ir::ImmediateIndex GenWriter::processSeqConstant(ConstantDataSequential *seq,
int index, ConstTypeId tid) {
if (index >= 0) {
const T data = GET_EFFECT_DATA(seq, index, tid);
return ctx.newImmediate(data);
} else {
vector array;
for(uint32_t i = 0; i < seq->getNumElements(); i++)
array.push_back(GET_EFFECT_DATA(seq, i, tid));
return ctx.newImmediate((T*)&array[0], array.size());
}
}
ir::ImmediateIndex GenWriter::processConstantVector(ConstantVector *cv, int index) {
if (index >= 0) {
Constant *c = cv->getOperand(index);
return processConstantImmIndex(c, -1);
} else {
vector immVector;
for (uint32_t i = 0; i < cv->getNumOperands(); i++)
immVector.push_back(processConstantImmIndex(cv->getOperand(i)));
return ctx.newImmediate(immVector, getType(ctx, cv->getType()->getElementType()));
}
}
ir::ImmediateIndex GenWriter::processConstantImmIndexImpl(Constant *CPV, int32_t index)
{
GBE_ASSERT(dyn_cast(CPV) == NULL);
#if LLVM_VERSION_MINOR > 0
ConstantDataSequential *seq = dyn_cast(CPV);
if (seq) {
Type *Ty = seq->getElementType();
if (Ty == Type::getInt1Ty(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_INT);
} else if (Ty == Type::getInt8Ty(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_INT);
} else if (Ty == Type::getInt16Ty(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_INT);
} else if (Ty == Type::getInt32Ty(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_INT);
} else if (Ty == Type::getInt64Ty(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_INT);
} else if (Ty == Type::getFloatTy(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_FLOAT);
} else if (Ty == Type::getDoubleTy(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_DOUBLE);
} else if (Ty == Type::getHalfTy(CPV->getContext())) {
GBE_ASSERTM(0, "Const data array never be half float\n");
}
} else
#endif /* LLVM_VERSION_MINOR > 0 */
if (dyn_cast(CPV)) {
Type* Ty = CPV->getType();
if(Ty->isVectorTy())
Ty = (cast(Ty))->getElementType();
if (Ty == Type::getInt1Ty(CPV->getContext())) {
const bool b = 0;
return ctx.newImmediate(b);
} else if (Ty == Type::getInt8Ty(CPV->getContext())) {
const uint8_t u8 = 0;
return ctx.newImmediate(u8);
} else if (Ty == Type::getInt16Ty(CPV->getContext())) {
const uint16_t u16 = 0;
return ctx.newImmediate(u16);
} else if (Ty == Type::getInt32Ty(CPV->getContext())) {
const uint32_t u32 = 0;
return ctx.newImmediate(u32);
} else if (Ty == Type::getInt64Ty(CPV->getContext())) {
const uint64_t u64 = 0;
return ctx.newImmediate(u64);
} else if (Ty == Type::getFloatTy(CPV->getContext())) {
const float f32 = 0;
return ctx.newImmediate(f32);
} else if (Ty == Type::getHalfTy(CPV->getContext())) {
const ir::half f16 = 0;
return ctx.newImmediate(f16);
} else if (Ty == Type::getDoubleTy(CPV->getContext())) {
const double f64 = 0;
return ctx.newImmediate(f64);
} else {
GBE_ASSERTM(false, "Unsupporte aggregate zero type.");
return ctx.newImmediate(uint32_t(0));
}
} else {
if (dyn_cast(CPV))
return processConstantVector(dyn_cast(CPV), index);
GBE_ASSERTM(dyn_cast(CPV) == NULL, "Unsupported constant expression");
// Integers
if (ConstantInt *CI = dyn_cast(CPV)) {
Type* Ty = CI->getType();
if (Ty == Type::getInt1Ty(CPV->getContext())) {
const bool b = CI->getZExtValue();
return ctx.newImmediate(b);
} else if (Ty == Type::getInt8Ty(CPV->getContext())) {
const uint8_t u8 = CI->getZExtValue();
return ctx.newImmediate(u8);
} else if (Ty == Type::getInt16Ty(CPV->getContext())) {
const uint16_t u16 = CI->getZExtValue();
return ctx.newImmediate(u16);
} else if (Ty == Type::getInt32Ty(CPV->getContext())) {
const uint32_t u32 = CI->getZExtValue();
return ctx.newImmediate(u32);
} else if (Ty == Type::getInt64Ty(CPV->getContext())) {
const uint64_t u64 = CI->getZExtValue();
return ctx.newImmediate(u64);
} else {
if (CI->getValue().getActiveBits() > 64) {
ctx.getUnit().setValid(false);
return ctx.newImmediate(uint64_t(0));
}
return ctx.newImmediate(uint64_t(CI->getZExtValue()));
}
}
// NULL pointers
if(isa(CPV)) {
return ctx.newImmediate(uint32_t(0));
}
const Type::TypeID typeID = CPV->getType()->getTypeID();
if (isa(CPV)) {
Type* Ty = CPV->getType();
if (Ty == Type::getInt1Ty(CPV->getContext())) return ctx.newImmediate(false);
if (Ty == Type::getInt8Ty(CPV->getContext())) return ctx.newImmediate((uint8_t)0);
if (Ty == Type::getInt16Ty(CPV->getContext())) return ctx.newImmediate((uint16_t)0);
if (Ty == Type::getInt32Ty(CPV->getContext())) return ctx.newImmediate((uint32_t)0);
if (Ty == Type::getInt64Ty(CPV->getContext())) return ctx.newImmediate((uint64_t)0);
if (Ty == Type::getFloatTy(CPV->getContext())) return ctx.newImmediate((float)0);
if (Ty == Type::getHalfTy(CPV->getContext())) return ctx.newImmediate((ir::half)0);
if (Ty == Type::getDoubleTy(CPV->getContext())) return ctx.newImmediate((double)0);
GBE_ASSERT(0 && "Unsupported undef value type.\n");
}
// Floats and doubles
switch (typeID) {
case Type::FloatTyID:
case Type::HalfTyID:
case Type::DoubleTyID:
{
ConstantFP *FPC = cast(CPV);
GBE_ASSERT(isa(CPV) == false);
if (FPC->getType() == Type::getFloatTy(CPV->getContext())) {
const float f32 = FPC->getValueAPF().convertToFloat();
return ctx.newImmediate(f32);
} else if (FPC->getType() == Type::getDoubleTy(CPV->getContext())) {
const double f64 = FPC->getValueAPF().convertToDouble();
return ctx.newImmediate(f64);
} else {
llvm::APFloat apf = FPC->getValueAPF();
llvm::APInt api = apf.bitcastToAPInt();
uint64_t v64 = api.getZExtValue();
uint16_t v16 = static_cast(v64);
const ir::half f16(v16);
return ctx.newImmediate(f16);
}
}
break;
default:
GBE_ASSERTM(false, "Unsupported constant type");
break;
}
}
GBE_ASSERTM(false, "Unsupported constant type");
return ctx.newImmediate(uint64_t(0));
}
ir::ImmediateIndex GenWriter::processConstantImmIndex(Constant *CPV, int32_t index) {
if (dyn_cast(CPV) == NULL)
return processConstantImmIndexImpl(CPV, index);
CPV->dump();
GBE_ASSERT(0 && "unsupported constant.\n");
return ctx.newImmediate((uint32_t)0);
}
const ir::Immediate &GenWriter::processConstantImm(Constant *CPV, int32_t index) {
ir::ImmediateIndex immIndex = processConstantImmIndex(CPV, index);
return ctx.getFunction().getImmediate(immIndex);
}
ir::ImmediateIndex GenWriter::newImmediate(Constant *CPV, uint32_t index) {
return processConstantImmIndex(CPV, index);
}
void GenWriter::newRegister(Value *value, Value *key, bool uniform) {
auto type = value->getType();
auto typeID = type->getTypeID();
switch (typeID) {
case Type::IntegerTyID:
case Type::FloatTyID:
case Type::HalfTyID:
case Type::DoubleTyID:
case Type::PointerTyID:
regTranslator.newScalar(value, key, 0, uniform);
break;
case Type::VectorTyID:
{
auto vectorType = cast(type);
const uint32_t elemNum = vectorType->getNumElements();
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
regTranslator.newScalar(value, key, elemID, uniform);
break;
}
case Type::StructTyID:
{
auto structType = cast(type);
const uint32_t elemNum = structType->getNumElements();
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
regTranslator.newScalar(value, key, elemID, uniform);
break;
}
default: NOT_SUPPORTED;
};
}
ir::Register GenWriter::getConstantRegister(Constant *c, uint32_t elemID) {
GBE_ASSERT(c != NULL);
if(isa(c)) {
return regTranslator.getScalar(c, elemID);
}
if(isa(c)) {
Type* llvmType = c->getType();
ir::Type dstType = getType(ctx, llvmType);
ir::Register reg = ctx.reg(getFamily(dstType));
ir::ImmediateIndex immIndex;
if(llvmType->isIntegerTy())
immIndex = ctx.newIntegerImmediate(0, dstType);
else if(llvmType->isFloatTy()) {
immIndex = ctx.newFloatImmediate((float)0.0);
} else {
immIndex = ctx.newDoubleImmediate((double)0.0);
}
ctx.LOADI(dstType, reg, immIndex);
return reg;
}
const ir::ImmediateIndex immIndex = this->newImmediate(c, elemID);
const ir::Immediate imm = ctx.getImmediate(immIndex);
const ir::Register reg = ctx.reg(getFamily(imm.getType()));
ctx.LOADI(imm.getType(), reg, immIndex);
return reg;
}
ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) {
//the real value may be constant, so get real value before constant check
regTranslator.getRealValue(value, elemID);
if(isa(value)) {
Constant *c = dyn_cast(value);
return getConstantRegister(c, elemID);
} else
return regTranslator.getScalar(value, elemID);
}
INLINE Value *GenWriter::getPHICopy(Value *PHI) {
const uintptr_t ptr = (uintptr_t) PHI;
return (Value*) (ptr+1);
}
void GenWriter::newLabelIndex(const BasicBlock *bb) {
if (labelMap.find(bb) == labelMap.end()) {
const ir::LabelIndex label = ctx.label();
labelMap[bb] = label;
}
}
void GenWriter::simplifyTerminator(BasicBlock *bb) {
Value *value = --bb->end();
BranchInst *I = NULL;
if ((I = dyn_cast(value)) != NULL) {
if (I->isConditional() == false)
return;
// If the "taken" successor is the next block, we try to invert the
// branch.
BasicBlock *succ = I->getSuccessor(0);
if (std::next(Function::iterator(bb)) != Function::iterator(succ))
return;
// More than one use is too complicated: we skip it
Value *condition = I->getCondition();
if (condition->hasOneUse() == false)
return;
// Right now, we only invert comparison instruction
ICmpInst *CI = dyn_cast(condition);
if (CI != NULL) {
GBE_ASSERT(conditionSet.find(CI) == conditionSet.end());
conditionSet.insert(CI);
return;
}
}
}
void GenWriter::emitBasicBlock(BasicBlock *BB) {
GBE_ASSERT(labelMap.find(BB) != labelMap.end());
ctx.LABEL(labelMap[BB]);
for (auto II = BB->begin(), E = BB->end(); II != E; ++II) visit(*II);
}
void GenWriter::emitMovForPHI(BasicBlock *curr, BasicBlock *succ) {
for (BasicBlock::iterator I = succ->begin(); isa(I); ++I) {
PHINode *PN = cast(I);
Value *IV = PN->getIncomingValueForBlock(curr);
Type *llvmType = PN->getType();
const ir::Type type = getType(ctx, llvmType);
Value *PHICopy = this->getPHICopy(PN);
const ir::Register dst = this->getRegister(PHICopy);
if (!isa(IV)) {
// Emit the MOV required by the PHI function. We do it simple and do not
// try to optimize them. A next data flow analysis pass on the Gen IR
// will remove them
Constant *CP = dyn_cast(IV);
if (CP) {
GBE_ASSERT(isa(CP) == false);
ConstantVector *CPV = dyn_cast(CP);
if (CPV && dyn_cast(CPV) &&
isa(extractConstantElem(CPV, 0)))
continue;
ctx.MOV(type, dst, getRegister(CP));
} else if (regTranslator.valueExists(IV,0) || dyn_cast(IV)) {
const ir::Register src = this->getRegister(IV);
ctx.MOV(type, dst, src);
}
assert(!ctx.getBlock()->undefPhiRegs.contains(dst));
ctx.getBlock()->definedPhiRegs.insert(dst);
} else {
// If this is an undefined value, we don't need emit phi copy here.
// But we need to record it. As latter, at liveness's backward analysis,
// we don't need to pass the phi value/register to this BB which the phi
// value is undefined. Otherwise, the phi value's liveness will be extent
// incorrectly and may be extent to the basic block zero which is really bad.
ctx.getBlock()->undefPhiRegs.insert(dst);
}
}
}
/*! To track read image args and write args */
struct ImageArgsInfo{
uint32_t readImageArgs;
uint32_t writeImageArgs;
};
static void collectImageArgs(std::string& accessQual, ImageArgsInfo& imageArgsInfo)
{
if(accessQual.find("read") != std::string::npos)
{
imageArgsInfo.readImageArgs++;
GBE_ASSERT(imageArgsInfo.readImageArgs <= BTI_MAX_READ_IMAGE_ARGS);
}
else if(accessQual.find("write") != std::string::npos)
{
imageArgsInfo.writeImageArgs++;
GBE_ASSERT(imageArgsInfo.writeImageArgs <= BTI_MAX_WRITE_IMAGE_ARGS);
}
else
{
//default is read_only per spec.
imageArgsInfo.readImageArgs++;
GBE_ASSERT(imageArgsInfo.readImageArgs <= BTI_MAX_READ_IMAGE_ARGS);
}
}
void GenWriter::emitFunctionPrototype(Function &F)
{
GBE_ASSERTM(F.hasStructRetAttr() == false,
"Returned value for kernel functions is forbidden");
// Loop over the kernel metadatas to set the required work group size.
size_t reqd_wg_sz[3] = {0, 0, 0};
size_t hint_wg_sz[3] = {0, 0, 0};
ir::FunctionArgument::InfoFromLLVM llvmInfo;
MDNode *addrSpaceNode = NULL;
MDNode *typeNameNode = NULL;
MDNode *accessQualNode = NULL;
MDNode *typeQualNode = NULL;
MDNode *argNameNode = NULL;
std::string functionAttributes;
/* First find the meta data belong to this function. */
MDNode *node = getKernelFunctionMetadata(&F);
/* because "-cl-kernel-arg-info", should always have meta data. */
if (!F.arg_empty())
assert(node);
for(uint j = 0; j < node->getNumOperands() - 1; j++) {
MDNode *attrNode = dyn_cast_or_null(node->getOperand(1 + j));
if (attrNode == NULL) break;
MDString *attrName = dyn_cast_or_null(attrNode->getOperand(0));
if (!attrName) continue;
if (attrName->getString() == "reqd_work_group_size") {
GBE_ASSERT(attrNode->getNumOperands() == 4);
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
ConstantInt *x = dyn_cast(attrNode->getOperand(1));
ConstantInt *y = dyn_cast(attrNode->getOperand(2));
ConstantInt *z = dyn_cast(attrNode->getOperand(3));
#else
ConstantInt *x = mdconst::extract(attrNode->getOperand(1));
ConstantInt *y = mdconst::extract(attrNode->getOperand(2));
ConstantInt *z = mdconst::extract(attrNode->getOperand(3));
#endif
GBE_ASSERT(x && y && z);
reqd_wg_sz[0] = x->getZExtValue();
reqd_wg_sz[1] = y->getZExtValue();
reqd_wg_sz[2] = z->getZExtValue();
functionAttributes += attrName->getString();
std::stringstream param;
char buffer[100];
param <<"(";
param << reqd_wg_sz[0];
param << ",";
param << reqd_wg_sz[1];
param << ",";
param << reqd_wg_sz[2];
param <<")";
param >> buffer;
functionAttributes += buffer;
functionAttributes += " ";
break;
} else if (attrName->getString() == "kernel_arg_addr_space") {
addrSpaceNode = attrNode;
} else if (attrName->getString() == "kernel_arg_access_qual") {
accessQualNode = attrNode;
} else if (attrName->getString() == "kernel_arg_type") {
typeNameNode = attrNode;
} else if (attrName->getString() == "kernel_arg_type_qual") {
typeQualNode = attrNode;
} else if (attrName->getString() == "kernel_arg_name") {
argNameNode = attrNode;
} else if (attrName->getString() == "vec_type_hint") {
GBE_ASSERT(attrNode->getNumOperands() == 3);
functionAttributes += attrName->getString();
functionAttributes += " ";
} else if (attrName->getString() == "work_group_size_hint") {
GBE_ASSERT(attrNode->getNumOperands() == 4);
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
ConstantInt *x = dyn_cast(attrNode->getOperand(1));
ConstantInt *y = dyn_cast(attrNode->getOperand(2));
ConstantInt *z = dyn_cast(attrNode->getOperand(3));
#else
ConstantInt *x = mdconst::extract(attrNode->getOperand(1));
ConstantInt *y = mdconst::extract(attrNode->getOperand(2));
ConstantInt *z = mdconst::extract(attrNode->getOperand(3));
#endif
GBE_ASSERT(x && y && z);
hint_wg_sz[0] = x->getZExtValue();
hint_wg_sz[1] = y->getZExtValue();
hint_wg_sz[2] = z->getZExtValue();
functionAttributes += attrName->getString();
std::stringstream param;
char buffer[100];
param <<"(";
param << hint_wg_sz[0];
param << ",";
param << hint_wg_sz[1];
param << ",";
param << hint_wg_sz[2];
param <<")";
param >> buffer;
functionAttributes += buffer;
functionAttributes += " ";
}
}
ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
ctx.getFunction().setFunctionAttributes(functionAttributes);
// Loop over the arguments and output registers for them
if (!F.arg_empty()) {
uint32_t argID = 0;
ImageArgsInfo imageArgsInfo = {};
Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
// Insert a new register for each function argument
#if LLVM_VERSION_MINOR <= 1
const AttrListPtr &PAL = F.getAttributes();
#endif /* LLVM_VERSION_MINOR <= 1 */
for (; I != E; ++I, ++argID) {
const std::string &argName = I->getName().str();
Type *type = I->getType();
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
llvmInfo.addrSpace = (cast(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
#else
llvmInfo.addrSpace = (mdconst::extract(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
#endif
llvmInfo.typeName = (cast(typeNameNode->getOperand(1 + argID)))->getString();
llvmInfo.accessQual = (cast(accessQualNode->getOperand(1 + argID)))->getString();
llvmInfo.typeQual = (cast(typeQualNode->getOperand(1 + argID)))->getString();
if(argNameNode){
llvmInfo.argName = (cast(argNameNode->getOperand(1 + argID)))->getString();
}
// function arguments are uniform values.
this->newRegister(I, NULL, true);
// add support for vector argument.
if(type->isVectorTy()) {
VectorType *vectorType = cast(type);
ir::Register reg = getRegister(I, 0);
Type *elemType = vectorType->getElementType();
const uint32_t elemSize = getTypeByteSize(unit, elemType);
const uint32_t elemNum = vectorType->getNumElements();
//vector's elemType always scalar type
ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
ir::Function& fn = ctx.getFunction();
for(uint32_t i=1; i < elemNum; i++) {
ir::PushLocation argLocation(fn, argID, elemSize*i);
reg = getRegister(I, i);
ctx.appendPushedConstant(reg, argLocation); //add to push map for reg alloc
}
continue;
}
GBE_ASSERTM(isScalarType(type) == true,
"vector type in the function argument is not supported yet");
const ir::Register reg = getRegister(I);
if (llvmInfo.isImageType()) {
ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, 4, 4, 0);
ctx.getFunction().getImageSet()->append(reg, &ctx, BtiMap.find(I)->second);
collectImageArgs(llvmInfo.accessQual, imageArgsInfo);
continue;
}
if (llvmInfo.isSamplerType()) {
ctx.input(argName, ir::FunctionArgument::SAMPLER, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
(void)ctx.getFunction().getSamplerSet()->append(reg, &ctx);
continue;
}
if (type->isPointerTy() == false)
ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
else {
PointerType *pointerType = dyn_cast(type);
Type *pointed = pointerType->getElementType();
// By value structure
#if LLVM_VERSION_MINOR <= 1
if (PAL.paramHasAttr(argID+1, Attribute::ByVal)) {
#else
if (I->hasByValAttr()) {
#endif /* LLVM_VERSION_MINOR <= 1 */
const size_t structSize = getTypeByteSize(unit, pointed);
ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, llvmInfo, structSize, getAlignmentByte(unit, type), 0);
}
// Regular user provided pointer (global, local or constant)
else {
const uint32_t addr = pointerType->getAddressSpace();
const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(addr);
const uint32_t ptrSize = getTypeByteSize(unit, type);
const uint32_t align = getAlignmentByte(unit, pointed);
switch (addrSpace) {
case ir::MEM_GLOBAL:
ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, BtiMap.find(I)->second);
break;
case ir::MEM_LOCAL:
ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg, llvmInfo, ptrSize, align, BTI_LOCAL);
ctx.getFunction().setUseSLM(true);
break;
case ir::MEM_CONSTANT:
ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg, llvmInfo, ptrSize, align, 0x2);
break;
default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
}
}
}
}
}
// When returning a structure, first input register is the pointer to the
// structure
#if GBE_DEBUG
const Type *type = F.getReturnType();
GBE_ASSERTM(type->isVoidTy() == true,
"Returned value for kernel functions is forbidden");
// Variable number of arguments is not supported
FunctionType *FT = cast(F.getFunctionType());
GBE_ASSERT(FT->isVarArg() == false);
#endif /* GBE_DEBUG */
}
static inline bool isFPIntBitCast(const Instruction &I) {
if (!isa(I))
return false;
Type *SrcTy = I.getOperand(0)->getType();
Type *DstTy = I.getType();
return (SrcTy->isFloatingPointTy() && DstTy->isIntegerTy()) ||
(DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
}
/*! To track last read and write of the registers */
struct RegInfoForMov {
ir::Instruction *lastWriteInsn;
ir::Instruction *lastReadInsn;
uint32_t lastWrite;
uint32_t lastRead;
};
/*! Replace register "from" by register "to" in the destination(s) */
static void replaceDst(ir::Instruction *insn, ir::Register from, ir::Register to) {
const uint32_t dstNum = insn->getDstNum();
for (uint32_t dstID = 0; dstID < dstNum; ++dstID)
if (insn->getDst(dstID) == from)
insn->setDst(dstID, to);
}
/*! Replace register "from" by register "to" in the source(s) */
static void replaceSrc(ir::Instruction *insn, ir::Register from, ir::Register to) {
const uint32_t srcNum = insn->getSrcNum();
for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
if (insn->getSrc(srcID) == from)
insn->setSrc(srcID, to);
}
/*! lastUse maintains data about last uses (reads/writes) for each
* ir::Register
*/
static void buildRegInfo(ir::BasicBlock &bb, vector &lastUse)
{
// Clear the register usages
for (auto &x : lastUse) {
x.lastWrite = x.lastRead = 0;
x.lastWriteInsn = x.lastReadInsn = NULL;
}
// Find use intervals for all registers (distinguish sources and
// destinations)
uint32_t insnID = 2;
bb.foreach([&](ir::Instruction &insn) {
if (insn.getOpcode() == ir::OP_MOV &&
insn.getDst(0) == insn.getSrc(0)) {
insn.remove();
return;
}
const uint32_t dstNum = insn.getDstNum();
const uint32_t srcNum = insn.getSrcNum();
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const ir::Register reg = insn.getSrc(srcID);
lastUse[reg].lastRead = insnID;
lastUse[reg].lastReadInsn = &insn;
}
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const ir::Register reg = insn.getDst(dstID);
lastUse[reg].lastWrite = insnID+1;
lastUse[reg].lastWriteInsn = &insn;
}
insnID+=2;
});
}
void GenWriter::optimizePhiCopy(ir::Liveness &liveness, ir::Function &fn,
map &replaceMap,
map &redundantPhiCopyMap)
{
// The overall idea behind is we check whether there is any interference
// between phi and phiCopy live range. If there is no point that
// phi & phiCopy are both alive, then we can optimize off the move
// from phiCopy to phi, and use phiCopy directly instead of phi.
// right now, the algorithm is still very conservative, we need to do
// aggressive coaleasing for the moves added during phi elimination.
using namespace ir;
ir::FunctionDAG *dag = new ir::FunctionDAG(liveness);
for (auto &it : phiMap) {
const Register phi = it.first;
const Register phiCopy = it.second;
const ir::DefSet *phiCopyDef = dag->getRegDef(phiCopy);
const ir::UseSet *phiUse = dag->getRegUse(phi);
const DefSet *phiDef = dag->getRegDef(phi);
bool isOpt = true;
// FIXME, I find under some situation, the phiDef maybe null, seems a bug when building FunctionDAg.
// need fix it there.
if (phiDef->empty()) continue;
const ir::BasicBlock *phiDefBB = (*phiDef->begin())->getInstruction()->getParent();
for (auto &x : *phiCopyDef) {
const ir::Instruction * phiCopyDefInsn = x->getInstruction();
const ir::BasicBlock *bb = phiCopyDefInsn->getParent();
const Liveness::LiveOut &out = liveness.getLiveOut(bb);
// phi & phiCopy are both alive at the endpoint of bb,
// thus can not be optimized.
if (out.contains(phi)) {
isOpt = false;
break;
}
const ir::Register phiCopySrc = phiCopyDefInsn->getSrc(0);
const ir::UseSet *phiCopySrcUse = dag->getRegUse(phiCopySrc);
const ir::DefSet *phiCopySrcDef = dag->getRegDef(phiCopySrc);
// we should only do coaleasing on instruction-def and ssa-value
if (phiCopySrcDef->size() == 1 && (*(phiCopySrcDef->begin()))->getType() == ValueDef::DEF_INSN_DST) {
const ir::Instruction *phiCopySrcDefInsn = (*(phiCopySrcDef->begin()))->getInstruction();
if(bb == phiDefBB && bb == phiCopySrcDefInsn->getParent()) {
// phiCopy, phiCopySrc defined in same basicblock as phi
// try to coalease phiCopy and phiCopySrc first.
// consider below situation:
// bb1:
// ...
// bb2:
// x = phi [x1, bb1], [x2, bb2]
// x2 = x+1;
// after de-ssa:
// bb2:
// mov x, x-copy
// add x2, x, 1
// mov x-copy, x2
// obviously x2, x-copy and x2 can be mapped to same virtual register
ir::BasicBlock::const_iterator iter = ir::BasicBlock::const_iterator(phiCopySrcDefInsn);
ir::BasicBlock::const_iterator iterE = bb->end();
iter++;
// check no use of phi in this basicblock between [phiCopySrc def, bb end]
bool phiPhiCopySrcInterfere = false;
while (iter != iterE) {
const ir::Instruction *insn = iter.node();
// check phiUse
for (unsigned i = 0; i < insn->getSrcNum(); i++) {
ir::Register src = insn->getSrc(i);
if (src == phi) {
phiPhiCopySrcInterfere = true; break;
}
}
++iter;
}
if (!phiPhiCopySrcInterfere) {
replaceSrc(const_cast(phiCopyDefInsn), phiCopySrc, phiCopy);
for (auto &s : *phiCopySrcDef) {
const Instruction *phiSrcDefInsn = s->getInstruction();
replaceDst(const_cast(phiSrcDefInsn), phiCopySrc, phiCopy);
}
for (auto &s : *phiCopySrcUse) {
const Instruction *phiSrcUseInsn = s->getInstruction();
replaceSrc(const_cast(phiSrcUseInsn), phiCopySrc, phiCopy);
}
replaceMap.insert(std::make_pair(phiCopySrc, phiCopy));
}
}
} else {
// FIXME, if the phiCopySrc is a phi value and has been used for more than one phiCopySrc
// This 1:1 map will ignore the second one.
if (((*(phiCopySrcDef->begin()))->getType() == ValueDef::DEF_INSN_DST) &&
redundantPhiCopyMap.find(phiCopySrc) == redundantPhiCopyMap.end())
redundantPhiCopyMap.insert(std::make_pair(phiCopySrc, phiCopy));
}
// If phi is used in the same BB that define the phiCopy,
// we need carefully check the liveness of phi & phiCopy.
// Make sure their live ranges do not interfere.
bool phiUsedInSameBB = false;
for (auto &y : *phiUse) {
const ir::Instruction *phiUseInsn = y->getInstruction();
const ir::BasicBlock *bb2 = phiUseInsn->getParent();
if (bb2 == bb) {
phiUsedInSameBB = true;
}
}
// Check phi is not used between phiCopy def point and bb's end point,
// which is often referred as 'phi swap issue', just like below:
// MOV phiCopy_1, x;
// MOV phiCopy_2, phi_1;
if (phiUsedInSameBB ) {
for (auto it = --bb->end(); it != bb->end() ; --it) {
const Instruction &p = *it;
if (&p == phiCopyDefInsn) break;
// we only care MOV here
if (p.getSrcNum() == 1 && p.getSrc(0) == phi) {
isOpt = false;
break;
}
}
}
}
// coalease phi and phiCopy
if (isOpt) {
for (auto &x : *phiDef) {
replaceDst(const_cast(x->getInstruction()), phi, phiCopy);
}
for (auto &x : *phiUse) {
const Instruction *phiUseInsn = x->getInstruction();
replaceSrc(const_cast(phiUseInsn), phi, phiCopy);
replaceMap.insert(std::make_pair(phi, phiCopy));
}
}
}
delete dag;
}
void GenWriter::postPhiCopyOptimization(ir::Liveness &liveness,
ir::Function &fn, map &replaceMap,
map &redundantPhiCopyMap)
{
// When doing the first pass phi copy optimization, we skip all the phi src MOV cases
// whoes phiSrdDefs are also a phi value. We leave it here when all phi copy optimizations
// have been done. Then we don't need to worry about there are still reducible phi copy remained.
// We only need to check whether those possible redundant phi copy pairs' interfering to
// each other globally, by leverage the DAG information.
using namespace ir;
// Firstly, validate all possible redundant phi copy map and update liveness information
// accordingly.
if (replaceMap.size() != 0) {
for (auto pair : replaceMap) {
if (redundantPhiCopyMap.find(pair.first) != redundantPhiCopyMap.end()) {
auto it = redundantPhiCopyMap.find(pair.first);
Register phiCopy = it->second;
Register newPhiCopySrc = pair.second;
redundantPhiCopyMap.erase(it);
redundantPhiCopyMap.insert(std::make_pair(newPhiCopySrc, phiCopy));
}
}
liveness.replaceRegs(replaceMap);
replaceMap.clear();
}
if (redundantPhiCopyMap.size() == 0)
return;
auto dag = new FunctionDAG(liveness);
map newRedundant;
map *curRedundant = &redundantPhiCopyMap;
map *nextRedundant = &newRedundant, tmp;
map replacedRegs, revReplacedRegs;
// Do multi pass redundant phi copy elimination based on the global interfering information.
// FIXME, we don't need to re-compute the whole DAG for each pass.
while (curRedundant->size() > 0) {
for (auto &pair : *curRedundant) {
auto phiCopySrc = pair.first;
auto phiCopy = pair.second;
if (replacedRegs.find(phiCopy) != replacedRegs.end() ||
revReplacedRegs.find(phiCopy) != revReplacedRegs.end() ||
revReplacedRegs.find(phiCopySrc) != revReplacedRegs.end())
continue;
if (!dag->interfere(liveness, phiCopySrc, phiCopy)) {
const ir::DefSet *phiCopySrcDef = dag->getRegDef(phiCopySrc);
const ir::UseSet *phiCopySrcUse = dag->getRegUse(phiCopySrc);
for (auto &s : *phiCopySrcDef) {
const Instruction *phiSrcDefInsn = s->getInstruction();
replaceDst(const_cast(phiSrcDefInsn), phiCopySrc, phiCopy);
}
for (auto &s : *phiCopySrcUse) {
const Instruction *phiSrcUseInsn = s->getInstruction();
replaceSrc(const_cast(phiSrcUseInsn), phiCopySrc, phiCopy);
}
replacedRegs.insert(std::make_pair(phiCopySrc, phiCopy));
revReplacedRegs.insert(std::make_pair(phiCopy, phiCopySrc));
curRedundant->erase(phiCopySrc);
}
}
if (replacedRegs.size() != 0) {
liveness.replaceRegs(replacedRegs);
for (auto &pair : *curRedundant) {
auto from = pair.first;
auto to = pair.second;
bool revisit = false;
if (replacedRegs.find(pair.second) != replacedRegs.end()) {
to = replacedRegs.find(to)->second;
revisit = true;
}
if (revReplacedRegs.find(from) != revReplacedRegs.end() ||
revReplacedRegs.find(to) != revReplacedRegs.end())
revisit = true;
if (revisit)
nextRedundant->insert(std::make_pair(from, to));
}
std::swap(curRedundant, nextRedundant);
} else
break;
nextRedundant->clear();
replacedRegs.clear();
revReplacedRegs.clear();
delete dag;
dag = new ir::FunctionDAG(liveness);
}
delete dag;
}
void GenWriter::removeMOVs(const ir::Liveness &liveness, ir::Function &fn)
{
// We store the last write and last read for each register
const uint32_t regNum = fn.regNum();
vector lastUse;
lastUse.resize(regNum);
// Remove the MOVs per block (local analysis only) Note that we do not try
// to remove MOV for variables that outlives the block. So we use liveness
// information to figure out which variable is alive
fn.foreachBlock([&](ir::BasicBlock &bb)
{
// We need to know when each register will be read or written
buildRegInfo(bb, lastUse);
// Liveinfo helps us to know if the source outlives the block
const ir::Liveness::BlockInfo &info = liveness.getBlockInfo(&bb);
auto it = --bb.end();
if (it->isMemberOf() == true) --it;
for (auto it = --bb.end(); it != bb.end();) {
ir::Instruction *insn = &*it; it--;
const ir::Opcode op = insn->getOpcode();
if (op == ir::OP_MOV) {
const ir::Register dst = insn->getDst(0);
const ir::Register src = insn->getSrc(0);
// Outlives the block. We do not do anything
if (info.inLiveOut(src))
continue;
const RegInfoForMov &dstInfo = lastUse[dst];
const RegInfoForMov &srcInfo = lastUse[src];
// The source is not computed in this block
if (srcInfo.lastWrite == 0)
continue;
// dst is read after src is written. We cannot overwrite dst
if (dstInfo.lastRead > srcInfo.lastWrite)
continue;
// We are good. We first patch the destination then all the sources
replaceDst(srcInfo.lastWriteInsn, src, dst);
// Then we patch all subsequent uses of the source
ir::Instruction *next = static_cast(srcInfo.lastWriteInsn->next);
while (next != insn) {
replaceSrc(next, src, dst);
next = static_cast(next->next);
}
insn->remove();
} else if (op == ir::OP_LOADI)
continue;
else
break;
}
});
}
void GenWriter::removeLOADIs(const ir::Liveness &liveness, ir::Function &fn)
{
// We store the last write and last read for each register
const uint32_t regNum = fn.regNum();
vector lastUse;
lastUse.resize(regNum);
// Traverse all blocks and remove redundant immediates. Do *not* remove
// immediates that outlive the block
fn.foreachBlock([&](ir::BasicBlock &bb)
{
// Each immediate that is already loaded in the block
map loadedImm;
// Immediate to immediate translation
map immTranslate;
// Liveinfo helps us to know if the loaded immediate outlives the block
const ir::Liveness::BlockInfo &info = liveness.getBlockInfo(&bb);
// We need to know when each register will be read or written
buildRegInfo(bb, lastUse);
// Top bottom traversal -> remove useless LOADIs
uint32_t insnID = 2;
bb.foreach([&](ir::Instruction &insn)
{
// We either try to remove the LOADI or we will try to use it as a
// replacement for the next same LOADIs
if (insn.isMemberOf()) {
ir::LoadImmInstruction &loadImm = cast(insn);
const ir::Immediate imm = loadImm.getImmediate();
const ir::Register dst = loadImm.getDst(0);
// Not here: cool, we put it in the map if the register is not
// overwritten. If it is, we just ignore it for simplicity. Note that
// it should not happen with the way we "unSSA" the code
auto it = loadedImm.find(imm);
auto end = loadedImm.end();
if (it == end && lastUse[dst].lastWrite == insnID+1)
loadedImm.insert(std::make_pair(imm, dst));
// We already pushed the same immediate and we do not outlive the
// block. We are good to replace this immediate by the previous one
else if (it != end && info.inLiveOut(dst) == false) {
immTranslate.insert(std::make_pair(dst, it->second));
insn.remove();
}
}
// Traverse all the destinations and sources and perform the
// substitutions (if any)
else {
const uint32_t srcNum = insn.getSrcNum();
const uint32_t dstNum = insn.getDstNum();
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const ir::Register src = insn.getSrc(srcID);
auto it = immTranslate.find(src);
if (it != immTranslate.end())
insn.setSrc(srcID, it->second);
}
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const ir::Register dst = insn.getDst(dstID);
auto it = immTranslate.find(dst);
if (it != immTranslate.end())
insn.setDst(dstID, it->second);
}
}
insnID += 2;
});
});
}
BVAR(OCL_OPTIMIZE_PHI_MOVES, true);
BVAR(OCL_OPTIMIZE_LOADI, true);
static const Instruction *getInstructionUseLocal(const Value *v) {
// Local variable can only be used in one kernel function. So, if we find
// one instruction that use the local variable, simply return.
const Instruction *insn = NULL;
for(Value::const_use_iterator iter = v->use_begin(); iter != v->use_end(); ++iter) {
// After LLVM 3.5, use_iterator points to 'Use' instead of 'User', which is more straightforward.
#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
const User *theUser = *iter;
#else
const User *theUser = iter->getUser();
#endif
if(isa(theUser)) return cast(theUser);
insn = getInstructionUseLocal(theUser);
if(insn != NULL) break;
}
return insn;
}
void GenWriter::allocateGlobalVariableRegister(Function &F)
{
// Allocate a address register for each global variable
const Module::GlobalListType &globalList = TheModule->getGlobalList();
for(auto i = globalList.begin(); i != globalList.end(); i ++) {
const GlobalVariable &v = *i;
if(!v.isConstantUsed()) continue;
ir::AddressSpace addrSpace = addressSpaceLLVMToGen(v.getType()->getAddressSpace());
if(addrSpace == ir::MEM_LOCAL) {
const Value * val = cast(&v);
const Instruction *insn = getInstructionUseLocal(val);
GBE_ASSERT(insn && "Can't find a valid reference instruction for local variable.");
const BasicBlock * bb = insn->getParent();
const Function * func = bb->getParent();
if(func != &F) continue;
ir::Function &f = ctx.getFunction();
f.setUseSLM(true);
const Constant *c = v.getInitializer();
Type *ty = c->getType();
uint32_t oldSlm = f.getSLMSize();
uint32_t align = 8 * getAlignmentByte(unit, ty);
uint32_t padding = getPadding(oldSlm*8, align);
f.setSLMSize(oldSlm + padding/8 + getTypeByteSize(unit, ty));
this->newRegister(const_cast(&v));
ir::Register reg = regTranslator.getScalar(const_cast(&v), 0);
ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(oldSlm + padding/8, ir::TYPE_S32));
} else if(addrSpace == ir::MEM_CONSTANT || v.isConstant()) {
GBE_ASSERT(v.hasInitializer());
this->newRegister(const_cast(&v));
ir::Register reg = regTranslator.getScalar(const_cast(&v), 0);
ir::Constant &con = unit.getConstantSet().getConstant(v.getName());
ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
} else {
if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
ctx.getFunction().getPrintfSet()->setBufBTI(BtiMap.find(const_cast(&v))->second);
this->newRegister(const_cast(&v), NULL, true);
ctx.CVT(ir::TYPE_U32, ir::TYPE_U64, getRegister(const_cast(&v)), ir::ocl::printfbptr);
} else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
ctx.getFunction().getPrintfSet()->setIndexBufBTI(BtiMap.find(const_cast