ir::ImmediateIndex GenWriter::processSeqConstant(ConstantDataSequential *seq,
int index, ConstTypeId tid) {
if (index >= 0) {
const T data = GET_EFFECT_DATA(seq, index, tid);
return ctx.newImmediate(data);
} else {
vector array;
for(uint32_t i = 0; i < seq->getNumElements(); i++)
array.push_back(GET_EFFECT_DATA(seq, i, tid));
return ctx.newImmediate((T*)&array[0], array.size());
}
}
ir::ImmediateIndex GenWriter::processConstantVector(ConstantVector *cv, int index) {
if (index >= 0) {
Constant *c = cv->getOperand(index);
return processConstantImmIndex(c, -1);
} else {
vector immVector;
for (uint32_t i = 0; i < cv->getNumOperands(); i++)
immVector.push_back(processConstantImmIndex(cv->getOperand(i)));
return ctx.newImmediate(immVector, getType(ctx, cv->getType()->getElementType()));
}
}
ir::ImmediateIndex GenWriter::processConstantImmIndexImpl(Constant *CPV, int32_t index)
{
GBE_ASSERT(dyn_cast(CPV) == NULL);
#if LLVM_VERSION_MINOR > 0
ConstantDataSequential *seq = dyn_cast(CPV);
if (seq) {
Type *Ty = seq->getElementType();
if (Ty == Type::getInt1Ty(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_INT);
} else if (Ty == Type::getInt8Ty(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_INT);
} else if (Ty == Type::getInt16Ty(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_INT);
} else if (Ty == Type::getInt32Ty(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_INT);
} else if (Ty == Type::getInt64Ty(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_INT);
} else if (Ty == Type::getFloatTy(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_FLOAT);
} else if (Ty == Type::getDoubleTy(CPV->getContext())) {
return processSeqConstant(seq, index, CONST_DOUBLE);
}
} else
#endif /* LLVM_VERSION_MINOR > 0 */
if (dyn_cast(CPV)) {
Type* Ty = CPV->getType();
if(Ty->isVectorTy())
Ty = (cast(Ty))->getElementType();
if (Ty == Type::getInt1Ty(CPV->getContext())) {
const bool b = 0;
return ctx.newImmediate(b);
} else if (Ty == Type::getInt8Ty(CPV->getContext())) {
const uint8_t u8 = 0;
return ctx.newImmediate(u8);
} else if (Ty == Type::getInt16Ty(CPV->getContext())) {
const uint16_t u16 = 0;
return ctx.newImmediate(u16);
} else if (Ty == Type::getInt32Ty(CPV->getContext())) {
const uint32_t u32 = 0;
return ctx.newImmediate(u32);
} else if (Ty == Type::getInt64Ty(CPV->getContext())) {
const uint64_t u64 = 0;
return ctx.newImmediate(u64);
} else if (Ty == Type::getFloatTy(CPV->getContext())) {
const float f32 = 0;
return ctx.newImmediate(f32);
} else if (Ty == Type::getDoubleTy(CPV->getContext())) {
const double f64 = 0;
return ctx.newImmediate(f64);
} else {
GBE_ASSERTM(false, "Unsupporte aggregate zero type.");
return ctx.newImmediate(uint32_t(0));
}
} else {
if (dyn_cast(CPV))
return processConstantVector(dyn_cast(CPV), index);
GBE_ASSERTM(dyn_cast(CPV) == NULL, "Unsupported constant expression");
// Integers
if (ConstantInt *CI = dyn_cast(CPV)) {
Type* Ty = CI->getType();
if (Ty == Type::getInt1Ty(CPV->getContext())) {
const bool b = CI->getZExtValue();
return ctx.newImmediate(b);
} else if (Ty == Type::getInt8Ty(CPV->getContext())) {
const uint8_t u8 = CI->getZExtValue();
return ctx.newImmediate(u8);
} else if (Ty == Type::getInt16Ty(CPV->getContext())) {
const uint16_t u16 = CI->getZExtValue();
return ctx.newImmediate(u16);
} else if (Ty == Type::getInt32Ty(CPV->getContext())) {
const uint32_t u32 = CI->getZExtValue();
return ctx.newImmediate(u32);
} else if (Ty == Type::getInt64Ty(CPV->getContext())) {
const uint64_t u64 = CI->getZExtValue();
return ctx.newImmediate(u64);
} else {
if (CI->getValue().getActiveBits() > 64) {
ctx.getUnit().setValid(false);
return ctx.newImmediate(uint64_t(0));
}
return ctx.newImmediate(uint64_t(CI->getZExtValue()));
}
}
// NULL pointers
if(isa(CPV)) {
return ctx.newImmediate(uint32_t(0));
}
const Type::TypeID typeID = CPV->getType()->getTypeID();
if (isa(CPV)) {
Type* Ty = CPV->getType();
if (Ty == Type::getInt1Ty(CPV->getContext())) return ctx.newImmediate(false);
if (Ty == Type::getInt8Ty(CPV->getContext())) return ctx.newImmediate((uint8_t)0);
if (Ty == Type::getInt16Ty(CPV->getContext())) return ctx.newImmediate((uint16_t)0);
if (Ty == Type::getInt32Ty(CPV->getContext())) return ctx.newImmediate((uint32_t)0);
if (Ty == Type::getInt64Ty(CPV->getContext())) return ctx.newImmediate((uint64_t)0);
if (Ty == Type::getFloatTy(CPV->getContext())) return ctx.newImmediate((float)0);
if (Ty == Type::getDoubleTy(CPV->getContext())) return ctx.newImmediate((double)0);
GBE_ASSERT(0 && "Unsupported undef value type.\n");
}
// Floats and doubles
switch (typeID) {
case Type::FloatTyID:
case Type::DoubleTyID:
{
ConstantFP *FPC = cast(CPV);
GBE_ASSERT(isa(CPV) == false);
if (FPC->getType() == Type::getFloatTy(CPV->getContext())) {
const float f32 = FPC->getValueAPF().convertToFloat();
return ctx.newImmediate(f32);
} else {
const double f64 = FPC->getValueAPF().convertToDouble();
return ctx.newImmediate(f64);
}
}
break;
default:
GBE_ASSERTM(false, "Unsupported constant type");
break;
}
}
GBE_ASSERTM(false, "Unsupported constant type");
return ctx.newImmediate(uint64_t(0));
}
ir::ImmediateIndex GenWriter::processConstantImmIndex(Constant *CPV, int32_t index) {
if (dyn_cast(CPV) == NULL)
return processConstantImmIndexImpl(CPV, index);
CPV->dump();
GBE_ASSERT(0 && "unsupported constant.\n");
return ctx.newImmediate((uint32_t)0);
}
const ir::Immediate &GenWriter::processConstantImm(Constant *CPV, int32_t index) {
ir::ImmediateIndex immIndex = processConstantImmIndex(CPV, index);
return ctx.getFunction().getImmediate(immIndex);
}
ir::ImmediateIndex GenWriter::newImmediate(Constant *CPV, uint32_t index) {
return processConstantImmIndex(CPV, index);
}
void GenWriter::newRegister(Value *value, Value *key, bool uniform) {
auto type = value->getType();
auto typeID = type->getTypeID();
switch (typeID) {
case Type::IntegerTyID:
case Type::FloatTyID:
case Type::DoubleTyID:
case Type::PointerTyID:
regTranslator.newScalar(value, key, 0, uniform);
break;
case Type::VectorTyID:
{
auto vectorType = cast(type);
const uint32_t elemNum = vectorType->getNumElements();
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
regTranslator.newScalar(value, key, elemID, uniform);
break;
}
case Type::StructTyID:
{
auto structType = cast(type);
const uint32_t elemNum = structType->getNumElements();
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
regTranslator.newScalar(value, key, elemID, uniform);
break;
}
default: NOT_SUPPORTED;
};
}
ir::Register GenWriter::getConstantRegister(Constant *c, uint32_t elemID) {
GBE_ASSERT(c != NULL);
if(isa(c)) {
return regTranslator.getScalar(c, elemID);
}
if(isa(c)) {
Type* llvmType = c->getType();
ir::Type dstType = getType(ctx, llvmType);
ir::Register reg = ctx.reg(getFamily(dstType));
ir::ImmediateIndex immIndex;
if(llvmType->isIntegerTy())
immIndex = ctx.newIntegerImmediate(0, dstType);
else if(llvmType->isFloatTy()) {
immIndex = ctx.newFloatImmediate((float)0.0);
} else {
immIndex = ctx.newDoubleImmediate((double)0.0);
}
ctx.LOADI(dstType, reg, immIndex);
return reg;
}
const ir::ImmediateIndex immIndex = this->newImmediate(c, elemID);
const ir::Immediate imm = ctx.getImmediate(immIndex);
const ir::Register reg = ctx.reg(getFamily(imm.getType()));
ctx.LOADI(imm.getType(), reg, immIndex);
return reg;
}
ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) {
//the real value may be constant, so get real value before constant check
regTranslator.getRealValue(value, elemID);
if(isa(value)) {
Constant *c = dyn_cast(value);
return getConstantRegister(c, elemID);
} else
return regTranslator.getScalar(value, elemID);
}
INLINE Value *GenWriter::getPHICopy(Value *PHI) {
const uintptr_t ptr = (uintptr_t) PHI;
return (Value*) (ptr+1);
}
void GenWriter::newLabelIndex(const BasicBlock *bb) {
if (labelMap.find(bb) == labelMap.end()) {
const ir::LabelIndex label = ctx.label();
labelMap[bb] = label;
}
}
void GenWriter::simplifyTerminator(BasicBlock *bb) {
Value *value = --bb->end();
BranchInst *I = NULL;
if ((I = dyn_cast(value)) != NULL) {
if (I->isConditional() == false)
return;
// If the "taken" successor is the next block, we try to invert the
// branch.
BasicBlock *succ = I->getSuccessor(0);
if (std::next(Function::iterator(bb)) != Function::iterator(succ))
return;
// More than one use is too complicated: we skip it
Value *condition = I->getCondition();
if (condition->hasOneUse() == false)
return;
// Right now, we only invert comparison instruction
ICmpInst *CI = dyn_cast(condition);
if (CI != NULL) {
GBE_ASSERT(conditionSet.find(CI) == conditionSet.end());
conditionSet.insert(CI);
return;
}
}
}
void GenWriter::emitBasicBlock(BasicBlock *BB) {
GBE_ASSERT(labelMap.find(BB) != labelMap.end());
ctx.LABEL(labelMap[BB]);
for (auto II = BB->begin(), E = BB->end(); II != E; ++II) visit(*II);
}
void GenWriter::emitMovForPHI(BasicBlock *curr, BasicBlock *succ) {
for (BasicBlock::iterator I = succ->begin(); isa(I); ++I) {
PHINode *PN = cast(I);
Value *IV = PN->getIncomingValueForBlock(curr);
Type *llvmType = PN->getType();
const ir::Type type = getType(ctx, llvmType);
Value *PHICopy = this->getPHICopy(PN);
const ir::Register dst = this->getRegister(PHICopy);
if (!isa(IV)) {
// Emit the MOV required by the PHI function. We do it simple and do not
// try to optimize them. A next data flow analysis pass on the Gen IR
// will remove them
Constant *CP = dyn_cast(IV);
if (CP) {
GBE_ASSERT(isa(CP) == false);
ConstantVector *CPV = dyn_cast(CP);
if (CPV && dyn_cast(CPV) &&
isa(extractConstantElem(CPV, 0)))
continue;
ctx.MOV(type, dst, getRegister(CP));
} else if (regTranslator.valueExists(IV,0) || dyn_cast(IV)) {
const ir::Register src = this->getRegister(IV);
ctx.MOV(type, dst, src);
}
assert(!ctx.getBlock()->undefPhiRegs.contains(dst));
ctx.getBlock()->definedPhiRegs.insert(dst);
} else {
// If this is an undefined value, we don't need emit phi copy here.
// But we need to record it. As latter, at liveness's backward analysis,
// we don't need to pass the phi value/register to this BB which the phi
// value is undefined. Otherwise, the phi value's liveness will be extent
// incorrectly and may be extent to the basic block zero which is really bad.
ctx.getBlock()->undefPhiRegs.insert(dst);
}
}
}
/*! To track read image args and write args */
struct ImageArgsInfo{
uint32_t readImageArgs;
uint32_t writeImageArgs;
};
static void collectImageArgs(std::string& accessQual, ImageArgsInfo& imageArgsInfo)
{
if(accessQual.find("read") != std::string::npos)
{
imageArgsInfo.readImageArgs++;
GBE_ASSERT(imageArgsInfo.readImageArgs <= BTI_MAX_READ_IMAGE_ARGS);
}
else if(accessQual.find("write") != std::string::npos)
{
imageArgsInfo.writeImageArgs++;
GBE_ASSERT(imageArgsInfo.writeImageArgs <= BTI_MAX_WRITE_IMAGE_ARGS);
}
else
{
//default is read_only per spec.
imageArgsInfo.readImageArgs++;
GBE_ASSERT(imageArgsInfo.readImageArgs <= BTI_MAX_READ_IMAGE_ARGS);
}
}
void GenWriter::emitFunctionPrototype(Function &F)
{
GBE_ASSERTM(F.hasStructRetAttr() == false,
"Returned value for kernel functions is forbidden");
// Loop over the kernel metadatas to set the required work group size.
NamedMDNode *clKernelMetaDatas = TheModule->getNamedMetadata("opencl.kernels");
size_t reqd_wg_sz[3] = {0, 0, 0};
size_t hint_wg_sz[3] = {0, 0, 0};
ir::FunctionArgument::InfoFromLLVM llvmInfo;
MDNode *node = NULL;
MDNode *addrSpaceNode = NULL;
MDNode *typeNameNode = NULL;
MDNode *accessQualNode = NULL;
MDNode *typeQualNode = NULL;
MDNode *argNameNode = NULL;
std::string functionAttributes;
/* First find the meta data belong to this function. */
for(uint i = 0; i < clKernelMetaDatas->getNumOperands(); i++) {
node = clKernelMetaDatas->getOperand(i);
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
if (node->getOperand(0) == &F) break;
#else
auto *V = cast(node->getOperand(0));
if (V && V->getValue() == &F) break;
#endif
node = NULL;
}
/* because "-cl-kernel-arg-info", should always have meta data. */
if (!F.arg_empty())
assert(node);
for(uint j = 0; j < node->getNumOperands() - 1; j++) {
MDNode *attrNode = dyn_cast_or_null(node->getOperand(1 + j));
if (attrNode == NULL) break;
MDString *attrName = dyn_cast_or_null(attrNode->getOperand(0));
if (!attrName) continue;
if (attrName->getString() == "reqd_work_group_size") {
GBE_ASSERT(attrNode->getNumOperands() == 4);
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
ConstantInt *x = dyn_cast(attrNode->getOperand(1));
ConstantInt *y = dyn_cast(attrNode->getOperand(2));
ConstantInt *z = dyn_cast(attrNode->getOperand(3));
#else
ConstantInt *x = mdconst::extract(attrNode->getOperand(1));
ConstantInt *y = mdconst::extract(attrNode->getOperand(2));
ConstantInt *z = mdconst::extract(attrNode->getOperand(3));
#endif
GBE_ASSERT(x && y && z);
reqd_wg_sz[0] = x->getZExtValue();
reqd_wg_sz[1] = y->getZExtValue();
reqd_wg_sz[2] = z->getZExtValue();
functionAttributes += attrName->getString();
std::stringstream param;
char buffer[100];
param <<"(";
param << reqd_wg_sz[0];
param << ",";
param << reqd_wg_sz[1];
param << ",";
param << reqd_wg_sz[2];
param <<")";
param >> buffer;
functionAttributes += buffer;
functionAttributes += " ";
break;
} else if (attrName->getString() == "kernel_arg_addr_space") {
addrSpaceNode = attrNode;
} else if (attrName->getString() == "kernel_arg_access_qual") {
accessQualNode = attrNode;
} else if (attrName->getString() == "kernel_arg_type") {
typeNameNode = attrNode;
} else if (attrName->getString() == "kernel_arg_type_qual") {
typeQualNode = attrNode;
} else if (attrName->getString() == "kernel_arg_name") {
argNameNode = attrNode;
} else if (attrName->getString() == "vec_type_hint") {
GBE_ASSERT(attrNode->getNumOperands() == 3);
functionAttributes += attrName->getString();
functionAttributes += " ";
} else if (attrName->getString() == "work_group_size_hint") {
GBE_ASSERT(attrNode->getNumOperands() == 4);
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
ConstantInt *x = dyn_cast(attrNode->getOperand(1));
ConstantInt *y = dyn_cast(attrNode->getOperand(2));
ConstantInt *z = dyn_cast(attrNode->getOperand(3));
#else
ConstantInt *x = mdconst::extract(attrNode->getOperand(1));
ConstantInt *y = mdconst::extract(attrNode->getOperand(2));
ConstantInt *z = mdconst::extract(attrNode->getOperand(3));
#endif
GBE_ASSERT(x && y && z);
hint_wg_sz[0] = x->getZExtValue();
hint_wg_sz[1] = y->getZExtValue();
hint_wg_sz[2] = z->getZExtValue();
functionAttributes += attrName->getString();
std::stringstream param;
char buffer[100];
param <<"(";
param << hint_wg_sz[0];
param << ",";
param << hint_wg_sz[1];
param << ",";
param << hint_wg_sz[2];
param <<")";
param >> buffer;
functionAttributes += buffer;
functionAttributes += " ";
}
}
ctx.appendSurface(1, ir::ocl::stackbuffer);
ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
ctx.getFunction().setFunctionAttributes(functionAttributes);
// Loop over the arguments and output registers for them
if (!F.arg_empty()) {
uint32_t argID = 0;
ImageArgsInfo imageArgsInfo = {};
Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
// Insert a new register for each function argument
#if LLVM_VERSION_MINOR <= 1
const AttrListPtr &PAL = F.getAttributes();
#endif /* LLVM_VERSION_MINOR <= 1 */
for (; I != E; ++I, ++argID) {
const std::string &argName = I->getName().str();
Type *type = I->getType();
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
llvmInfo.addrSpace = (cast(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
#else
llvmInfo.addrSpace = (mdconst::extract(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
#endif
llvmInfo.typeName = (cast(typeNameNode->getOperand(1 + argID)))->getString();
llvmInfo.accessQual = (cast(accessQualNode->getOperand(1 + argID)))->getString();
llvmInfo.typeQual = (cast(typeQualNode->getOperand(1 + argID)))->getString();
if(argNameNode){
llvmInfo.argName = (cast(argNameNode->getOperand(1 + argID)))->getString();
}
// function arguments are uniform values.
this->newRegister(I, NULL, true);
// add support for vector argument.
if(type->isVectorTy()) {
VectorType *vectorType = cast(type);
ir::Register reg = getRegister(I, 0);
Type *elemType = vectorType->getElementType();
const uint32_t elemSize = getTypeByteSize(unit, elemType);
const uint32_t elemNum = vectorType->getNumElements();
//vector's elemType always scalar type
ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
ir::Function& fn = ctx.getFunction();
for(uint32_t i=1; i < elemNum; i++) {
ir::PushLocation argLocation(fn, argID, elemSize*i);
reg = getRegister(I, i);
ctx.appendPushedConstant(reg, argLocation); //add to push map for reg alloc
}
continue;
}
GBE_ASSERTM(isScalarType(type) == true,
"vector type in the function argument is not supported yet");
const ir::Register reg = getRegister(I);
if (llvmInfo.isImageType()) {
ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, 4, 4, 0);
ctx.getFunction().getImageSet()->append(reg, &ctx, incBtiBase());
collectImageArgs(llvmInfo.accessQual, imageArgsInfo);
continue;
}
if (llvmInfo.isSamplerType()) {
ctx.input(argName, ir::FunctionArgument::SAMPLER, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
(void)ctx.getFunction().getSamplerSet()->append(reg, &ctx);
continue;
}
if (type->isPointerTy() == false)
ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
else {
PointerType *pointerType = dyn_cast(type);
Type *pointed = pointerType->getElementType();
// By value structure
#if LLVM_VERSION_MINOR <= 1
if (PAL.paramHasAttr(argID+1, Attribute::ByVal)) {
#else
if (I->hasByValAttr()) {
#endif /* LLVM_VERSION_MINOR <= 1 */
const size_t structSize = getTypeByteSize(unit, pointed);
ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, llvmInfo, structSize, getAlignmentByte(unit, type), 0);
}
// Regular user provided pointer (global, local or constant)
else {
const uint32_t addr = pointerType->getAddressSpace();
const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(addr);
const uint32_t ptrSize = getTypeByteSize(unit, type);
const uint32_t align = getAlignmentByte(unit, pointed);
switch (addrSpace) {
case ir::MEM_GLOBAL:
globalPointer.insert(std::make_pair(I, btiBase));
ctx.appendSurface(btiBase, reg);
ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, btiBase);
incBtiBase();
break;
case ir::MEM_LOCAL:
ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg, llvmInfo, ptrSize, align, BTI_LOCAL);
ctx.getFunction().setUseSLM(true);
break;
case ir::MEM_CONSTANT:
ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg, llvmInfo, ptrSize, align, 0x2);
break;
default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
}
}
}
}
}
// When returning a structure, first input register is the pointer to the
// structure
#if GBE_DEBUG
const Type *type = F.getReturnType();
GBE_ASSERTM(type->isVoidTy() == true,
"Returned value for kernel functions is forbidden");
// Variable number of arguments is not supported
FunctionType *FT = cast(F.getFunctionType());
GBE_ASSERT(FT->isVarArg() == false);
#endif /* GBE_DEBUG */
}
static inline bool isFPIntBitCast(const Instruction &I) {
if (!isa(I))
return false;
Type *SrcTy = I.getOperand(0)->getType();
Type *DstTy = I.getType();
return (SrcTy->isFloatingPointTy() && DstTy->isIntegerTy()) ||
(DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
}
/*! To track last read and write of the registers */
struct RegInfoForMov {
ir::Instruction *lastWriteInsn;
ir::Instruction *lastReadInsn;
uint32_t lastWrite;
uint32_t lastRead;
};
/*! Replace register "from" by register "to" in the destination(s) */
static void replaceDst(ir::Instruction *insn, ir::Register from, ir::Register to) {
const uint32_t dstNum = insn->getDstNum();
for (uint32_t dstID = 0; dstID < dstNum; ++dstID)
if (insn->getDst(dstID) == from)
insn->setDst(dstID, to);
}
/*! Replace register "from" by register "to" in the source(s) */
static void replaceSrc(ir::Instruction *insn, ir::Register from, ir::Register to) {
const uint32_t srcNum = insn->getSrcNum();
for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
if (insn->getSrc(srcID) == from)
insn->setSrc(srcID, to);
}
/*! lastUse maintains data about last uses (reads/writes) for each
* ir::Register
*/
static void buildRegInfo(ir::BasicBlock &bb, vector &lastUse)
{
// Clear the register usages
for (auto &x : lastUse) {
x.lastWrite = x.lastRead = 0;
x.lastWriteInsn = x.lastReadInsn = NULL;
}
// Find use intervals for all registers (distinguish sources and
// destinations)
uint32_t insnID = 2;
bb.foreach([&](ir::Instruction &insn) {
const uint32_t dstNum = insn.getDstNum();
const uint32_t srcNum = insn.getSrcNum();
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const ir::Register reg = insn.getSrc(srcID);
lastUse[reg].lastRead = insnID;
lastUse[reg].lastReadInsn = &insn;
}
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const ir::Register reg = insn.getDst(dstID);
lastUse[reg].lastWrite = insnID+1;
lastUse[reg].lastWriteInsn = &insn;
}
insnID+=2;
});
}
void GenWriter::optimizePhiCopy(ir::Liveness &liveness, ir::Function &fn)
{
// The overall idea behind is we check whether there is any interference
// between phi and phiCopy live range. If there is no point that
// phi & phiCopy are both alive, then we can optimize off the move
// from phiCopy to phi, and use phiCopy directly instead of phi.
using namespace ir;
ir::FunctionDAG *dag = new ir::FunctionDAG(liveness);
for (auto &it : phiMap) {
const Register phi = it.first;
const Register phiCopy = it.second;
const ir::DefSet *phiCopyDef = dag->getRegDef(phiCopy);
const ir::UseSet *phiUse = dag->getRegUse(phi);
const DefSet *phiDef = dag->getRegDef(phi);
bool isOpt = true;
for (auto &x : *phiCopyDef) {
const ir::Instruction * phiCopyDefInsn = x->getInstruction();
const ir::BasicBlock *bb = phiCopyDefInsn->getParent();
const Liveness::LiveOut &out = liveness.getLiveOut(bb);
// phi & phiCopy are both alive at the endpoint of bb,
// thus can not be optimized.
if (out.contains(phi)) {
isOpt = false;
break;
}
// If phi is used in the same BB that define the phiCopy,
// we need carefully check the liveness of phi & phiCopy.
// Make sure their live ranges do not interfere.
bool phiUsedInSameBB = false;
for (auto &y : *phiUse) {
const ir::Instruction *phiUseInsn = y->getInstruction();
const ir::BasicBlock *bb2 = phiUseInsn->getParent();
if (bb2 == bb) {
phiUsedInSameBB = true;
}
}
// Check phi is not used between phiCopy def point and bb's end point,
// which is often referred as 'phi swap issue', just like below:
// MOV phiCopy_1, x;
// MOV phiCopy_2, phi_1;
if (phiUsedInSameBB ) {
for (auto it = --bb->end(); it != bb->end() ; --it) {
const Instruction &p = *it;
if (&p == phiCopyDefInsn) break;
// we only care MOV here
if (p.getSrcNum() == 1 && p.getSrc(0) == phi) {
isOpt = false;
break;
}
}
}
}
// [MOV phi, phiCopy;] can be removed. So we remove it
// and replace phi uses with phiCopy
if (isOpt) {
for (auto &x : *phiDef) {
const_cast(x->getInstruction())->remove();
}
for (auto &x : *phiUse) {
const Instruction *phiUseInsn = x->getInstruction();
replaceSrc(const_cast(phiUseInsn), phi, phiCopy);
}
}
}
delete dag;
}
void GenWriter::removeMOVs(const ir::Liveness &liveness, ir::Function &fn)
{
// We store the last write and last read for each register
const uint32_t regNum = fn.regNum();
vector lastUse;
lastUse.resize(regNum);
// Remove the MOVs per block (local analysis only) Note that we do not try
// to remove MOV for variables that outlives the block. So we use liveness
// information to figure out which variable is alive
fn.foreachBlock([&](ir::BasicBlock &bb)
{
// We need to know when each register will be read or written
buildRegInfo(bb, lastUse);
// Liveinfo helps us to know if the source outlives the block
const ir::Liveness::BlockInfo &info = liveness.getBlockInfo(&bb);
auto it = --bb.end();
if (it->isMemberOf() == true) --it;
for (auto it = --bb.end(); it != bb.end();) {
ir::Instruction *insn = &*it; it--;
const ir::Opcode op = insn->getOpcode();
if (op == ir::OP_MOV) {
const ir::Register dst = insn->getDst(0);
const ir::Register src = insn->getSrc(0);
// Outlives the block. We do not do anything
if (info.inLiveOut(src))
continue;
const RegInfoForMov &dstInfo = lastUse[dst];
const RegInfoForMov &srcInfo = lastUse[src];
// The source is not computed in this block
if (srcInfo.lastWrite == 0)
continue;
// dst is read after src is written. We cannot overwrite dst
if (dstInfo.lastRead > srcInfo.lastWrite)
continue;
// We are good. We first patch the destination then all the sources
replaceDst(srcInfo.lastWriteInsn, src, dst);
// Then we patch all subsequent uses of the source
ir::Instruction *next = static_cast(srcInfo.lastWriteInsn->next);
while (next != insn) {
replaceSrc(next, src, dst);
next = static_cast(next->next);
}
insn->remove();
} else if (op == ir::OP_LOADI)
continue;
else
break;
}
});
}
void GenWriter::removeLOADIs(const ir::Liveness &liveness, ir::Function &fn)
{
// We store the last write and last read for each register
const uint32_t regNum = fn.regNum();
vector lastUse;
lastUse.resize(regNum);
// Traverse all blocks and remove redundant immediates. Do *not* remove
// immediates that outlive the block
fn.foreachBlock([&](ir::BasicBlock &bb)
{
// Each immediate that is already loaded in the block
map loadedImm;
// Immediate to immediate translation
map immTranslate;
// Liveinfo helps us to know if the loaded immediate outlives the block
const ir::Liveness::BlockInfo &info = liveness.getBlockInfo(&bb);
// We need to know when each register will be read or written
buildRegInfo(bb, lastUse);
// Top bottom traversal -> remove useless LOADIs
uint32_t insnID = 2;
bb.foreach([&](ir::Instruction &insn)
{
// We either try to remove the LOADI or we will try to use it as a
// replacement for the next same LOADIs
if (insn.isMemberOf()) {
ir::LoadImmInstruction &loadImm = cast(insn);
const ir::Immediate imm = loadImm.getImmediate();
const ir::Register dst = loadImm.getDst(0);
// Not here: cool, we put it in the map if the register is not
// overwritten. If it is, we just ignore it for simplicity. Note that
// it should not happen with the way we "unSSA" the code
auto it = loadedImm.find(imm);
auto end = loadedImm.end();
if (it == end && lastUse[dst].lastWrite == insnID+1)
loadedImm.insert(std::make_pair(imm, dst));
// We already pushed the same immediate and we do not outlive the
// block. We are good to replace this immediate by the previous one
else if (it != end && info.inLiveOut(dst) == false) {
immTranslate.insert(std::make_pair(dst, it->second));
insn.remove();
}
}
// Traverse all the destinations and sources and perform the
// substitutions (if any)
else {
const uint32_t srcNum = insn.getSrcNum();
const uint32_t dstNum = insn.getDstNum();
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const ir::Register src = insn.getSrc(srcID);
auto it = immTranslate.find(src);
if (it != immTranslate.end())
insn.setSrc(srcID, it->second);
}
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const ir::Register dst = insn.getDst(dstID);
auto it = immTranslate.find(dst);
if (it != immTranslate.end())
insn.setDst(dstID, it->second);
}
}
insnID += 2;
});
});
}
BVAR(OCL_OPTIMIZE_PHI_MOVES, true);
BVAR(OCL_OPTIMIZE_LOADI, true);
static const Instruction *getInstructionUseLocal(const Value *v) {
// Local variable can only be used in one kernel function. So, if we find
// one instruction that use the local variable, simply return.
const Instruction *insn = NULL;
for(Value::const_use_iterator iter = v->use_begin(); iter != v->use_end(); ++iter) {
// After LLVM 3.5, use_iterator points to 'Use' instead of 'User', which is more straightforward.
#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
const User *theUser = *iter;
#else
const User *theUser = iter->getUser();
#endif
if(isa(theUser)) return cast(theUser);
insn = getInstructionUseLocal(theUser);
if(insn != NULL) break;
}
return insn;
}
void GenWriter::allocateGlobalVariableRegister(Function &F)
{
// Allocate a address register for each global variable
const Module::GlobalListType &globalList = TheModule->getGlobalList();
size_t j = 0;
for(auto i = globalList.begin(); i != globalList.end(); i ++) {
const GlobalVariable &v = *i;
if(!v.isConstantUsed()) continue;
ir::AddressSpace addrSpace = addressSpaceLLVMToGen(v.getType()->getAddressSpace());
if(addrSpace == ir::MEM_LOCAL) {
const Value * val = cast(&v);
const Instruction *insn = getInstructionUseLocal(val);
GBE_ASSERT(insn && "Can't find a valid reference instruction for local variable.");
const BasicBlock * bb = insn->getParent();
const Function * func = bb->getParent();
if(func != &F) continue;
ir::Function &f = ctx.getFunction();
f.setUseSLM(true);
const Constant *c = v.getInitializer();
Type *ty = c->getType();
uint32_t oldSlm = f.getSLMSize();
uint32_t align = 8 * getAlignmentByte(unit, ty);
uint32_t padding = getPadding(oldSlm*8, align);
f.setSLMSize(oldSlm + padding/8 + getTypeByteSize(unit, ty));
this->newRegister(const_cast(&v));
ir::Register reg = regTranslator.getScalar(const_cast(&v), 0);
ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(oldSlm + padding/8, ir::TYPE_S32));
} else if(addrSpace == ir::MEM_CONSTANT || v.isConstant()) {
GBE_ASSERT(v.hasInitializer());
this->newRegister(const_cast(&v));
ir::Register reg = regTranslator.getScalar(const_cast(&v), 0);
ir::Constant &con = unit.getConstantSet().getConstant(j ++);
GBE_ASSERT(con.getName() == v.getName());
ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
} else {
if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
ctx.appendSurface(btiBase, ir::ocl::printfbptr);
ctx.getFunction().getPrintfSet()->setBufBTI(btiBase);
globalPointer.insert(std::make_pair(&v, incBtiBase()));
regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast(&v));
} else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
ctx.appendSurface(btiBase, ir::ocl::printfiptr);
ctx.getFunction().getPrintfSet()->setIndexBufBTI(btiBase);
globalPointer.insert(std::make_pair(&v, incBtiBase()));
regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast(&v));
} else if(v.getName().str().substr(0, 4) == ".str") {
/* When there are multi printf statements in multi kernel fucntions within the same
translate unit, if they have the same sting parameter, such as
kernel_func1 () {
printf("Line is %d\n", line_num1);
}
kernel_func2 () {
printf("Line is %d\n", line_num2);
}
The Clang will just generate one global string named .strXXX to represent "Line is %d\n"
So when translating the kernel_func1, we can not unref that global var, so we will
get here. Just ignore it to avoid assert. */
} else {
GBE_ASSERT(0 && "Unsupported private memory access pattern");
}
}
}
}
static INLINE void findAllLoops(LoopInfo * LI, std::vector> &lp)
{
for (Loop::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I) {
lp.push_back(std::make_pair(*I, -1));
}
if (lp.size() == 0) return;
uint32_t i = 0;
do {
const std::vector subLoops = lp[i].first->getSubLoops();
for(auto sub : subLoops)
lp.push_back(std::make_pair(sub, i));
i++;
} while(i < lp.size());
}
void GenWriter::gatherLoopInfo(ir::Function &fn) {
vector loopBBs;
vector> loopExits;
std::vector> lp;
findAllLoops(LI, lp);
#if GBE_DEBUG
// check two loops' interference
for(unsigned int i = 0; i < lp.size(); i++) {
SmallVector exitBBs;
lp[i].first->getExitEdges(exitBBs);
const std::vector &inBBs = lp[i].first->getBlocks();
std::vector bbs1;
for(auto x : inBBs) {
bbs1.push_back(labelMap[x]);
}
std::sort(bbs1.begin(), bbs1.end());
for(unsigned int j = i+1; j < lp.size(); j++) {
if(! lp[i].first->contains(lp[j].first)) {
const std::vector &inBBs2 = lp[j].first->getBlocks();
std::vector bbs2;
std::vector bbs3;
for(auto x : inBBs2) {
bbs2.push_back(labelMap[x]);
}
std::sort(bbs2.begin(), bbs2.end());
std::set_intersection(bbs1.begin(), bbs1.end(), bbs2.begin(), bbs2.end(), std::back_inserter(bbs3));
GBE_ASSERT(bbs3.size() < 1);
}
}
}
#endif
for (auto loop : lp) {
loopBBs.clear();
loopExits.clear();
const std::vector &inBBs = loop.first->getBlocks();
for (auto b : inBBs) {
GBE_ASSERT(labelMap.find(b) != labelMap.end());
loopBBs.push_back(labelMap[b]);
}
SmallVector exitBBs;
loop.first->getExitEdges(exitBBs);
for(auto b : exitBBs){
GBE_ASSERT(labelMap.find(b.first) != labelMap.end());
GBE_ASSERT(labelMap.find(b.second) != labelMap.end());
loopExits.push_back(std::make_pair(labelMap[b.first], labelMap[b.second]));
}
fn.addLoop(loopBBs, loopExits);
}
}
static unsigned getChildNo(BasicBlock *bb) {
TerminatorInst *term = bb->getTerminator();
return term->getNumSuccessors();
}
// return NULL if index out-range of children number
static BasicBlock *getChildPossible(BasicBlock *bb, unsigned index) {
TerminatorInst *term = bb->getTerminator();
unsigned childNo = term->getNumSuccessors();
BasicBlock *child = NULL;
if(index < childNo) {
child = term->getSuccessor(index);
}
return child;
}
/*!
Sorting Basic blocks is mainly used to solve register liveness issue, take a
look at below CFG:
-<--1--
| |
| ->2
-- 3 <--- |
| ^ | -->4--
| | | | |
| | -----5<-- |
| | |
| ----------6<-----
|
-->7
1.) A register %10 defined in bb4, and used in bb5 & bb6. In normal liveness
analysis, %10 is not alive in bb3. But under simd execution model, after
executing bb4, some channel jump through bb5 to bb3, other channel may jump
to bb6, we must execute bb3 first, then bb6, to avoid missing instructions.
The physical register of %10 was assigned some value in bb4, but when
executing bb3, its content may be over-written as it is dead in bb3. When
jumping back to execute bb6, it will get polluted data. What a disaster!
What we do here is do a topological sorting of basic blocks, For this case
we can see the bb3 will be placed after bb5 & bb6. The liveness calculation
is just as normal and will be correct.
2.) Another advantage of sorting basic blocks is reducing register pressure.
In the above CFG, a register defined in bb3 and used in bb7 will be
alive through 3,4,5,6,7. But in fact it should be only alive in bb3 and bb7.
After topological sorting, this kind of register would be only alive in bb3
and bb7. Register pressure in 4,5,6 is reduced.
3.) Classical post-order traversal will automatically choose a order for the
successors of a basic block, But this order may be hard to handle, take a look
at below CFG:
1 <-----
/ |
2 --> 4 -
|
3
|
5
In the post oder traversal, it may be: 5->4->3->2->1, as 4, 3 does not have
strict order. This is a serious issue, a value defined in bb3, used in bb5
may be overwritten in bb1. Remember the simd execution model? some lanes
may execute bb4 after other lanes finish bb3, and then jump to bb1, but live
range of the register does not cover bb1. what we done here is for a loop
exit (here bb3), we alwasy make sure it is visited first in the post-order
traversal, for the graph, that means 5->3->4->2->1. Then a definition in bb3,
and used in 5 will not interfere with any other values defined in the loop.
FIXME: For irreducible graph, we need to identify it and convert to reducible graph.
*/
void GenWriter::sortBasicBlock(Function &F) {
BasicBlock &entry = F.getEntryBlock();
std::vector visitStack;
std::vector sorted;
std::set visited;
visitStack.push_back(&entry);
visited.insert(&entry);
while (!visitStack.empty()) {
BasicBlock *top = visitStack.back();
unsigned childNo = getChildNo(top);
GBE_ASSERT(childNo <= 2);
BasicBlock *child0 = getChildPossible(top, 0);
BasicBlock *child1 = getChildPossible(top, 1);
if(childNo == 2) {
Loop *loop = LI->getLoopFor(top);
// visit loop exit node first, so loop exit block will be placed
// after blocks in loop in 'reverse post-order' list.
if (loop && loop->contains(child0) && !loop->contains(child1)) {
BasicBlock *tmp = child0; child0 = child1; child1 = tmp;
}
}
if (child0 != NULL && visited.find(child0) == visited.end()) {
visitStack.push_back(child0);
visited.insert(child0);
} else if (child1 != NULL && visited.find(child1) == visited.end()) {
visitStack.push_back(child1);
visited.insert(child1);
} else {
sorted.push_back(visitStack.back());
visitStack.pop_back();
}
}
Function::BasicBlockListType &bbList = F.getBasicBlockList();
for (std::vector::iterator iter = sorted.begin(); iter != sorted.end(); ++iter) {
(*iter)->removeFromParent();
}
for (std::vector::reverse_iterator iter = sorted.rbegin(); iter != sorted.rend(); ++iter) {
bbList.push_back(*iter);
}
}
void GenWriter::emitFunction(Function &F)
{
switch (F.getCallingConv()) {
#if LLVM_VERSION_MINOR <= 2
case CallingConv::PTX_Device: // we do not emit device function
return;
case CallingConv::PTX_Kernel:
#else
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::SPIR_KERNEL:
#endif
break;
default:
GBE_ASSERTM(false, "Unsupported calling convention");
}
ctx.startFunction(F.getName());
ir::Function &fn = ctx.getFunction();
this->regTranslator.clear();
this->labelMap.clear();
this->emitFunctionPrototype(F);
this->allocateGlobalVariableRegister(F);
sortBasicBlock(F);
// Visit all the instructions and emit the IR registers or the value to
// value mapping when a new register is not needed
pass = PASS_EMIT_REGISTERS;
for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
visit(*I);
// First create all the labels (one per block) ...
for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
this->newLabelIndex(BB);
// Then, for all branch instructions that have conditions, see if we can
// simplify the code by inverting condition code
for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
this->simplifyTerminator(BB);
// gather loop info, which is useful for liveness analysis
gatherLoopInfo(fn);
// ... then, emit the instructions for all basic blocks
pass = PASS_EMIT_INSTRUCTIONS;
for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
emitBasicBlock(BB);
ctx.endFunction();
// Liveness can be shared when we optimized the immediates and the MOVs
ir::Liveness liveness(fn);
if (OCL_OPTIMIZE_LOADI) this->removeLOADIs(liveness, fn);
if (OCL_OPTIMIZE_PHI_MOVES) this->optimizePhiCopy(liveness, fn);
if (OCL_OPTIMIZE_PHI_MOVES) this->removeMOVs(liveness, fn);
}
void GenWriter::regAllocateReturnInst(ReturnInst &I) {}
void GenWriter::emitReturnInst(ReturnInst &I) {
const ir::Function &fn = ctx.getFunction();
GBE_ASSERTM(fn.outputNum() <= 1, "no more than one value can be returned");
if (fn.outputNum() == 1 && I.getNumOperands() > 0) {
const ir::Register dst = fn.getOutput(0);
const ir::Register src = this->getRegister(I.getOperand(0));
const ir::RegisterFamily family = fn.getRegisterFamily(dst);
ctx.MOV(ir::getType(family), dst, src);
}
ctx.RET();
}
void GenWriter::regAllocateBinaryOperator(Instruction &I) {
this->newRegister(&I);
}
void GenWriter::emitBinaryOperator(Instruction &I) {
#if GBE_DEBUG
GBE_ASSERT(I.getType()->isPointerTy() == false);
// We accept logical operations on booleans
switch (I.getOpcode()) {
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
break;
default:
GBE_ASSERT(I.getType() != Type::getInt1Ty(I.getContext()));
}
#endif /* GBE_DEBUG */
// Get the element type for a vector
const ir::Type type = getType(ctx, I.getType());
// Emit the instructions in a row
const ir::Register dst = this->getRegister(&I);
const ir::Register src0 = this->getRegister(I.getOperand(0));
const ir::Register src1 = this->getRegister(I.getOperand(1));
switch (I.getOpcode()) {
case Instruction::Add:
case Instruction::FAdd: ctx.ADD(type, dst, src0, src1); break;
case Instruction::Sub:
case Instruction::FSub: ctx.SUB(type, dst, src0, src1); break;
case Instruction::Mul:
case Instruction::FMul: ctx.MUL(type, dst, src0, src1); break;
case Instruction::URem: ctx.REM(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
case Instruction::SRem:
case Instruction::FRem: ctx.REM(type, dst, src0, src1); break;
case Instruction::UDiv: ctx.DIV(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
case Instruction::SDiv:
case Instruction::FDiv: ctx.DIV(type, dst, src0, src1); break;
case Instruction::And: ctx.AND(type, dst, src0, src1); break;
case Instruction::Or: ctx.OR(type, dst, src0, src1); break;
case Instruction::Xor: ctx.XOR(type, dst, src0, src1); break;
case Instruction::Shl: ctx.SHL(type, dst, src0, src1); break;
case Instruction::LShr: ctx.SHR(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
case Instruction::AShr: ctx.ASR(type, dst, src0, src1); break;
default: NOT_SUPPORTED;
}
}
void GenWriter::regAllocateICmpInst(ICmpInst &I) {
this->newRegister(&I);
}
static ir::Type makeTypeSigned(const ir::Type &type) {
if (type == ir::TYPE_U8) return ir::TYPE_S8;
else if (type == ir::TYPE_U16) return ir::TYPE_S16;
else if (type == ir::TYPE_U32) return ir::TYPE_S32;
else if (type == ir::TYPE_U64) return ir::TYPE_S64;
return type;
}
static ir::Type makeTypeUnsigned(const ir::Type &type) {
if (type == ir::TYPE_S8) return ir::TYPE_U8;
else if (type == ir::TYPE_S16) return ir::TYPE_U16;
else if (type == ir::TYPE_S32) return ir::TYPE_U32;
else if (type == ir::TYPE_S64) return ir::TYPE_U64;
return type;
}
void GenWriter::emitICmpInst(ICmpInst &I) {
// Get the element type and the number of elements
Type *operandType = I.getOperand(0)->getType();
const ir::Type type = getType(ctx, operandType);
const ir::Type signedType = makeTypeSigned(type);
const ir::Type unsignedType = makeTypeUnsigned(type);
// Emit the instructions in a row
const ir::Register dst = this->getRegister(&I);
const ir::Register src0 = this->getRegister(I.getOperand(0));
const ir::Register src1 = this->getRegister(I.getOperand(1));
// We must invert the condition to simplify the branch code
if (conditionSet.find(&I) != conditionSet.end()) {
switch (I.getPredicate()) {
case ICmpInst::ICMP_EQ: ctx.NE(type, dst, src0, src1); break;
case ICmpInst::ICMP_NE: ctx.EQ(type, dst, src0, src1); break;
case ICmpInst::ICMP_ULE: ctx.GT((unsignedType), dst, src0, src1); break;
case ICmpInst::ICMP_SLE: ctx.GT(signedType, dst, src0, src1); break;
case ICmpInst::ICMP_UGE: ctx.LT(unsignedType, dst, src0, src1); break;
case ICmpInst::ICMP_SGE: ctx.LT(signedType, dst, src0, src1); break;
case ICmpInst::ICMP_ULT: ctx.GE(unsignedType, dst, src0, src1); break;
case ICmpInst::ICMP_SLT: ctx.GE(signedType, dst, src0, src1); break;
case ICmpInst::ICMP_UGT: ctx.LE(unsignedType, dst, src0, src1); break;
case ICmpInst::ICMP_SGT: ctx.LE(signedType, dst, src0, src1); break;
default: NOT_SUPPORTED;
}
}
// Nothing special to do
else {
switch (I.getPredicate()) {
case ICmpInst::ICMP_EQ: ctx.EQ(type, dst, src0, src1); break;
case ICmpInst::ICMP_NE: ctx.NE(type, dst, src0, src1); break;
case ICmpInst::ICMP_ULE: ctx.LE((unsignedType), dst, src0, src1); break;
case ICmpInst::ICMP_SLE: ctx.LE(signedType, dst, src0, src1); break;
case ICmpInst::ICMP_UGE: ctx.GE(unsignedType, dst, src0, src1); break;
case ICmpInst::ICMP_SGE: ctx.GE(signedType, dst, src0, src1); break;
case ICmpInst::ICMP_ULT: ctx.LT(unsignedType, dst, src0, src1); break;
case ICmpInst::ICMP_SLT: ctx.LT(signedType, dst, src0, src1); break;
case ICmpInst::ICMP_UGT: ctx.GT(unsignedType, dst, src0, src1); break;
case ICmpInst::ICMP_SGT: ctx.GT(signedType, dst, src0, src1); break;
default: NOT_SUPPORTED;
}
}
}
void GenWriter::regAllocateFCmpInst(FCmpInst &I) {
this->newRegister(&I);
}
void GenWriter::emitFCmpInst(FCmpInst &I) {
// Get the element type and the number of elements
Type *operandType = I.getOperand(0)->getType();
const ir::Type type = getType(ctx, operandType);
const ir::Type insnType = getType(ctx, I.getType());
// Emit the instructions in a row
const ir::Register dst = this->getRegister(&I);
const ir::Register src0 = this->getRegister(I.getOperand(0));
const ir::Register src1 = this->getRegister(I.getOperand(1));
const ir::Register tmp = ctx.reg(getFamily(ctx, I.getType()));
Value *cv = ConstantInt::get(I.getType(), 1);
switch (I.getPredicate()) {
case ICmpInst::FCMP_OEQ: ctx.EQ(type, dst, src0, src1); break;
case ICmpInst::FCMP_ONE: ctx.NE(type, dst, src0, src1); break;
case ICmpInst::FCMP_OLE: ctx.LE(type, dst, src0, src1); break;
case ICmpInst::FCMP_OGE: ctx.GE(type, dst, src0, src1); break;
case ICmpInst::FCMP_OLT: ctx.LT(type, dst, src0, src1); break;
case ICmpInst::FCMP_OGT: ctx.GT(type, dst, src0, src1); break;
case ICmpInst::FCMP_ORD:
//If there is a constant between src0 and src1, this constant value
//must ordered, otherwise, llvm will optimize the instruction to ture.
//So discard this constant value, only compare the other src.
if(isa(I.getOperand(0)))
ctx.EQ(type, dst, src1, src1);
else if(isa(I.getOperand(1)))
ctx.EQ(type, dst, src0, src0);
else
ctx.ORD(type, dst, src0, src1);
break;
case ICmpInst::FCMP_UNO:
if(isa(I.getOperand(0)))
ctx.NE(type, dst, src1, src1);
else if(isa(I.getOperand(1)))
ctx.NE(type, dst, src0, src0);
else {
ctx.ORD(type, tmp, src0, src1);
ctx.XOR(insnType, dst, tmp, getRegister(cv)); //TODO: Use NOT directly
}
break;
case ICmpInst::FCMP_UEQ:
ctx.NE(type, tmp, src0, src1);
ctx.XOR(insnType, dst, tmp, getRegister(cv));
break;
case ICmpInst::FCMP_UGT:
ctx.LE(type, tmp, src0, src1);
ctx.XOR(insnType, dst, tmp, getRegister(cv));
break;
case ICmpInst::FCMP_UGE:
ctx.LT(type, tmp, src0, src1);
ctx.XOR(insnType, dst, tmp, getRegister(cv));
break;
case ICmpInst::FCMP_ULT:
ctx.GE(type, tmp, src0, src1);
ctx.XOR(insnType, dst, tmp, getRegister(cv));
break;
case ICmpInst::FCMP_ULE:
ctx.GT(type, tmp, src0, src1);
ctx.XOR(insnType, dst, tmp, getRegister(cv));
break;
case ICmpInst::FCMP_UNE:
ctx.EQ(type, tmp, src0, src1);
ctx.XOR(insnType, dst, tmp, getRegister(cv));
break;
case ICmpInst::FCMP_TRUE:
ctx.MOV(insnType, dst, getRegister(cv));
break;
default: NOT_SUPPORTED;
}
}
void GenWriter::regAllocateCastInst(CastInst &I) {
Value *dstValue = &I;
Value *srcValue = I.getOperand(0);
const auto op = I.getOpcode();
switch (op)
{
// When casting pointer to integers, be aware with integers
case Instruction::PtrToInt:
case Instruction::IntToPtr:
{
Type *dstType = dstValue->getType();
Type *srcType = srcValue->getType();
if (getTypeByteSize(unit, dstType) == getTypeByteSize(unit, srcType))
{
#if GBE_DEBUG
#endif /* GBE_DEBUG */
regTranslator.newValueProxy(srcValue, dstValue);
} else
this->newRegister(dstValue);
}
break;
// Bitcast just forward registers
case Instruction::BitCast:
{
Type *srcType = srcValue->getType();
Type *dstType = dstValue->getType();
if(srcType->isVectorTy() || dstType->isVectorTy())
this->newRegister(dstValue);
else
regTranslator.newValueProxy(srcValue, dstValue);
}
break;
// Various conversion operations -> just allocate registers for them
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::SExt:
case Instruction::ZExt:
case Instruction::FPExt:
case Instruction::FPTrunc:
case Instruction::Trunc:
this->newRegister(&I);
break;
default: NOT_SUPPORTED;
}
}
void GenWriter::emitCastInst(CastInst &I) {
switch (I.getOpcode())
{
case Instruction::PtrToInt:
case Instruction::IntToPtr:
{
Value *dstValue = &I;
Value *srcValue = I.getOperand(0);
Type *dstType = dstValue->getType();
Type *srcType = srcValue->getType();
if (getTypeByteSize(unit, dstType) != getTypeByteSize(unit, srcType)) {
const ir::Register dst = this->getRegister(&I);
const ir::Register src = this->getRegister(srcValue);
ctx.CVT(getType(ctx, dstType), getType(ctx, srcType), dst, src);
}
}
break;
case Instruction::BitCast:
{
Value *srcValue = I.getOperand(0);
Value *dstValue = &I;
uint32_t srcElemNum = 0, dstElemNum = 0 ;
ir::Type srcType = getVectorInfo(ctx, srcValue, srcElemNum);
ir::Type dstType = getVectorInfo(ctx, dstValue, dstElemNum);
// As long and double are not compatible in register storage
// and we do not support double yet, simply put an assert here
GBE_ASSERT(!(srcType == ir::TYPE_S64 && dstType == ir::TYPE_DOUBLE));
GBE_ASSERT(!(dstType == ir::TYPE_S64 && srcType == ir::TYPE_DOUBLE));
if(srcElemNum > 1 || dstElemNum > 1) {
// Build the tuple data in the vector
vector srcTupleData;
vector dstTupleData;
uint32_t elemID = 0;
for (elemID = 0; elemID < srcElemNum; ++elemID) {
ir::Register reg;
reg = this->getRegister(srcValue, elemID);
srcTupleData.push_back(reg);
}
for (elemID = 0; elemID < dstElemNum; ++elemID) {
ir::Register reg;
reg = this->getRegister(dstValue, elemID);
dstTupleData.push_back(reg);
}
const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], srcElemNum);
const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], dstElemNum);
ctx.BITCAST(dstType, srcType, dstTuple, srcTuple, dstElemNum, srcElemNum);
}
}
break; // nothing to emit here
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::SExt:
case Instruction::ZExt:
case Instruction::FPExt:
case Instruction::FPTrunc:
case Instruction::Trunc:
{
// Get the element type for a vector
Type *llvmDstType = I.getType();
Type *llvmSrcType = I.getOperand(0)->getType();
ir::Type dstType;
if (I.getOpcode() == Instruction::FPToUI)
dstType = getUnsignedType(ctx, llvmDstType);
else
dstType = getType(ctx, llvmDstType);
ir::Type srcType;
if (I.getOpcode() == Instruction::ZExt || I.getOpcode() == Instruction::UIToFP) {
srcType = getUnsignedType(ctx, llvmSrcType);
} else {
srcType = getType(ctx, llvmSrcType);
}
// We use a select (0,1) not a convert when the destination is a boolean
if (srcType == ir::TYPE_BOOL) {
const ir::RegisterFamily family = getFamily(dstType);
const ir::ImmediateIndex zero = ctx.newIntegerImmediate(0, dstType);
ir::ImmediateIndex one;
if (I.getOpcode() == Instruction::SExt
&& (dstType == ir::TYPE_S8 || dstType == ir::TYPE_S16 || dstType == ir::TYPE_S32 || dstType == ir::TYPE_S64))
one = ctx.newIntegerImmediate(-1, dstType);
else
one = ctx.newIntegerImmediate(1, dstType);
const ir::Register zeroReg = ctx.reg(family);
const ir::Register oneReg = ctx.reg(family);
ctx.LOADI(dstType, zeroReg, zero);
ctx.LOADI(dstType, oneReg, one);
const ir::Register dst = this->getRegister(&I);
const ir::Register src = this->getRegister(I.getOperand(0));
ctx.SEL(dstType, dst, src, oneReg, zeroReg);
}
// Use a convert for the other cases
else {
const ir::Register dst = this->getRegister(&I);
const ir::Register src = this->getRegister(I.getOperand(0));
ctx.CVT(dstType, srcType, dst, src);
}
}
break;
default: NOT_SUPPORTED;
}
}
/*! Because there are still fake insert/extract instruction for
* load/store, so keep empty function here */
void GenWriter::regAllocateInsertElement(InsertElementInst &I) {}
void GenWriter::emitInsertElement(InsertElementInst &I) {
const VectorType *type = dyn_cast(I.getType());
GBE_ASSERT(type);
const int elemNum = type->getNumElements();
Value *vec = I.getOperand(0);
Value *value = I.getOperand(1);
const Value *index = I.getOperand(2);
const ConstantInt *c = dyn_cast(index);
int i = c->getValue().getSExtValue();
for(int j=0; j(index);
GBE_ASSERT(c);
int i = c->getValue().getSExtValue();
regTranslator.newValueProxy(vec, &I, i, 0);
}
void GenWriter::emitExtractElement(ExtractElementInst &I) {
}
void GenWriter::regAllocateExtractValue(ExtractValueInst &I) {
Value *agg = I.getAggregateOperand();
for (const unsigned *i = I.idx_begin(), *e = I.idx_end(); i != e; i++)
regTranslator.newValueProxy(agg, &I, *i, 0);
}
void GenWriter::emitExtractValue(ExtractValueInst &I) {
}
void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {}
void GenWriter::emitShuffleVectorInst(ShuffleVectorInst &I) {}
void GenWriter::regAllocateSelectInst(SelectInst &I) {
this->newRegister(&I);
}
void GenWriter::emitSelectInst(SelectInst &I) {
// Get the element type for a vector
const ir::Type type = getType(ctx, I.getType());
// Emit the instructions in a row
const ir::Register dst = this->getRegister(&I);
const ir::Register cond = this->getRegister(I.getOperand(0));
const ir::Register src0 = this->getRegister(I.getOperand(1));
const ir::Register src1 = this->getRegister(I.getOperand(2));
ctx.SEL(type, dst, cond, src0, src1);
}
void GenWriter::regAllocatePHINode(PHINode &I) {
// Copy 1 for the PHI
this->newRegister(&I);
// Copy 2 to avoid lost copy issue
Value *copy = this->getPHICopy(&I);
this->newRegister(&I, copy);
}
void GenWriter::emitPHINode(PHINode &I) {
Value *copy = this->getPHICopy(&I);
const ir::Type type = getType(ctx, I.getType());
const ir::Register dst = this->getRegister(&I);
const ir::Register src = this->getRegister(copy);
ctx.MOV(type, dst, src);
phiMap.insert(std::make_pair(dst, src));
}
void GenWriter::regAllocateBranchInst(BranchInst &I) {}
void GenWriter::emitBranchInst(BranchInst &I) {
// Emit MOVs if required
BasicBlock *bb = I.getParent();
this->emitMovForPHI(bb, I.getSuccessor(0));
if (I.isConditional())
this->emitMovForPHI(bb, I.getSuccessor(1));
// Inconditional branch. Just check that we jump to a block which is not our
// successor
if (I.isConditional() == false) {
BasicBlock *target = I.getSuccessor(0);
if (std::next(Function::iterator(bb)) != Function::iterator(target)) {
GBE_ASSERT(labelMap.find(target) != labelMap.end());
const ir::LabelIndex labelIndex = labelMap[target];
ctx.BRA(labelIndex);
}
}
// The LLVM branch has two targets
else {
BasicBlock *taken = NULL, *nonTaken = NULL;
Value *condition = I.getCondition();
// We may inverted the branch condition to simplify the branching code
const bool inverted = conditionSet.find(condition) != conditionSet.end();
taken = inverted ? I.getSuccessor(1) : I.getSuccessor(0);
nonTaken = inverted ? I.getSuccessor(0) : I.getSuccessor(1);
// Get both taken label and predicate register
GBE_ASSERT(labelMap.find(taken) != labelMap.end());
const ir::LabelIndex index = labelMap[taken];
const ir::Register reg = this->getRegister(condition);
ctx.BRA(index, reg);
// If non-taken target is the next block, there is nothing to do
BasicBlock *bb = I.getParent();
if (std::next(Function::iterator(bb)) == Function::iterator(nonTaken))
return;
// This is slightly more complicated here. We need to issue one more
// branch for the non-taken condition.
GBE_ASSERT(labelMap.find(nonTaken) != labelMap.end());
const ir::LabelIndex untakenIndex = ctx.label();
ctx.LABEL(untakenIndex);
ctx.BRA(labelMap[nonTaken]);
}
}
void GenWriter::regAllocateCallInst(CallInst &I) {
Value *dst = &I;
Value *Callee = I.getCalledValue();
GBE_ASSERT(ctx.getFunction().getProfile() == ir::PROFILE_OCL);
GBE_ASSERT(isa(I.getCalledValue()) == false);
GBE_ASSERT(I.hasStructRetAttr() == false);
// We only support a small number of intrinsics right now
if (Function *F = I.getCalledFunction()) {
const Intrinsic::ID intrinsicID = (Intrinsic::ID) F->getIntrinsicID();
if (intrinsicID != 0) {
switch (F->getIntrinsicID()) {
case Intrinsic::stacksave:
this->newRegister(&I);
break;
case Intrinsic::stackrestore:
break;
#if LLVM_VERSION_MINOR >= 2
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
break;
case Intrinsic::fmuladd:
this->newRegister(&I);
break;
#endif /* LLVM_VERSION_MINOR >= 2 */
case Intrinsic::debugtrap:
case Intrinsic::dbg_value:
case Intrinsic::dbg_declare:
break;
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow:
this->newRegister(&I);
break;
case Intrinsic::ctlz:
case Intrinsic::bswap:
this->newRegister(&I);
break;
case Intrinsic::fabs:
case Intrinsic::sqrt:
case Intrinsic::ceil:
case Intrinsic::fma:
case Intrinsic::trunc:
case Intrinsic::rint:
case Intrinsic::floor:
case Intrinsic::sin:
case Intrinsic::cos:
case Intrinsic::log2:
case Intrinsic::exp2:
case Intrinsic::pow:
this->newRegister(&I);
break;
default:
GBE_ASSERTM(false, "Unsupported intrinsics");
}
return;
}
}
// Get the name of the called function and handle it
const std::string fnName = Callee->stripPointerCasts()->getName();
auto genIntrinsicID = intrinsicMap.find(fnName);
switch (genIntrinsicID) {
case GEN_OCL_GET_GROUP_ID0:
regTranslator.newScalarProxy(ir::ocl::groupid0, dst); break;
case GEN_OCL_GET_GROUP_ID1:
regTranslator.newScalarProxy(ir::ocl::groupid1, dst); break;
case GEN_OCL_GET_GROUP_ID2:
regTranslator.newScalarProxy(ir::ocl::groupid2, dst); break;
case GEN_OCL_GET_LOCAL_ID0:
regTranslator.newScalarProxy(ir::ocl::lid0, dst); break;
case GEN_OCL_GET_LOCAL_ID1:
regTranslator.newScalarProxy(ir::ocl::lid1, dst); break;
case GEN_OCL_GET_LOCAL_ID2:
regTranslator.newScalarProxy(ir::ocl::lid2, dst); break;
case GEN_OCL_GET_NUM_GROUPS0:
regTranslator.newScalarProxy(ir::ocl::numgroup0, dst); break;
case GEN_OCL_GET_NUM_GROUPS1:
regTranslator.newScalarProxy(ir::ocl::numgroup1, dst); break;
case GEN_OCL_GET_NUM_GROUPS2:
regTranslator.newScalarProxy(ir::ocl::numgroup2, dst); break;
case GEN_OCL_GET_LOCAL_SIZE0:
regTranslator.newScalarProxy(ir::ocl::lsize0, dst); break;
case GEN_OCL_GET_LOCAL_SIZE1:
regTranslator.newScalarProxy(ir::ocl::lsize1, dst); break;
case GEN_OCL_GET_LOCAL_SIZE2:
regTranslator.newScalarProxy(ir::ocl::lsize2, dst); break;
case GEN_OCL_GET_GLOBAL_SIZE0:
regTranslator.newScalarProxy(ir::ocl::gsize0, dst); break;
case GEN_OCL_GET_GLOBAL_SIZE1:
regTranslator.newScalarProxy(ir::ocl::gsize1, dst); break;
case GEN_OCL_GET_GLOBAL_SIZE2:
regTranslator.newScalarProxy(ir::ocl::gsize2, dst); break;
case GEN_OCL_GET_GLOBAL_OFFSET0:
regTranslator.newScalarProxy(ir::ocl::goffset0, dst); break;
case GEN_OCL_GET_GLOBAL_OFFSET1:
regTranslator.newScalarProxy(ir::ocl::goffset1, dst); break;
case GEN_OCL_GET_GLOBAL_OFFSET2:
regTranslator.newScalarProxy(ir::ocl::goffset2, dst); break;
case GEN_OCL_GET_WORK_DIM:
regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
case GEN_OCL_FBH:
case GEN_OCL_FBL:
case GEN_OCL_CBIT:
case GEN_OCL_RSQ:
case GEN_OCL_RCP:
case GEN_OCL_ABS:
case GEN_OCL_GET_IMAGE_WIDTH:
case GEN_OCL_GET_IMAGE_HEIGHT:
case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
case GEN_OCL_GET_IMAGE_DEPTH:
case GEN_OCL_ATOMIC_ADD0:
case GEN_OCL_ATOMIC_ADD1:
case GEN_OCL_ATOMIC_SUB0:
case GEN_OCL_ATOMIC_SUB1:
case GEN_OCL_ATOMIC_AND0:
case GEN_OCL_ATOMIC_AND1:
case GEN_OCL_ATOMIC_OR0:
case GEN_OCL_ATOMIC_OR1:
case GEN_OCL_ATOMIC_XOR0:
case GEN_OCL_ATOMIC_XOR1:
case GEN_OCL_ATOMIC_XCHG0:
case GEN_OCL_ATOMIC_XCHG1:
case GEN_OCL_ATOMIC_UMAX0:
case GEN_OCL_ATOMIC_UMAX1:
case GEN_OCL_ATOMIC_UMIN0:
case GEN_OCL_ATOMIC_UMIN1:
case GEN_OCL_ATOMIC_IMAX0:
case GEN_OCL_ATOMIC_IMAX1:
case GEN_OCL_ATOMIC_IMIN0:
case GEN_OCL_ATOMIC_IMIN1:
case GEN_OCL_ATOMIC_INC0:
case GEN_OCL_ATOMIC_INC1:
case GEN_OCL_ATOMIC_DEC0:
case GEN_OCL_ATOMIC_DEC1:
case GEN_OCL_ATOMIC_CMPXCHG0:
case GEN_OCL_ATOMIC_CMPXCHG1:
// No structure can be returned
this->newRegister(&I);
break;
case GEN_OCL_FORCE_SIMD8:
case GEN_OCL_FORCE_SIMD16:
case GEN_OCL_LBARRIER:
case GEN_OCL_GBARRIER:
case GEN_OCL_LGBARRIER:
ctx.getFunction().setUseSLM(true);
break;
case GEN_OCL_WRITE_IMAGE_I:
case GEN_OCL_WRITE_IMAGE_UI:
case GEN_OCL_WRITE_IMAGE_F:
break;
case GEN_OCL_READ_IMAGE_I:
case GEN_OCL_READ_IMAGE_UI:
case GEN_OCL_READ_IMAGE_F:
{
// dst is a 4 elements vector. We allocate all 4 registers here.
uint32_t elemNum;
(void)getVectorInfo(ctx, &I, elemNum);
GBE_ASSERT(elemNum == 4);
this->newRegister(&I);
break;
}
case GEN_OCL_MUL_HI_INT:
case GEN_OCL_MUL_HI_UINT:
case GEN_OCL_MUL_HI_I64:
case GEN_OCL_MUL_HI_UI64:
case GEN_OCL_UPSAMPLE_SHORT:
case GEN_OCL_UPSAMPLE_INT:
case GEN_OCL_UPSAMPLE_LONG:
case GEN_OCL_FMAX:
case GEN_OCL_FMIN:
case GEN_OCL_SADD_SAT_CHAR:
case GEN_OCL_SADD_SAT_SHORT:
case GEN_OCL_SADD_SAT_INT:
case GEN_OCL_SADD_SAT_LONG:
case GEN_OCL_UADD_SAT_CHAR:
case GEN_OCL_UADD_SAT_SHORT:
case GEN_OCL_UADD_SAT_INT:
case GEN_OCL_UADD_SAT_LONG:
case GEN_OCL_SSUB_SAT_CHAR:
case GEN_OCL_SSUB_SAT_SHORT:
case GEN_OCL_SSUB_SAT_INT:
case GEN_OCL_SSUB_SAT_LONG:
case GEN_OCL_USUB_SAT_CHAR:
case GEN_OCL_USUB_SAT_SHORT:
case GEN_OCL_USUB_SAT_INT:
case GEN_OCL_USUB_SAT_LONG:
case GEN_OCL_HADD:
case GEN_OCL_RHADD:
case GEN_OCL_I64HADD:
case GEN_OCL_I64RHADD:
case GEN_OCL_I64_MAD_SAT:
case GEN_OCL_I64_MAD_SATU:
case GEN_OCL_SAT_CONV_U8_TO_I8:
case GEN_OCL_SAT_CONV_I16_TO_I8:
case GEN_OCL_SAT_CONV_U16_TO_I8:
case GEN_OCL_SAT_CONV_I32_TO_I8:
case GEN_OCL_SAT_CONV_U32_TO_I8:
case GEN_OCL_SAT_CONV_F32_TO_I8:
case GEN_OCL_SAT_CONV_I8_TO_U8:
case GEN_OCL_SAT_CONV_I16_TO_U8:
case GEN_OCL_SAT_CONV_U16_TO_U8:
case GEN_OCL_SAT_CONV_I32_TO_U8:
case GEN_OCL_SAT_CONV_U32_TO_U8:
case GEN_OCL_SAT_CONV_F32_TO_U8:
case GEN_OCL_SAT_CONV_U16_TO_I16:
case GEN_OCL_SAT_CONV_I32_TO_I16:
case GEN_OCL_SAT_CONV_U32_TO_I16:
case GEN_OCL_SAT_CONV_F32_TO_I16:
case GEN_OCL_SAT_CONV_I16_TO_U16:
case GEN_OCL_SAT_CONV_I32_TO_U16:
case GEN_OCL_SAT_CONV_U32_TO_U16:
case GEN_OCL_SAT_CONV_F32_TO_U16:
case GEN_OCL_SAT_CONV_U32_TO_I32:
case GEN_OCL_SAT_CONV_F32_TO_I32:
case GEN_OCL_SAT_CONV_I32_TO_U32:
case GEN_OCL_SAT_CONV_F32_TO_U32:
case GEN_OCL_CONV_F16_TO_F32:
case GEN_OCL_CONV_F32_TO_F16:
case GEN_OCL_SIMD_ANY:
case GEN_OCL_SIMD_ALL:
case GEN_OCL_SIMD_SIZE:
case GEN_OCL_READ_TM:
case GEN_OCL_REGION:
case GEN_OCL_SIMD_ID:
case GEN_OCL_SIMD_SHUFFLE:
this->newRegister(&I);
break;
case GEN_OCL_PRINTF:
break;
case GEN_OCL_NOT_FOUND:
default:
std::cerr << "Caller instruction: " << std::endl;
I.dump();
std::cerr << "Callee function: " << std::endl;
Callee->dump();
GBE_ASSERT(0);
};
}
void GenWriter::emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode, ir::Type type) {
CallSite::arg_iterator AI = CS.arg_begin();
#if GBE_DEBUG
CallSite::arg_iterator AE = CS.arg_end();
#endif /* GBE_DEBUG */
GBE_ASSERT(AI != AE);
const ir::Register src = this->getRegister(*AI);
const ir::Register dst = this->getRegister(&I);
ctx.ALU1(opcode, type, dst, src);
}
void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode) {
CallSite::arg_iterator AI = CS.arg_begin();
CallSite::arg_iterator AE = CS.arg_end();
GBE_ASSERT(AI != AE);
const ir::Register dst = this->getRegister(&I);
ir::BTI bti;
gatherBTI(&I, bti);
const ir::AddressSpace addrSpace = btiToGen(bti);
vector src;
uint32_t srcNum = 0;
while(AI != AE) {
src.push_back(this->getRegister(*(AI++)));
srcNum++;
}
const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
ctx.ATOMIC(opcode, dst, addrSpace, bti, srcTuple);
}
/* append a new sampler. should be called before any reference to
* a sampler_t value. */
uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
Constant *CPV = dyn_cast(*AI);
uint8_t index;
if (CPV != NULL)
{
// This is not a kernel argument sampler, we need to append it to sampler set,
// and allocate a sampler slot for it.
const ir::Immediate &x = processConstantImm(CPV);
GBE_ASSERTM(x.getType() == ir::TYPE_U32 || x.getType() == ir::TYPE_S32, "Invalid sampler type");
index = ctx.getFunction().getSamplerSet()->append(x.getIntegerValue(), &ctx);
} else {
const ir::Register samplerReg = this->getRegister(*AI);
index = ctx.getFunction().getSamplerSet()->append(samplerReg, &ctx);
}
return index;
}
uint8_t GenWriter::getImageID(CallInst &I) {
const ir::Register imageReg = this->getRegister(I.getOperand(0));
return ctx.getFunction().getImageSet()->getIdx(imageReg);
}
void GenWriter::emitCallInst(CallInst &I) {
if (Function *F = I.getCalledFunction()) {
if (F->getIntrinsicID() != 0) {
const ir::Function &fn = ctx.getFunction();
// Get the function arguments
CallSite CS(&I);
CallSite::arg_iterator AI = CS.arg_begin();
#if GBE_DEBUG
CallSite::arg_iterator AE = CS.arg_end();
#endif /* GBE_DEBUG */
switch (F->getIntrinsicID()) {
case Intrinsic::stacksave:
{
const ir::Register dst = this->getRegister(&I);
const ir::Register src = ir::ocl::stackptr;
const ir::RegisterFamily family = fn.getRegisterFamily(dst);
ctx.MOV(ir::getType(family), dst, src);
}
break;
case Intrinsic::stackrestore:
{
const ir::Register dst = ir::ocl::stackptr;
const ir::Register src = this->getRegister(I.getOperand(0));
const ir::RegisterFamily family = fn.getRegisterFamily(dst);
ctx.MOV(ir::getType(family), dst, src);
}
break;
#if LLVM_VERSION_MINOR >= 2
case Intrinsic::fmuladd:
{
const ir::Register tmp = ctx.reg(ir::FAMILY_DWORD);
const ir::Register dst = this->getRegister(&I);
const ir::Register src0 = this->getRegister(I.getOperand(0));
const ir::Register src1 = this->getRegister(I.getOperand(1));
const ir::Register src2 = this->getRegister(I.getOperand(2));
ctx.MUL(ir::TYPE_FLOAT, tmp, src0, src1);
ctx.ADD(ir::TYPE_FLOAT, dst, tmp, src2);
}
break;
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
break;
#endif /* LLVM_VERSION_MINOR >= 2 */
case Intrinsic::debugtrap:
case Intrinsic::dbg_value:
case Intrinsic::dbg_declare:
break;
case Intrinsic::uadd_with_overflow:
{
Type *llvmDstType = I.getType();
GBE_ASSERT(llvmDstType->isStructTy());
ir::Type dst0Type = getType(ctx, llvmDstType->getStructElementType(0));
const ir::Register dst0 = this->getRegister(&I, 0);
const ir::Register src0 = this->getRegister(I.getOperand(0));
const ir::Register src1 = this->getRegister(I.getOperand(1));
ctx.ADD(dst0Type, dst0, src0, src1);
ir::Register overflow = this->getRegister(&I, 1);
const ir::Type unsignedType = makeTypeUnsigned(dst0Type);
ctx.LT(unsignedType, overflow, dst0, src1);
}
break;
case Intrinsic::usub_with_overflow:
{
Type *llvmDstType = I.getType();
GBE_ASSERT(llvmDstType->isStructTy());
ir::Type dst0Type = getType(ctx, llvmDstType->getStructElementType(0));
const ir::Register dst0 = this->getRegister(&I, 0);
const ir::Register src0 = this->getRegister(I.getOperand(0));
const ir::Register src1 = this->getRegister(I.getOperand(1));
ctx.SUB(dst0Type, dst0, src0, src1);
ir::Register overflow = this->getRegister(&I, 1);
const ir::Type unsignedType = makeTypeUnsigned(dst0Type);
ctx.GT(unsignedType, overflow, dst0, src0);
}
break;
case Intrinsic::sadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow:
NOT_IMPLEMENTED;
break;
case Intrinsic::ctlz:
{
Type *llvmDstType = I.getType();
ir::Type dstType = getType(ctx, llvmDstType);
Type *llvmSrcType = I.getOperand(0)->getType();
ir::Type srcType = getUnsignedType(ctx, llvmSrcType);
//the llvm.ctlz.i64 is lowered to two llvm.ctlz.i32 call in ocl_clz.ll
GBE_ASSERT(srcType != ir::TYPE_U64);
const ir::Register dst = this->getRegister(&I);
const ir::Register src = this->getRegister(I.getOperand(0));
int imm_value = 0;
if(srcType == ir::TYPE_U16) {
imm_value = 16;
}else if(srcType == ir::TYPE_U8) {
imm_value = 24;
}
if(srcType == ir::TYPE_U16 || srcType == ir::TYPE_U8) {
ir::ImmediateIndex imm;
ir::Type tmpType = ir::TYPE_S32;
imm = ctx.newIntegerImmediate(imm_value, tmpType);
const ir::RegisterFamily family = getFamily(tmpType);
const ir::Register immReg = ctx.reg(family);
ctx.LOADI(ir::TYPE_S32, immReg, imm);
ir::Register tmp0 = ctx.reg(getFamily(tmpType));
ir::Register tmp1 = ctx.reg(getFamily(tmpType));
ir::Register tmp2 = ctx.reg(getFamily(tmpType));
ctx.CVT(tmpType, srcType, tmp0, src);
ctx.ALU1(ir::OP_LZD, tmpType, tmp1, tmp0);
ctx.SUB(tmpType, tmp2, tmp1, immReg);
ctx.CVT(dstType, tmpType, dst, tmp2);
}
else
{
ctx.ALU1(ir::OP_LZD, dstType, dst, src);
}
}
break;
case Intrinsic::fma:
{
ir::Type srcType = getType(ctx, I.getType());
const ir::Register dst = this->getRegister(&I);
const ir::Register src0 = this->getRegister(I.getOperand(0));
const ir::Register src1 = this->getRegister(I.getOperand(1));
const ir::Register src2 = this->getRegister(I.getOperand(2));
ctx.MAD(srcType, dst, src0, src1, src2);
}
break;
case Intrinsic::sqrt: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
case Intrinsic::ceil: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
case Intrinsic::fabs: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
case Intrinsic::trunc: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
case Intrinsic::rint: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
case Intrinsic::floor: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
case Intrinsic::sin: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
case Intrinsic::cos: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
case Intrinsic::log2: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
case Intrinsic::exp2: this->emitUnaryCallInst(I,CS,ir::OP_EXP); break;
case Intrinsic::bswap:
this->emitUnaryCallInst(I,CS,ir::OP_BSWAP, getUnsignedType(ctx, I.getType())); break;
case Intrinsic::pow:
{
const ir::Register src0 = this->getRegister(*AI); ++AI;
const ir::Register src1 = this->getRegister(*AI);
const ir::Register dst = this->getRegister(&I);
ctx.POW(ir::TYPE_FLOAT, dst, src0, src1);
break;
}
default: NOT_IMPLEMENTED;
}
} else {
// Get the name of the called function and handle it
Value *Callee = I.getCalledValue();
const std::string fnName = Callee->stripPointerCasts()->getName();
auto genIntrinsicID = intrinsicMap.find(fnName);
// Get the function arguments
CallSite CS(&I);
CallSite::arg_iterator AI = CS.arg_begin();
#if GBE_DEBUG
CallSite::arg_iterator AE = CS.arg_end();
#endif /* GBE_DEBUG */
switch (genIntrinsicID) {
case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
case GEN_OCL_CBIT: this->emitUnaryCallInst(I,CS,ir::OP_CBIT, getUnsignedType(ctx, (*AI)->getType())); break;
case GEN_OCL_ABS:
{
const ir::Register src = this->getRegister(*AI);
const ir::Register dst = this->getRegister(&I);
ctx.ALU1(ir::OP_ABS, getType(ctx, (*AI)->getType()), dst, src);
break;
}
case GEN_OCL_SIMD_ALL:
{
const ir::Register src = this->getRegister(*AI);
const ir::Register dst = this->getRegister(&I);
ctx.ALU1(ir::OP_SIMD_ALL, ir::TYPE_S32, dst, src);
break;
}
case GEN_OCL_SIMD_ANY:
{
const ir::Register src = this->getRegister(*AI);
const ir::Register dst = this->getRegister(&I);
ctx.ALU1(ir::OP_SIMD_ANY, ir::TYPE_S32, dst, src);
break;
}
case GEN_OCL_READ_TM:
{
const ir::Register dst = this->getRegister(&I);
ctx.READ_ARF(ir::TYPE_U32, dst, ir::ARF_TM);
break;
}
case GEN_OCL_REGION:
{
const ir::Register dst = this->getRegister(&I);
// offset must be immediate
GBE_ASSERT(AI != AE); Constant *CPV = dyn_cast(*AI);
assert(CPV);
const ir::Immediate &x = processConstantImm(CPV);
AI++;
const ir::Register src = this->getRegister(*AI);
ctx.REGION(dst, src, x.getIntegerValue());
break;
}
case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
case GEN_OCL_FORCE_SIMD8: ctx.setSimdWidth(8); break;
case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break;
case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
case GEN_OCL_ATOMIC_ADD0:
case GEN_OCL_ATOMIC_ADD1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_ADD); break;
case GEN_OCL_ATOMIC_SUB0:
case GEN_OCL_ATOMIC_SUB1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_SUB); break;
case GEN_OCL_ATOMIC_AND0:
case GEN_OCL_ATOMIC_AND1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_AND); break;
case GEN_OCL_ATOMIC_OR0:
case GEN_OCL_ATOMIC_OR1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_OR); break;
case GEN_OCL_ATOMIC_XOR0:
case GEN_OCL_ATOMIC_XOR1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_XOR); break;
case GEN_OCL_ATOMIC_XCHG0:
case GEN_OCL_ATOMIC_XCHG1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_XCHG); break;
case GEN_OCL_ATOMIC_INC0:
case GEN_OCL_ATOMIC_INC1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_INC); break;
case GEN_OCL_ATOMIC_DEC0:
case GEN_OCL_ATOMIC_DEC1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_DEC); break;
case GEN_OCL_ATOMIC_UMIN0:
case GEN_OCL_ATOMIC_UMIN1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_UMIN); break;
case GEN_OCL_ATOMIC_UMAX0:
case GEN_OCL_ATOMIC_UMAX1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_UMAX); break;
case GEN_OCL_ATOMIC_IMIN0:
case GEN_OCL_ATOMIC_IMIN1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_IMIN); break;
case GEN_OCL_ATOMIC_IMAX0:
case GEN_OCL_ATOMIC_IMAX1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_IMAX); break;
case GEN_OCL_ATOMIC_CMPXCHG0:
case GEN_OCL_ATOMIC_CMPXCHG1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_CMPXCHG); break;
case GEN_OCL_GET_IMAGE_WIDTH:
case GEN_OCL_GET_IMAGE_HEIGHT:
case GEN_OCL_GET_IMAGE_DEPTH:
case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
{
const uint8_t imageID = getImageID(I);
GBE_ASSERT(AI != AE); ++AI;
const ir::Register reg = this->getRegister(&I, 0);
int infoType = genIntrinsicID - GEN_OCL_GET_IMAGE_WIDTH;
ir::ImageInfoKey key(imageID, infoType);
const ir::Register infoReg = ctx.getFunction().getImageSet()->appendInfo(key, &ctx);
ctx.GET_IMAGE_INFO(infoType, reg, imageID, infoReg);
break;
}
case GEN_OCL_READ_IMAGE_I:
case GEN_OCL_READ_IMAGE_UI:
case GEN_OCL_READ_IMAGE_F:
{
const uint8_t imageID = getImageID(I);
GBE_ASSERT(AI != AE); ++AI;
GBE_ASSERT(AI != AE);
const uint8_t sampler = this->appendSampler(AI);
++AI; GBE_ASSERT(AI != AE);
uint32_t coordNum;
const ir::Type coordType = getVectorInfo(ctx, *AI, coordNum);
if (coordNum == 4)
coordNum = 3;
const uint32_t imageDim = coordNum;
GBE_ASSERT(imageDim >= 1 && imageDim <= 3);
uint8_t samplerOffset = 0;
Value *coordVal = *AI;
++AI; GBE_ASSERT(AI != AE);
Value *samplerOffsetVal = *AI;
#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
Constant *CPV = dyn_cast(samplerOffsetVal);
assert(CPV);
const ir::Immediate &x = processConstantImm(CPV);
GBE_ASSERTM(x.getType() == ir::TYPE_U32 || x.getType() == ir::TYPE_S32, "Invalid sampler type");
samplerOffset = x.getIntegerValue();
#endif
bool isFloatCoord = coordType == ir::TYPE_FLOAT;
bool requiredFloatCoord = samplerOffset == 0;
GBE_ASSERT(isFloatCoord == requiredFloatCoord);
vector dstTupleData, srcTupleData;
for (uint32_t elemID = 0; elemID < imageDim; elemID++)
srcTupleData.push_back(this->getRegister(coordVal, elemID));
uint32_t elemNum;
ir::Type dstType = getVectorInfo(ctx, &I, elemNum);
GBE_ASSERT(elemNum == 4);
for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
const ir::Register reg = this->getRegister(&I, elemID);
dstTupleData.push_back(reg);
}
const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], imageDim);
ctx.SAMPLE(imageID, dstTuple, srcTuple, imageDim, dstType == ir::TYPE_FLOAT,
requiredFloatCoord, sampler, samplerOffset);
break;
}
case GEN_OCL_WRITE_IMAGE_I:
case GEN_OCL_WRITE_IMAGE_UI:
case GEN_OCL_WRITE_IMAGE_F:
{
const uint8_t imageID = getImageID(I);
GBE_ASSERT(AI != AE); ++AI; GBE_ASSERT(AI != AE);
uint32_t coordNum;
(void)getVectorInfo(ctx, *AI, coordNum);
if (coordNum == 4)
coordNum = 3;
const uint32_t imageDim = coordNum;
vector srcTupleData;
GBE_ASSERT(imageDim >= 1 && imageDim <= 3);
for (uint32_t elemID = 0; elemID < imageDim; elemID++)
srcTupleData.push_back(this->getRegister(*AI, elemID));
++AI; GBE_ASSERT(AI != AE);
uint32_t elemNum;
ir::Type srcType = getVectorInfo(ctx, *AI, elemNum);
GBE_ASSERT(elemNum == 4);
for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
const ir::Register reg = this->getRegister(*AI, elemID);
srcTupleData.push_back(reg);
}
const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], imageDim + 4);
ctx.TYPED_WRITE(imageID, srcTuple, imageDim + 4, srcType, ir::TYPE_U32);
break;
}
case GEN_OCL_MUL_HI_INT:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.MUL_HI(getType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_MUL_HI_UINT:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.MUL_HI(getUnsignedType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_MUL_HI_I64:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.I64_MUL_HI(getType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_MUL_HI_UI64:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.I64_MUL_HI(getUnsignedType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_UPSAMPLE_SHORT:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.UPSAMPLE_SHORT(getType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_UPSAMPLE_INT:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.UPSAMPLE_INT(getType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_UPSAMPLE_LONG:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.UPSAMPLE_LONG(getType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_SADD_SAT_CHAR:
case GEN_OCL_SADD_SAT_SHORT:
case GEN_OCL_SADD_SAT_INT:
case GEN_OCL_SADD_SAT_LONG:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.ADDSAT(getType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_UADD_SAT_CHAR:
case GEN_OCL_UADD_SAT_SHORT:
case GEN_OCL_UADD_SAT_INT:
case GEN_OCL_UADD_SAT_LONG:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.ADDSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_SSUB_SAT_CHAR:
case GEN_OCL_SSUB_SAT_SHORT:
case GEN_OCL_SSUB_SAT_INT:
case GEN_OCL_SSUB_SAT_LONG:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.SUBSAT(getType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_USUB_SAT_CHAR:
case GEN_OCL_USUB_SAT_SHORT:
case GEN_OCL_USUB_SAT_INT:
case GEN_OCL_USUB_SAT_LONG:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.SUBSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_I64_MAD_SAT:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.I64MADSAT(getType(ctx, I.getType()), dst, src0, src1, src2);
break;
}
case GEN_OCL_I64_MAD_SATU:
{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.I64MADSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1, src2);
break;
}
case GEN_OCL_FMAX:
case GEN_OCL_FMIN:{
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
const ir::Register cmp = ctx.reg(ir::FAMILY_BOOL);
//Becasue cmp's sources are same as sel's source, so cmp instruction and sel
//instruction will be merged to one sel_cmp instruction in the gen selection
//Add two intruction here for simple.
if(genIntrinsicID == GEN_OCL_FMAX)
ctx.GE(getType(ctx, I.getType()), cmp, src0, src1);
else
ctx.LT(getType(ctx, I.getType()), cmp, src0, src1);
ctx.SEL(getType(ctx, I.getType()), dst, cmp, src0, src1);
break;
}
case GEN_OCL_HADD: {
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.HADD(getUnsignedType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_I64HADD:
{
GBE_ASSERT(AI != AE);
const ir::Register src0 = this->getRegister(*(AI++));
GBE_ASSERT(AI != AE);
const ir::Register src1 = this->getRegister(*(AI++));
const ir::Register dst = this->getRegister(&I);
ctx.I64HADD(ir::TYPE_U64, dst, src0, src1);
break;
}
case GEN_OCL_RHADD: {
GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.RHADD(getUnsignedType(ctx, I.getType()), dst, src0, src1);
break;
}
case GEN_OCL_I64RHADD:
{
GBE_ASSERT(AI != AE);
const ir::Register src0 = this->getRegister(*(AI++));
GBE_ASSERT(AI != AE);
const ir::Register src1 = this->getRegister(*(AI++));
const ir::Register dst = this->getRegister(&I);
ctx.I64RHADD(ir::TYPE_U64, dst, src0, src1);
break;
}
#define DEF(DST_TYPE, SRC_TYPE) \
{ ctx.SAT_CVT(DST_TYPE, SRC_TYPE, getRegister(&I), getRegister(I.getOperand(0))); break; }
case GEN_OCL_SAT_CONV_U8_TO_I8:
DEF(ir::TYPE_S8, ir::TYPE_U8);
case GEN_OCL_SAT_CONV_I16_TO_I8:
DEF(ir::TYPE_S8, ir::TYPE_S16);
case GEN_OCL_SAT_CONV_U16_TO_I8:
DEF(ir::TYPE_S8, ir::TYPE_U16);
case GEN_OCL_SAT_CONV_I32_TO_I8:
DEF(ir::TYPE_S8, ir::TYPE_S32);
case GEN_OCL_SAT_CONV_U32_TO_I8:
DEF(ir::TYPE_S8, ir::TYPE_U32);
case GEN_OCL_SAT_CONV_F32_TO_I8:
DEF(ir::TYPE_S8, ir::TYPE_FLOAT);
case GEN_OCL_SAT_CONV_I8_TO_U8:
DEF(ir::TYPE_U8, ir::TYPE_S8);
case GEN_OCL_SAT_CONV_I16_TO_U8:
DEF(ir::TYPE_U8, ir::TYPE_S16);
case GEN_OCL_SAT_CONV_U16_TO_U8:
DEF(ir::TYPE_U8, ir::TYPE_U16);
case GEN_OCL_SAT_CONV_I32_TO_U8:
DEF(ir::TYPE_U8, ir::TYPE_S32);
case GEN_OCL_SAT_CONV_U32_TO_U8:
DEF(ir::TYPE_U8, ir::TYPE_U32);
case GEN_OCL_SAT_CONV_F32_TO_U8:
DEF(ir::TYPE_U8, ir::TYPE_FLOAT);
case GEN_OCL_SAT_CONV_U16_TO_I16:
DEF(ir::TYPE_S16, ir::TYPE_U16);
case GEN_OCL_SAT_CONV_I32_TO_I16:
DEF(ir::TYPE_S16, ir::TYPE_S32);
case GEN_OCL_SAT_CONV_U32_TO_I16:
DEF(ir::TYPE_S16, ir::TYPE_U32);
case GEN_OCL_SAT_CONV_F32_TO_I16:
DEF(ir::TYPE_S16, ir::TYPE_FLOAT);
case GEN_OCL_SAT_CONV_I16_TO_U16:
DEF(ir::TYPE_U16, ir::TYPE_S16);
case GEN_OCL_SAT_CONV_I32_TO_U16:
DEF(ir::TYPE_U16, ir::TYPE_S32);
case GEN_OCL_SAT_CONV_U32_TO_U16:
DEF(ir::TYPE_U16, ir::TYPE_U32);
case GEN_OCL_SAT_CONV_F32_TO_U16:
DEF(ir::TYPE_U16, ir::TYPE_FLOAT);
case GEN_OCL_SAT_CONV_U32_TO_I32:
DEF(ir::TYPE_S32, ir::TYPE_U32);
case GEN_OCL_SAT_CONV_F32_TO_I32:
DEF(ir::TYPE_S32, ir::TYPE_FLOAT);
case GEN_OCL_SAT_CONV_I32_TO_U32:
DEF(ir::TYPE_U32, ir::TYPE_S32);
case GEN_OCL_SAT_CONV_F32_TO_U32:
DEF(ir::TYPE_U32, ir::TYPE_FLOAT);
case GEN_OCL_CONV_F16_TO_F32:
ctx.F16TO32(ir::TYPE_FLOAT, ir::TYPE_U16, getRegister(&I), getRegister(I.getOperand(0)));
break;
case GEN_OCL_CONV_F32_TO_F16:
ctx.F32TO16(ir::TYPE_U16, ir::TYPE_FLOAT, getRegister(&I), getRegister(I.getOperand(0)));
break;
#undef DEF
case GEN_OCL_PRINTF:
{
ir::PrintfSet::PrintfFmt* fmt = (ir::PrintfSet::PrintfFmt*)getPrintfInfo(&I);
ctx.getFunction().getPrintfSet()->append(fmt, unit);
assert(fmt);
break;
}
case GEN_OCL_SIMD_SIZE:
{
const ir::Register dst = this->getRegister(&I);
ctx.ALU0(ir::OP_SIMD_SIZE, getType(ctx, I.getType()), dst);
break;
}
case GEN_OCL_SIMD_ID:
{
const ir::Register dst = this->getRegister(&I);
ctx.ALU0(ir::OP_SIMD_ID, getType(ctx, I.getType()), dst);
break;
}
case GEN_OCL_SIMD_SHUFFLE:
{
const ir::Register src0 = this->getRegister(*AI); ++AI;
const ir::Register src1 = this->getRegister(*AI); ++AI;
const ir::Register dst = this->getRegister(&I);
ctx.SIMD_SHUFFLE(getType(ctx, I.getType()), dst, src0, src1);
break;
}
default: break;
}
}
}
}
void GenWriter::regAllocateAllocaInst(AllocaInst &I) {
this->newRegister(&I);
}
void GenWriter::emitAllocaInst(AllocaInst &I) {
Value *src = I.getOperand(0);
Type *elemType = I.getType()->getElementType();
ir::ImmediateIndex immIndex;
uint32_t elementSize = getTypeByteSize(unit, elemType);
// Be aware, we manipulate pointers
if (ctx.getPointerSize() == ir::POINTER_32_BITS)
immIndex = ctx.newImmediate(uint32_t(elementSize));
else
immIndex = ctx.newImmediate(uint64_t(elementSize));
// OK, we try to see if we know compile time the size we need to allocate
if (I.isArrayAllocation() == true) {
Constant *CPV = dyn_cast(src);
GBE_ASSERT(CPV);
const ir::Immediate &imm = processConstantImm(CPV);
const uint64_t elemNum = imm.getIntegerValue();
elementSize *= elemNum;
if (ctx.getPointerSize() == ir::POINTER_32_BITS)
immIndex = ctx.newImmediate(uint32_t(ALIGN(elementSize, 4)));
else
immIndex = ctx.newImmediate(uint64_t(ALIGN(elementSize, 4)));
}
// Now emit the stream of instructions to get the allocated pointer
const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
const ir::Register dst = this->getRegister(&I);
const ir::Register stack = ir::ocl::stackptr;
const ir::Register reg = ctx.reg(pointerFamily);
const ir::Immediate imm = ctx.getImmediate(immIndex);
uint32_t align = getAlignmentByte(unit, elemType);
// below code assume align is power of 2
GBE_ASSERT(align && (align & (align-1)) == 0);
// align the stack pointer according to data alignment
if(align > 1) {
uint32_t prevStackPtr = ctx.getFunction().getStackSize();
uint32_t step = ((prevStackPtr + (align - 1)) & ~(align - 1)) - prevStackPtr;
if (step != 0) {
ir::ImmediateIndex stepImm = ctx.newIntegerImmediate(step, ir::TYPE_U32);
ir::Register stepReg = ctx.reg(ctx.getPointerFamily());
ctx.LOADI(ir::TYPE_U32, stepReg, stepImm);
ctx.ADD(ir::TYPE_U32, stack, stack, stepReg);
ctx.getFunction().pushStackSize(step);
}
}
// Set the destination register properly
ctx.MOV(imm.getType(), dst, stack);
ctx.LOADI(imm.getType(), reg, immIndex);
ctx.ADD(imm.getType(), stack, stack, reg);
ctx.getFunction().pushStackSize(elementSize);
}
static INLINE Value *getLoadOrStoreValue(LoadInst &I) {
return &I;
}
static INLINE Value *getLoadOrStoreValue(StoreInst &I) {
return I.getValueOperand();
}
void GenWriter::regAllocateLoadInst(LoadInst &I) {
this->newRegister(&I);
}
void GenWriter::regAllocateStoreInst(StoreInst &I) {}
void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
Value *llvmValues, const ir::Register ptr,
const ir::AddressSpace addrSpace,
Type * elemType, bool isLoad, ir::BTI bti,
bool dwAligned) {
const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
const uint32_t perMsgNum = elemNum / msgNum;
for (uint32_t msg = 0; msg < msgNum; ++msg) {
// Build the tuple data in the vector
vector tupleData; // put registers here
for (uint32_t elemID = 0; elemID < perMsgNum; ++elemID) {
ir::Register reg;
if(regTranslator.isUndefConst(llvmValues, elemID)) {
Value *v = Constant::getNullValue(elemType);
reg = this->getRegister(v);
} else
reg = this->getRegister(llvmValues, perMsgNum*msg+elemID);
tupleData.push_back(reg);
}
const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], perMsgNum);
// We may need to update to offset the pointer
ir::Register addr;
if (msg == 0)
addr = ptr;
else {
const ir::Register offset = ctx.reg(pointerFamily);
ir::ImmediateIndex immIndex;
ir::Type immType;
// each message can read/write 16 byte
const int32_t stride = 16;
if (pointerFamily == ir::FAMILY_DWORD) {
immIndex = ctx.newImmediate(int32_t(msg*stride));
immType = ir::TYPE_S32;
} else {
immIndex = ctx.newImmediate(int64_t(msg*stride));
immType = ir::TYPE_S64;
}
addr = ctx.reg(pointerFamily);
ctx.LOADI(immType, offset, immIndex);
ctx.ADD(immType, addr, ptr, offset);
}
// Emit the instruction
if (isLoad)
ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
else
ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
}
}
// The idea behind is to search along the use-def chain, and find out all
// possible sources of the pointer. Then in later codeGen, we can emit
// read/store instructions to these BTIs gathered.
void GenWriter::gatherBTI(Value *insn, ir::BTI &bti) {
PtrOrigMapIter iter = pointerOrigMap.find(insn);
if (iter != pointerOrigMap.end()) {
SmallVectorImpl &origins = iter->second;
uint8_t nBTI = 0;
for (unsigned i = 0; i < origins.size(); i++) {
uint8_t new_bti = 0;
Value *origin = origins[i];
// all constant put into constant cache, including __constant & const __private
if (isa(origin)
&& dyn_cast(origin)->isConstant()) {
new_bti = BTI_CONSTANT;
} else {
unsigned space = origin->getType()->getPointerAddressSpace();
switch (space) {
case 0:
new_bti = BTI_PRIVATE;
break;
case 1:
{
GlobalPtrIter iter = globalPointer.find(origin);
GBE_ASSERT(iter != globalPointer.end());
new_bti = iter->second;
break;
}
case 2:
new_bti = BTI_CONSTANT;
break;
case 3:
new_bti = BTI_LOCAL;
break;
default:
GBE_ASSERT(0 && "address space not unhandled in gatherBTI()\n");
break;
}
}
// avoid duplicate
bool bFound = false;
for (int j = 0; j < nBTI; j++) {
if (bti.bti[j] == new_bti) {
bFound = true; break;
}
}
if (bFound == false) {
bti.bti[nBTI++] = new_bti;
bti.count = nBTI;
}
}
} else {
insn->dump();
std::cerr << "Illegal pointer which is not from a valid memory space." << std::endl;
std::cerr << "Aborting..." << std::endl;
exit(-1);
}
GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
}
// handle load of dword/qword with unaligned address
void GenWriter::emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned)
{
Type *llvmType = llvmValues->getType();
const ir::Type type = getType(ctx, llvmType);
unsigned byteSize = getTypeByteSize(unit, llvmType);
const ir::Register ptr = this->getRegister(llvmPtr);
Type *elemType = llvmType;
unsigned elemNum = 1;
if (!isScalarType(llvmType)) {
VectorType *vectorType = cast(llvmType);
elemType = vectorType->getElementType();
elemNum = vectorType->getNumElements();
}
vector tupleData;
for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
ir::Register reg;
if(regTranslator.isUndefConst(llvmValues, elemID)) {
Value *v = Constant::getNullValue(elemType);
reg = this->getRegister(v);
} else
reg = this->getRegister(llvmValues, elemID);
tupleData.push_back(reg);
}
const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
vector byteTupleData;
for (uint32_t elemID = 0; elemID < byteSize; ++elemID) {
byteTupleData.push_back(ctx.reg(ir::FAMILY_BYTE));
}
const ir::Tuple byteTuple = ctx.arrayTuple(&byteTupleData[0], byteSize);
if (isLoad) {
ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
ctx.BITCAST(type, ir::TYPE_U8, tuple, byteTuple, elemNum, byteSize);
} else {
ctx.BITCAST(ir::TYPE_U8, type, byteTuple, tuple, byteSize, elemNum);
// FIXME: byte scatter does not handle correctly vector store, after fix that,
// we can directly use on store instruction like:
// ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
for (uint32_t elemID = 0; elemID < byteSize; elemID++) {
const ir::Register reg = byteTupleData[elemID];
ir::Register addr;
if (elemID == 0)
addr = ptr;
else {
const ir::Register offset = ctx.reg(pointerFamily);
ir::ImmediateIndex immIndex;
immIndex = ctx.newImmediate(int32_t(elemID));
addr = ctx.reg(pointerFamily);
ctx.LOADI(ir::TYPE_S32, offset, immIndex);
ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
}
ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
}
}
}
extern int OCL_SIMD_WIDTH;
template
INLINE void GenWriter::emitLoadOrStore(T &I)
{
Value *llvmPtr = I.getPointerOperand();
Value *llvmValues = getLoadOrStoreValue(I);
Type *llvmType = llvmValues->getType();
const bool dwAligned = (I.getAlignment() % 4) == 0;
const ir::Register ptr = this->getRegister(llvmPtr);
ir::BTI binding;
gatherBTI(&I, binding);
const ir::AddressSpace addrSpace = btiToGen(binding);
Type *scalarType = llvmType;
if (!isScalarType(llvmType)) {
VectorType *vectorType = cast(llvmType);
scalarType = vectorType->getElementType();
}
if (!dwAligned
&& (scalarType == IntegerType::get(I.getContext(), 64)
|| scalarType == IntegerType::get(I.getContext(), 32))
) {
emitUnalignedDQLoadStore(llvmPtr, llvmValues, addrSpace, binding, isLoad, dwAligned);
return;
}
// Scalar is easy. We neednot build register tuples
if (isScalarType(llvmType) == true) {
const ir::Type type = getType(ctx, llvmType);
const ir::Register values = this->getRegister(llvmValues);
if (isLoad)
ctx.LOAD(type, ptr, addrSpace, dwAligned, binding, values);
else
ctx.STORE(type, ptr, addrSpace, dwAligned, binding, values);
}
// A vector type requires to build a tuple
else {
VectorType *vectorType = cast(llvmType);
Type *elemType = vectorType->getElementType();
// We follow OCL spec and support 2,3,4,8,16 elements only
uint32_t elemNum = vectorType->getNumElements();
GBE_ASSERTM(elemNum == 2 || elemNum == 3 || elemNum == 4 || elemNum == 8 || elemNum == 16,
"Only vectors of 2,3,4,8 or 16 elements are supported");
// Per OPenCL 1.2 spec 6.1.5:
// For 3-component vector data types, the size of the data type is 4 * sizeof(component).
// And the llvm does cast a type3 data to type4 for load/store instruction,
// so a 4 elements vector may only have 3 valid elements. We need to fix it to correct element
// count here.
if (elemNum == 4 && regTranslator.isUndefConst(llvmValues, 3))
elemNum = 3;
// The code is going to be fairly different from types to types (based on
// size of each vector element)
const ir::Type type = getType(ctx, elemType);
const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
const ir::RegisterFamily dataFamily = getFamily(type);
if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT && addrSpace != ir::MEM_MIXED) {
// One message is enough here. Nothing special to do
if (elemNum <= 4) {
// Build the tuple data in the vector
vector tupleData; // put registers here
for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
ir::Register reg;
if(regTranslator.isUndefConst(llvmValues, elemID)) {
Value *v = Constant::getNullValue(elemType);
reg = this->getRegister(v);
} else
reg = this->getRegister(llvmValues, elemID);
tupleData.push_back(reg);
}
const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
// Emit the instruction
if (isLoad)
ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
else
ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
}
// Not supported by the hardware. So, we split the message and we use
// strided loads and stores
else {
emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
}
}
else if((dataFamily == ir::FAMILY_WORD && (isLoad || elemNum % 2 == 0)) ||
(dataFamily == ir::FAMILY_BYTE && (isLoad || elemNum % 4 == 0))) {
emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
} else {
for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
if(regTranslator.isUndefConst(llvmValues, elemID))
continue;
const ir::Register reg = this->getRegister(llvmValues, elemID);
ir::Register addr;
if (elemID == 0)
addr = ptr;
else {
const ir::Register offset = ctx.reg(pointerFamily);
ir::ImmediateIndex immIndex;
int elemSize = getTypeByteSize(unit, elemType);
immIndex = ctx.newImmediate(int32_t(elemID * elemSize));
addr = ctx.reg(pointerFamily);
ctx.LOADI(ir::TYPE_S32, offset, immIndex);
ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
}
if (isLoad)
ctx.LOAD(type, addr, addrSpace, dwAligned, binding, reg);
else
ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
}
}
}
}
void GenWriter::emitLoadInst(LoadInst &I) {
this->emitLoadOrStore(I);
}
void GenWriter::emitStoreInst(StoreInst &I) {
this->emitLoadOrStore(I);
}
llvm::FunctionPass *createGenPass(ir::Unit &unit) {
return new GenWriter(unit);
}
} /* namespace gbe */