diff options
author | Homer Hsing <homer.xing@intel.com> | 2013-06-21 12:26:31 +0800 |
---|---|---|
committer | Zhigang Gong <zhigang.gong@linux.intel.com> | 2013-06-21 14:27:57 +0800 |
commit | 675086fabb6f32e38c5912931c8566ed8819eb79 (patch) | |
tree | 214c52f73ef9efe064c80765a2d889b56ba42b9a /backend | |
parent | 104839f4c631dad188791c6584c9a114bc551a7b (diff) |
Support 64-bit float
support:
arithmetic(+ - *)
store load
immediate_value
if else
select
other change:
add "nib control" field in machine instruction format
support "nib control"
fix "directly store float-64 after load float-64".
change hard coded store size (4) to flexible size (4 or 8)
when using float-64 load(store), change to SIMD8
example:
/* support arithmetic store load immediate_value */
kernel void f(global double *src, global double *dst) {
int i = get_global_id(0);
double d = 1.234567890123456789;
dst[i] = d * (src[i] + d);
}
/* support if else */
kernel void f(global float *src, global double *dst) {
int i = get_global_id(0);
float d = 1.234567890123456789f;
if (i < 14)
dst[i] = d * (d + src[i]);
else
dst[i] = 14;
}
/* support select */
kernel void f(global float *src, global double *dst) {
int i = get_global_id(0);
float d = 1.234567890123456789f;
dst[i] = i < 14 ? d : 14;
}
Signed-off-by: Homer Hsing <homer.xing@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
Tested-by: Yang Rong <rong.r.yang@intel.com>
Diffstat (limited to 'backend')
-rw-r--r-- | backend/src/backend/gen_context.cpp | 17 | ||||
-rw-r--r-- | backend/src/backend/gen_context.hpp | 2 | ||||
-rw-r--r-- | backend/src/backend/gen_defs.hpp | 10 | ||||
-rw-r--r-- | backend/src/backend/gen_encoder.cpp | 203 | ||||
-rw-r--r-- | backend/src/backend/gen_encoder.hpp | 6 | ||||
-rw-r--r-- | backend/src/backend/gen_insn_gen7_schedule_info.hxx | 2 | ||||
-rw-r--r-- | backend/src/backend/gen_insn_selection.cpp | 148 | ||||
-rw-r--r-- | backend/src/backend/gen_insn_selection.hxx | 4 | ||||
-rw-r--r-- | backend/src/backend/gen_reg_allocation.cpp | 1 | ||||
-rw-r--r-- | backend/src/backend/gen_register.hpp | 102 | ||||
-rw-r--r-- | backend/src/llvm/llvm_gen_backend.cpp | 3 |
11 files changed, 476 insertions, 22 deletions
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 70c5bcf1..53ba73c1 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -153,6 +153,8 @@ namespace gbe const GenRegister src0 = ra->genReg(insn.src(0)); const GenRegister src1 = ra->genReg(insn.src(1)); switch (insn.opcode) { + case SEL_OP_LOAD_DF_IMM: p->LOAD_DF_IMM(dst, src1, src0.value.df); break; + case SEL_OP_MOV_DF: p->MOV_DF(dst, src0, src1); break; case SEL_OP_SEL: p->SEL(dst, src0, src1); break; case SEL_OP_AND: p->AND(dst, src0, src1); break; case SEL_OP_OR: p->OR (dst, src0, src1); break; @@ -269,6 +271,14 @@ namespace gbe p->pop(); } + void GenContext::emitReadFloat64Instruction(const SelectionInstruction &insn) { + const GenRegister dst = ra->genReg(insn.dst(0)); + const GenRegister src = ra->genReg(insn.src(0)); + const uint32_t bti = insn.extra.function; + const uint32_t elemNum = insn.extra.elem; + p->READ_FLOAT64(dst, src, bti, elemNum); + } + void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) { const GenRegister dst = ra->genReg(insn.dst(0)); const GenRegister src = ra->genReg(insn.src(0)); @@ -277,6 +287,13 @@ namespace gbe p->UNTYPED_READ(dst, src, bti, elemNum); } + void GenContext::emitWriteFloat64Instruction(const SelectionInstruction &insn) { + const GenRegister src = ra->genReg(insn.src(0)); + const uint32_t bti = insn.extra.function; + const uint32_t elemNum = insn.extra.elem; + p->WRITE_FLOAT64(src, bti, elemNum); + } + void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) { const GenRegister src = ra->genReg(insn.src(0)); const uint32_t bti = insn.extra.function; diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index 1566cbb0..804384d8 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -87,6 +87,8 @@ namespace gbe void emitBarrierInstruction(const SelectionInstruction &insn); void emitFenceInstruction(const SelectionInstruction &insn); void emitMathInstruction(const SelectionInstruction &insn); + void emitReadFloat64Instruction(const SelectionInstruction &insn); + void emitWriteFloat64Instruction(const SelectionInstruction &insn); void emitUntypedReadInstruction(const SelectionInstruction &insn); void emitUntypedWriteInstruction(const SelectionInstruction &insn); void emitByteGatherInstruction(const SelectionInstruction &insn); diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp index f4e4938d..9d8db5bf 100644 --- a/backend/src/backend/gen_defs.hpp +++ b/backend/src/backend/gen_defs.hpp @@ -215,6 +215,7 @@ enum GenMessageTarget { #define GEN_TYPE_VF 5 /* packed float vector, immediates only? */ #define GEN_TYPE_HF 6 #define GEN_TYPE_V 6 /* packed int vector, immediates only, uword dest only */ +#define GEN_TYPE_DF 6 #define GEN_TYPE_F 7 #define GEN_ARF_NULL 0x00 @@ -303,6 +304,7 @@ enum GenMessageTarget { #define GEN_BYTE_SCATTER_BYTE 0 #define GEN_BYTE_SCATTER_WORD 1 #define GEN_BYTE_SCATTER_DWORD 2 +#define GEN_BYTE_SCATTER_QWORD 3 #define GEN_SAMPLER_RETURN_FORMAT_FLOAT32 0 #define GEN_SAMPLER_RETURN_FORMAT_UINT32 2 @@ -418,7 +420,7 @@ struct GenInstruction uint32_t src0_reg_type:3; uint32_t src1_reg_file:2; uint32_t src1_reg_type:3; - uint32_t pad:1; + uint32_t nib_ctrl:1; uint32_t dest_subreg_nr:5; uint32_t dest_reg_nr:8; uint32_t dest_horiz_stride:2; @@ -432,7 +434,7 @@ struct GenInstruction uint32_t src0_reg_type:3; uint32_t src1_reg_file:2; /* 0x00000c00 */ uint32_t src1_reg_type:3; /* 0x00007000 */ - uint32_t pad:1; + uint32_t nib_ctrl:1; int dest_indirect_offset:10; /* offset against the deref'd address reg */ uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */ uint32_t dest_horiz_stride:2; @@ -446,7 +448,7 @@ struct GenInstruction uint32_t src0_reg_type:3; uint32_t src1_reg_file:2; uint32_t src1_reg_type:3; - uint32_t pad:1; + uint32_t nib_ctrl:1; uint32_t dest_writemask:4; uint32_t dest_subreg_nr:1; uint32_t dest_reg_nr:8; @@ -459,7 +461,7 @@ struct GenInstruction uint32_t dest_reg_type:3; uint32_t src0_reg_file:2; uint32_t src0_reg_type:3; - uint32_t pad0:6; + uint32_t nib_ctrl:1; uint32_t dest_writemask:4; int dest_indirect_offset:6; uint32_t dest_subreg_nr:3; diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index 859a1b98..3d8afe89 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -235,6 +235,7 @@ namespace gbe NOT_IMPLEMENTED; insn->header.acc_wr_control = this->curr.accWrEnable; insn->header.quarter_control = this->curr.quarterControl; + insn->bits1.ia1.nib_ctrl = this->curr.nibControl; insn->header.mask_control = this->curr.noMask; insn->bits2.ia1.flag_reg_nr = this->curr.flag; insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag; @@ -355,6 +356,105 @@ namespace gbe 0 }; + static int dst_type(int exec_width) { + if (exec_width == 8) + return GEN_TYPE_UD; + if (exec_width == 16) + return GEN_TYPE_UW; + NOT_IMPLEMENTED; + return 0; + } + + void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) { + int w = curr.execWidth; + dst = GenRegister::h2(dst); + dst.type = GEN_TYPE_UD; + src.type = GEN_TYPE_UD; + GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD); + GenRegister imm4 = GenRegister::immud(4); + GenInstruction *insn; + insn = next(GEN_OPCODE_SEND); + setHeader(insn); + setDst(insn, GenRegister::uw16grf(r.nr, 0)); + setSrc0(insn, GenRegister::ud8grf(src.nr, 0)); + setSrc1(insn, GenRegister::immud(0)); + setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8); + push(); + curr.quarterControl = 0; + curr.nibControl = 0; + MOV(dst, r); + if (w == 8) + curr.nibControl = 1; + else + curr.quarterControl = 1; + MOV(GenRegister::suboffset(dst, w), GenRegister::suboffset(r, w / 2)); + pop(); + ADD(src, src, imm4); + insn = next(GEN_OPCODE_SEND); + setHeader(insn); + setDst(insn, GenRegister::uw16grf(r.nr, 0)); + setSrc0(insn, GenRegister::ud8grf(src.nr, 0)); + setSrc1(insn, GenRegister::immud(0)); + setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8); + push(); + curr.quarterControl = 0; + curr.nibControl = 0; + MOV(GenRegister::suboffset(dst, 1), r); + if (w == 8) + curr.nibControl = 1; + else + curr.quarterControl = 1; + MOV(GenRegister::suboffset(dst, w + 1), GenRegister::suboffset(r, w / 2)); + pop(); + } + + void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) { + int w = curr.execWidth; + GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD); + r.type = GEN_TYPE_UD; + GenRegister hdr = GenRegister::h2(r); + GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0); + src.hstride = GEN_HORIZONTAL_STRIDE_2; + GenRegister data = GenRegister::offset(r, w / 8); + GenRegister imm4 = GenRegister::immud(4); + MOV(r, GenRegister::ud8grf(msg.nr, 0)); + push(); + curr.quarterControl = 0; + curr.nibControl = 0; + MOV(data, src); + if (w == 8) + curr.nibControl = 1; + else + curr.quarterControl = 1; + MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w)); + pop(); + GenInstruction *insn; + insn = next(GEN_OPCODE_SEND); + setHeader(insn); + setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth))); + setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0)); + setSrc1(insn, GenRegister::immud(0)); + setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0); + + ADD(r, GenRegister::ud8grf(msg.nr, 0), imm4); + push(); + curr.quarterControl = 0; + curr.nibControl = 0; + MOV(data, GenRegister::suboffset(src, 1)); + if (w == 8) + curr.nibControl = 1; + else + curr.quarterControl = 1; + MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w + 1)); + pop(); + insn = next(GEN_OPCODE_SEND); + setHeader(insn); + setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth))); + setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0)); + setSrc1(insn, GenRegister::immud(0)); + setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0); + } + void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) { GenInstruction *insn = this->next(GEN_OPCODE_SEND); assert(elemNum >= 1 || elemNum <= 4); @@ -467,7 +567,25 @@ namespace gbe } INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) { - if (needToSplitAlu1(p, dst, src) == false) { + if (dst.isdf() && src.isdf()) { + int w = p->curr.execWidth; + p->push(); + p->curr.quarterControl = 0; + p->curr.nibControl = 0; + GenInstruction *insn = p->next(opcode); + p->setHeader(insn); + p->setDst(insn, dst); + p->setSrc0(insn, src); + if (w == 8) + p->curr.nibControl = 1; // second 1/8 mask + else // w == 16 + p->curr.quarterControl = 1; // second 1/4 mask + insn = p->next(opcode); + p->setHeader(insn); + p->setDst(insn, GenRegister::suboffset(dst, w / 2)); + p->setSrc0(insn, GenRegister::suboffset(src, w / 2)); + p->pop(); + } else if (needToSplitAlu1(p, dst, src) == false) { GenInstruction *insn = p->next(opcode); p->setHeader(insn); p->setDst(insn, dst); @@ -499,7 +617,27 @@ namespace gbe GenRegister src0, GenRegister src1) { - if (needToSplitAlu2(p, dst, src0, src1) == false) { + if (dst.isdf() && src0.isdf() && src1.isdf()) { + int w = p->curr.execWidth; + p->push(); + p->curr.quarterControl = 0; + p->curr.nibControl = 0; + GenInstruction *insn = p->next(opcode); + p->setHeader(insn); + p->setDst(insn, dst); + p->setSrc0(insn, src0); + p->setSrc1(insn, src1); + if (w == 8) + p->curr.nibControl = 1; // second 1/8 mask + else // w == 16 + p->curr.quarterControl = 1; // second 1/4 mask + insn = p->next(opcode); + p->setHeader(insn); + p->setDst(insn, GenRegister::suboffset(dst, w / 2)); + p->setSrc0(insn, GenRegister::suboffset(src0, w / 2)); + p->setSrc1(insn, GenRegister::suboffset(src1, w / 2)); + p->pop(); + } else if (needToSplitAlu2(p, dst, src0, src1) == false) { GenInstruction *insn = p->next(opcode); p->setHeader(insn); p->setDst(insn, dst); @@ -620,6 +758,67 @@ namespace gbe alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \ } + void GenEncoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) { + union { double d; unsigned u[2]; } u; + u.d = value; + GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD); + push(); + curr.predicate = GEN_PREDICATE_NONE; + curr.execWidth = 1; + MOV(r, GenRegister::immud(u.u[1])); + MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0])); + pop(); + r.type = GEN_TYPE_DF; + r.vstride = GEN_VERTICAL_STRIDE_0; + r.width = GEN_WIDTH_1; + r.hstride = GEN_HORIZONTAL_STRIDE_0; + push(); + MOV(dest, r); + pop(); + } + + void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) { + int w = curr.execWidth; + if (src0.isdf()) { + push(); + curr.execWidth = 16; + MOV(dest, src0); + if (w == 16) { + curr.quarterControl = 1; + MOV(GenRegister::QnPhysical(dest, w / 4), GenRegister::QnPhysical(src0, w / 4)); + } + pop(); + } else { + GenRegister r0 = GenRegister::h2(r); + push(); + curr.execWidth = 8; + curr.predicate = GEN_PREDICATE_NONE; + MOV(r0, src0); + MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 4)); + curr.predicate = GEN_PREDICATE_NORMAL; + curr.quarterControl = 0; + curr.nibControl = 0; + MOV(dest, r); + curr.nibControl = 1; + MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r, 8)); + pop(); + if (w == 16) { + push(); + curr.execWidth = 8; + curr.predicate = GEN_PREDICATE_NONE; + MOV(r0, GenRegister::suboffset(src0, 8)); + MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 12)); + curr.predicate = GEN_PREDICATE_NORMAL; + curr.quarterControl = 1; + curr.nibControl = 0; + MOV(GenRegister::suboffset(dest, 8), r); + curr.nibControl = 1; + MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r, 8)); + pop(); + } + } + } + ALU1(MOV) ALU1(RNDZ) ALU1(RNDE) diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp index c98774f8..1a5dcf9d 100644 --- a/backend/src/backend/gen_encoder.hpp +++ b/backend/src/backend/gen_encoder.hpp @@ -113,9 +113,11 @@ namespace gbe ALU2(LINE) ALU2(PLN) ALU3(MAD) + ALU2(MOV_DF); #undef ALU1 #undef ALU2 #undef ALU3 + void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value); /*! Barrier message (to synchronize threads of a workgroup) */ void BARRIER(GenRegister src); /*! Memory fence message (to order loads and stores between threads) */ @@ -132,6 +134,10 @@ namespace gbe void NOP(void); /*! Wait instruction (used for the barrier) */ void WAIT(void); + /*! Read 64-bits float arrays */ + void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum); + /*! Write 64-bits float arrays */ + void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t elemNum); /*! Untyped read (upto 4 channels) */ void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum); /*! Untyped write (upto 4 channels) */ diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx index 098d9ecc..a3b4621a 100644 --- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx +++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx @@ -12,6 +12,8 @@ DECL_GEN7_SCHEDULE(Wait, 20, 2, 2) DECL_GEN7_SCHEDULE(Math, 20, 4, 2) DECL_GEN7_SCHEDULE(Barrier, 80, 1, 1) DECL_GEN7_SCHEDULE(Fence, 80, 1, 1) +DECL_GEN7_SCHEDULE(ReadFloat64, 80, 1, 1) +DECL_GEN7_SCHEDULE(WriteFloat64, 80, 1, 1) DECL_GEN7_SCHEDULE(UntypedRead, 80, 1, 1) DECL_GEN7_SCHEDULE(UntypedWrite, 80, 1, 1) DECL_GEN7_SCHEDULE(ByteGather, 80, 1, 1) diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index 4e7cebd5..59014191 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -129,6 +129,7 @@ namespace gbe case TYPE_S32: return GEN_TYPE_D; case TYPE_U32: return GEN_TYPE_UD; case TYPE_FLOAT: return GEN_TYPE_F; + case TYPE_DOUBLE: return GEN_TYPE_DF; default: NOT_SUPPORTED; return GEN_TYPE_F; } } @@ -166,11 +167,13 @@ namespace gbe bool SelectionInstruction::isRead(void) const { return this->opcode == SEL_OP_UNTYPED_READ || + this->opcode == SEL_OP_READ_FLOAT64 || this->opcode == SEL_OP_BYTE_GATHER; } bool SelectionInstruction::isWrite(void) const { return this->opcode == SEL_OP_UNTYPED_WRITE || + this->opcode == SEL_OP_WRITE_FLOAT64 || this->opcode == SEL_OP_BYTE_SCATTER; } @@ -406,6 +409,8 @@ namespace gbe #define ALU3(OP) \ INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); } ALU1(MOV) + ALU2(MOV_DF) + ALU2(LOAD_DF_IMM) ALU1(RNDZ) ALU1(RNDE) ALU2(SEL) @@ -449,6 +454,10 @@ namespace gbe void NOP(void); /*! Wait instruction (used for the barrier) */ void WAIT(void); + /*! Read 64 bits float array */ + void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti); + /*! Write 64 bits float array */ + void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti); /*! Untyped read (up to 4 elements) */ void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti); /*! Untyped write (up to 4 elements) */ @@ -610,20 +619,23 @@ namespace gbe ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn, uint32_t regID) { SelectionBlock *block = insn->parent; - const uint32_t simdWidth = ctx.getSimdWidth(); + uint32_t simdWidth = ctx.getSimdWidth(); ir::Register tmp; + ir::RegisterFamily f = file.get(insn->dst(regID).reg()).family; + int genType = f == ir::FAMILY_QWORD ? GEN_TYPE_DF : GEN_TYPE_F; + GenRegister gr; // This will append the temporary register in the instruction block this->block = block; - tmp = this->reg(ir::FAMILY_DWORD); + tmp = this->reg(f); // Generate the MOV instruction and replace the register in the instruction SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1); - mov->dst(0) = GenRegister::retype(insn->dst(regID), GEN_TYPE_F); + mov->dst(0) = GenRegister::retype(insn->dst(regID), genType); mov->state = GenInstructionState(simdWidth); - insn->dst(regID) = mov->src(0) = GenRegister::fxgrf(simdWidth, tmp); + gr = f == ir::FAMILY_QWORD ? GenRegister::dfxgrf(simdWidth, tmp) : GenRegister::fxgrf(simdWidth, tmp); + insn->dst(regID) = mov->src(0) = gr; insn->append(*mov); - return tmp; } @@ -657,6 +669,7 @@ namespace gbe case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break; case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break; case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break; + case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break; default: NOT_SUPPORTED; } GBE_ASSERT(false); @@ -719,6 +732,33 @@ namespace gbe void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); } void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); } + void Selection::Opaque::READ_FLOAT64(Reg addr, + const GenRegister *dst, + uint32_t elemNum, + uint32_t bti) + { + SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1); + SelectionVector *srcVector = this->appendVector(); + SelectionVector *dstVector = this->appendVector(); + + // Regular instruction to encode + for (uint32_t elemID = 0; elemID < elemNum; ++elemID) + insn->dst(elemID) = dst[elemID]; + insn->src(0) = addr; + insn->extra.function = bti; + insn->extra.elem = elemNum; + + // Sends require contiguous allocation + dstVector->regNum = elemNum; + dstVector->isSrc = 0; + dstVector->reg = &insn->dst(0); + + // Source cannot be scalar (yet) + srcVector->regNum = 1; + srcVector->isSrc = 1; + srcVector->reg = &insn->src(0); + } + void Selection::Opaque::UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, @@ -746,6 +786,27 @@ namespace gbe srcVector->reg = &insn->src(0); } + void Selection::Opaque::WRITE_FLOAT64(Reg addr, + const GenRegister *src, + uint32_t elemNum, + uint32_t bti) + { + SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, elemNum+1); + SelectionVector *vector = this->appendVector(); + + // Regular instruction to encode + insn->src(0) = addr; + for (uint32_t elemID = 0; elemID < elemNum; ++elemID) + insn->src(elemID+1) = src[elemID]; + insn->extra.function = bti; + insn->extra.elem = elemNum; + + // Sends require contiguous allocation for the sources + vector->regNum = elemNum+1; + vector->reg = &insn->src(0); + vector->isSrc = 1; + } + void Selection::Opaque::UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, @@ -1092,6 +1153,15 @@ namespace gbe // Implementation of all patterns /////////////////////////////////////////////////////////////////////////// + bool canGetRegisterFromImmediate(const ir::Instruction &insn) { + using namespace ir; + const auto &childInsn = cast<LoadImmInstruction>(insn); + const auto &imm = childInsn.getImmediate(); + if(imm.type != TYPE_DOUBLE) + return true; + return false; + } + GenRegister getRegisterFromImmediate(ir::Immediate imm) { using namespace ir; @@ -1103,6 +1173,7 @@ namespace gbe case TYPE_S16: return GenRegister::immw(imm.data.s16); case TYPE_U8: return GenRegister::immuw(imm.data.u8); case TYPE_S8: return GenRegister::immw(imm.data.s8); + case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64); default: NOT_SUPPORTED; return GenRegister::immuw(0); } } @@ -1146,7 +1217,13 @@ namespace gbe const GenRegister src = sel.selReg(insn.getSrc(0)); switch (opcode) { case ir::OP_ABS: sel.MOV(dst, GenRegister::abs(src)); break; - case ir::OP_MOV: sel.MOV(dst, src); break; + case ir::OP_MOV: + if (dst.isdf()) { + ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD); + sel.MOV_DF(dst, src, sel.selReg(r)); + } else + sel.MOV(dst, src); + break; case ir::OP_RNDD: sel.RNDD(dst, src); break; case ir::OP_RNDE: sel.RNDE(dst, src); break; case ir::OP_RNDU: sel.RNDU(dst, src); break; @@ -1225,14 +1302,14 @@ namespace gbe SelectionDAG *dag1 = dag.child[1]; // Right source can always be an immediate - if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) { + if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) { const auto &childInsn = cast<LoadImmInstruction>(dag1->insn); src0 = sel.selReg(insn.getSrc(0), type); src1 = getRegisterFromImmediate(childInsn.getImmediate()); if (dag0) dag0->isRoot = 1; } // Left source cannot be immediate but it is OK if we can commute - else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI) { + else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) { const auto &childInsn = cast<LoadImmInstruction>(dag0->insn); src0 = sel.selReg(insn.getSrc(1), type); src1 = getRegisterFromImmediate(childInsn.getImmediate()); @@ -1268,7 +1345,7 @@ namespace gbe case OP_SHR: sel.SHR(dst, src0, src1); break; case OP_ASR: sel.ASR(dst, src0, src1); break; case OP_MUL: - if (type == TYPE_FLOAT) + if (type == TYPE_FLOAT || type == TYPE_DOUBLE) sel.MUL(dst, src0, src1); else if (type == TYPE_U32 || type == TYPE_S32) { sel.pop(); @@ -1599,6 +1676,7 @@ namespace gbe case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.data.s16)); break; case TYPE_U8: sel.MOV(dst, GenRegister::immuw(imm.data.u8)); break; case TYPE_S8: sel.MOV(dst, GenRegister::immw(imm.data.s8)); break; + case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.data.f64), sel.selReg(sel.reg(FAMILY_QWORD))); break; default: NOT_SUPPORTED; } sel.pop(); @@ -1650,6 +1728,8 @@ namespace gbe INLINE uint32_t getByteScatterGatherSize(ir::Type type) { using namespace ir; switch (type) { + case TYPE_DOUBLE: + return GEN_BYTE_SCATTER_QWORD; case TYPE_FLOAT: case TYPE_U32: case TYPE_S32: @@ -1681,6 +1761,22 @@ namespace gbe sel.UNTYPED_READ(addr, dst.data(), valueNum, bti); } + void emitReadFloat64(Selection::Opaque &sel, + const ir::LoadInstruction &insn, + GenRegister addr, + uint32_t bti) const + { + using namespace ir; + const uint32_t valueNum = insn.getValueNum(); + vector<GenRegister> dst(valueNum); + for (uint32_t dstID = 0; dstID < valueNum; ++dstID) + dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F); + dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD))); + if (sel.ctx.getSimdWidth() == 16) + dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD))); + sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti); + } + void emitByteGather(Selection::Opaque &sel, const ir::LoadInstruction &insn, const uint32_t elemSize, @@ -1732,6 +1828,8 @@ namespace gbe const uint32_t elemSize = getByteScatterGatherSize(type); if (insn.getAddressSpace() == MEM_CONSTANT) this->emitIndirectMove(sel, insn, address); + else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD) + this->emitReadFloat64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00); else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD) this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00); else { @@ -1762,6 +1860,25 @@ namespace gbe sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti); } + void emitWriteFloat64(Selection::Opaque &sel, + const ir::StoreInstruction &insn, + uint32_t bti) const + { + using namespace ir; + const uint32_t valueNum = insn.getValueNum(); + const uint32_t addrID = ir::StoreInstruction::addressIndex; + GenRegister addr; + vector<GenRegister> value(valueNum); + + addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F); + for (uint32_t valueID = 0; valueID < valueNum; ++valueID) + value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F); + value.push_back(sel.selReg(sel.reg(FAMILY_QWORD))); + if (sel.ctx.getSimdWidth() == 16) + value.push_back(sel.selReg(sel.reg(FAMILY_QWORD))); + sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti); + } + void emitByteScatter(Selection::Opaque &sel, const ir::StoreInstruction &insn, const uint32_t elemSize, @@ -1791,7 +1908,9 @@ namespace gbe const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01; const Type type = insn.getValueType(); const uint32_t elemSize = getByteScatterGatherSize(type); - if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD) + if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD) + this->emitWriteFloat64(sel, insn, bti); + else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD) this->emitUntypedWrite(sel, insn, bti); else { const GenRegister address = sel.selReg(insn.getAddress()); @@ -1839,7 +1958,7 @@ namespace gbe SelectionDAG *dag1 = dag.child[1]; // Right source can always be an immediate - if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) { + if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) { const auto &childInsn = cast<LoadImmInstruction>(dag1->insn); src0 = sel.selReg(insn.getSrc(0), type); src1 = getRegisterFromImmediate(childInsn.getImmediate()); @@ -1873,7 +1992,7 @@ namespace gbe const GenRegister src = sel.selReg(insn.getSrc(0), srcType); // We need two instructions to make the conversion - if (dstFamily != FAMILY_DWORD && srcFamily == FAMILY_DWORD) { + if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && srcFamily == FAMILY_DWORD) { GenRegister unpacked; if (dstFamily == FAMILY_WORD) { const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W; @@ -1886,6 +2005,9 @@ namespace gbe } sel.MOV(unpacked, src); sel.MOV(dst, unpacked); + } else if (dst.isdf()) { + ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD); + sel.MOV_DF(dst, src, sel.selReg(r)); } else sel.MOV(dst, src); return true; @@ -1919,7 +2041,7 @@ namespace gbe SelectionDAG *dag2 = dag.child[2]; // Right source can always be an immediate - if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL && dag2->insn.getOpcode() == OP_LOADI) { + if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL && dag2->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag2->insn)) { const auto &childInsn = cast<LoadImmInstruction>(dag2->insn); src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index), type); src1 = getRegisterFromImmediate(childInsn.getImmediate()); diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx index 789c81ca..4b5525b4 100644 --- a/backend/src/backend/gen_insn_selection.hxx +++ b/backend/src/backend/gen_insn_selection.hxx @@ -1,5 +1,7 @@ DECL_SELECTION_IR(LABEL, LabelInstruction) DECL_SELECTION_IR(MOV, UnaryInstruction) +DECL_SELECTION_IR(MOV_DF, BinaryInstruction) +DECL_SELECTION_IR(LOAD_DF_IMM, BinaryInstruction) DECL_SELECTION_IR(NOT, UnaryInstruction) DECL_SELECTION_IR(LZD, UnaryInstruction) DECL_SELECTION_IR(RNDZ, UnaryInstruction) @@ -32,6 +34,8 @@ DECL_SELECTION_IR(BARRIER, BarrierInstruction) DECL_SELECTION_IR(FENCE, FenceInstruction) DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction) DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction) +DECL_SELECTION_IR(READ_FLOAT64, ReadFloat64Instruction) +DECL_SELECTION_IR(WRITE_FLOAT64, WriteFloat64Instruction) DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction) DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction) DECL_SELECTION_IR(SAMPLE, SampleInstruction) diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp index 9765b025..e7c96ac4 100644 --- a/backend/src/backend/gen_reg_allocation.cpp +++ b/backend/src/backend/gen_reg_allocation.cpp @@ -458,7 +458,6 @@ namespace gbe } bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) { - // Perform the linear scan allocator const uint32_t regNum = ctx.sel->getRegNum(); for (uint32_t startID = 0; startID < regNum; ++startID) { diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp index d772b0d6..fedb743a 100644 --- a/backend/src/backend/gen_register.hpp +++ b/backend/src/backend/gen_register.hpp @@ -69,11 +69,12 @@ namespace gbe /*! Type size in bytes for each Gen type */ INLINE int typeSize(uint32_t type) { switch(type) { + case GEN_TYPE_DF: + return 8; case GEN_TYPE_UD: case GEN_TYPE_D: case GEN_TYPE_F: return 4; - case GEN_TYPE_HF: case GEN_TYPE_UW: case GEN_TYPE_W: return 2; @@ -110,6 +111,7 @@ namespace gbe INLINE GenInstructionState(uint32_t simdWidth = 8) { this->execWidth = simdWidth; this->quarterControl = GEN_COMPRESSION_Q1; + this->nibControl = 0; this->accWrEnable = 0; this->noMask = 0; this->flag = 0; @@ -126,6 +128,7 @@ namespace gbe uint32_t flagIndex:16; //!< Only if virtual flag (index of the register) uint32_t execWidth:5; uint32_t quarterControl:1; + uint32_t nibControl:1; uint32_t accWrEnable:1; uint32_t noMask:1; uint32_t predicate:4; @@ -192,6 +195,7 @@ namespace gbe /*! For immediates or virtual register */ union { + double df; float f; int32_t d; uint32_t ud; @@ -211,6 +215,31 @@ namespace gbe uint32_t quarter:1; //!< To choose which part we want (Q1 / Q2) uint32_t address_mode:1; //!< direct or indirect + static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) { + GenRegister r = reg; + r.nr += nr; + r.subnr += subnr; + return r; + } + + INLINE bool isimmdf(void) const { + if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE) + return true; + return false; + } + + INLINE bool isdf(void) const { + if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE) + return true; + return false; + } + + static INLINE GenRegister h2(GenRegister reg) { + GenRegister r = reg; + r.hstride = GEN_HORIZONTAL_STRIDE_2; + return r; + } + static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t quarter) { GBE_ASSERT(reg.physical == 0); if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register @@ -293,6 +322,18 @@ namespace gbe return reg; } + static INLINE GenRegister df16(uint32_t file, ir::Register reg) { + return retype(vec16(file, reg), GEN_TYPE_DF); + } + + static INLINE GenRegister df8(uint32_t file, ir::Register reg) { + return retype(vec8(file, reg), GEN_TYPE_DF); + } + + static INLINE GenRegister df1(uint32_t file, ir::Register reg) { + return retype(vec1(file, reg), GEN_TYPE_DF); + } + static INLINE GenRegister ud16(uint32_t file, ir::Register reg) { return retype(vec16(file, reg), GEN_TYPE_UD); } @@ -371,6 +412,12 @@ namespace gbe GEN_HORIZONTAL_STRIDE_0); } + static INLINE GenRegister immdf(double df) { + GenRegister immediate = imm(GEN_TYPE_DF); + immediate.value.df = df; + return immediate; + } + static INLINE GenRegister immf(float f) { GenRegister immediate = imm(GEN_TYPE_F); immediate.value.f = f; @@ -448,6 +495,18 @@ namespace gbe return vec16(GEN_GENERAL_REGISTER_FILE, reg); } + static INLINE GenRegister df1grf(ir::Register reg) { + return df1(GEN_GENERAL_REGISTER_FILE, reg); + } + + static INLINE GenRegister df8grf(ir::Register reg) { + return df8(GEN_GENERAL_REGISTER_FILE, reg); + } + + static INLINE GenRegister df16grf(ir::Register reg) { + return df16(GEN_GENERAL_REGISTER_FILE, reg); + } + static INLINE GenRegister ud16grf(ir::Register reg) { return ud16(GEN_GENERAL_REGISTER_FILE, reg); } @@ -608,11 +667,37 @@ namespace gbe GEN_HORIZONTAL_STRIDE_0); } + static INLINE int hstride_size(GenRegister reg) { + switch (reg.hstride) { + case GEN_HORIZONTAL_STRIDE_0: return 0; + case GEN_HORIZONTAL_STRIDE_1: return 1; + case GEN_HORIZONTAL_STRIDE_2: return 2; + case GEN_HORIZONTAL_STRIDE_4: return 4; + default: NOT_IMPLEMENTED; return 0; + } + } + static INLINE GenRegister suboffset(GenRegister reg, uint32_t delta) { - reg.subnr += delta * typeSize(reg.type); + if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) { + reg.subnr += delta * typeSize(reg.type); + reg.nr += reg.subnr / 32; + reg.subnr %= 32; + } return reg; } + static INLINE GenRegister df16(uint32_t file, uint32_t nr, uint32_t subnr) { + return retype(vec16(file, nr, subnr), GEN_TYPE_DF); + } + + static INLINE GenRegister df8(uint32_t file, uint32_t nr, uint32_t subnr) { + return retype(vec8(file, nr, subnr), GEN_TYPE_DF); + } + + static INLINE GenRegister df1(uint32_t file, uint32_t nr, uint32_t subnr) { + return retype(vec1(file, nr, subnr), GEN_TYPE_DF); + } + static INLINE GenRegister ud16(uint32_t file, uint32_t nr, uint32_t subnr) { return retype(vec16(file, nr, subnr), GEN_TYPE_UD); } @@ -685,6 +770,18 @@ namespace gbe return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr); } + static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) { + return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr); + } + + static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) { + return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr); + } + + static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) { + return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr); + } + static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) { return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr); } @@ -790,6 +887,7 @@ namespace gbe return SIMD1(values...); \ } \ } + DECL_REG_ENCODER(dfxgrf, df16grf, df8grf, df1grf); DECL_REG_ENCODER(fxgrf, f16grf, f8grf, f1grf); DECL_REG_ENCODER(uwxgrf, uw16grf, uw8grf, uw1grf); DECL_REG_ENCODER(udxgrf, ud16grf, ud8grf, ud1grf); diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 3a59da38..5b7754cc 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -2164,6 +2164,7 @@ namespace gbe } void GenWriter::regAllocateStoreInst(StoreInst &I) {} + extern int OCL_SIMD_WIDTH; template <bool isLoad, typename T> INLINE void GenWriter::emitLoadOrStore(T &I) { @@ -2178,6 +2179,8 @@ namespace gbe // Scalar is easy. We neednot build register tuples if (isScalarType(llvmType) == true) { const ir::Type type = getType(ctx, llvmType); + if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16 + OCL_SIMD_WIDTH = 8; const ir::Register values = this->getRegister(llvmValues); if (isLoad) ctx.LOAD(type, ptr, addrSpace, dwAligned, values); |