diff options
author | Junyan He <junyan.he@linux.intel.com> | 2015-11-17 07:40:21 +0800 |
---|---|---|
committer | Yang Rong <rong.r.yang@intel.com> | 2015-11-17 16:24:04 +0800 |
commit | 068a00cf456f50d09e53450c1c25fc89600cc32a (patch) | |
tree | 9ce9d759f8ade8fb7ca60754f8e94176d8996157 | |
parent | 4b99e6116d20fe25611de2f22ca055edd95b3d9f (diff) |
Backend: Implement StoreProfilingInstruction in GenContext.
The offset 0 of the profiling buffer contains the log number.
We will use atomic instruction to inc it every time a log
is generated.
We will generate one log for each HW gpu thread. The log
contains the XYZ range of global work items which are executed
on this thread, the EU id, the Sub Slice id, thread number,
and 20 points' timestamp which we are interested in.
Signed-off-by: Junyan He <junyan.he@linux.intel.com>
Reviewed-by: Yang Rong <rong.r.yang@intel.com>
-rw-r--r-- | backend/src/backend/gen_context.cpp | 167 |
1 files changed, 167 insertions, 0 deletions
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 9047db6f..41fe72dc 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -2674,6 +2674,173 @@ namespace gbe } void GenContext::emitStoreProfilingInstruction(const SelectionInstruction &insn) { + uint32_t simdType; + if (this->simdWidth == 16) { + simdType = ir::ProfilingInfo::ProfilingSimdType16; + } else if (this->simdWidth == 8) { + simdType = ir::ProfilingInfo::ProfilingSimdType8; + } else { + simdType = ir::ProfilingInfo::ProfilingSimdType1; + GBE_ASSERT(0); + } + + p->NOP(); + p->NOP(); + + GenRegister tmArf = GenRegister::tm0(); + GenRegister profilingReg[5]; + if (p->curr.execWidth == 16) { + profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD); + profilingReg[1] = GenRegister::offset(profilingReg[0], 1); + profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_UD); + profilingReg[3] = GenRegister::offset(profilingReg[2], 1); + profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(2)), GEN_TYPE_UD); + } else { + GBE_ASSERT(p->curr.execWidth == 8); + profilingReg[0] = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD); + profilingReg[1] = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_UD); + profilingReg[2] = GenRegister::retype(ra->genReg(insn.src(2)), GEN_TYPE_UD); + profilingReg[3] = GenRegister::retype(ra->genReg(insn.src(3)), GEN_TYPE_UD); + profilingReg[4] = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD); + } + GenRegister tmp = ra->genReg(insn.dst(0)); + uint32_t profilingType = insn.extra.profilingType; + uint32_t bti = insn.extra.profilingBTI; + GBE_ASSERT(profilingType == 1); + GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag); + GenRegister lastTsReg = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UL); + lastTsReg = GenRegister::offset(lastTsReg, 0, 2*sizeof(uint64_t)); + GenRegister realClock = GenRegister::offset(lastTsReg, 0, sizeof(uint64_t)); + GenRegister tmp0 = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UL); + + /* MOV(4) tmp0<1>:UW arf_tm<4,4,1>:UW */ + p->push(); { + p->curr.execWidth = 4; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + GenRegister _tmp0 = tmp0; + _tmp0.type = GEN_TYPE_UW; + _tmp0.hstride = GEN_HORIZONTAL_STRIDE_1; + _tmp0.vstride = GEN_VERTICAL_STRIDE_4; + _tmp0.width = GEN_WIDTH_4; + p->MOV(_tmp0, tmArf); + } p->pop(); + + /* Calc the time elapsed. */ + subTimestamps(tmp0, lastTsReg, tmp); + /* Update the real clock */ + addTimestamps(realClock, tmp0, tmp); + + //the epilog, record the last timestamp and return. + /* MOV(1) epilog<1>:UL realclock<0,1,0>:UL */ + /* ADD(1) epilog<1>:UL prolog<0,1,0>:UL */ + GenRegister prolog = GenRegister::toUniform(profilingReg[2], GEN_TYPE_UD); + prolog = GenRegister::offset(prolog, 0, 4*sizeof(uint32_t)); + GenRegister epilog = GenRegister::offset(prolog, 0, 2*sizeof(uint32_t)); + p->push(); { + p->curr.execWidth = 1; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->MOV(epilog, GenRegister::retype(realClock, GEN_TYPE_UD)); + p->MOV(GenRegister::offset(epilog, 0, sizeof(uint32_t)), + GenRegister::offset(GenRegister::retype(realClock, GEN_TYPE_UD), 0, sizeof(uint32_t))); + addTimestamps(epilog, prolog, tmp); + } p->pop(); + + /* Now, begin to write the results out. */ + // Inc the log items number. + p->push(); { + //ptr[0] is the total count of the log items. + GenRegister sndMsg = GenRegister::retype(tmp, GEN_TYPE_UD); + sndMsg.width = GEN_WIDTH_8; + sndMsg.hstride = GEN_HORIZONTAL_STRIDE_1; + sndMsg.vstride = GEN_VERTICAL_STRIDE_8; + p->curr.execWidth = 8; + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->MOV(sndMsg, GenRegister::immud(0x0)); + + GenRegister incRes = GenRegister::offset(sndMsg, 1); + p->push(); + { + p->curr.execWidth = 1; + p->MOV(flagReg, GenRegister::immuw(0x01)); + } + p->pop(); + p->curr.useFlag(insn.state.flag, insn.state.subFlag); + p->curr.predicate = GEN_PREDICATE_NORMAL; + p->ATOMIC(incRes, GEN_ATOMIC_OP_INC, sndMsg, GenRegister::immud(bti), 1); + } p->pop(); + + // Calculate the final addr + GenRegister addr = GenRegister::retype(tmp, GEN_TYPE_UD); + addr.width = GEN_WIDTH_8; + addr.hstride = GEN_HORIZONTAL_STRIDE_1; + addr.vstride = GEN_VERTICAL_STRIDE_8; + p->push(); { + GenRegister offset = GenRegister::offset(addr, 1); + + p->curr.execWidth = 8; + p->curr.noMask = 1; + p->curr.predicate = GEN_PREDICATE_NONE; + p->MUL(addr, GenRegister::toUniform(offset, GEN_TYPE_UD), + GenRegister::immud(sizeof(ir::ProfilingInfo::ProfilingReportItem))); + p->ADD(addr, addr, GenRegister::immud(4)); // for the counter + p->curr.execWidth = 1; + for (int i = 1; i < 8; i++) { + p->ADD(GenRegister::toUniform(GenRegister::offset(addr, 0, i*sizeof(uint32_t)), GEN_TYPE_UD), + GenRegister::toUniform(GenRegister::offset(addr, 0, i*sizeof(uint32_t)), GEN_TYPE_UD), + GenRegister::immud(i*sizeof(uint32_t))); + } + } p->pop(); + + GenRegister data = GenRegister::offset(addr, 1); + p->push(); { + p->curr.execWidth = 8; + p->curr.noMask = 1; + p->curr.predicate = GEN_PREDICATE_NONE; + p->MOV(data, profilingReg[4]); + } p->pop(); + + // Write the result out + p->push(); { + GenRegister ffid = GenRegister::toUniform(data, GEN_TYPE_UD); + GenRegister tmp = GenRegister::toUniform(profilingReg[3], GEN_TYPE_UD); + GenRegister stateReg = GenRegister(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_STATE, 0, + GEN_TYPE_UD, GEN_VERTICAL_STRIDE_0, GEN_WIDTH_1, GEN_HORIZONTAL_STRIDE_1); + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.execWidth = 1; + p->MOV(ffid, stateReg); + p->SHR(ffid, ffid, GenRegister::immud(24)); + p->AND(ffid, ffid, GenRegister::immud(0x0ff)); + p->OR(ffid, ffid, GenRegister::immud(simdType << 4)); + + GenRegister genInfo = GenRegister::offset(ffid, 0, 4); + p->MOV(genInfo, stateReg); + p->AND(genInfo, genInfo, GenRegister::immud(0x0ff07)); + //The dispatch mask + stateReg = GenRegister(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_STATE, 2, + GEN_TYPE_UD, GEN_VERTICAL_STRIDE_0, GEN_WIDTH_1, GEN_HORIZONTAL_STRIDE_1); + p->MOV(tmp, stateReg); + p->AND(tmp, tmp, GenRegister::immud(0x0000ffff)); + p->SHL(tmp, tmp, GenRegister::immud(16)); + p->OR(genInfo, genInfo, tmp); + + // Write it out. + p->curr.execWidth = 8; + p->curr.noMask = 1; + p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1); + p->ADD(addr, addr, GenRegister::immud(32)); + + // time stamps + for (int i = 0; i < 3; i++) { + p->curr.execWidth = 8; + p->MOV(data, GenRegister::retype(profilingReg[i], GEN_TYPE_UD)); + p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1); + p->ADD(addr, addr, GenRegister::immud(32)); + } + } p->pop(); } void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) { |