summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGuo Yejun <yejun.guo@intel.com>2015-12-04 03:22:20 +0800
committerYang Rong <rong.r.yang@intel.com>2015-12-18 17:05:26 +0800
commit4f0721d1c146b65f940bc8a539007daeec431024 (patch)
treec9185af90491b36064eb6dc63dc06765898345e0
parent42489ae261f2c0104eb02fa1effc9c8bab34236b (diff)
add Broxton support
special versions of linux kernel and libdrm are needed. utest and conformance test PASSED. Signed-off-by: Guo Yejun <yejun.guo@intel.com> Reviewed-by: Junyan He <junyan.he@linux.intel.com>
-rwxr-xr-xGetGenID.sh2
-rw-r--r--backend/src/backend/gen8_context.cpp2
-rw-r--r--backend/src/backend/gen8_context.hpp2
-rw-r--r--backend/src/backend/gen9_context.cpp110
-rw-r--r--backend/src/backend/gen9_context.hpp22
-rw-r--r--backend/src/backend/gen_insn_selection.cpp9
-rw-r--r--backend/src/backend/gen_insn_selection.hpp7
-rw-r--r--backend/src/backend/gen_program.cpp13
-rw-r--r--backend/src/gbe_bin_generater.cpp4
-rw-r--r--src/cl_device_data.h9
-rw-r--r--src/cl_device_id.c33
-rw-r--r--src/intel/intel_gpgpu.c5
12 files changed, 208 insertions, 10 deletions
diff --git a/GetGenID.sh b/GetGenID.sh
index 7acf9bda..30296da7 100755
--- a/GetGenID.sh
+++ b/GetGenID.sh
@@ -1,5 +1,5 @@
#!/bin/bash
-genpciid=(0152 0162 0156 0166 015a 016a 0f31 0402 0412 0422 040a 041a 042a 0406 0416 0426 0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0a02 0a12 0a22 0a0a 0a1a 0a2a 0a06 0a16 0a26 0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26)
+genpciid=(0152 0162 0156 0166 015a 016a 0f31 0402 0412 0422 040a 041a 042a 0406 0416 0426 0c02 0c12 0c22 0c0a 0c1a 0c2a 0c06 0c16 0c26 0a02 0a12 0a22 0a0a 0a1a 0a2a 0a06 0a16 0a26 0d02 0d12 0d22 0d0a 0d1a 0d2a 0d06 0d16 0d26 5a84)
pciid=($(lspci -nn | grep "\[8086:.*\]" -o | awk -F : '{print $2}' | awk -F ] '{print $1}'))
n=${#pciid[*]}
i=0
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index b497ee5c..a92bddec 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -356,7 +356,7 @@ namespace gbe
GBE_ASSERT(0);
}
- static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0)
+ GenRegister Gen8Context::unpacked_ud(GenRegister reg, uint32_t offset)
{
if(reg.hstride == GEN_HORIZONTAL_STRIDE_0) {
if(offset == 0)
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index 84508e95..b33aeebb 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -74,6 +74,8 @@ namespace gbe
virtual void emitPackLongInstruction(const SelectionInstruction &insn);
virtual void emitUnpackLongInstruction(const SelectionInstruction &insn);
+ static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0);
+
protected:
virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
virtual GenEncoder* generateEncoder(void) {
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
index 326f5a12..6b016570 100644
--- a/backend/src/backend/gen9_context.cpp
+++ b/backend/src/backend/gen9_context.cpp
@@ -54,4 +54,114 @@ namespace gbe
p->WAIT();
p->pop();
}
+
+ void BxtContext::newSelection(void) {
+ this->sel = GBE_NEW(SelectionBxt, *this);
+ }
+
+ void BxtContext::calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+ GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l)
+ {
+ src0.type = src1.type = GEN_TYPE_UD;
+ dst_h.type = dst_l.type = GEN_TYPE_UL;
+ s0l_s1h.type = s0h_s1l.type = GEN_TYPE_UL;
+
+ //GenRegister tmp;
+
+ GenRegister s0l = unpacked_ud(src0);
+ GenRegister s1l = unpacked_ud(src1);
+ GenRegister s0h = unpacked_ud(s0l_s1h); //s0h only used before s0l_s1h, reuse s0l_s1h
+ GenRegister s1h = unpacked_ud(dst_l); //s1h only used before dst_l, reuse dst_l
+
+ p->MOV(s0h, GenRegister::offset(s0l, 0, 4));
+ p->MOV(s1h, GenRegister::offset(s1l, 0, 4));
+
+ /* High 32 bits X High 32 bits. */
+ p->MUL(dst_h, s0h, s1h);
+ /* High 32 bits X low 32 bits. */
+ p->MUL(s0h_s1l, s0h, s1l);
+ /* Low 32 bits X high 32 bits. */
+ p->MUL(s0l_s1h, s0l, s1h);
+ /* Low 32 bits X low 32 bits. */
+ p->MUL(dst_l, s0l, s1l);
+
+ /* Because the max product of s0l*s1h is (2^N - 1) * (2^N - 1) = 2^2N + 1 - 2^(N+1), here N = 32
+ The max of addding 2 32bits integer to it is
+ 2^2N + 1 - 2^(N+1) + 2*(2^N - 1) = 2^2N - 1
+ which means the product s0h_s1l adds dst_l's high 32 bits and then adds s0l_s1h's low 32 bits will not
+ overflow and have no carry.
+ By this manner, we can avoid using acc register, which has a lot of restrictions. */
+
+ GenRegister s0l_s1h_l = unpacked_ud(s0l_s1h);
+ p->ADD(s0h_s1l, s0h_s1l, s0l_s1h_l);
+
+ p->SHR(s0l_s1h, s0l_s1h, GenRegister::immud(32));
+ GenRegister s0l_s1h_h = unpacked_ud(s0l_s1h);
+ p->ADD(dst_h, dst_h, s0l_s1h_h);
+
+ GenRegister dst_l_h = unpacked_ud(s0l_s1h);
+ p->MOV(dst_l_h, unpacked_ud(dst_l, 1));
+ p->ADD(s0h_s1l, s0h_s1l, dst_l_h);
+
+ // No longer need s0l_s1h
+ GenRegister tmp = s0l_s1h;
+
+ p->SHL(tmp, s0h_s1l, GenRegister::immud(32));
+ GenRegister tmp_unpacked = unpacked_ud(tmp, 1);
+ p->MOV(unpacked_ud(dst_l, 1), tmp_unpacked);
+
+ p->SHR(tmp, s0h_s1l, GenRegister::immud(32));
+ p->ADD(dst_h, dst_h, tmp);
+ }
+
+ void BxtContext::emitI64MULInstruction(const SelectionInstruction &insn)
+ {
+ GenRegister src0 = ra->genReg(insn.src(0));
+ GenRegister src1 = ra->genReg(insn.src(1));
+ GenRegister dst = ra->genReg(insn.dst(0));
+ GenRegister res = ra->genReg(insn.dst(1));
+
+ src0.type = src1.type = GEN_TYPE_UD;
+ dst.type = GEN_TYPE_UL;
+ res.type = GEN_TYPE_UL;
+
+ /* Low 32 bits X low 32 bits. */
+ GenRegister s0l = unpacked_ud(src0);
+ GenRegister s1l = unpacked_ud(src1);
+ p->MUL(dst, s0l, s1l);
+
+ /* Low 32 bits X high 32 bits. */
+ GenRegister s1h = unpacked_ud(res);
+ p->MOV(s1h, unpacked_ud(src1, 1));
+
+ p->MUL(res, s0l, s1h);
+ p->SHL(res, res, GenRegister::immud(32));
+ p->ADD(dst, dst, res);
+
+ /* High 32 bits X low 32 bits. */
+ GenRegister s0h = unpacked_ud(res);
+ p->MOV(s0h, unpacked_ud(src0, 1));
+
+ p->MUL(res, s0h, s1l);
+ p->SHL(res, res, GenRegister::immud(32));
+ p->ADD(dst, dst, res);
+ }
+
+ void BxtContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+ if (sz == 0)
+ sz = 16;
+ GBE_ASSERT(sz%4 == 0);
+ GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ for (int i = 0; i < sz/2; i++) {
+ p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
+ GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+ }
+ p->pop();
+ }
+
}
diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp
index 8acad8cc..a2931cca 100644
--- a/backend/src/backend/gen9_context.hpp
+++ b/backend/src/backend/gen9_context.hpp
@@ -46,5 +46,27 @@ namespace gbe
private:
virtual void newSelection(void);
};
+
+ //most code of BxtContext are copied from ChvContext, it results in two physical copy of the same code.
+ //there are two possible ways to resolve it: 1) virtual inheritance 2) class template
+ //but either way makes BxtContext and ChvContext tied closely, it might impact the flexibility of future changes
+ //so, choose the method of two physical copies.
+ class BxtContext : public Gen9Context
+ {
+ public:
+ virtual ~BxtContext(void) { }
+ BxtContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+ : Gen9Context(unit, name, deviceID, relaxMath) {
+ };
+ virtual void emitI64MULInstruction(const SelectionInstruction &insn);
+
+ protected:
+ virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+
+ private:
+ virtual void newSelection(void);
+ virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+ GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+ };
}
#endif /* __GBE_GEN9_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index aef7f154..7eec2b32 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2072,6 +2072,15 @@ namespace gbe
this->opaque->setHasHalfType(true);
}
+ SelectionBxt::SelectionBxt(GenContext &ctx) : Selection(ctx) {
+ this->opaque->setHas32X32Mul(true);
+ this->opaque->setHasLongType(true);
+ this->opaque->setLongRegRestrict(true);
+ this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
+ this->opaque->setSlowByteGather(true);
+ this->opaque->setHasHalfType(true);
+ }
+
void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
uint32_t bti, bool is3D) {
uint32_t elemID = 0;
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index ffc79e15..3bb00dd8 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -298,6 +298,13 @@ namespace gbe
Selection9(GenContext &ctx);
};
+ class SelectionBxt: public Selection
+ {
+ public:
+ /*! Initialize internal structures used for the selection */
+ SelectionBxt(GenContext &ctx);
+ };
+
} /* namespace gbe */
#endif /* __GEN_INSN_SELECTION_HPP__ */
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 04da692b..a322da86 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -170,6 +170,8 @@ namespace gbe {
ctx = GBE_NEW(ChvContext, unit, name, deviceID, relaxMath);
} else if (IS_SKYLAKE(deviceID)) {
ctx = GBE_NEW(Gen9Context, unit, name, deviceID, relaxMath);
+ } else if (IS_BROXTON(deviceID)) {
+ ctx = GBE_NEW(BxtContext, unit, name, deviceID, relaxMath);
}
GBE_ASSERTM(ctx != NULL, "Fail to create the gen context\n");
ctx->setASMFileName(this->asm_file_name);
@@ -214,7 +216,8 @@ namespace gbe {
(IS_HASWELL(typeA) && !strcmp(src_hw_info, "HSW")) || \
(IS_BROADWELL(typeA) && !strcmp(src_hw_info, "BDW")) || \
(IS_CHERRYVIEW(typeA) && !strcmp(src_hw_info, "CHV")) || \
- (IS_SKYLAKE(typeA) && !strcmp(src_hw_info, "SKL")) )
+ (IS_SKYLAKE(typeA) && !strcmp(src_hw_info, "SKL")) || \
+ (IS_BROXTON(typeA) && !strcmp(src_hw_info, "BXT")) )
static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char *binary, size_t size) {
using namespace gbe;
@@ -328,6 +331,14 @@ namespace gbe {
src_hw_info[0]='S';
src_hw_info[1]='K';
src_hw_info[2]='L';
+ }else if(IS_BROXTON(prog->deviceID)){
+ src_hw_info[0]='B';
+ src_hw_info[1]='X';
+ src_hw_info[2]='T';
+ }else {
+ free(*binary);
+ *binary = NULL;
+ return 0;
}
FILL_DEVICE_ID(*binary, src_hw_info);
memcpy(*binary+BINARY_HEADER_LENGTH, oss.str().c_str(), sz*sizeof(char));
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
index 86197e19..8225d4a3 100644
--- a/backend/src/gbe_bin_generater.cpp
+++ b/backend/src/gbe_bin_generater.cpp
@@ -186,6 +186,10 @@ void program_build_instance::serialize_program(void) throw(int)
src_hw_info[0]='S';
src_hw_info[1]='K';
src_hw_info[2]='L';
+ }else if(IS_BROXTON(gen_pci_id)){
+ src_hw_info[0]='B';
+ src_hw_info[1]='X';
+ src_hw_info[2]='T';
}
if (str_fmt_out) {
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index 3552a166..63e078fd 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -287,7 +287,14 @@
devid == PCI_CHIP_SKYLAKE_SRV_GT4)
#define IS_SKYLAKE(devid) (IS_SKL_GT1(devid) || IS_SKL_GT2(devid) || IS_SKL_GT3(devid) || IS_SKL_GT4(devid))
-#define IS_GEN9(devid) IS_SKYLAKE(devid)
+
+/* BXT */
+#define PCI_CHIP_BROXTON_P 0x5A84 /* Intel(R) BXT-P for mobile desktop */
+
+#define IS_BROXTON(devid) \
+ (devid == PCI_CHIP_BROXTON_P)
+
+#define IS_GEN9(devid) (IS_SKYLAKE(devid) || IS_BROXTON(devid))
#endif /* __CL_DEVICE_DATA_H__ */
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index e25adb9a..fa53a772 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -198,6 +198,17 @@ static struct _cl_device_id intel_skl_gt4_device = {
#include "cl_gen75_device.h"
};
+static struct _cl_device_id intel_bxt_device = {
+ INIT_ICD(dispatch)
+ .max_compute_unit = 18,
+ .max_thread_per_unit = 6,
+ .sub_slice_count = 3,
+ .max_work_item_sizes = {512, 512, 512},
+ .max_work_group_size = 512,
+ .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
LOCAL cl_device_id
cl_get_gt_device(void)
{
@@ -518,6 +529,15 @@ skl_gt4_break:
cl_intel_platform_enable_fp16_extension(ret);
break;
+ case PCI_CHIP_BROXTON_P:
+ DECL_INFO_STRING(bxt_break, intel_bxt_device, name, "Intel(R) HD Graphics Broxton-P");
+bxt_break:
+ intel_bxt_device.device_id = device_id;
+ intel_bxt_device.platform = cl_get_platform_default();
+ ret = &intel_bxt_device;
+ cl_intel_platform_enable_fp16_extension(ret);
+ break;
+
case PCI_CHIP_SANDYBRIDGE_BRIDGE:
case PCI_CHIP_SANDYBRIDGE_GT1:
case PCI_CHIP_SANDYBRIDGE_GT2:
@@ -732,7 +752,8 @@ cl_get_device_info(cl_device_id device,
device != &intel_skl_gt1_device &&
device != &intel_skl_gt2_device &&
device != &intel_skl_gt3_device &&
- device != &intel_skl_gt4_device
+ device != &intel_skl_gt4_device &&
+ device != &intel_bxt_device
))
return CL_INVALID_DEVICE;
@@ -844,7 +865,9 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
device != &intel_skl_gt1_device &&
device != &intel_skl_gt2_device &&
device != &intel_skl_gt3_device &&
- device != &intel_skl_gt4_device))
+ device != &intel_skl_gt4_device &&
+ device != &intel_bxt_device
+ ))
return CL_INVALID_DEVICE;
if (ver == NULL)
return CL_SUCCESS;
@@ -859,7 +882,8 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
|| device == &intel_brw_gt3_device || device == &intel_chv_device) {
*ver = 8;
} else if (device == &intel_skl_gt1_device || device == &intel_skl_gt2_device
- || device == &intel_skl_gt3_device || device == &intel_skl_gt4_device) {
+ || device == &intel_skl_gt3_device || device == &intel_skl_gt4_device
+ || device == &intel_bxt_device) {
*ver = 9;
} else
return CL_INVALID_VALUE;
@@ -947,7 +971,8 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
device != &intel_skl_gt1_device &&
device != &intel_skl_gt2_device &&
device != &intel_skl_gt3_device &&
- device != &intel_skl_gt4_device))
+ device != &intel_skl_gt4_device &&
+ device != &intel_bxt_device))
return CL_INVALID_DEVICE;
CHECK_KERNEL(kernel);
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index e96bb95d..04c67f80 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -1146,7 +1146,8 @@ static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_
IS_HASWELL(gpgpu->drv->device_id) ||
IS_BROADWELL(gpgpu->drv->device_id) ||
IS_CHERRYVIEW(gpgpu->drv->device_id) ||
- IS_SKYLAKE(gpgpu->drv->device_id))) &&
+ IS_SKYLAKE(gpgpu->drv->device_id) ||
+ IS_BROXTON(gpgpu->drv->device_id))) &&
index >= BTI_WORKAROUND_IMAGE_OFFSET + BTI_RESERVED_NUM &&
type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
surface_type = I965_SURFACE_2D;
@@ -2181,7 +2182,7 @@ intel_set_gpgpu_callbacks(int device_id)
intel_gpgpu_select_pipeline = intel_gpgpu_select_pipeline_gen7;
return;
}
- if (IS_SKYLAKE(device_id)) {
+ if (IS_SKYLAKE(device_id) || IS_BROXTON(device_id)) {
cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen9;
intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8;
cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen9;