CL/Driver: enable atomics in L3 for HSW.

This could get more than 10x boost for some atomic stress workloads. Signed-off-by: Zhigang Gong <zhigang.gong@intel.com> Reviewed-by: "Yang, Rong R" <rong.r.yang@intel.com>
author: Zhigang Gong <zhigang.gong@intel.com> 2014-12-31 10:02:30 +0800
committer: Zhigang Gong <zhigang.gong@intel.com> 2015-01-07 16:06:03 +0800
commit: ef7127c03bd533277afc443b335c37a69927250a (patch)
tree: cf55ba1a532fba1af4bec6354ca1f775db6fc711 /src
parent: 688867be7e09b163e86966b1e6581bcfe22a0dfe (diff)
2 files changed, 14 insertions, 1 deletions
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
index e983718e..044c0049 100644
--- a/src/intel/intel_defines.h
+++ b/src/intel/intel_defines.h
@@ -304,6 +304,10 @@
 #define URB_SIZE(intel)         (IS_IGDNG(intel->device_id) ? 1024 : \
                                  IS_G4X(intel->device_id) ? 384 : 256)
 
+// HSW
+#define HSW_SCRATCH1_OFFSET                      (0xB038)
+#define HSW_ROW_CHICKEN3_HDC_OFFSET              (0xE49C)
+
 // L3 cache stuff 
 #define GEN7_L3_SQC_REG1_ADDRESS_OFFSET          (0XB010)
 #define GEN7_L3_CNTL_REG2_ADDRESS_OFFSET         (0xB020)
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index fb03bcc7..e6e37fb7 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -632,7 +632,16 @@ static void
 intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm)
 {
   /* still set L3 in batch buffer for fulsim. */
-  BEGIN_BATCH(gpgpu->batch, 9);
+  BEGIN_BATCH(gpgpu->batch, 15);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  /* FIXME: KMD always disable the atomic in L3 for some reason.
+     I checked the spec, and don't think we need that workaround now.
+     Before I send a patch to kernel, let's just enable it here. */
+  OUT_BATCH(gpgpu->batch, HSW_SCRATCH1_OFFSET);
+  OUT_BATCH(gpgpu->batch, 0);                         /* enable atomic in L3 */
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, HSW_ROW_CHICKEN3_HDC_OFFSET);
+  OUT_BATCH(gpgpu->batch, (1 << 6ul) << 16);          /* enable atomic in L3 */
   OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
   OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
   OUT_BATCH(gpgpu->batch, 0x00800000);
author	Zhigang Gong <zhigang.gong@intel.com>	2014-12-31 10:02:30 +0800
committer	Zhigang Gong <zhigang.gong@intel.com>	2015-01-07 16:06:03 +0800
commit	ef7127c03bd533277afc443b335c37a69927250a (patch)
tree	cf55ba1a532fba1af4bec6354ca1f775db6fc711 /src
parent	688867be7e09b163e86966b1e6581bcfe22a0dfe (diff)