diff options
author | Jordan Justen <jordan.l.justen@intel.com> | 2016-05-26 13:51:21 -0700 |
---|---|---|
committer | Jordan Justen <jordan.l.justen@intel.com> | 2016-05-27 11:17:12 -0700 |
commit | 40960ebb4768032b0d6949dc07afdaa00b2a0e68 (patch) | |
tree | b01b743eaadc8176aa61ca9527eda9b49324d9f6 | |
parent | 594ef4288ebc8bd3cecfa143b6cd6e303362f770 (diff) |
squash anv: Support new local ID generation & cross-thread constantshsw-cs-cross-thread-constants-v2
Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
-rw-r--r-- | src/intel/vulkan/anv_cmd_buffer.c | 52 | ||||
-rw-r--r-- | src/intel/vulkan/anv_pipeline.c | 4 | ||||
-rw-r--r-- | src/intel/vulkan/anv_private.h | 1 | ||||
-rw-r--r-- | src/intel/vulkan/gen7_cmd_buffer.c | 10 | ||||
-rw-r--r-- | src/intel/vulkan/gen8_cmd_buffer.c | 13 | ||||
-rw-r--r-- | src/intel/vulkan/genX_cmd_buffer.c | 4 | ||||
-rw-r--r-- | src/intel/vulkan/genX_pipeline.c | 12 |
7 files changed, 42 insertions, 54 deletions
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index bba24e8233..464b56acf5 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -1059,24 +1059,14 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - const unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8; - const unsigned push_constant_data_size = - (local_id_dwords + prog_data->nr_params) * 4; - const unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); - const unsigned param_aligned_count = - reg_aligned_constant_size / sizeof(uint32_t); - /* If we don't actually have any push constants, bail. */ - if (reg_aligned_constant_size == 0) + if (cs_prog_data->push.total.size == 0) return (struct anv_state) { .offset = 0 }; - const unsigned threads = pipeline->cs_thread_width_max; - const unsigned total_push_constants_size = - reg_aligned_constant_size * threads; const unsigned push_constant_alignment = cmd_buffer->device->info.gen < 8 ? 32 : 64; const unsigned aligned_total_push_constants_size = - ALIGN(total_push_constants_size, push_constant_alignment); + ALIGN(cs_prog_data->push.total.size, push_constant_alignment); struct anv_state state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, aligned_total_push_constants_size, @@ -1085,21 +1075,33 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) /* Walk through the param array and fill the buffer with data */ uint32_t *u32_map = state.map; - brw_cs_fill_local_id_payload(cs_prog_data, u32_map, threads, - reg_aligned_constant_size); - - /* Setup uniform data for the first thread */ - for (unsigned i = 0; i < prog_data->nr_params; i++) { - uint32_t offset = (uintptr_t)prog_data->param[i]; - u32_map[local_id_dwords + i] = *(uint32_t *)((uint8_t *)data + offset); + if (cs_prog_data->push.cross_thread.size > 0) { + assert(cs_prog_data->thread_local_id_index < 0 || + cs_prog_data->thread_local_id_index >= + cs_prog_data->push.cross_thread.dwords); + for (unsigned i = 0; + i < cs_prog_data->push.cross_thread.dwords; + i++) { + uint32_t offset = (uintptr_t)prog_data->param[i]; + u32_map[i] = *(uint32_t *)((uint8_t *)data + offset); + } } - /* Copy uniform data from the first thread to every other thread */ - const size_t uniform_data_size = prog_data->nr_params * sizeof(uint32_t); - for (unsigned t = 1; t < threads; t++) { - memcpy(&u32_map[t * param_aligned_count + local_id_dwords], - &u32_map[local_id_dwords], - uniform_data_size); + if (cs_prog_data->push.per_thread.size > 0) { + for (unsigned t = 0; t < cs_prog_data->threads; t++) { + uint32_t *t_u32_map = + &u32_map[8 * t + cs_prog_data->push.cross_thread.dwords]; + for (unsigned si = cs_prog_data->push.cross_thread.dwords, di = 0; + si < prog_data->nr_params; + si++, di++) { + if (si != cs_prog_data->thread_local_id_index) { + uint32_t offset = (uintptr_t)prog_data->param[si]; + t_u32_map[di] = *(uint32_t *)((uint8_t *)data + offset); + } else { + t_u32_map[di] = t * cs_prog_data->simd_size; + } + } + } } if (!cmd_buffer->device->info.has_llc) diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 8021348ff6..0a61e3a477 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -204,6 +204,7 @@ anv_shader_compile_to_nir(struct anv_device *device, nir = brw_preprocess_nir(compiler, nir); nir_lower_system_values(nir); + brw_nir_lower_intrinsics(nir); const bool is_scalar = compiler->scalar_stage[nir->stage]; anv_nir_lower_uniforms(nir, is_scalar); @@ -368,6 +369,9 @@ anv_pipeline_compile(struct anv_pipeline *pipeline, pipeline->needs_data_cache = true; } + if (stage == MESA_SHADER_COMPUTE) + prog_data->nr_params++; /* The CS Thread ID uniform */ + if (nir->info.num_ssbos > 0) pipeline->needs_data_cache = true; diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 6ac0745b66..a9ad8c94b3 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1433,7 +1433,6 @@ struct anv_pipeline { bool primitive_restart; uint32_t topology; - uint32_t cs_thread_width_max; uint32_t cs_right_mask; struct { diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c index 7b55133fed..3fd7845142 100644 --- a/src/intel/vulkan/gen7_cmd_buffer.c +++ b/src/intel/vulkan/gen7_cmd_buffer.c @@ -234,12 +234,6 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8; - unsigned push_constant_data_size = - (prog_data->nr_params + local_id_dwords) * 4; - unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); - unsigned push_constant_regs = reg_aligned_constant_size / 32; - if (push_state.alloc_size) { anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { curbe.CURBETotalDataLength = push_state.alloc_size; @@ -264,14 +258,14 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) .BindingTablePointer = surfaces.offset, .SamplerStatePointer = samplers.offset, .ConstantURBEntryReadLength = - push_constant_regs, + cs_prog_data->push.per_thread.regs, #if !GEN_IS_HASWELL .ConstantURBEntryReadOffset = 0, #endif .BarrierEnable = cs_prog_data->uses_barrier, .SharedLocalMemorySize = slm_size, .NumberofThreadsinGPGPUThreadGroup = - pipeline->cs_thread_width_max); + cs_prog_data->threads); const uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); anv_batch_emit(&cmd_buffer->batch, diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c index 065cf9e763..4e3eb38865 100644 --- a/src/intel/vulkan/gen8_cmd_buffer.c +++ b/src/intel/vulkan/gen8_cmd_buffer.c @@ -319,12 +319,6 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8; - unsigned push_constant_data_size = - (prog_data->nr_params + local_id_dwords) * 4; - unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); - unsigned push_constant_regs = reg_aligned_constant_size / 32; - if (push_state.alloc_size) { anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { curbe.CURBETotalDataLength = push_state.alloc_size; @@ -351,12 +345,15 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) .BindingTableEntryCount = 0, .SamplerStatePointer = samplers.offset, .SamplerCount = 0, - .ConstantIndirectURBEntryReadLength = push_constant_regs, + .ConstantIndirectURBEntryReadLength = + cs_prog_data->push.per_thread.regs, .ConstantURBEntryReadOffset = 0, .BarrierEnable = cs_prog_data->uses_barrier, .SharedLocalMemorySize = slm_size, .NumberofThreadsinGPGPUThreadGroup = - pipeline->cs_thread_width_max); + cs_prog_data->threads, + .CrossThreadConstantDataReadLength = + cs_prog_data->push.cross_thread.regs); uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); anv_batch_emit(&cmd_buffer->batch, diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index ee47c2926e..d22fe2ef20 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -690,7 +690,7 @@ void genX(CmdDispatch)( ggw.SIMDSize = prog_data->simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; - ggw.ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1; + ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; ggw.ThreadGroupIDXDimension = x; ggw.ThreadGroupIDYDimension = y; ggw.ThreadGroupIDZDimension = z; @@ -791,7 +791,7 @@ void genX(CmdDispatchIndirect)( ggw.SIMDSize = prog_data->simd_size / 16; ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; - ggw.ThreadWidthCounterMaximum = pipeline->cs_thread_width_max - 1; + ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; ggw.RightExecutionMask = pipeline->cs_right_mask; ggw.BottomExecutionMask = 0xffffffff; } diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 918a9a4f03..458e80c82b 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -87,18 +87,9 @@ genX(compute_pipeline_create)( anv_setup_pipeline_l3_config(pipeline); const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); - const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - - unsigned local_id_dwords = cs_prog_data->local_invocation_id_regs * 8; - unsigned push_constant_data_size = - (prog_data->nr_params + local_id_dwords) * 4; - unsigned reg_aligned_constant_size = ALIGN(push_constant_data_size, 32); - unsigned push_constant_regs = reg_aligned_constant_size / 32; uint32_t group_size = cs_prog_data->local_size[0] * cs_prog_data->local_size[1] * cs_prog_data->local_size[2]; - pipeline->cs_thread_width_max = - DIV_ROUND_UP(group_size, cs_prog_data->simd_size); uint32_t remainder = group_size & (cs_prog_data->simd_size - 1); if (remainder > 0) @@ -107,7 +98,8 @@ genX(compute_pipeline_create)( pipeline->cs_right_mask = ~0u >> (32 - cs_prog_data->simd_size); const uint32_t vfe_curbe_allocation = - push_constant_regs * pipeline->cs_thread_width_max; + ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + + cs_prog_data->push.cross_thread.regs, 2); anv_batch_emit(&pipeline->batch, GENX(MEDIA_VFE_STATE), vfe) { vfe.ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_COMPUTE]; |