diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2014-12-10 09:13:59 -0500 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2015-01-07 16:04:28 -0500 |
commit | 4040674b1285d357fc4ed68622197c81d68b5c8d (patch) | |
tree | 6a3170855de4cc31ab2bb5d59aa25fba824683f3 | |
parent | 5268fb6e83e03ef415d2f844f9adc0fc4268c3ab (diff) |
radeonsi: Enable VGPR spilling for all shader typesvgpr-spilling-Jan07-2014
-rw-r--r-- | src/gallium/drivers/radeonsi/si_compute.c | 40 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.c | 69 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.h | 7 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_shaders.c | 36 |
4 files changed, 108 insertions, 44 deletions
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 37e5c42e244..e5f158d82ec 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -42,12 +42,6 @@ #define NUM_USER_SGPRS 4 #endif -static const char *scratch_rsrc_dword0_symbol = - "SCRATCH_RSRC_DWORD0"; - -static const char *scratch_rsrc_dword1_symbol = - "SCRATCH_RSRC_DWORD1"; - struct si_compute { struct si_context *ctx; @@ -183,35 +177,6 @@ static unsigned compute_num_waves_for_scratch( return scratch_waves; } -static void apply_scratch_relocs(const struct si_screen *sscreen, - const struct radeon_shader_binary *binary, - struct si_shader *shader, uint64_t scratch_va) { - unsigned i; - char *ptr; - uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff; - uint32_t scratch_rsrc_dword1 = - S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) - | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64); - - if (!binary->reloc_count) { - return; - } - - ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, - PIPE_TRANSFER_READ_WRITE); - for (i = 0 ; i < binary->reloc_count; i++) { - const struct radeon_shader_reloc *reloc = &binary->relocs[i]; - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { - util_memcpy_cpu_to_le32(ptr + reloc->offset, - &scratch_rsrc_dword0, 4); - } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { - util_memcpy_cpu_to_le32(ptr + reloc->offset, - &scratch_rsrc_dword1, 4); - } - } - sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); -} - static void si_launch_grid( struct pipe_context *ctx, const uint *block_layout, const uint *grid_layout, @@ -256,7 +221,8 @@ static void si_launch_grid( #if HAVE_LLVM >= 0x0306 /* Read the config information */ - si_shader_binary_read_config(&program->binary, &program->program, pc); + si_shader_binary_read_config(sctx->screen, &program->binary, + &program->program, pc); #endif /* Upload the kernel arguments */ @@ -295,7 +261,7 @@ static void si_launch_grid( RADEON_PRIO_SHADER_RESOURCE_RW); /* Patch the shader with the scratch buffer address. */ - apply_scratch_relocs(sctx->screen, + si_shader_apply_scratch_relocs(sctx->screen, &program->binary, shader, scratch_buffer_va); } diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index cf28860a35f..d59e736b983 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -46,6 +46,12 @@ #include <errno.h> +static const char *scratch_rsrc_dword0_symbol = + "SCRATCH_RSRC_DWORD0"; + +static const char *scratch_rsrc_dword1_symbol = + "SCRATCH_RSRC_DWORD1"; + struct si_shader_output_values { LLVMValueRef values[4]; @@ -2517,7 +2523,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx) } } -void si_shader_binary_read_config(const struct radeon_shader_binary *binary, +void si_shader_binary_read_config(const struct si_screen *sscreen, + const struct radeon_shader_binary *binary, struct si_shader *shader, unsigned symbol_offset) { @@ -2549,6 +2556,14 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary, case R_0286CC_SPI_PS_INPUT_ENA: shader->spi_ps_input_ena = value; break; + case R_0286E8_SPI_TMPRING_SIZE: + /* XXX: This is the maximum value allowed. I'm not sure + * how compute this for non-cs shaders. + */ + shader->scratch_waves = + 32 * sscreen->b.info.max_compute_units; + /* Fall-through */ + case R_00B860_COMPUTE_TMPRING_SIZE: /* WAVESIZE is in units of 256 dwords. */ shader->scratch_bytes_per_wave = @@ -2562,6 +2577,36 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary, } } +void si_shader_apply_scratch_relocs(const struct si_screen *sscreen, + const struct radeon_shader_binary *binary, + struct si_shader *shader, uint64_t scratch_va) { + unsigned i; + char *ptr; + uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff; + uint32_t scratch_rsrc_dword1 = + S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) + | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64); + + if (!binary->reloc_count) { + return; + } + + ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, + PIPE_TRANSFER_READ_WRITE); + for (i = 0 ; i < binary->reloc_count; i++) { + const struct radeon_shader_reloc *reloc = &binary->relocs[i]; + if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { + util_memcpy_cpu_to_le32(ptr + reloc->offset, + &scratch_rsrc_dword0, 4); + } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { + util_memcpy_cpu_to_le32(ptr + reloc->offset, + &scratch_rsrc_dword1, 4); + } + } + sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); +} + + int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader, const struct radeon_shader_binary *binary) @@ -2582,7 +2627,7 @@ int si_shader_binary_read(struct si_screen *sscreen, } } - si_shader_binary_read_config(binary, shader, 0); + si_shader_binary_read_config(sscreen, binary, shader, 0); /* copy new shader */ code_size = binary->code_size + binary->rodata_size; @@ -2601,6 +2646,7 @@ int si_shader_binary_read(struct si_screen *sscreen, util_memcpy_cpu_to_le32(ptr, binary->rodata, binary->rodata_size); } + sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); return 0; @@ -2621,6 +2667,25 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader, return r; } r = si_shader_binary_read(sscreen, shader, &binary); + + if (shader->scratch_bytes_per_wave > 0) { + uint64_t scratch_buffer_va; + unsigned scratch_bytes = shader->scratch_bytes_per_wave * + shader->scratch_waves; + /* It's possible for different shader variants to have + * different scratch buffer sizes. The scratch buffer sizes + * won't vary by too much, so allocating double the scratch + * space need for the first variant should be enough for the + * rest if they need it. + */ + shader->scratch_bo = (struct r600_resource*) + si_resource_create_custom(&sscreen->b.b, + PIPE_USAGE_DEFAULT, scratch_bytes * 2); + scratch_buffer_va = shader->scratch_bo->gpu_address; + si_shader_apply_scratch_relocs(sscreen, &binary, shader, + scratch_buffer_va); + } + FREE(binary.code); FREE(binary.config); FREE(binary.rodata); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 08e344af444..c5b274ec288 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -147,6 +147,7 @@ struct si_shader { unsigned lds_size; unsigned spi_ps_input_ena; unsigned scratch_bytes_per_wave; + unsigned scratch_waves; unsigned spi_shader_col_format; unsigned spi_shader_z_format; unsigned db_shader_control; @@ -185,7 +186,11 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader); unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index); int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader, const struct radeon_shader_binary *binary); -void si_shader_binary_read_config(const struct radeon_shader_binary *binary, +void si_shader_apply_scratch_relocs(const struct si_screen *sscreen, + const struct radeon_shader_binary *binary, + struct si_shader *shader, uint64_t scratch_va); +void si_shader_binary_read_config(const struct si_screen *sscreen, + const struct radeon_shader_binary *binary, struct si_shader *shader, unsigned symbol_offset); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 817a990ee2e..b6a0f326a8b 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -33,6 +33,22 @@ #include "util/u_memory.h" #include "util/u_simple_shaders.h" +static void si_shader_setup_scratch_buffer(struct si_shader *shader, + struct si_pm4_state *pm4) { + if (shader->scratch_bytes_per_wave == 0) { + return; + } + + assert(shader->scratch_bo); + + si_pm4_set_reg(pm4, R_0286E8_SPI_TMPRING_SIZE, + S_0286E8_WAVES(shader->scratch_waves) + | S_0286E8_WAVESIZE(shader->scratch_bytes_per_wave >> 10)); + + si_pm4_add_bo(pm4, shader->scratch_bo, RADEON_USAGE_READWRITE, + RADEON_PRIO_SHADER_RESOURCE_RW); +} + static void si_shader_es(struct si_shader *shader) { struct si_pm4_state *pm4; @@ -67,7 +83,10 @@ static void si_shader_es(struct si_shader *shader) S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B328_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES, - S_00B32C_USER_SGPR(num_user_sgprs)); + S_00B32C_USER_SGPR(num_user_sgprs) | + S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + + si_shader_setup_scratch_buffer(shader, pm4); } static void si_shader_gs(struct si_shader *shader) @@ -136,7 +155,10 @@ static void si_shader_gs(struct si_shader *shader) S_00B228_SGPRS((num_sgprs - 1) / 8) | S_00B228_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, - S_00B22C_USER_SGPR(num_user_sgprs)); + S_00B22C_USER_SGPR(num_user_sgprs) | + S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + + si_shader_setup_scratch_buffer(shader, pm4); } static void si_shader_vs(struct si_shader *shader) @@ -216,7 +238,8 @@ static void si_shader_vs(struct si_shader *shader) S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) | S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) | S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) | - S_00B12C_SO_EN(!!shader->selector->so.num_outputs)); + S_00B12C_SO_EN(!!shader->selector->so.num_outputs) | + S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); if (window_space) si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL, S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1)); @@ -226,6 +249,8 @@ static void si_shader_vs(struct si_shader *shader) S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) | S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) | S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1)); + + si_shader_setup_scratch_buffer(shader, pm4); } static void si_shader_ps(struct si_shader *shader) @@ -307,7 +332,10 @@ static void si_shader_ps(struct si_shader *shader) S_00B028_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS, S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) | - S_00B02C_USER_SGPR(num_user_sgprs)); + S_00B02C_USER_SGPR(num_user_sgprs) | + S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); + + si_shader_setup_scratch_buffer(shader, pm4); } static void si_shader_init_pm4_state(struct si_shader *shader) |