summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Anholt <eric@anholt.net>2011-04-27 13:33:10 -0700
committerEric Anholt <eric@anholt.net>2011-06-18 16:00:45 -0700
commitc173541d9769d41a85cc899bc49699a3587df4bf (patch)
tree0b445fd0db1f9eb806b7fe48fa8ac4fced4baa8a
parent962dab948609c97c1c01fde6a27e19307948d302 (diff)
i965: Use state streaming on programs, and state base address on gen5+.
There will be a little bit of thrashing of the program cache BO as the cache warms up, but once the application is in steady state, this reduces relocations on gen5 and later. On my T420 laptop, cairogl firefox-talos-gfx performance improves 2.6% +/- 1.3% (n=6). No statistically significant performance difference on nexuiz (n=5).
-rw-r--r--src/mesa/drivers/dri/i965/brw_clip.c24
-rw-r--r--src/mesa/drivers/dri/i965/brw_clip_state.c19
-rw-r--r--src/mesa/drivers/dri/i965/brw_context.h55
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp6
-rw-r--r--src/mesa/drivers/dri/i965/brw_gs.c24
-rw-r--r--src/mesa/drivers/dri/i965/brw_gs_state.c19
-rw-r--r--src/mesa/drivers/dri/i965/brw_misc_state.c10
-rw-r--r--src/mesa/drivers/dri/i965/brw_sf.c22
-rw-r--r--src/mesa/drivers/dri/i965/brw_sf_state.c15
-rw-r--r--src/mesa/drivers/dri/i965/brw_state.h30
-rw-r--r--src/mesa/drivers/dri/i965/brw_state_cache.c164
-rw-r--r--src/mesa/drivers/dri/i965/brw_state_dump.c26
-rw-r--r--src/mesa/drivers/dri/i965/brw_state_upload.c17
-rw-r--r--src/mesa/drivers/dri/i965/brw_vs.c22
-rw-r--r--src/mesa/drivers/dri/i965/brw_vs_state.c18
-rw-r--r--src/mesa/drivers/dri/i965/brw_vtbl.c12
-rw-r--r--src/mesa/drivers/dri/i965/brw_wm.c21
-rw-r--r--src/mesa/drivers/dri/i965/brw_wm_state.c40
-rw-r--r--src/mesa/drivers/dri/i965/gen6_gs_state.c2
-rw-r--r--src/mesa/drivers/dri/i965/gen6_urb.c2
-rw-r--r--src/mesa/drivers/dri/i965/gen6_vs_state.c2
-rw-r--r--src/mesa/drivers/dri/i965/gen6_wm_state.c10
-rw-r--r--src/mesa/drivers/dri/i965/gen7_disable.c2
-rw-r--r--src/mesa/drivers/dri/i965/gen7_urb.c2
-rw-r--r--src/mesa/drivers/dri/i965/gen7_vs_state.c2
-rw-r--r--src/mesa/drivers/dri/i965/gen7_wm_state.c9
26 files changed, 277 insertions, 298 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index c7d428ba48..d82206bae5 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -146,15 +146,12 @@ static void compile_clip_prog( struct brw_context *brw,
printf("\n");
}
- /* Upload
- */
- drm_intel_bo_unreference(brw->clip.prog_bo);
- brw->clip.prog_bo = brw_upload_cache(&brw->cache,
- BRW_CLIP_PROG,
- &c.key, sizeof(c.key),
- program, program_size,
- &c.prog_data, sizeof(c.prog_data),
- &brw->clip.prog_data);
+ brw_upload_cache(&brw->cache,
+ BRW_CLIP_PROG,
+ &c.key, sizeof(c.key),
+ program, program_size,
+ &c.prog_data, sizeof(c.prog_data),
+ &brw->clip.prog_offset, &brw->clip.prog_data);
ralloc_free(mem_ctx);
}
@@ -271,12 +268,11 @@ static void upload_clip_prog(struct brw_context *brw)
}
}
- drm_intel_bo_unreference(brw->clip.prog_bo);
- brw->clip.prog_bo = brw_search_cache(&brw->cache, BRW_CLIP_PROG,
- &key, sizeof(key),
- &brw->clip.prog_data);
- if (brw->clip.prog_bo == NULL)
+ if (!brw_search_cache(&brw->cache, BRW_CLIP_PROG,
+ &key, sizeof(key),
+ &brw->clip.prog_offset, &brw->clip.prog_data)) {
compile_clip_prog( brw, &key );
+ }
}
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 6015c8cbe9..b9efbb74c8 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -43,11 +43,15 @@ brw_prepare_clip_unit(struct brw_context *brw)
clip = brw_state_batch(brw, sizeof(*clip), 32, &brw->clip.state_offset);
memset(clip, 0, sizeof(*clip));
- /* CACHE_NEW_CLIP_PROG */
+ /* BRW_NEW_PROGRAM_CACHE | CACHE_NEW_CLIP_PROG */
clip->thread0.grf_reg_count = (ALIGN(brw->clip.prog_data->total_grf, 16) /
16 - 1);
- /* reloc */
- clip->thread0.kernel_start_pointer = brw->clip.prog_bo->offset >> 6;
+ clip->thread0.kernel_start_pointer =
+ brw_program_reloc(brw,
+ brw->clip.state_offset +
+ offsetof(struct brw_clip_unit_state, thread0),
+ brw->clip.prog_offset +
+ (clip->thread0.grf_reg_count << 1)) >> 6;
clip->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
clip->thread1.single_program_flow = 1;
@@ -110,14 +114,6 @@ brw_prepare_clip_unit(struct brw_context *brw)
clip->viewport_ymin = -1;
clip->viewport_ymax = 1;
- /* Emit clip program relocation */
- assert(brw->clip.prog_bo);
- drm_intel_bo_emit_reloc(intel->batch.bo,
- (brw->clip.state_offset +
- offsetof(struct brw_clip_unit_state, thread0)),
- brw->clip.prog_bo, clip->thread0.grf_reg_count << 1,
- I915_GEM_DOMAIN_INSTRUCTION, 0);
-
brw->state.dirty.cache |= CACHE_NEW_CLIP_UNIT;
}
@@ -125,6 +121,7 @@ const struct brw_tracked_state brw_clip_unit = {
.dirty = {
.mesa = _NEW_TRANSFORM,
.brw = (BRW_NEW_BATCH |
+ BRW_NEW_PROGRAM_CACHE |
BRW_NEW_CURBE_OFFSETS |
BRW_NEW_URB_FENCE),
.cache = CACHE_NEW_CLIP_PROG
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 621b6f8990..16b71f6b1c 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -142,7 +142,8 @@ enum brw_state_id {
BRW_STATE_NR_VS_SURFACES,
BRW_STATE_INDEX_BUFFER,
BRW_STATE_VS_CONSTBUF,
- BRW_STATE_WM_CONSTBUF
+ BRW_STATE_WM_CONSTBUF,
+ BRW_STATE_PROGRAM_CACHE,
};
#define BRW_NEW_URB_FENCE (1 << BRW_STATE_URB_FENCE)
@@ -172,6 +173,7 @@ enum brw_state_id {
#define BRW_NEW_INDEX_BUFFER (1 << BRW_STATE_INDEX_BUFFER)
#define BRW_NEW_VS_CONSTBUF (1 << BRW_STATE_VS_CONSTBUF)
#define BRW_NEW_WM_CONSTBUF (1 << BRW_STATE_WM_CONSTBUF)
+#define BRW_NEW_PROGRAM_CACHE (1 << BRW_STATE_PROGRAM_CACHE)
struct brw_state_flags {
/** State update flags signalled by mesa internals */
@@ -365,7 +367,8 @@ struct brw_cache_item {
GLuint key_size; /* for variable-sized keys */
const void *key;
- drm_intel_bo *bo;
+ uint32_t offset;
+ uint32_t size;
struct brw_cache_item *next;
};
@@ -376,14 +379,11 @@ struct brw_cache {
struct brw_context *brw;
struct brw_cache_item **items;
+ drm_intel_bo *bo;
GLuint size, n_items;
- char *name[BRW_MAX_CACHE];
-
- /* Record of the last BOs chosen for each cache_id. Used to set
- * brw->state.dirty.cache when a new cache item is chosen.
- */
- drm_intel_bo *last_bo[BRW_MAX_CACHE];
+ uint32_t next_offset;
+ bool bo_used_by_gpu;
};
@@ -634,8 +634,9 @@ struct brw_context
struct brw_vs_prog_data *prog_data;
int8_t *constant_map; /* variable array following prog_data */
- drm_intel_bo *prog_bo;
drm_intel_bo *const_bo;
+ /** Offset in the program cache to the VS program */
+ uint32_t prog_offset;
uint32_t state_offset;
/** Binding table of pointers to surf_bo entries */
@@ -651,14 +652,16 @@ struct brw_context
struct brw_gs_prog_data *prog_data;
GLboolean prog_active;
+ /** Offset in the program cache to the CLIP program pre-gen6 */
+ uint32_t prog_offset;
uint32_t state_offset;
- drm_intel_bo *prog_bo;
} gs;
struct {
struct brw_clip_prog_data *prog_data;
- drm_intel_bo *prog_bo;
+ /** Offset in the program cache to the CLIP program pre-gen6 */
+ uint32_t prog_offset;
/* Offset in the batch to the CLIP state on pre-gen6. */
uint32_t state_offset;
@@ -673,7 +676,8 @@ struct brw_context
struct {
struct brw_sf_prog_data *prog_data;
- drm_intel_bo *prog_bo;
+ /** Offset in the program cache to the CLIP program pre-gen6 */
+ uint32_t prog_offset;
uint32_t state_offset;
uint32_t vp_offset;
} sf;
@@ -700,12 +704,14 @@ struct brw_context
GLuint sampler_count;
uint32_t sampler_offset;
+ /** Offset in the program cache to the WM program */
+ uint32_t prog_offset;
+
/** Binding table of pointers to surf_bo entries */
uint32_t bind_bo_offset;
uint32_t surf_offset[BRW_WM_MAX_SURF];
uint32_t state_offset; /* offset in batchbuffer to pre-gen6 WM state */
- drm_intel_bo *prog_bo;
drm_intel_bo *const_bo; /* pull constant buffer. */
/**
* This is offset in the batch to the push constants on gen6.
@@ -717,9 +723,6 @@ struct brw_context
struct {
- /* gen4 */
- drm_intel_bo *prog_bo;
-
uint32_t state_offset;
uint32_t blend_state_offset;
uint32_t depth_stencil_state_offset;
@@ -874,6 +877,26 @@ brw_register_blocks(int reg_count)
return ALIGN(reg_count, 16) / 16 - 1;
}
+static inline uint32_t
+brw_program_reloc(struct brw_context *brw, uint32_t state_offset,
+ uint32_t prog_offset)
+{
+ struct intel_context *intel = &brw->intel;
+
+ if (intel->gen >= 5) {
+ /* Using state base address. */
+ return prog_offset;
+ }
+
+ drm_intel_bo_emit_reloc(intel->batch.bo,
+ state_offset,
+ brw->cache.bo,
+ prog_offset,
+ I915_GEM_DOMAIN_INSTRUCTION, 0);
+
+ return brw->cache.bo->offset + prog_offset;
+}
+
GLboolean brw_do_cubemap_normalize(struct exec_list *instructions);
#endif
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 7c73a8fbf0..8580c78fea 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1697,14 +1697,12 @@ brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
key.program_string_id = bfp->id;
- drm_intel_bo *old_prog_bo = brw->wm.prog_bo;
+ uint32_t old_prog_offset = brw->wm.prog_offset;
struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
- brw->wm.prog_bo = NULL;
bool success = do_wm_prog(brw, prog, bfp, &key);
- drm_intel_bo_unreference(brw->wm.prog_bo);
- brw->wm.prog_bo = old_prog_bo;
+ brw->wm.prog_offset = old_prog_offset;
brw->wm.prog_data = old_prog_data;
return success;
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 001cd62f8c..3171e97d7a 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -121,14 +121,11 @@ static void compile_gs_prog( struct brw_context *brw,
printf("\n");
}
- /* Upload
- */
- drm_intel_bo_unreference(brw->gs.prog_bo);
- brw->gs.prog_bo = brw_upload_cache(&brw->cache, BRW_GS_PROG,
- &c.key, sizeof(c.key),
- program, program_size,
- &c.prog_data, sizeof(c.prog_data),
- &brw->gs.prog_data);
+ brw_upload_cache(&brw->cache, BRW_GS_PROG,
+ &c.key, sizeof(c.key),
+ program, program_size,
+ &c.prog_data, sizeof(c.prog_data),
+ &brw->gs.prog_offset, &brw->gs.prog_data);
ralloc_free(mem_ctx);
}
@@ -189,15 +186,12 @@ static void prepare_gs_prog(struct brw_context *brw)
brw->gs.prog_active = key.need_gs_prog;
}
- drm_intel_bo_unreference(brw->gs.prog_bo);
- brw->gs.prog_bo = NULL;
-
if (brw->gs.prog_active) {
- brw->gs.prog_bo = brw_search_cache(&brw->cache, BRW_GS_PROG,
- &key, sizeof(key),
- &brw->gs.prog_data);
- if (brw->gs.prog_bo == NULL)
+ if (!brw_search_cache(&brw->cache, BRW_GS_PROG,
+ &key, sizeof(key),
+ &brw->gs.prog_offset, &brw->gs.prog_data)) {
compile_gs_prog( brw, &key );
+ }
}
}
diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
index 542874b770..bbfefcd816 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
@@ -45,12 +45,17 @@ brw_prepare_gs_unit(struct brw_context *brw)
memset(gs, 0, sizeof(*gs));
- /* CACHE_NEW_GS_PROG */
+ /* BRW_NEW_PROGRAM_CACHE | CACHE_NEW_GS_PROG */
if (brw->gs.prog_active) {
gs->thread0.grf_reg_count = (ALIGN(brw->gs.prog_data->total_grf, 16) /
16 - 1);
- /* reloc */
- gs->thread0.kernel_start_pointer = brw->gs.prog_bo->offset >> 6;
+
+ gs->thread0.kernel_start_pointer =
+ brw_program_reloc(brw,
+ brw->gs.state_offset +
+ offsetof(struct brw_gs_unit_state, thread0),
+ brw->gs.prog_offset +
+ (gs->thread0.grf_reg_count << 1)) >> 6;
gs->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
gs->thread1.single_program_flow = 1;
@@ -69,13 +74,6 @@ brw_prepare_gs_unit(struct brw_context *brw)
gs->thread4.max_threads = 1;
else
gs->thread4.max_threads = 0;
-
- /* Emit GS program relocation */
- drm_intel_bo_emit_reloc(intel->batch.bo,
- (brw->gs.state_offset +
- offsetof(struct brw_gs_unit_state, thread0)),
- brw->gs.prog_bo, gs->thread0.grf_reg_count << 1,
- I915_GEM_DOMAIN_INSTRUCTION, 0);
}
if (intel->gen == 5)
@@ -91,6 +89,7 @@ const struct brw_tracked_state brw_gs_unit = {
.dirty = {
.mesa = 0,
.brw = (BRW_NEW_BATCH |
+ BRW_NEW_PROGRAM_CACHE |
BRW_NEW_CURBE_OFFSETS |
BRW_NEW_URB_FENCE),
.cache = CACHE_NEW_GS_PROG
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 1f3b64fd53..b0f95dd66b 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -706,7 +706,9 @@ static void upload_state_base_address( struct brw_context *brw )
I915_GEM_DOMAIN_INSTRUCTION), 0, 1);
OUT_BATCH(1); /* Indirect object base address: MEDIA_OBJECT data */
- OUT_BATCH(1); /* Instruction base address: shader kernels (incl. SIP) */
+ OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+ 1); /* Instruction base address: shader kernels (incl. SIP) */
+
OUT_BATCH(1); /* General state upper bound */
OUT_BATCH(1); /* Dynamic state upper bound */
OUT_BATCH(1); /* Indirect object upper bound */
@@ -719,7 +721,8 @@ static void upload_state_base_address( struct brw_context *brw )
OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
1); /* Surface state base address */
OUT_BATCH(1); /* Indirect object base address */
- OUT_BATCH(1); /* Instruction base address */
+ OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+ 1); /* Instruction base address */
OUT_BATCH(1); /* General state upper bound */
OUT_BATCH(1); /* Indirect object upper bound */
OUT_BATCH(1); /* Instruction access upper bound */
@@ -740,7 +743,8 @@ static void upload_state_base_address( struct brw_context *brw )
const struct brw_tracked_state brw_state_base_address = {
.dirty = {
.mesa = 0,
- .brw = BRW_NEW_BATCH,
+ .brw = (BRW_NEW_BATCH |
+ BRW_NEW_PROGRAM_CACHE),
.cache = 0,
},
.emit = upload_state_base_address
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index c2227777cf..fca30a74aa 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -120,14 +120,11 @@ static void compile_sf_prog( struct brw_context *brw,
printf("\n");
}
- /* Upload
- */
- drm_intel_bo_unreference(brw->sf.prog_bo);
- brw->sf.prog_bo = brw_upload_cache(&brw->cache, BRW_SF_PROG,
- &c.key, sizeof(c.key),
- program, program_size,
- &c.prog_data, sizeof(c.prog_data),
- &brw->sf.prog_data);
+ brw_upload_cache(&brw->cache, BRW_SF_PROG,
+ &c.key, sizeof(c.key),
+ program, program_size,
+ &c.prog_data, sizeof(c.prog_data),
+ &brw->sf.prog_offset, &brw->sf.prog_data);
ralloc_free(mem_ctx);
}
@@ -191,12 +188,11 @@ static void upload_sf_prog(struct brw_context *brw)
key.frontface_ccw = (ctx->Polygon.FrontFace == GL_CCW) ^ (ctx->DrawBuffer->Name != 0);
}
- drm_intel_bo_unreference(brw->sf.prog_bo);
- brw->sf.prog_bo = brw_search_cache(&brw->cache, BRW_SF_PROG,
- &key, sizeof(key),
- &brw->sf.prog_data);
- if (brw->sf.prog_bo == NULL)
+ if (!brw_search_cache(&brw->cache, BRW_SF_PROG,
+ &key, sizeof(key),
+ &brw->sf.prog_offset, &brw->sf.prog_data)) {
compile_sf_prog( brw, &key );
+ }
}
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 78b22c4df3..eb3d103099 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -133,9 +133,14 @@ static void upload_sf_unit( struct brw_context *brw )
memset(sf, 0, sizeof(*sf));
- /* CACHE_NEW_SF_PROG */
+ /* BRW_NEW_PROGRAM_CACHE | CACHE_NEW_SF_PROG */
sf->thread0.grf_reg_count = ALIGN(brw->sf.prog_data->total_grf, 16) / 16 - 1;
- sf->thread0.kernel_start_pointer = brw->sf.prog_bo->offset >> 6; /* reloc */
+ sf->thread0.kernel_start_pointer =
+ brw_program_reloc(brw,
+ brw->sf.state_offset +
+ offsetof(struct brw_sf_unit_state, thread0),
+ brw->sf.prog_offset +
+ (sf->thread0.grf_reg_count << 1)) >> 6;
sf->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
@@ -282,11 +287,6 @@ static void upload_sf_unit( struct brw_context *brw )
/* STATE_PREFETCH command description describes this state as being
* something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
*/
- /* Emit SF program relocation */
- drm_intel_bo_emit_reloc(bo, (brw->sf.state_offset +
- offsetof(struct brw_sf_unit_state, thread0)),
- brw->sf.prog_bo, sf->thread0.grf_reg_count << 1,
- I915_GEM_DOMAIN_INSTRUCTION, 0);
/* Emit SF viewport relocation */
drm_intel_bo_emit_reloc(bo, (brw->sf.state_offset +
@@ -308,6 +308,7 @@ const struct brw_tracked_state brw_sf_unit = {
_NEW_SCISSOR |
_NEW_BUFFERS),
.brw = (BRW_NEW_BATCH |
+ BRW_NEW_PROGRAM_CACHE |
BRW_NEW_URB_FENCE),
.cache = (CACHE_NEW_SF_VP |
CACHE_NEW_SF_PROG)
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 544ef7d47e..b384651d8d 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -145,21 +145,21 @@ void brw_clear_validated_bos(struct brw_context *brw);
* brw_state_cache.c
*/
-drm_intel_bo *brw_upload_cache(struct brw_cache *cache,
- enum brw_cache_id cache_id,
- const void *key,
- GLuint key_sz,
- const void *data,
- GLuint data_sz,
- const void *aux,
- GLuint aux_sz,
- void *aux_return);
-
-drm_intel_bo *brw_search_cache( struct brw_cache *cache,
- enum brw_cache_id cache_id,
- const void *key,
- GLuint key_size,
- void *aux_return);
+void brw_upload_cache(struct brw_cache *cache,
+ enum brw_cache_id cache_id,
+ const void *key,
+ GLuint key_sz,
+ const void *data,
+ GLuint data_sz,
+ const void *aux,
+ GLuint aux_sz,
+ uint32_t *out_offset, void *out_aux);
+
+bool brw_search_cache(struct brw_cache *cache,
+ enum brw_cache_id cache_id,
+ const void *key,
+ GLuint key_size,
+ uint32_t *inout_offset, void *out_aux);
void brw_state_cache_check_size( struct brw_context *brw );
void brw_init_caches( struct brw_context *brw );
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index f13a41fa7c..d13711b19b 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -45,6 +45,7 @@
*/
#include "main/imports.h"
+#include "intel_batchbuffer.h"
#include "brw_state.h"
#define FILE_DEBUG_FLAG DEBUG_STATE
@@ -67,23 +68,6 @@ hash_key(struct brw_cache_item *item)
return hash;
}
-
-/**
- * Marks a new buffer as being chosen for the given cache id.
- */
-static void
-update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
- drm_intel_bo *bo)
-{
- if (bo == cache->last_bo[cache_id])
- return; /* no change */
-
- drm_intel_bo_unreference(cache->last_bo[cache_id]);
- cache->last_bo[cache_id] = bo;
- drm_intel_bo_reference(cache->last_bo[cache_id]);
- cache->brw->state.dirty.cache |= 1 << cache_id;
-}
-
static int
brw_cache_item_equals(const struct brw_cache_item *a,
const struct brw_cache_item *b)
@@ -145,12 +129,13 @@ rehash(struct brw_cache *cache)
/**
* Returns the buffer object matching cache_id and key, or NULL.
*/
-drm_intel_bo *
+bool
brw_search_cache(struct brw_cache *cache,
enum brw_cache_id cache_id,
const void *key, GLuint key_size,
- void *aux_return)
+ uint32_t *inout_offset, void *out_aux)
{
+ struct brw_context *brw = cache->brw;
struct brw_cache_item *item;
struct brw_cache_item lookup;
GLuint hash;
@@ -164,19 +149,45 @@ brw_search_cache(struct brw_cache *cache,
item = search_cache(cache, hash, &lookup);
if (item == NULL)
- return NULL;
+ return false;
- if (aux_return)
- *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+ *(void **)out_aux = ((char *)item->key + item->key_size);
- update_cache_last(cache, cache_id, item->bo);
+ if (item->offset != *inout_offset) {
+ brw->state.dirty.cache |= (1 << cache_id);
+ *inout_offset = item->offset;
+ }
- drm_intel_bo_reference(item->bo);
- return item->bo;
+ return true;
}
+static void
+brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
+{
+ struct brw_context *brw = cache->brw;
+ struct intel_context *intel = &brw->intel;
+ drm_intel_bo *new_bo;
+
+ new_bo = drm_intel_bo_alloc(intel->bufmgr, "program cache", new_size, 64);
-drm_intel_bo *
+ /* Copy any existing data that needs to be saved. */
+ if (cache->next_offset != 0) {
+ drm_intel_bo_map(cache->bo, false);
+ drm_intel_bo_subdata(new_bo, 0, cache->next_offset, cache->bo->virtual);
+ drm_intel_bo_unmap(cache->bo);
+ }
+
+ drm_intel_bo_unreference(cache->bo);
+ cache->bo = new_bo;
+ cache->bo_used_by_gpu = false;
+
+ /* Since we have a new BO in place, we need to signal the units
+ * that depend on it (state base address on gen5+, or unit state before).
+ */
+ brw->state.dirty.brw |= BRW_NEW_PROGRAM_CACHE;
+}
+
+void
brw_upload_cache(struct brw_cache *cache,
enum brw_cache_id cache_id,
const void *key,
@@ -185,12 +196,12 @@ brw_upload_cache(struct brw_cache *cache,
GLuint data_size,
const void *aux,
GLuint aux_size,
- void *aux_return)
+ uint32_t *out_offset,
+ void *out_aux)
{
struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
GLuint hash;
void *tmp;
- drm_intel_bo *bo;
item->cache_id = cache_id;
item->key = key;
@@ -198,10 +209,28 @@ brw_upload_cache(struct brw_cache *cache,
hash = hash_key(item);
item->hash = hash;
- /* Create the buffer object to contain the data */
- bo = drm_intel_bo_alloc(cache->brw->intel.bufmgr,
- cache->name[cache_id], data_size, 1 << 6);
+ /* Allocate space in the cache BO for our new program. */
+ if (cache->next_offset + data_size > cache->bo->size) {
+ uint32_t new_size = cache->bo->size * 2;
+
+ while (cache->next_offset + data_size > new_size)
+ new_size *= 2;
+
+ brw_cache_new_bo(cache, new_size);
+ }
+
+ /* If we would block on writing to an in-use program BO, just
+ * recreate it.
+ */
+ if (cache->bo_used_by_gpu) {
+ brw_cache_new_bo(cache, cache->bo->size);
+ }
+
+ item->offset = cache->next_offset;
+ item->size = data_size;
+ /* Programs are always 64-byte aligned, so set up the next one now */
+ cache->next_offset = ALIGN(item->offset + data_size, 64);
/* Set up the memory containing the key and aux_data */
tmp = malloc(key_size + aux_size);
@@ -211,9 +240,6 @@ brw_upload_cache(struct brw_cache *cache,
item->key = tmp;
- item->bo = bo;
- drm_intel_bo_reference(bo);
-
if (cache->n_items > cache->size * 1.5)
rehash(cache);
@@ -222,34 +248,18 @@ brw_upload_cache(struct brw_cache *cache,
cache->items[hash] = item;
cache->n_items++;
- if (aux_return) {
- *(void **)aux_return = (void *)((char *)item->key + item->key_size);
- }
-
- DBG("upload %s: %d bytes to cache id %d\n",
- cache->name[cache_id],
- data_size, cache_id);
-
/* Copy data to the buffer */
- drm_intel_bo_subdata(bo, 0, data_size, data);
-
- update_cache_last(cache, cache_id, bo);
+ drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
- return bo;
-}
-
-static void
-brw_init_cache_id(struct brw_cache *cache,
- const char *name,
- enum brw_cache_id id)
-{
- cache->name[id] = strdup(name);
+ *out_offset = item->offset;
+ *(void **)out_aux = (void *)((char *)item->key + item->key_size);
+ cache->brw->state.dirty.cache |= 1 << cache_id;
}
-
void
brw_init_caches(struct brw_context *brw)
{
+ struct intel_context *intel = &brw->intel;
struct brw_cache *cache = &brw->cache;
cache->brw = brw;
@@ -259,36 +269,15 @@ brw_init_caches(struct brw_context *brw)
cache->items = (struct brw_cache_item **)
calloc(1, cache->size * sizeof(struct brw_cache_item));
- brw_init_cache_id(cache, "CC_VP", BRW_CC_VP);
- brw_init_cache_id(cache, "CC_UNIT", BRW_CC_UNIT);
- brw_init_cache_id(cache, "WM_PROG", BRW_WM_PROG);
- brw_init_cache_id(cache, "SAMPLER", BRW_SAMPLER);
- brw_init_cache_id(cache, "WM_UNIT", BRW_WM_UNIT);
- brw_init_cache_id(cache, "SF_PROG", BRW_SF_PROG);
- brw_init_cache_id(cache, "SF_VP", BRW_SF_VP);
-
- brw_init_cache_id(cache, "SF_UNIT", BRW_SF_UNIT);
-
- brw_init_cache_id(cache, "VS_UNIT", BRW_VS_UNIT);
-
- brw_init_cache_id(cache, "VS_PROG", BRW_VS_PROG);
-
- brw_init_cache_id(cache, "CLIP_UNIT", BRW_CLIP_UNIT);
-
- brw_init_cache_id(cache, "CLIP_PROG", BRW_CLIP_PROG);
- brw_init_cache_id(cache, "CLIP_VP", BRW_CLIP_VP);
-
- brw_init_cache_id(cache, "GS_UNIT", BRW_GS_UNIT);
-
- brw_init_cache_id(cache, "GS_PROG", BRW_GS_PROG);
- brw_init_cache_id(cache, "BLEND_STATE", BRW_BLEND_STATE);
- brw_init_cache_id(cache, "COLOR_CALC_STATE", BRW_COLOR_CALC_STATE);
- brw_init_cache_id(cache, "DEPTH_STENCIL_STATE", BRW_DEPTH_STENCIL_STATE);
+ cache->bo = drm_intel_bo_alloc(intel->bufmgr,
+ "program cache",
+ 4096, 64);
}
static void
brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
{
+ struct intel_context *intel = &brw->intel;
struct brw_cache_item *c, *next;
GLuint i;
@@ -297,7 +286,6 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
for (i = 0; i < cache->size; i++) {
for (c = cache->items[i]; c; c = next) {
next = c->next;
- drm_intel_bo_unreference(c->bo);
free((void *)c->key);
free(c);
}
@@ -306,9 +294,18 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
cache->n_items = 0;
+ /* Start putting programs into the start of the BO again, since
+ * we'll never find the old results.
+ */
+ cache->next_offset = 0;
+
+ /* We need to make sure that the programs get regenerated, since
+ * any offsets leftover in brw_context will no longer be valid.
+ */
brw->state.dirty.mesa |= ~0;
brw->state.dirty.brw |= ~0;
brw->state.dirty.cache |= ~0;
+ intel_batchbuffer_flush(intel);
}
void
@@ -325,15 +322,10 @@ brw_state_cache_check_size(struct brw_context *brw)
static void
brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
{
- GLuint i;
DBG("%s\n", __FUNCTION__);
brw_clear_cache(brw, cache);
- for (i = 0; i < BRW_MAX_CACHE; i++) {
- drm_intel_bo_unreference(cache->last_bo[i]);
- free(cache->name[i]);
- }
free(cache->items);
cache->items = NULL;
cache->size = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index ff06cb3a91..7a3a88f04f 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -459,21 +459,19 @@ static void dump_blend_state(struct brw_context *brw)
}
-static void brw_debug_prog(const char *name, drm_intel_bo *prog)
+static void brw_debug_prog(struct brw_context *brw,
+ const char *name, uint32_t prog_offset)
{
unsigned int i;
uint32_t *data;
- if (prog == NULL)
- return;
-
- drm_intel_bo_map(prog, GL_FALSE);
+ drm_intel_bo_map(brw->cache.bo, false);
- data = prog->virtual;
+ data = brw->cache.bo->virtual + prog_offset;
- for (i = 0; i < prog->size / 4 / 4; i++) {
+ for (i = 0; i < brw->cache.bo->size / 4 / 4; i++) {
fprintf(stderr, "%8s: 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x\n",
- name, (unsigned int)prog->offset + i * 4 * 4,
+ name, (unsigned int)brw->cache.bo->offset + i * 4 * 4,
data[i * 4], data[i * 4 + 1], data[i * 4 + 2], data[i * 4 + 3]);
/* Stop at the end of the program. It'd be nice to keep track of the actual
* intended program size instead of guessing like this.
@@ -485,7 +483,7 @@ static void brw_debug_prog(const char *name, drm_intel_bo *prog)
break;
}
- drm_intel_bo_unmap(prog);
+ drm_intel_bo_unmap(brw->cache.bo);
}
@@ -518,17 +516,19 @@ void brw_debug_batch(struct intel_context *intel)
if (intel->gen < 6)
state_struct_out("VS", intel->batch.bo, brw->vs.state_offset,
sizeof(struct brw_vs_unit_state));
- brw_debug_prog("VS prog", brw->vs.prog_bo);
+ brw_debug_prog(brw, "VS prog", brw->vs.prog_offset);
if (intel->gen < 6)
state_struct_out("GS", intel->batch.bo, brw->gs.state_offset,
sizeof(struct brw_gs_unit_state));
- brw_debug_prog("GS prog", brw->gs.prog_bo);
+ if (brw->gs.prog_active) {
+ brw_debug_prog(brw, "GS prog", brw->gs.prog_offset);
+ }
if (intel->gen < 6) {
state_struct_out("SF", intel->batch.bo, brw->sf.state_offset,
sizeof(struct brw_sf_unit_state));
- brw_debug_prog("SF prog", brw->sf.prog_bo);
+ brw_debug_prog(brw, "SF prog", brw->sf.prog_offset);
}
if (intel->gen >= 7)
dump_sf_clip_viewport_state(brw);
@@ -540,7 +540,7 @@ void brw_debug_batch(struct intel_context *intel)
if (intel->gen < 6)
state_struct_out("WM", intel->batch.bo, brw->wm.state_offset,
sizeof(struct brw_wm_unit_state));
- brw_debug_prog("WM prog", brw->wm.prog_bo);
+ brw_debug_prog(brw, "WM prog", brw->wm.prog_offset);
if (intel->gen >= 6) {
dump_cc_viewport_state(brw);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 6a4c112dcf..50ab490f33 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -47,11 +47,11 @@ static const struct brw_tracked_state *gen4_atoms[] =
&brw_check_fallback,
&brw_wm_input_sizes,
- &brw_vs_prog,
- &brw_gs_prog,
- &brw_clip_prog,
- &brw_sf_prog,
- &brw_wm_prog,
+ &brw_vs_prog, /* must do before GS prog, state base address. */
+ &brw_gs_prog, /* must do before state base address */
+ &brw_clip_prog, /* must do before state base address */
+ &brw_sf_prog, /* must do before state base address */
+ &brw_wm_prog, /* must do before state base address */
/* Once all the programs are done, we know how large urb entry
* sizes need to be and can decide if we need to change the urb
@@ -110,9 +110,9 @@ static const struct brw_tracked_state *gen6_atoms[] =
&brw_check_fallback,
&brw_wm_input_sizes,
- &brw_vs_prog,
- &brw_gs_prog,
- &brw_wm_prog,
+ &brw_vs_prog, /* must do before state base address */
+ &brw_gs_prog, /* must do before state base address */
+ &brw_wm_prog, /* must do before state base address */
&gen6_clip_vp,
&gen6_sf_vp,
@@ -365,6 +365,7 @@ static struct dirty_bit_map brw_bits[] = {
DEFINE_BIT(BRW_NEW_PRIMITIVE),
DEFINE_BIT(BRW_NEW_CONTEXT),
DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
+ DEFINE_BIT(BRW_NEW_PROGRAM_CACHE),
DEFINE_BIT(BRW_NEW_PSP),
DEFINE_BIT(BRW_NEW_WM_SURFACES),
DEFINE_BIT(BRW_NEW_INDICES),
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 80d5e78ed0..a9ad5311fe 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -105,12 +105,11 @@ static void do_vs_prog( struct brw_context *brw,
/* constant_map */
aux_size += c.vp->program.Base.Parameters->NumParameters;
- drm_intel_bo_unreference(brw->vs.prog_bo);
- brw->vs.prog_bo = brw_upload_cache(&brw->cache, BRW_VS_PROG,
- &c.key, sizeof(c.key),
- program, program_size,
- &c.prog_data, aux_size,
- &brw->vs.prog_data);
+ brw_upload_cache(&brw->cache, BRW_VS_PROG,
+ &c.key, sizeof(c.key),
+ program, program_size,
+ &c.prog_data, aux_size,
+ &brw->vs.prog_offset, &brw->vs.prog_data);
ralloc_free(mem_ctx);
}
@@ -153,14 +152,11 @@ static void brw_upload_vs_prog(struct brw_context *brw)
}
}
- /* Make an early check for the key.
- */
- drm_intel_bo_unreference(brw->vs.prog_bo);
- brw->vs.prog_bo = brw_search_cache(&brw->cache, BRW_VS_PROG,
- &key, sizeof(key),
- &brw->vs.prog_data);
- if (brw->vs.prog_bo == NULL)
+ if (!brw_search_cache(&brw->cache, BRW_VS_PROG,
+ &key, sizeof(key),
+ &brw->vs.prog_offset, &brw->vs.prog_data)) {
do_vs_prog(brw, vp, &key);
+ }
brw->vs.constant_map = ((int8_t *)brw->vs.prog_data +
sizeof(*brw->vs.prog_data));
}
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index 1eee5b7e5d..185020c816 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -58,8 +58,14 @@ brw_prepare_vs_unit(struct brw_context *brw)
vs = brw_state_batch(brw, sizeof(*vs), 32, &brw->vs.state_offset);
memset(vs, 0, sizeof(*vs));
- /* CACHE_NEW_VS_PROG */
- vs->thread0.kernel_start_pointer = brw->vs.prog_bo->offset >> 6; /* reloc */
+ /* BRW_NEW_PROGRAM_CACHE | CACHE_NEW_VS_PROG */
+ vs->thread0.kernel_start_pointer =
+ brw_program_reloc(brw,
+ brw->vs.state_offset +
+ offsetof(struct brw_vs_unit_state, thread0),
+ brw->vs.prog_offset +
+ (vs->thread0.grf_reg_count << 1)) >> 6;
+
vs->thread0.grf_reg_count = ALIGN(brw->vs.prog_data->total_grf, 16) / 16 - 1;
vs->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
/* Choosing multiple program flow means that we may get 2-vertex threads,
@@ -152,13 +158,6 @@ brw_prepare_vs_unit(struct brw_context *brw)
*/
vs->vs6.vs_enable = 1;
- /* Emit VS program relocation */
- drm_intel_bo_emit_reloc(intel->batch.bo, (brw->vs.state_offset +
- offsetof(struct brw_vs_unit_state,
- thread0)),
- brw->vs.prog_bo, vs->thread0.grf_reg_count << 1,
- I915_GEM_DOMAIN_INSTRUCTION, 0);
-
brw->state.dirty.cache |= CACHE_NEW_VS_UNIT;
}
@@ -166,6 +165,7 @@ const struct brw_tracked_state brw_vs_unit = {
.dirty = {
.mesa = _NEW_TRANSFORM,
.brw = (BRW_NEW_BATCH |
+ BRW_NEW_PROGRAM_CACHE |
BRW_NEW_CURBE_OFFSETS |
BRW_NEW_NR_VS_SURFACES |
BRW_NEW_URB_FENCE),
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 236c4d297d..0f73148262 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -69,14 +69,8 @@ static void brw_destroy_context( struct intel_context *intel )
ralloc_free(brw->wm.compile_data);
dri_bo_release(&brw->curbe.curbe_bo);
- dri_bo_release(&brw->vs.prog_bo);
dri_bo_release(&brw->vs.const_bo);
- dri_bo_release(&brw->gs.prog_bo);
- dri_bo_release(&brw->clip.prog_bo);
- dri_bo_release(&brw->sf.prog_bo);
- dri_bo_release(&brw->wm.prog_bo);
dri_bo_release(&brw->wm.const_bo);
- dri_bo_release(&brw->cc.prog_bo);
free(brw->curbe.last_buf);
free(brw->curbe.next_buf);
@@ -125,6 +119,12 @@ static void brw_new_batch( struct intel_context *intel )
brw->state.dirty.brw |= BRW_NEW_CONTEXT | BRW_NEW_BATCH;
brw->vb.nr_current_buffers = 0;
+
+ /* Mark that the current program cache BO has been used by the GPU.
+ * It will be reallocated if we need to put new programs in for the
+ * next batch.
+ */
+ brw->cache.bo_used_by_gpu = true;
}
static void brw_invalidate_state( struct intel_context *intel, GLuint new_state )
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 1aebd12df4..f1c9985290 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -273,12 +273,11 @@ bool do_wm_prog(struct brw_context *brw,
*/
program = brw_get_program(&c->func, &program_size);
- drm_intel_bo_unreference(brw->wm.prog_bo);
- brw->wm.prog_bo = brw_upload_cache(&brw->cache, BRW_WM_PROG,
- &c->key, sizeof(c->key),
- program, program_size,
- &c->prog_data, sizeof(c->prog_data),
- &brw->wm.prog_data);
+ brw_upload_cache(&brw->cache, BRW_WM_PROG,
+ &c->key, sizeof(c->key),
+ program, program_size,
+ &c->prog_data, sizeof(c->prog_data),
+ &brw->wm.prog_offset, &brw->wm.prog_data);
return true;
}
@@ -477,13 +476,9 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
brw_wm_populate_key(brw, &key);
- /* Make an early check for the key.
- */
- drm_intel_bo_unreference(brw->wm.prog_bo);
- brw->wm.prog_bo = brw_search_cache(&brw->cache, BRW_WM_PROG,
- &key, sizeof(key),
- &brw->wm.prog_data);
- if (brw->wm.prog_bo == NULL) {
+ if (!brw_search_cache(&brw->cache, BRW_WM_PROG,
+ &key, sizeof(key),
+ &brw->wm.prog_offset, &brw->wm.prog_data)) {
bool success = do_wm_prog(brw, ctx->Shader.CurrentFragmentProgram, fp,
&key);
assert(success);
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index ef98f8126d..506e2bdff5 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -90,13 +90,25 @@ brw_prepare_wm_unit(struct brw_context *brw)
brw->wm.prog_data->first_curbe_grf_16);
}
- /* CACHE_NEW_WM_PROG */
+ /* BRW_NEW_PROGRAM_CACHE | CACHE_NEW_WM_PROG */
wm->thread0.grf_reg_count = brw->wm.prog_data->reg_blocks;
wm->wm9.grf_reg_count_2 = brw->wm.prog_data->reg_blocks_16;
- wm->thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */
- /* reloc */
- wm->wm9.kernel_start_pointer_2 = (brw->wm.prog_bo->offset +
- brw->wm.prog_data->prog_offset_16) >> 6;
+
+ wm->thread0.kernel_start_pointer =
+ brw_program_reloc(brw,
+ brw->wm.state_offset +
+ offsetof(struct brw_wm_unit_state, thread0),
+ brw->wm.prog_offset +
+ (wm->thread0.grf_reg_count << 1)) >> 6;
+
+ wm->wm9.kernel_start_pointer_2 =
+ brw_program_reloc(brw,
+ brw->wm.state_offset +
+ offsetof(struct brw_wm_unit_state, wm9),
+ brw->wm.prog_offset +
+ brw->wm.prog_data->prog_offset_16 +
+ (wm->wm9.grf_reg_count_2 << 1)) >> 6;
+
wm->thread1.depth_coef_urb_read_offset = 1;
wm->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
@@ -214,23 +226,6 @@ brw_prepare_wm_unit(struct brw_context *brw)
if (unlikely(INTEL_DEBUG & DEBUG_STATS) || intel->stats_wm)
wm->wm4.stats_enable = 1;
- /* Emit WM program relocation */
- drm_intel_bo_emit_reloc(intel->batch.bo,
- brw->wm.state_offset +
- offsetof(struct brw_wm_unit_state, thread0),
- brw->wm.prog_bo, wm->thread0.grf_reg_count << 1,
- I915_GEM_DOMAIN_INSTRUCTION, 0);
-
- if (brw->wm.prog_data->prog_offset_16) {
- drm_intel_bo_emit_reloc(intel->batch.bo,
- brw->wm.state_offset +
- offsetof(struct brw_wm_unit_state, wm9),
- brw->wm.prog_bo,
- ((wm->wm9.grf_reg_count_2 << 1) +
- brw->wm.prog_data->prog_offset_16),
- I915_GEM_DOMAIN_INSTRUCTION, 0);
- }
-
/* Emit scratch space relocation */
if (brw->wm.prog_data->total_scratch != 0) {
drm_intel_bo_emit_reloc(intel->batch.bo,
@@ -265,6 +260,7 @@ const struct brw_tracked_state brw_wm_unit = {
_NEW_BUFFERS),
.brw = (BRW_NEW_BATCH |
+ BRW_NEW_PROGRAM_CACHE |
BRW_NEW_FRAGMENT_PROGRAM |
BRW_NEW_CURBE_OFFSETS |
BRW_NEW_NR_WM_SURFACES),
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c
index c1d0a73939..e73e7824af 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c
@@ -45,7 +45,7 @@ upload_gs_state(struct brw_context *brw)
ADVANCE_BATCH();
// GS should never be used on Gen6. Disable it.
- assert(brw->gs.prog_bo == NULL);
+ assert(!brw->gs.prog_active);
BEGIN_BATCH(7);
OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
OUT_BATCH(0); /* prog_bo */
diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c
index 62645a6a30..b4105111c8 100644
--- a/src/mesa/drivers/dri/i965/gen6_urb.c
+++ b/src/mesa/drivers/dri/i965/gen6_urb.c
@@ -64,7 +64,7 @@ upload_urb(struct brw_context *brw)
assert(brw->urb.nr_vs_entries % 4 == 0);
assert(brw->urb.nr_gs_entries % 4 == 0);
/* GS requirement */
- assert(!brw->gs.prog_bo || brw->urb.vs_size < 5);
+ assert(!brw->gs.prog_active || brw->urb.vs_size < 5);
BEGIN_BATCH(3);
OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2));
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index b46368e36e..7838a91d8d 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -147,7 +147,7 @@ upload_vs_state(struct brw_context *brw)
BEGIN_BATCH(6);
OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
- OUT_RELOC(brw->vs.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+ OUT_BATCH(brw->vs.prog_offset);
OUT_BATCH((0 << GEN6_VS_SAMPLER_COUNT_SHIFT) |
GEN6_VS_FLOATING_POINT_MODE_ALT |
(brw->vs.nr_surfaces << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 43e651db3e..024a1d879e 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -183,7 +183,7 @@ upload_wm_state(struct brw_context *brw)
BEGIN_BATCH(9);
OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2));
- OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+ OUT_BATCH(brw->wm.prog_offset);
OUT_BATCH(dw2);
if (brw->wm.prog_data->total_scratch) {
OUT_RELOC(brw->wm.scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
@@ -195,12 +195,8 @@ upload_wm_state(struct brw_context *brw)
OUT_BATCH(dw5);
OUT_BATCH(dw6);
OUT_BATCH(0); /* kernel 1 pointer */
- if (brw->wm.prog_data->prog_offset_16) {
- OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
- brw->wm.prog_data->prog_offset_16);
- } else {
- OUT_BATCH(0); /* kernel 2 pointer */
- }
+ /* kernel 2 pointer */
+ OUT_BATCH(brw->wm.prog_offset + brw->wm.prog_data->prog_offset_16);
ADVANCE_BATCH();
}
diff --git a/src/mesa/drivers/dri/i965/gen7_disable.c b/src/mesa/drivers/dri/i965/gen7_disable.c
index 4e9461739d..a44d31596b 100644
--- a/src/mesa/drivers/dri/i965/gen7_disable.c
+++ b/src/mesa/drivers/dri/i965/gen7_disable.c
@@ -31,7 +31,7 @@ disable_stages(struct brw_context *brw)
{
struct intel_context *intel = &brw->intel;
- assert(brw->gs.prog_bo == NULL);
+ assert(!brw->gs.prog_active);
/* Disable the Geometry Shader (GS) Unit */
BEGIN_BATCH(7);
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
index 3a614693df..b36d780ed4 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -78,7 +78,7 @@ upload_urb(struct brw_context *brw)
assert(brw->urb.nr_vs_entries % 8 == 0);
assert(brw->urb.nr_gs_entries % 8 == 0);
/* GS requirement */
- assert(!brw->gs.prog_bo);
+ assert(brw->gs.prog_active);
BEGIN_BATCH(2);
OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2));
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index ae7a1d6c35..0fad3d2fb6 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -67,7 +67,7 @@ upload_vs_state(struct brw_context *brw)
BEGIN_BATCH(6);
OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
- OUT_RELOC(brw->vs.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+ OUT_BATCH(brw->vs.prog_offset);
OUT_BATCH((0 << GEN6_VS_SAMPLER_COUNT_SHIFT) |
GEN6_VS_FLOATING_POINT_MODE_ALT |
(brw->vs.nr_surfaces << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index 6a64eb8a2d..ac6ba2fed1 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -227,18 +227,13 @@ upload_ps_state(struct brw_context *brw)
BEGIN_BATCH(8);
OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
- OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+ OUT_BATCH(brw->wm.prog_offset);
OUT_BATCH(dw2);
OUT_BATCH(0); /* scratch space base offset */
OUT_BATCH(dw4);
OUT_BATCH(dw5);
OUT_BATCH(0); /* kernel 1 pointer */
- if (brw->wm.prog_data->prog_offset_16) {
- OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
- brw->wm.prog_data->prog_offset_16);
- } else {
- OUT_BATCH(0); /* kernel 2 pointer */
- }
+ OUT_BATCH(brw->wm.prog_offset + brw->wm.prog_data->prog_offset_16);
ADVANCE_BATCH();
}