diff options
author | Luc Verhaegen <libv@skynet.be> | 2010-03-14 01:38:43 +0100 |
---|---|---|
committer | Luc Verhaegen <libv@skynet.be> | 2010-03-14 01:38:43 +0100 |
commit | c8335894f829274c38ab5281234c276f0803d65d (patch) | |
tree | c1a9664be1d821438fb72e9ffae814e48fd4a8f0 /i965 | |
parent | 7a3ced9ea22443dcdc83b0f2e180bb7cbe8aa77c (diff) |
Import i915 and i965 dri drivers from mesa 7.3.0.7.3.0
Diffstat (limited to 'i965')
60 files changed, 2641 insertions, 2799 deletions
diff --git a/i965/Makefile.am b/i965/Makefile.am index b1b816c..4fb59be 100644 --- a/i965/Makefile.am +++ b/i965/Makefile.am @@ -5,26 +5,25 @@ I965_CFLAGS = -I../shared -I../shared/server i965_dri_la_LTLIBRARIES = i965_dri.la i965_dri_la_CFLAGS = $(AM_CFLAGS) $(DRM_CFLAGS) $(DRI_CFLAGS) $(I965_CFLAGS) i965_dri_la_LDFLAGS = -module -noprefix -avoid-version -lm -ldl\ - $(DRM_LIBS) $(DRI_LIBS) + $(DRM_LIBS) -ldrm_intel $(DRI_LIBS) i965_dri_ladir = @libdir@/dri i965_dri_la_SOURCES = \ ../shared/intel_batchbuffer.c \ ../shared/intel_blit.c \ ../shared/intel_buffer_objects.c \ ../shared/intel_buffers.c \ - ../shared/intel_bufmgr_ttm.c \ ../shared/intel_context.c \ ../shared/intel_decode.c \ ../shared/intel_depthstencil.c \ ../shared/intel_fbo.c \ - ../shared/intel_ioctl.c \ ../shared/intel_mipmap_tree.c \ ../shared/intel_regions.c \ ../shared/intel_screen.c \ ../shared/intel_span.c \ ../shared/intel_pixel.c \ - ../shared/intel_pixel_copy.c \ ../shared/intel_pixel_bitmap.c \ + ../shared/intel_pixel_copy.c \ + ../shared/intel_pixel_draw.c \ intel_state.c \ ../shared/intel_tex.c \ ../shared/intel_tex_copy.c \ @@ -56,6 +55,7 @@ i965_dri_la_SOURCES = \ brw_metaops.c \ brw_misc_state.c \ brw_program.c \ + brw_queryobj.c \ brw_sf.c \ brw_sf_emit.c \ brw_sf_state.c \ @@ -71,7 +71,6 @@ i965_dri_la_SOURCES = \ brw_vs_constval.c \ brw_vs_emit.c \ brw_vs_state.c \ - brw_vs_tnl.c \ brw_vtbl.c \ brw_wm.c \ brw_wm_debug.c \ diff --git a/i965/brw_cc.c b/i965/brw_cc.c index 9d8984f..fa8121e 100644 --- a/i965/brw_cc.c +++ b/i965/brw_cc.c @@ -34,10 +34,10 @@ #include "brw_state.h" #include "brw_defines.h" #include "brw_util.h" -#include "macros.h" -#include "enums.h" +#include "main/macros.h" +#include "main/enums.h" -static int upload_cc_vp( struct brw_context *brw ) +static void prepare_cc_vp( struct brw_context *brw ) { struct brw_cc_viewport ccv; @@ -48,7 +48,6 @@ static int upload_cc_vp( struct brw_context *brw ) dri_bo_unreference(brw->cc.vp_bo); brw->cc.vp_bo = brw_cache_data( &brw->cache, BRW_CC_VP, &ccv, NULL, 0 ); - return dri_bufmgr_check_aperture_space(brw->cc.vp_bo); } const struct brw_tracked_state brw_cc_vp = { @@ -57,7 +56,7 @@ const struct brw_tracked_state brw_cc_vp = { .brw = BRW_NEW_CONTEXT, .cache = 0 }, - .prepare = upload_cc_vp + .prepare = prepare_cc_vp }; struct brw_cc_unit_key { @@ -256,16 +255,17 @@ cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key) NULL, NULL); /* Emit CC viewport relocation */ - dri_emit_reloc(bo, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, - 0, - offsetof(struct brw_cc_unit_state, cc4), - brw->cc.vp_bo); + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, + 0, + 0, + offsetof(struct brw_cc_unit_state, cc4), + brw->cc.vp_bo); return bo; } -static int prepare_cc_unit( struct brw_context *brw ) +static void prepare_cc_unit( struct brw_context *brw ) { struct brw_cc_unit_key key; @@ -279,7 +279,6 @@ static int prepare_cc_unit( struct brw_context *brw ) if (brw->cc.state_bo == NULL) brw->cc.state_bo = cc_unit_create_from_key(brw, &key); - return dri_bufmgr_check_aperture_space(brw->cc.state_bo); } const struct brw_tracked_state brw_cc_unit = { diff --git a/i965/brw_clip.c b/i965/brw_clip.c index 540108e..38d8b70 100644 --- a/i965/brw_clip.c +++ b/i965/brw_clip.c @@ -29,9 +29,9 @@ * Keith Whitwell <keith@tungstengraphics.com> */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" #include "intel_batchbuffer.h" @@ -131,7 +131,7 @@ static void compile_clip_prog( struct brw_context *brw, /* Calculate interpolants for triangle and line rasterization. */ -static int upload_clip_prog( struct brw_context *brw ) +static void upload_clip_prog(struct brw_context *brw) { GLcontext *ctx = &brw->intel.ctx; struct brw_clip_prog_key key; @@ -242,8 +242,6 @@ static int upload_clip_prog( struct brw_context *brw ) &brw->clip.prog_data); if (brw->clip.prog_bo == NULL) compile_clip_prog( brw, &key ); - - return dri_bufmgr_check_aperture_space(brw->clip.prog_bo); } diff --git a/i965/brw_clip_line.c b/i965/brw_clip_line.c index 0930e6a..c45d48d 100644 --- a/i965/brw_clip_line.c +++ b/i965/brw_clip_line.c @@ -29,11 +29,11 @@ * Keith Whitwell <keith@tungstengraphics.com> */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" - +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" #include "shader/program.h" + #include "intel_batchbuffer.h" #include "brw_defines.h" @@ -148,7 +148,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c ) brw_clip_init_clipmask(c); /* -ve rhw workaround */ - if (!(BRW_IS_GM45(p->brw) || BRW_IS_G4X(p->brw))) { + if (!BRW_IS_G4X(p->brw)) { brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<20)); diff --git a/i965/brw_clip_point.c b/i965/brw_clip_point.c index 2346980..d17b199 100644 --- a/i965/brw_clip_point.c +++ b/i965/brw_clip_point.c @@ -29,11 +29,11 @@ * Keith Whitwell <keith@tungstengraphics.com> */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" - +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" #include "shader/program.h" + #include "intel_batchbuffer.h" #include "brw_defines.h" diff --git a/i965/brw_clip_state.c b/i965/brw_clip_state.c index 2d0b24c..9b0d7ea 100644 --- a/i965/brw_clip_state.c +++ b/i965/brw_clip_state.c @@ -32,7 +32,7 @@ #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" -#include "macros.h" +#include "main/macros.h" struct brw_clip_unit_key { unsigned int total_grf; @@ -88,7 +88,21 @@ clip_unit_create_from_key(struct brw_context *brw, clip.thread4.nr_urb_entries = key->nr_urb_entries; clip.thread4.urb_entry_allocation_size = key->urb_size - 1; - clip.thread4.max_threads = 1; /* 2 threads */ + /* If we have enough clip URB entries to run two threads, do so. + */ + if (key->nr_urb_entries >= 10) { + /* Half of the URB entries go to each thread, and it has to be an + * even number. + */ + assert(key->nr_urb_entries % 2 == 0); + clip.thread4.max_threads = 2 - 1; + } else { + assert(key->nr_urb_entries >= 5); + clip.thread4.max_threads = 1 - 1; + } + + if (INTEL_DEBUG & DEBUG_SINGLE_THREAD) + clip.thread4.max_threads = 0; if (INTEL_DEBUG & DEBUG_STATS) clip.thread4.stats_enable = 1; @@ -102,7 +116,7 @@ clip_unit_create_from_key(struct brw_context *brw, clip.clip5.api_mode = BRW_CLIP_API_OGL; clip.clip5.clip_mode = key->clip_mode; - if (BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) + if (BRW_IS_G4X(brw)) clip.clip5.negative_w_clip_test = 1; clip.clip6.clipper_viewport_state_ptr = 0; @@ -119,19 +133,19 @@ clip_unit_create_from_key(struct brw_context *brw, /* Emit clip program relocation */ assert(brw->clip.prog_bo); - dri_emit_reloc(bo, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, - clip.thread0.grf_reg_count << 1, - offsetof(struct brw_clip_unit_state, thread0), - brw->clip.prog_bo); + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, + 0, + clip.thread0.grf_reg_count << 1, + offsetof(struct brw_clip_unit_state, thread0), + brw->clip.prog_bo); return bo; } -static int upload_clip_unit( struct brw_context *brw ) +static void upload_clip_unit( struct brw_context *brw ) { struct brw_clip_unit_key key; - int ret = 0; clip_unit_populate_key(brw, &key); @@ -143,9 +157,6 @@ static int upload_clip_unit( struct brw_context *brw ) if (brw->clip.state_bo == NULL) { brw->clip.state_bo = clip_unit_create_from_key(brw, &key); } - - ret = dri_bufmgr_check_aperture_space(brw->clip.state_bo); - return ret; } const struct brw_tracked_state brw_clip_unit = { diff --git a/i965/brw_clip_tri.c b/i965/brw_clip_tri.c index 0003901..1dbba37 100644 --- a/i965/brw_clip_tri.c +++ b/i965/brw_clip_tri.c @@ -29,11 +29,11 @@ * Keith Whitwell <keith@tungstengraphics.com> */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" - +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" #include "shader/program.h" + #include "intel_batchbuffer.h" #include "brw_defines.h" @@ -526,7 +526,7 @@ void brw_emit_tri_clip( struct brw_clip_compile *c ) /* if -ve rhw workaround bit is set, do cliptest */ - if (!(BRW_IS_GM45(p->brw) || BRW_IS_G4X(p->brw))) { + if (!BRW_IS_G4X(p->brw)) { brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<20)); diff --git a/i965/brw_clip_unfilled.c b/i965/brw_clip_unfilled.c index 6f20d79..d7ca517 100644 --- a/i965/brw_clip_unfilled.c +++ b/i965/brw_clip_unfilled.c @@ -29,11 +29,11 @@ * Keith Whitwell <keith@tungstengraphics.com> */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" - +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" #include "shader/program.h" + #include "intel_batchbuffer.h" #include "brw_defines.h" diff --git a/i965/brw_clip_util.c b/i965/brw_clip_util.c index c32bd4e..9d3b0be 100644 --- a/i965/brw_clip_util.c +++ b/i965/brw_clip_util.c @@ -30,11 +30,11 @@ */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" - +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" #include "shader/program.h" + #include "intel_batchbuffer.h" #include "brw_defines.h" diff --git a/i965/brw_context.c b/i965/brw_context.c index 1c7ad5c..d7a2bd9 100644 --- a/i965/brw_context.c +++ b/i965/brw_context.c @@ -30,11 +30,18 @@ */ +#include "main/imports.h" +#include "main/api_noop.h" +#include "main/macros.h" +#include "main/vtxfmt.h" +#include "main/simple_list.h" +#include "shader/shader_api.h" + #include "brw_context.h" #include "brw_defines.h" #include "brw_draw.h" +#include "brw_state.h" #include "brw_vs.h" -#include "imports.h" #include "intel_tex.h" #include "intel_blit.h" #include "intel_batchbuffer.h" @@ -43,10 +50,7 @@ #include "tnl/t_pipeline.h" #include "utils.h" -#include "api_noop.h" -#include "vtxfmt.h" -#include "shader/shader_api.h" /*************************************** * Mesa's Driver Functions @@ -67,6 +71,9 @@ static void brwInitDriverFunctions( struct dd_function_table *functions ) brwInitFragProgFuncs( functions ); brwInitProgFuncs( functions ); + brw_init_queryobj_functions(functions); + + functions->Viewport = intel_viewport; } @@ -122,9 +129,10 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis, TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline; - ctx->Const.MaxTextureUnits = BRW_MAX_TEX_UNIT; ctx->Const.MaxTextureImageUnits = BRW_MAX_TEX_UNIT; - ctx->Const.MaxTextureCoordUnits = BRW_MAX_TEX_UNIT; + ctx->Const.MaxTextureCoordUnits = 8; /* Mesa limit */ + ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureCoordUnits, + ctx->Const.MaxTextureImageUnits); ctx->Const.MaxVertexTextureImageUnits = 0; /* no vertex shader textures */ /* Advertise the full hardware capabilities. The new memory @@ -134,8 +142,10 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis, ctx->Const.Max3DTextureLevels = 9; ctx->Const.MaxCubeTextureLevels = 12; ctx->Const.MaxTextureRectSize = (1<<11); - ctx->Const.MaxTextureUnits = BRW_MAX_TEX_UNIT; + /* if conformance mode is set, swrast can handle any size AA point */ + ctx->Const.MaxPointSizeAA = 255.0; + /* ctx->Const.MaxNativeVertexProgramTemps = 32; */ brw_init_attribs( brw ); @@ -147,11 +157,12 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis, brw->emit_state_always = 0; - ctx->FragmentProgram._MaintainTexEnvProgram = 1; + ctx->VertexProgram._MaintainTnlProgram = GL_TRUE; + ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE; - brw_draw_init( brw ); + make_empty_list(&brw->query.active_head); - brw_ProgramCacheInit( ctx ); + brw_draw_init( brw ); return GL_TRUE; } diff --git a/i965/brw_context.h b/i965/brw_context.h index 32e0554..5d3f99e 100644 --- a/i965/brw_context.h +++ b/i965/brw_context.h @@ -35,7 +35,7 @@ #include "intel_context.h" #include "brw_structs.h" -#include "imports.h" +#include "main/imports.h" /* Glossary: @@ -130,18 +130,19 @@ struct brw_context; #define BRW_NEW_CONTEXT 0x80 #define BRW_NEW_WM_INPUT_DIMENSIONS 0x100 #define BRW_NEW_INPUT_VARYING 0x200 -#define BRW_NEW_TNL_PROGRAM 0x400 #define BRW_NEW_PSP 0x800 #define BRW_NEW_METAOPS 0x1000 #define BRW_NEW_FENCE 0x2000 -#define BRW_NEW_LOCK 0x4000 +#define BRW_NEW_INDICES 0x4000 +#define BRW_NEW_VERTICES 0x8000 /** * Used for any batch entry with a relocated pointer that will be used * by any 3D rendering. */ -#define BRW_NEW_BATCH 0x8000 +#define BRW_NEW_BATCH 0x10000 /** brw->depth_region updated */ -#define BRW_NEW_DEPTH_BUFFER 0x10000 +#define BRW_NEW_DEPTH_BUFFER 0x20000 +#define BRW_NEW_NR_SURFACES 0x40000 struct brw_state_flags { /** State update flags signalled by mesa internals */ @@ -157,7 +158,6 @@ struct brw_state_flags { struct brw_vertex_program { struct gl_vertex_program program; GLuint id; - GLuint param_state; /* flags indicating state tracked by params */ }; @@ -165,7 +165,6 @@ struct brw_vertex_program { struct brw_fragment_program { struct gl_fragment_program program; GLuint id; - GLuint param_state; /* flags indicating state tracked by params */ }; @@ -239,7 +238,7 @@ struct brw_vs_ouput_sizes { }; -#define BRW_MAX_TEX_UNIT 8 +#define BRW_MAX_TEX_UNIT 16 #define BRW_WM_MAX_SURF BRW_MAX_TEX_UNIT + MAX_DRAW_BUFFERS enum brw_cache_id { @@ -332,7 +331,7 @@ struct brw_state_pointers { */ struct brw_tracked_state { struct brw_state_flags dirty; - int (*prepare)( struct brw_context *brw ); + void (*prepare)( struct brw_context *brw ); void (*emit)( struct brw_context *brw ); }; @@ -409,7 +408,22 @@ struct brw_tnl_cache { GLuint size, n_items; }; +struct brw_query_object { + struct gl_query_object Base; + /** Doubly linked list of active query objects in the context. */ + struct brw_query_object *prev, *next; + + /** Last query BO associated with this query. */ + dri_bo *bo; + /** First index in bo with query data for this object. */ + int first_index; + /** Last index in bo with query data for this object. */ + int last_index; + + /* Total count of pixels from previous BOs */ + unsigned int count; +}; struct brw_context { @@ -417,7 +431,6 @@ struct brw_context GLuint primitive; GLboolean emit_state_always; - GLboolean wrap; GLboolean tmp_fallback; GLboolean no_batch_wrap; @@ -429,6 +442,19 @@ struct brw_context GLuint nr_draw_regions; struct intel_region *draw_regions[MAX_DRAW_BUFFERS]; struct intel_region *depth_region; + + /** + * List of buffers accumulated in brw_validate_state to receive + * dri_bo_check_aperture treatment before exec, so we can know if we + * should flush the batch and try again before emitting primitives. + * + * This can be a fixed number as we only have a limited number of + * objects referenced from the batchbuffer in a primitive emit, + * consisting of the vertex buffers, pipelined state pointers, + * the CURBE, the depth buffer, and a query BO. + */ + dri_bo *validated_bos[VERT_ATTRIB_MAX + 16]; + int validated_bo_count; } state; struct brw_state_pointers attribs; @@ -450,9 +476,22 @@ struct brw_context * for changes to this state: */ struct brw_vertex_info info; + unsigned int min_index, max_index; } vb; struct { + /** + * Index buffer for this draw_prims call. + * + * Updates are signaled by BRW_NEW_INDICES. + */ + const struct _mesa_index_buffer *ib; + + dri_bo *bo; + unsigned int offset; + } ib; + + struct { /* Will be allocated on demand if needed. */ struct brw_state_pointers attribs; @@ -473,10 +512,6 @@ struct brw_context GLboolean active; } metaops; - /* Track fixed function t&l in a vertex program: - */ - struct gl_vertex_program *tnl_program; - struct brw_tnl_cache tnl_program_cache; /* Active vertex program: */ @@ -542,6 +577,11 @@ struct brw_context GLfloat *last_buf; GLuint last_bufsz; + /** + * Whether we should create a new bo instead of reusing the old one + * (if we just dispatch the batch pointing at the old one. + */ + GLboolean need_new_bo; } curbe; struct { @@ -611,7 +651,12 @@ struct brw_context dri_bo *vp_bo; } cc; - + struct { + struct brw_query_object active_head; + dri_bo *bo; + int index; + GLboolean active; + } query; /* Used to give every program string a unique id */ GLuint program_id; @@ -636,15 +681,13 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis, __DRIcontextPrivate *driContextPriv, void *sharedContextPrivate); - - /*====================================================================== - * brw_state.c + * brw_queryobj.c */ -int brw_validate_state( struct brw_context *brw ); -void brw_init_state( struct brw_context *brw ); -void brw_destroy_state( struct brw_context *brw ); - +void brw_init_queryobj_functions(struct dd_function_table *functions); +void brw_prepare_query_begin(struct brw_context *brw); +void brw_emit_query_begin(struct brw_context *brw); +void brw_emit_query_end(struct brw_context *brw); /*====================================================================== * brw_state_dump.c diff --git a/i965/brw_curbe.c b/i965/brw_curbe.c index 5ff4e29..fbf473a 100644 --- a/i965/brw_curbe.c +++ b/i965/brw_curbe.c @@ -31,10 +31,10 @@ -#include "glheader.h" -#include "context.h" -#include "macros.h" -#include "enums.h" +#include "main/glheader.h" +#include "main/context.h" +#include "main/macros.h" +#include "main/enums.h" #include "shader/prog_parameter.h" #include "shader/prog_statevars.h" #include "intel_batchbuffer.h" @@ -46,7 +46,7 @@ /* Partition the CURBE between the various users of constant values: */ -static int calculate_curbe_offsets( struct brw_context *brw ) +static void calculate_curbe_offsets( struct brw_context *brw ) { /* CACHE_NEW_WM_PROG */ GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16; @@ -117,7 +117,6 @@ static int calculate_curbe_offsets( struct brw_context *brw ) brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS; } - return 0; } @@ -156,19 +155,7 @@ void brw_upload_constant_buffer_state(struct brw_context *brw) assert(brw->urb.nr_cs_entries); BRW_CACHED_BATCH_STRUCT(brw, &cbs); -} - -#if 0 -const struct brw_tracked_state brw_constant_buffer_state = { - .dirty = { - .mesa = 0, - .brw = BRW_NEW_URB_FENCE, - .cache = 0 - }, - .update = brw_upload_constant_buffer_state -}; -#endif - +} static GLfloat fixed_plane[6][4] = { { 0, 0, -1, 1 }, @@ -183,7 +170,7 @@ static GLfloat fixed_plane[6][4] = { * cache mechanism, but maybe would benefit from a comparison against * the current uploaded set of constants. */ -static int prepare_constant_buffer(struct brw_context *brw) +static void prepare_constant_buffer(struct brw_context *brw) { GLcontext *ctx = &brw->intel.ctx; struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program; @@ -197,8 +184,8 @@ static int prepare_constant_buffer(struct brw_context *brw) * function will also be called whenever fp or vp changes. */ brw->curbe.tracked_state.dirty.mesa = (_NEW_TRANSFORM|_NEW_PROJECTION); - brw->curbe.tracked_state.dirty.mesa |= vp->param_state; - brw->curbe.tracked_state.dirty.mesa |= fp->param_state; + brw->curbe.tracked_state.dirty.mesa |= vp->program.Base.Parameters->StateFlags; + brw->curbe.tracked_state.dirty.mesa |= fp->program.Base.Parameters->StateFlags; if (sz == 0) { @@ -207,8 +194,8 @@ static int prepare_constant_buffer(struct brw_context *brw) brw->curbe.last_buf = NULL; brw->curbe.last_bufsz = 0; } - - return 0; + + return; } buf = (GLfloat *)malloc(bufsz); @@ -295,7 +282,8 @@ static int prepare_constant_buffer(struct brw_context *brw) brw->curbe.last_bufsz = bufsz; if (brw->curbe.curbe_bo != NULL && - brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size) + (brw->curbe.need_new_bo || + brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size)) { dri_bo_unreference(brw->curbe.curbe_bo); brw->curbe.curbe_bo = NULL; @@ -306,10 +294,7 @@ static int prepare_constant_buffer(struct brw_context *brw) * They're generally around 64b. */ brw->curbe.curbe_bo = dri_bo_alloc(brw->intel.bufmgr, "CURBE", - 4096, 1 << 6, - DRM_BO_FLAG_MEM_LOCAL | - DRM_BO_FLAG_CACHED | - DRM_BO_FLAG_CACHED_MAPPED); + 4096, 1 << 6); brw->curbe.curbe_next_offset = 0; } @@ -322,6 +307,7 @@ static int prepare_constant_buffer(struct brw_context *brw) dri_bo_subdata(brw->curbe.curbe_bo, brw->curbe.curbe_offset, bufsz, buf); } + brw_add_validated_bo(brw, brw->curbe.curbe_bo); /* Because this provokes an action (ie copy the constants into the * URB), it shouldn't be shortcircuited if identical to the @@ -336,9 +322,6 @@ static int prepare_constant_buffer(struct brw_context *brw) * flushes as necessary when doublebuffering of CURBEs isn't * possible. */ - - /* check aperture space for this bo */ - return dri_bufmgr_check_aperture_space(brw->curbe.curbe_bo); } @@ -353,7 +336,8 @@ static void emit_constant_buffer(struct brw_context *brw) OUT_BATCH(0); } else { OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2)); - OUT_RELOC(brw->curbe.curbe_bo, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, + OUT_RELOC(brw->curbe.curbe_bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, (sz - 1) + brw->curbe.curbe_offset); } ADVANCE_BATCH(); diff --git a/i965/brw_defines.h b/i965/brw_defines.h index 92c058a..39c3225 100644 --- a/i965/brw_defines.h +++ b/i965/brw_defines.h @@ -33,69 +33,6 @@ #ifndef BRW_DEFINES_H #define BRW_DEFINES_H -/* - */ -#define MI_NOOP 0x00 -#define MI_USER_INTERRUPT 0x02 -#define MI_WAIT_FOR_EVENT 0x03 -#define MI_FLUSH 0x04 -#define MI_REPORT_HEAD 0x07 -#define MI_ARB_ON_OFF 0x08 -#define MI_BATCH_BUFFER_END 0x0A -#define MI_OVERLAY_FLIP 0x11 -#define MI_LOAD_SCAN_LINES_INCL 0x12 -#define MI_LOAD_SCAN_LINES_EXCL 0x13 -#define MI_DISPLAY_BUFFER_INFO 0x14 -#define MI_SET_CONTEXT 0x18 -#define MI_STORE_DATA_IMM 0x20 -#define MI_STORE_DATA_INDEX 0x21 -#define MI_LOAD_REGISTER_IMM 0x22 -#define MI_STORE_REGISTER_MEM 0x24 -#define MI_BATCH_BUFFER_START 0x31 - -#define MI_SYNCHRONOUS_FLIP 0x0 -#define MI_ASYNCHRONOUS_FLIP 0x1 - -#define MI_BUFFER_SECURE 0x0 -#define MI_BUFFER_NONSECURE 0x1 - -#define MI_ARBITRATE_AT_CHAIN_POINTS 0x0 -#define MI_ARBITRATE_BETWEEN_INSTS 0x1 -#define MI_NO_ARBITRATION 0x3 - -#define MI_CONDITION_CODE_WAIT_DISABLED 0x0 -#define MI_CONDITION_CODE_WAIT_0 0x1 -#define MI_CONDITION_CODE_WAIT_1 0x2 -#define MI_CONDITION_CODE_WAIT_2 0x3 -#define MI_CONDITION_CODE_WAIT_3 0x4 -#define MI_CONDITION_CODE_WAIT_4 0x5 - -#define MI_DISPLAY_PIPE_A 0x0 -#define MI_DISPLAY_PIPE_B 0x1 - -#define MI_DISPLAY_PLANE_A 0x0 -#define MI_DISPLAY_PLANE_B 0x1 -#define MI_DISPLAY_PLANE_C 0x2 - -#define MI_STANDARD_FLIP 0x0 -#define MI_ENQUEUE_FLIP_PERFORM_BASE_FRAME_NUMBER_LOAD 0x1 -#define MI_ENQUEUE_FLIP_TARGET_FRAME_NUMBER_RELATIVE 0x2 -#define MI_ENQUEUE_FLIP_ABSOLUTE_TARGET_FRAME_NUMBER 0x3 - -#define MI_PHYSICAL_ADDRESS 0x0 -#define MI_VIRTUAL_ADDRESS 0x1 - -#define MI_BUFFER_MEMORY_MAIN 0x0 -#define MI_BUFFER_MEMORY_GTT 0x2 -#define MI_BUFFER_MEMORY_PER_PROCESS_GTT 0x3 - -#define MI_FLIP_CONTINUE 0x0 -#define MI_FLIP_ON 0x1 -#define MI_FLIP_OFF 0x2 - -#define MI_UNTRUSTED_REGISTER_SPACE 0x0 -#define MI_TRUSTED_REGISTER_SPACE 0x1 - /* 3D state: */ #define _3DOP_3DSTATE_PIPELINED 0x0 @@ -119,7 +56,6 @@ #define _3DSTATE_LINE_STIPPLE 0x08 #define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP 0x09 #define _3DCONTROL 0x00 -#define _3DPRIMITIVE 0x00 #define PIPE_CONTROL_NOWRITE 0x00 #define PIPE_CONTROL_WRITEIMMEDIATE 0x01 @@ -862,10 +798,9 @@ #include "intel_chipset.h" -#define BRW_IS_GM45(brw) (IS_GM45_GM((brw)->intel.intelScreen->deviceID)) #define BRW_IS_G4X(brw) (IS_G4X((brw)->intel.intelScreen->deviceID)) -#define CMD_PIPELINE_SELECT(brw) ((BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965) -#define CMD_VF_STATISTICS(brw) ((BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965) -#define URB_SIZES(brw) ((BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) ? 384 : 256) /* 512 bit unit */ +#define CMD_PIPELINE_SELECT(brw) (BRW_IS_G4X(brw) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965) +#define CMD_VF_STATISTICS(brw) (BRW_IS_G4X(brw) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965) +#define URB_SIZES(brw) (BRW_IS_G4X(brw) ? 384 : 256) /* 512 bit units */ #endif diff --git a/i965/brw_draw.c b/i965/brw_draw.c index f90c5f7..785fb78 100644 --- a/i965/brw_draw.c +++ b/i965/brw_draw.c @@ -27,11 +27,11 @@ #include <stdlib.h> -#include "glheader.h" -#include "context.h" -#include "state.h" -#include "api_validate.h" -#include "enums.h" +#include "main/glheader.h" +#include "main/context.h" +#include "main/state.h" +#include "main/api_validate.h" +#include "main/enums.h" #include "brw_draw.h" #include "brw_defines.h" @@ -39,7 +39,6 @@ #include "brw_state.h" #include "brw_fallback.h" -#include "intel_ioctl.h" #include "intel_batchbuffer.h" #include "intel_buffer_objects.h" @@ -50,7 +49,7 @@ #define FILE_DEBUG_FLAG DEBUG_BATCH -static GLuint hw_prim[GL_POLYGON+1] = { +static GLuint prim_to_hw_prim[GL_POLYGON+1] = { _3DPRIM_POINTLIST, _3DPRIM_LINELIST, _3DPRIM_LINELOOP, @@ -83,9 +82,8 @@ static const GLenum reduced_prim[GL_POLYGON+1] = { * programs be immune to the active primitive (ie. cope with all * possibilities). That may not be realistic however. */ -static GLuint brw_set_prim(struct brw_context *brw, GLenum prim, GLboolean *need_flush) +static GLuint brw_set_prim(struct brw_context *brw, GLenum prim) { - int ret; if (INTEL_DEBUG & DEBUG_PRIMS) _mesa_printf("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim)); @@ -105,13 +103,9 @@ static GLuint brw_set_prim(struct brw_context *brw, GLenum prim, GLboolean *need brw->intel.reduced_primitive = reduced_prim[prim]; brw->state.dirty.brw |= BRW_NEW_REDUCED_PRIMITIVE; } - - ret = brw_validate_state(brw); - if (ret) - *need_flush = GL_TRUE; } - return hw_prim[prim]; + return prim_to_hw_prim[prim]; } @@ -126,12 +120,11 @@ static GLuint trim(GLenum prim, GLuint length) } -static void brw_emit_prim( struct brw_context *brw, - const struct _mesa_prim *prim ) - +static void brw_emit_prim(struct brw_context *brw, + const struct _mesa_prim *prim, + uint32_t hw_prim) { struct brw_3d_primitive prim_packet; - GLboolean need_flush = GL_FALSE; if (INTEL_DEBUG & DEBUG_PRIMS) _mesa_printf("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode), @@ -140,7 +133,7 @@ static void brw_emit_prim( struct brw_context *brw, prim_packet.header.opcode = CMD_3D_PRIM; prim_packet.header.length = sizeof(prim_packet)/4 - 2; prim_packet.header.pad = 0; - prim_packet.header.topology = brw_set_prim(brw, prim->mode, &need_flush); + prim_packet.header.topology = hw_prim; prim_packet.header.indexed = prim->indexed; prim_packet.verts_per_instance = trim(prim->mode, prim->count); @@ -149,22 +142,25 @@ static void brw_emit_prim( struct brw_context *brw, prim_packet.start_instance_location = 0; prim_packet.base_vert_location = 0; + /* Can't wrap here, since we rely on the validated state. */ + brw->no_batch_wrap = GL_TRUE; if (prim_packet.verts_per_instance) { intel_batchbuffer_data( brw->intel.batch, &prim_packet, sizeof(prim_packet), LOOP_CLIPRECTS); } - - assert(need_flush == GL_FALSE); + brw->no_batch_wrap = GL_FALSE; } static void brw_merge_inputs( struct brw_context *brw, const struct gl_client_array *arrays[]) { - struct brw_vertex_element *inputs = brw->vb.inputs; struct brw_vertex_info old = brw->vb.info; GLuint i; - memset(inputs, 0, sizeof(*inputs)); + for (i = 0; i < VERT_ATTRIB_MAX; i++) + dri_bo_unreference(brw->vb.inputs[i].bo); + + memset(&brw->vb.inputs, 0, sizeof(brw->vb.inputs)); memset(&brw->vb.info, 0, sizeof(brw->vb.info)); for (i = 0; i < VERT_ATTRIB_MAX; i++) { @@ -175,7 +171,8 @@ static void brw_merge_inputs( struct brw_context *brw, if (arrays[i]->StrideB != 0) brw->vb.info.varying |= 1 << i; - brw->vb.info.sizes[i/16] |= (inputs[i].glarray->Size - 1) << ((i%16) * 2); + brw->vb.info.sizes[i/16] |= (brw->vb.inputs[i].glarray->Size - 1) << + ((i%16) * 2); } } @@ -257,21 +254,29 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx, struct intel_context *intel = intel_context(ctx); struct brw_context *brw = brw_context(ctx); GLboolean retval = GL_FALSE; + GLboolean warn = GL_FALSE; + GLboolean first_time = GL_TRUE; GLuint i; - GLuint ib_offset; - dri_bo *ib_bo; - GLboolean force_flush = GL_FALSE; - int ret; if (ctx->NewState) _mesa_update_state( ctx ); + if (check_fallbacks(brw, prim, nr_prims)) + return GL_FALSE; + brw_validate_textures( brw ); /* Bind all inputs, derive varying and size information: */ brw_merge_inputs( brw, arrays ); - + + brw->ib.ib = ib; + brw->state.dirty.brw |= BRW_NEW_INDICES; + + brw->vb.min_index = min_index; + brw->vb.max_index = max_index; + brw->state.dirty.brw |= BRW_NEW_VERTICES; + /* Have to validate state quite late. Will rebuild tnl_program, * which depends on varying information. * @@ -281,12 +286,14 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx, LOCK_HARDWARE(intel); - if (brw->intel.numClipRects == 0) { + if (!intel->constant_cliprect && intel->driDrawable->numClipRects == 0) { UNLOCK_HARDWARE(intel); return GL_TRUE; } - { + for (i = 0; i < nr_prims; i++) { + uint32_t hw_prim; + /* Flush the batch if it's approaching full, so that we don't wrap while * we've got validated state that needs to be in the same batch as the * primitives. This fraction is just a guess (minimal full state plus @@ -294,76 +301,58 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx, * an upper bound of how much we might emit in a single * brw_try_draw_prims(). */ - flush: - if (force_flush) - brw->no_batch_wrap = GL_FALSE; + intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4, + LOOP_CLIPRECTS); - if (intel->batch->ptr - intel->batch->map > intel->batch->size * 3 / 4 - /* brw_emit_prim may change the cliprect_mode to LOOP_CLIPRECTS */ - || intel->batch->cliprect_mode != LOOP_CLIPRECTS || (force_flush == GL_TRUE)) - intel_batchbuffer_flush(intel->batch); + hw_prim = brw_set_prim(brw, prim[i].mode); - force_flush = GL_FALSE; - brw->no_batch_wrap = GL_TRUE; + if (first_time || (brw->state.dirty.brw & BRW_NEW_PRIMITIVE)) { + first_time = GL_FALSE; - /* Set the first primitive early, ahead of validate_state: - */ - brw_set_prim(brw, prim[0].mode, &force_flush); + brw_validate_state(brw); - /* XXX: Need to separate validate and upload of state. - */ - ret = brw_validate_state( brw ); - if (ret) { - force_flush = GL_TRUE; - goto flush; - } - - /* Various fallback checks: - */ - if (brw->intel.Fallback) - goto out; - - if (check_fallbacks( brw, prim, nr_prims )) - goto out; - - /* need to account for index buffer and vertex buffer */ - if (ib) { - ret = brw_prepare_indices( brw, ib , &ib_bo, &ib_offset); - if (ret) { - force_flush = GL_TRUE; - goto flush; - } - } + /* Various fallback checks: */ + if (brw->intel.Fallback) + goto out; - ret = brw_prepare_vertices( brw, min_index, max_index); - if (ret < 0) - goto out; - - if (ret > 0) { - force_flush = GL_TRUE; - goto flush; + /* Check that we can fit our state in with our existing batchbuffer, or + * flush otherwise. + */ + if (dri_bufmgr_check_aperture_space(brw->state.validated_bos, + brw->state.validated_bo_count)) { + static GLboolean warned; + intel_batchbuffer_flush(intel->batch); + + /* Validate the state after we flushed the batch (which would have + * changed the set of dirty state). If we still fail to + * check_aperture, warn of what's happening, but attempt to continue + * on since it may succeed anyway, and the user would probably rather + * see a failure and a warning than a fallback. + */ + brw_validate_state(brw); + if (!warned && + dri_bufmgr_check_aperture_space(brw->state.validated_bos, + brw->state.validated_bo_count)) { + warn = GL_TRUE; + warned = GL_TRUE; + } + } + + brw_upload_state(brw); } - - /* Upload index, vertex data: - */ - if (ib) - brw_emit_indices( brw, ib, ib_bo, ib_offset); - brw_emit_vertices( brw, min_index, max_index); - - for (i = 0; i < nr_prims; i++) { - brw_emit_prim(brw, &prim[i]); - } + brw_emit_prim(brw, &prim[i], hw_prim); retval = GL_TRUE; } out: - - brw->no_batch_wrap = GL_FALSE; - UNLOCK_HARDWARE(intel); + if (warn) + fprintf(stderr, "i965: Single primitive emit potentially exceeded " + "available aperture space\n"); + if (!retval) DBG("%s failed\n", __FUNCTION__); @@ -420,7 +409,6 @@ void brw_draw_prims( GLcontext *ctx, return; } - /* Make a first attempt at drawing: */ retval = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index); @@ -433,6 +421,7 @@ void brw_draw_prims( GLcontext *ctx, _swsetup_Wakeup(ctx); _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index); } + } void brw_draw_init( struct brw_context *brw ) @@ -447,8 +436,18 @@ void brw_draw_init( struct brw_context *brw ) void brw_draw_destroy( struct brw_context *brw ) { + int i; + if (brw->vb.upload.bo != NULL) { dri_bo_unreference(brw->vb.upload.bo); brw->vb.upload.bo = NULL; } + + for (i = 0; i < VERT_ATTRIB_MAX; i++) { + dri_bo_unreference(brw->vb.inputs[i].bo); + brw->vb.inputs[i].bo = NULL; + } + + dri_bo_unreference(brw->ib.bo); + brw->ib.bo = NULL; } diff --git a/i965/brw_draw.h b/i965/brw_draw.h index b354740..9aebbdb 100644 --- a/i965/brw_draw.h +++ b/i965/brw_draw.h @@ -28,10 +28,9 @@ #ifndef BRW_DRAW_H #define BRW_DRAW_H -#include "mtypes.h" /* for GLcontext... */ +#include "main/mtypes.h" /* for GLcontext... */ #include "vbo/vbo.h" -#include "dri_bufmgr.h" struct brw_context; @@ -51,27 +50,4 @@ void brw_draw_destroy( struct brw_context *brw ); void brw_init_current_values(GLcontext *ctx, struct gl_client_array *arrays); - -/* brw_draw_upload.c - */ -int brw_prepare_indices( struct brw_context *brw, - const struct _mesa_index_buffer *index_buffer, - dri_bo **bo_return, - GLuint *offset_return); - -void brw_emit_indices( struct brw_context *brw, - const struct _mesa_index_buffer *index_buffer, - dri_bo *bo, - GLuint offset); - -int brw_prepare_vertices( struct brw_context *brw, - GLuint min_index, - GLuint max_index ); - -void brw_emit_vertices( struct brw_context *brw, - GLuint min_index, - GLuint max_index ); - - - #endif diff --git a/i965/brw_draw_upload.c b/i965/brw_draw_upload.c index 7946ffd..73d6dea 100644 --- a/i965/brw_draw_upload.c +++ b/i965/brw_draw_upload.c @@ -27,11 +27,11 @@ #include <stdlib.h> -#include "glheader.h" -#include "context.h" -#include "state.h" -#include "api_validate.h" -#include "enums.h" +#include "main/glheader.h" +#include "main/context.h" +#include "main/state.h" +#include "main/api_validate.h" +#include "main/enums.h" #include "brw_draw.h" #include "brw_defines.h" @@ -39,7 +39,6 @@ #include "brw_state.h" #include "brw_fallback.h" -#include "intel_ioctl.h" #include "intel_batchbuffer.h" #include "intel_buffer_objects.h" #include "intel_tex.h" @@ -228,10 +227,7 @@ static void wrap_buffers( struct brw_context *brw, if (brw->vb.upload.bo != NULL) dri_bo_unreference(brw->vb.upload.bo); brw->vb.upload.bo = dri_bo_alloc(brw->intel.bufmgr, "temporary VBO", - size, 1, - DRM_BO_FLAG_MEM_LOCAL | - DRM_BO_FLAG_CACHED | - DRM_BO_FLAG_CACHED_MAPPED); + size, 1); /* Set the internal VBO\ to no-backing-store. We only use them as a * temporary within a brw_try_draw_prims while the lock is held. @@ -254,10 +250,10 @@ static void get_space( struct brw_context *brw, wrap_buffers(brw, size); } + assert(*bo_return == NULL); dri_bo_reference(brw->vb.upload.bo); *bo_return = brw->vb.upload.bo; *offset_return = brw->vb.upload.offset; - brw->vb.upload.offset += size; } @@ -304,9 +300,7 @@ copy_array_to_vbo_array( struct brw_context *brw, } } -int brw_prepare_vertices( struct brw_context *brw, - GLuint min_index, - GLuint max_index ) +static void brw_prepare_vertices(struct brw_context *brw) { GLcontext *ctx = &brw->intel.ctx; struct intel_context *intel = intel_context(ctx); @@ -314,7 +308,8 @@ int brw_prepare_vertices( struct brw_context *brw, GLuint i; const unsigned char *ptr = NULL; GLuint interleave = 0; - int ret = 0; + unsigned int min_index = brw->vb.min_index; + unsigned int max_index = brw->vb.max_index; struct brw_vertex_element *enabled[VERT_ATTRIB_MAX]; GLuint nr_enabled = 0; @@ -342,8 +337,10 @@ int brw_prepare_vertices( struct brw_context *brw, * cases with > 17 vertex attributes enabled, so it probably * isn't an issue at this point. */ - if (nr_enabled >= BRW_VEP_MAX) - return -1; + if (nr_enabled >= BRW_VEP_MAX) { + intel->Fallback = 1; + return; + } for (i = 0; i < nr_enabled; i++) { struct brw_vertex_element *input = enabled[i]; @@ -356,22 +353,31 @@ int brw_prepare_vertices( struct brw_context *brw, intel_buffer_object(input->glarray->BufferObj); /* Named buffer object: Just reference its contents directly. */ + dri_bo_unreference(input->bo); input->bo = intel_bufferobj_buffer(intel, intel_buffer, INTEL_READ); dri_bo_reference(input->bo); input->offset = (unsigned long)input->glarray->Ptr; input->stride = input->glarray->StrideB; - - ret |= dri_bufmgr_check_aperture_space(input->bo); } else { + if (input->bo != NULL) { + /* Already-uploaded vertex data is present from a previous + * prepare_vertices, but we had to re-validate state due to + * check_aperture failing and a new batch being produced. + */ + continue; + } + /* Queue the buffer object up to be uploaded in the next pass, * when we've decided if we're doing interleaved or not. */ if (i == 0) { /* Position array not properly enabled: */ - if (input->glarray->StrideB == 0) - return -1; + if (input->glarray->StrideB == 0) { + intel->Fallback = 1; + return; + } interleave = input->glarray->StrideB; ptr = input->glarray->Ptr; @@ -403,7 +409,6 @@ int brw_prepare_vertices( struct brw_context *brw, */ copy_array_to_vbo_array(brw, upload[0], interleave); - ret |= dri_bufmgr_check_aperture_space(upload[0]->bo); for (i = 1; i < nr_uploads; i++) { /* Then, just point upload[i] at upload[0]'s buffer. */ upload[i]->stride = interleave; @@ -417,23 +422,19 @@ int brw_prepare_vertices( struct brw_context *brw, /* Upload non-interleaved arrays */ for (i = 0; i < nr_uploads; i++) { copy_array_to_vbo_array(brw, upload[i], upload[i]->element_size); - if (upload[i]->bo) { - ret |= dri_bufmgr_check_aperture_space(upload[i]->bo); - } } } + brw_prepare_query_begin(brw); - if (ret) - return 1; - + for (i = 0; i < nr_enabled; i++) { + struct brw_vertex_element *input = enabled[i]; - return 0; + brw_add_validated_bo(brw, input->bo); + } } -void brw_emit_vertices( struct brw_context *brw, - GLuint min_index, - GLuint max_index ) +static void brw_emit_vertices(struct brw_context *brw) { GLcontext *ctx = &brw->intel.ctx; struct intel_context *intel = intel_context(ctx); @@ -451,6 +452,7 @@ void brw_emit_vertices( struct brw_context *brw, enabled[nr_enabled++] = input; } + brw_emit_query_begin(brw); /* Now emit VB and VEP state packets. * @@ -469,16 +471,10 @@ void brw_emit_vertices( struct brw_context *brw, BRW_VB0_ACCESS_VERTEXDATA | (input->stride << BRW_VB0_PITCH_SHIFT)); OUT_RELOC(input->bo, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, + I915_GEM_DOMAIN_VERTEX, 0, input->offset); - OUT_BATCH(max_index); + OUT_BATCH(brw->vb.max_index); OUT_BATCH(0); /* Instance data step rate */ - - /* Unreference the buffer so it can get freed, now that we won't - * touch it any more. - */ - dri_bo_unreference(input->bo); - input->bo = NULL; } ADVANCE_BATCH(); @@ -515,18 +511,31 @@ void brw_emit_vertices( struct brw_context *brw, ADVANCE_BATCH(); } -int brw_prepare_indices( struct brw_context *brw, - const struct _mesa_index_buffer *index_buffer, - dri_bo **bo_return, - GLuint *offset_return) +const struct brw_tracked_state brw_vertices = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_BATCH | BRW_NEW_VERTICES, + .cache = 0, + }, + .prepare = brw_prepare_vertices, + .emit = brw_emit_vertices, +}; + +static void brw_prepare_indices(struct brw_context *brw) { GLcontext *ctx = &brw->intel.ctx; struct intel_context *intel = &brw->intel; - GLuint ib_size = get_size(index_buffer->type) * index_buffer->count; - dri_bo *bo; - struct gl_buffer_object *bufferobj = index_buffer->obj; - GLuint offset = (GLuint)index_buffer->ptr; - int ret; + const struct _mesa_index_buffer *index_buffer = brw->ib.ib; + GLuint ib_size; + dri_bo *bo = NULL; + struct gl_buffer_object *bufferobj; + GLuint offset; + + if (index_buffer == NULL) + return; + + ib_size = get_size(index_buffer->type) * index_buffer->count; + bufferobj = index_buffer->obj;; /* Turn into a proper VBO: */ @@ -540,6 +549,8 @@ int brw_prepare_indices( struct brw_context *brw, */ dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr); } else { + offset = (GLuint)index_buffer->ptr; + /* If the index buffer isn't aligned to its element size, we have to * rebase it into a temporary. */ @@ -562,19 +573,24 @@ int brw_prepare_indices( struct brw_context *brw, } } - *bo_return = bo; - *offset_return = offset; - ret = dri_bufmgr_check_aperture_space(bo); - return ret; + dri_bo_unreference(brw->ib.bo); + brw->ib.bo = bo; + brw->ib.offset = offset; + + brw_add_validated_bo(brw, brw->ib.bo); } -void brw_emit_indices(struct brw_context *brw, - const struct _mesa_index_buffer *index_buffer, - dri_bo *bo, - GLuint offset) +static void brw_emit_indices(struct brw_context *brw) { struct intel_context *intel = &brw->intel; - GLuint ib_size = get_size(index_buffer->type) * index_buffer->count; + const struct _mesa_index_buffer *index_buffer = brw->ib.ib; + GLuint ib_size; + + if (index_buffer == NULL) + return; + + ib_size = get_size(index_buffer->type) * index_buffer->count; + /* Emit the indexbuffer packet: */ { @@ -590,13 +606,23 @@ void brw_emit_indices(struct brw_context *brw, BEGIN_BATCH(4, IGNORE_CLIPRECTS); OUT_BATCH( ib.header.dword ); - OUT_RELOC( bo, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, offset); - OUT_RELOC( bo, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, - offset + ib_size); + OUT_RELOC(brw->ib.bo, + I915_GEM_DOMAIN_VERTEX, 0, + brw->ib.offset); + OUT_RELOC(brw->ib.bo, + I915_GEM_DOMAIN_VERTEX, 0, + brw->ib.offset + ib_size); OUT_BATCH( 0 ); ADVANCE_BATCH(); - - dri_bo_unreference(bo); } } +const struct brw_tracked_state brw_indices = { + .dirty = { + .mesa = 0, + .brw = BRW_NEW_BATCH | BRW_NEW_INDICES, + .cache = 0, + }, + .prepare = brw_prepare_indices, + .emit = brw_emit_indices, +}; diff --git a/i965/brw_eu.h b/i965/brw_eu.h index 207b8b7..9e2b39a 100644 --- a/i965/brw_eu.h +++ b/i965/brw_eu.h @@ -65,7 +65,7 @@ struct brw_reg GLuint abs:1; /* source only */ GLuint vstride:4; /* source only */ GLuint width:3; /* src only, align1 only */ - GLuint hstride:2; /* src only, align1 only */ + GLuint hstride:2; /* align1 only */ GLuint address_mode:1; /* relative addressing, hopefully! */ GLuint pad0:1; @@ -129,17 +129,28 @@ static INLINE int type_sz( GLuint type ) } } +/** + * Construct a brw_reg. + * \param file one of the BRW_x_REGISTER_FILE values + * \param nr register number/index + * \param subnr register sub number + * \param type one of BRW_REGISTER_TYPE_x + * \param vstride one of BRW_VERTICAL_STRIDE_x + * \param width one of BRW_WIDTH_x + * \param hstride one of BRW_HORIZONTAL_STRIDE_x + * \param swizzle one of BRW_SWIZZLE_x + * \param writemask WRITEMASK_X/Y/Z/W bitfield + */ static INLINE struct brw_reg brw_reg( GLuint file, - GLuint nr, - GLuint subnr, - GLuint type, - GLuint vstride, - GLuint width, - GLuint hstride, - GLuint swizzle, - GLuint writemask) -{ - + GLuint nr, + GLuint subnr, + GLuint type, + GLuint vstride, + GLuint width, + GLuint hstride, + GLuint swizzle, + GLuint writemask ) +{ struct brw_reg reg; reg.type = type; reg.file = file; @@ -166,6 +177,7 @@ static INLINE struct brw_reg brw_reg( GLuint file, return reg; } +/** Construct float[16] register */ static INLINE struct brw_reg brw_vec16_reg( GLuint file, GLuint nr, GLuint subnr ) @@ -181,6 +193,7 @@ static INLINE struct brw_reg brw_vec16_reg( GLuint file, WRITEMASK_XYZW); } +/** Construct float[8] register */ static INLINE struct brw_reg brw_vec8_reg( GLuint file, GLuint nr, GLuint subnr ) @@ -196,7 +209,7 @@ static INLINE struct brw_reg brw_vec8_reg( GLuint file, WRITEMASK_XYZW); } - +/** Construct float[4] register */ static INLINE struct brw_reg brw_vec4_reg( GLuint file, GLuint nr, GLuint subnr ) @@ -212,7 +225,7 @@ static INLINE struct brw_reg brw_vec4_reg( GLuint file, WRITEMASK_XYZW); } - +/** Construct float[2] register */ static INLINE struct brw_reg brw_vec2_reg( GLuint file, GLuint nr, GLuint subnr ) @@ -228,6 +241,7 @@ static INLINE struct brw_reg brw_vec2_reg( GLuint file, WRITEMASK_XY); } +/** Construct float[1] register */ static INLINE struct brw_reg brw_vec1_reg( GLuint file, GLuint nr, GLuint subnr ) @@ -277,6 +291,7 @@ static INLINE struct brw_reg byte_offset( struct brw_reg reg, } +/** Construct unsigned word[16] register */ static INLINE struct brw_reg brw_uw16_reg( GLuint file, GLuint nr, GLuint subnr ) @@ -284,6 +299,7 @@ static INLINE struct brw_reg brw_uw16_reg( GLuint file, return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr); } +/** Construct unsigned word[8] register */ static INLINE struct brw_reg brw_uw8_reg( GLuint file, GLuint nr, GLuint subnr ) @@ -291,6 +307,7 @@ static INLINE struct brw_reg brw_uw8_reg( GLuint file, return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr); } +/** Construct unsigned word[1] register */ static INLINE struct brw_reg brw_uw1_reg( GLuint file, GLuint nr, GLuint subnr ) @@ -311,6 +328,7 @@ static INLINE struct brw_reg brw_imm_reg( GLuint type ) 0); } +/** Construct float immediate register */ static INLINE struct brw_reg brw_imm_f( GLfloat f ) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F); @@ -318,6 +336,7 @@ static INLINE struct brw_reg brw_imm_f( GLfloat f ) return imm; } +/** Construct integer immediate register */ static INLINE struct brw_reg brw_imm_d( GLint d ) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D); @@ -325,6 +344,7 @@ static INLINE struct brw_reg brw_imm_d( GLint d ) return imm; } +/** Construct uint immediate register */ static INLINE struct brw_reg brw_imm_ud( GLuint ud ) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD); @@ -332,6 +352,7 @@ static INLINE struct brw_reg brw_imm_ud( GLuint ud ) return imm; } +/** Construct ushort immediate register */ static INLINE struct brw_reg brw_imm_uw( GLushort uw ) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW); @@ -339,6 +360,7 @@ static INLINE struct brw_reg brw_imm_uw( GLushort uw ) return imm; } +/** Construct short immediate register */ static INLINE struct brw_reg brw_imm_w( GLshort w ) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W); @@ -350,8 +372,7 @@ static INLINE struct brw_reg brw_imm_w( GLshort w ) * numbers alias with _V and _VF below: */ -/* Vector of eight signed half-byte values: - */ +/** Construct vector of eight signed half-byte values */ static INLINE struct brw_reg brw_imm_v( GLuint v ) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V); @@ -362,8 +383,7 @@ static INLINE struct brw_reg brw_imm_v( GLuint v ) return imm; } -/* Vector of four 8-bit float values: - */ +/** Construct vector of four 8-bit float values */ static INLINE struct brw_reg brw_imm_vf( GLuint v ) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF); @@ -400,38 +420,43 @@ static INLINE struct brw_reg brw_address( struct brw_reg reg ) return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr); } - -static INLINE struct brw_reg brw_vec1_grf( GLuint nr, - GLuint subnr ) +/** Construct float[1] general-purpose register */ +static INLINE struct brw_reg brw_vec1_grf( GLuint nr, GLuint subnr ) { return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); } -static INLINE struct brw_reg brw_vec8_grf( GLuint nr, - GLuint subnr ) +/** Construct float[2] general-purpose register */ +static INLINE struct brw_reg brw_vec2_grf( GLuint nr, GLuint subnr ) { - return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); + return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); } -static INLINE struct brw_reg brw_vec4_grf( GLuint nr, - GLuint subnr ) +/** Construct float[4] general-purpose register */ +static INLINE struct brw_reg brw_vec4_grf( GLuint nr, GLuint subnr ) { return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); } - -static INLINE struct brw_reg brw_vec2_grf( GLuint nr, - GLuint subnr ) +/** Construct float[8] general-purpose register */ +static INLINE struct brw_reg brw_vec8_grf( GLuint nr, GLuint subnr ) { - return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); + return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); } -static INLINE struct brw_reg brw_uw8_grf( GLuint nr, - GLuint subnr ) + +static INLINE struct brw_reg brw_uw8_grf( GLuint nr, GLuint subnr ) { return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); } +static INLINE struct brw_reg brw_uw16_grf( GLuint nr, GLuint subnr ) +{ + return brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); +} + + +/** Construct null register (usually used for setting condition codes) */ static INLINE struct brw_reg brw_null_reg( void ) { return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, @@ -518,13 +543,13 @@ static INLINE struct brw_reg stride( struct brw_reg reg, GLuint width, GLuint hstride ) { - reg.vstride = cvt(vstride); reg.width = cvt(width) - 1; reg.hstride = cvt(hstride); return reg; } + static INLINE struct brw_reg vec16( struct brw_reg reg ) { return stride(reg, 16,16,1); @@ -550,6 +575,7 @@ static INLINE struct brw_reg vec1( struct brw_reg reg ) return stride(reg, 0,1,0); } + static INLINE struct brw_reg get_element( struct brw_reg reg, GLuint elt ) { return vec1(suboffset(reg, elt)); @@ -681,7 +707,7 @@ static INLINE struct brw_indirect brw_indirect( GLuint addr_subnr, GLint offset static INLINE struct brw_instruction *current_insn( struct brw_compile *p) { - return &p->store[p->nr_insn]; + return &p->store[p->nr_insn]; } void brw_pop_insn_state( struct brw_compile *p ); @@ -727,6 +753,7 @@ ALU2(ADD) ALU2(MUL) ALU1(FRC) ALU1(RNDD) +ALU1(RNDZ) ALU2(MAC) ALU2(MACH) ALU1(LZD) diff --git a/i965/brw_eu_debug.c b/i965/brw_eu_debug.c index 2dff1ad..91dbbd5 100644 --- a/i965/brw_eu_debug.c +++ b/i965/brw_eu_debug.c @@ -30,9 +30,9 @@ */ -#include "mtypes.h" +#include "main/mtypes.h" +#include "main/imports.h" #include "brw_eu.h" -#include "imports.h" void brw_print_reg( struct brw_reg hwreg ) { diff --git a/i965/brw_eu_emit.c b/i965/brw_eu_emit.c index 0bfbec9..4e099b5 100644 --- a/i965/brw_eu_emit.c +++ b/i965/brw_eu_emit.c @@ -64,7 +64,9 @@ static void brw_set_dest( struct brw_instruction *insn, if (insn->header.access_mode == BRW_ALIGN_1) { insn->bits1.da1.dest_subreg_nr = dest.subnr; - insn->bits1.da1.dest_horiz_stride = BRW_HORIZONTAL_STRIDE_1; + if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) + dest.hstride = BRW_HORIZONTAL_STRIDE_1; + insn->bits1.da1.dest_horiz_stride = dest.hstride; } else { insn->bits1.da16.dest_subreg_nr = dest.subnr / 16; @@ -78,7 +80,9 @@ static void brw_set_dest( struct brw_instruction *insn, */ if (insn->header.access_mode == BRW_ALIGN_1) { insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset; - insn->bits1.ia1.dest_horiz_stride = BRW_HORIZONTAL_STRIDE_1; + if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) + dest.hstride = BRW_HORIZONTAL_STRIDE_1; + insn->bits1.ia1.dest_horiz_stride = dest.hstride; } else { insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset; @@ -329,14 +333,14 @@ static void brw_set_sampler_message(struct brw_context *brw, { brw_set_src1(insn, brw_imm_d(0)); - if (BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) { - insn->bits3.sampler_gm45_g4x.binding_table_index = binding_table_index; - insn->bits3.sampler_gm45_g4x.sampler = sampler; - insn->bits3.sampler_gm45_g4x.msg_type = msg_type; - insn->bits3.sampler_gm45_g4x.response_length = response_length; - insn->bits3.sampler_gm45_g4x.msg_length = msg_length; - insn->bits3.sampler_gm45_g4x.end_of_thread = eot; - insn->bits3.sampler_gm45_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER; + if (BRW_IS_G4X(brw)) { + insn->bits3.sampler_g4x.binding_table_index = binding_table_index; + insn->bits3.sampler_g4x.sampler = sampler; + insn->bits3.sampler_g4x.msg_type = msg_type; + insn->bits3.sampler_g4x.response_length = response_length; + insn->bits3.sampler_g4x.msg_length = msg_length; + insn->bits3.sampler_g4x.end_of_thread = eot; + insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER; } else { insn->bits3.sampler.binding_table_index = binding_table_index; insn->bits3.sampler.sampler = sampler; @@ -435,6 +439,7 @@ ALU2(ADD) ALU2(MUL) ALU1(FRC) ALU1(RNDD) +ALU1(RNDZ) ALU2(MAC) ALU2(MACH) ALU1(LZD) diff --git a/i965/brw_fallback.c b/i965/brw_fallback.c index 8a8fb50..4ea660a 100644 --- a/i965/brw_fallback.c +++ b/i965/brw_fallback.c @@ -25,19 +25,20 @@ * **************************************************************************/ +#include "main/glheader.h" +#include "main/context.h" +#include "main/enums.h" +#include "main/imports.h" +#include "main/macros.h" +#include "main/mtypes.h" + #include "swrast_setup/swrast_setup.h" #include "swrast/swrast.h" #include "tnl/tnl.h" -#include "context.h" #include "brw_context.h" #include "brw_fallback.h" -#include "glheader.h" -#include "enums.h" -#include "glapi.h" -#include "imports.h" -#include "macros.h" -#include "mtypes.h" +#include "glapi/glapi.h" #define FILE_DEBUG_FLAG DEBUG_FALLBACKS @@ -73,10 +74,7 @@ static GLboolean do_check_fallback(struct brw_context *brw) if (texUnit->_ReallyEnabled) { struct intel_texture_object *intelObj = intel_texture_object(texUnit->_Current); struct gl_texture_image *texImage = intelObj->base.Image[0][intelObj->firstLevel]; - if (texImage->Border || - ((texImage->_BaseFormat == GL_DEPTH_COMPONENT) && - ((texImage->TexObject->WrapS == GL_CLAMP_TO_BORDER) || - (texImage->TexObject->WrapT == GL_CLAMP_TO_BORDER)))) { + if (texImage->Border) { DBG("FALLBACK: texture border\n"); return GL_TRUE; } @@ -95,10 +93,9 @@ static GLboolean do_check_fallback(struct brw_context *brw) return GL_FALSE; } -static int check_fallback(struct brw_context *brw) +static void check_fallback(struct brw_context *brw) { brw->intel.Fallback = do_check_fallback(brw); - return 0; } const struct brw_tracked_state brw_check_fallback = { diff --git a/i965/brw_fallback.h b/i965/brw_fallback.h index 684a46c..50dcdac 100644 --- a/i965/brw_fallback.h +++ b/i965/brw_fallback.h @@ -28,7 +28,7 @@ #ifndef BRW_FALLBACK_H #define BRW_FALLBACK_H -#include "mtypes.h" /* for GLcontext... */ +#include "main/mtypes.h" /* for GLcontext... */ struct brw_context; struct vbo_prim; diff --git a/i965/brw_gs.c b/i965/brw_gs.c index 9419315..a8b74a0 100644 --- a/i965/brw_gs.c +++ b/i965/brw_gs.c @@ -29,9 +29,9 @@ * Keith Whitwell <keith@tungstengraphics.com> */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" #include "intel_batchbuffer.h" @@ -162,10 +162,9 @@ static void populate_key( struct brw_context *brw, /* Calculate interpolants for triangle and line rasterization. */ -static int prepare_gs_prog( struct brw_context *brw ) +static void prepare_gs_prog(struct brw_context *brw) { struct brw_gs_prog_key key; - int ret = 0; /* Populate the key: */ populate_key(brw, &key); @@ -183,11 +182,7 @@ static int prepare_gs_prog( struct brw_context *brw ) &brw->gs.prog_data); if (brw->gs.prog_bo == NULL) compile_gs_prog( brw, &key ); - - ret |= dri_bufmgr_check_aperture_space(brw->gs.prog_bo); } - - return ret; } diff --git a/i965/brw_gs_emit.c b/i965/brw_gs_emit.c index 9abb94d..22e0d25 100644 --- a/i965/brw_gs_emit.c +++ b/i965/brw_gs_emit.c @@ -30,9 +30,9 @@ */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" #include "shader/program.h" #include "intel_batchbuffer.h" diff --git a/i965/brw_gs_state.c b/i965/brw_gs_state.c index f1f9e01..27023cf 100644 --- a/i965/brw_gs_state.c +++ b/i965/brw_gs_state.c @@ -34,7 +34,7 @@ #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" -#include "macros.h" +#include "main/macros.h" struct brw_gs_unit_key { unsigned int total_grf; @@ -106,17 +106,17 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key) if (key->prog_active) { /* Emit GS program relocation */ - dri_emit_reloc(bo, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, - gs.thread0.grf_reg_count << 1, - offsetof(struct brw_gs_unit_state, thread0), - brw->gs.prog_bo); + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + gs.thread0.grf_reg_count << 1, + offsetof(struct brw_gs_unit_state, thread0), + brw->gs.prog_bo); } return bo; } -static int prepare_gs_unit( struct brw_context *brw ) +static void prepare_gs_unit(struct brw_context *brw) { struct brw_gs_unit_key key; @@ -130,7 +130,6 @@ static int prepare_gs_unit( struct brw_context *brw ) if (brw->gs.state_bo == NULL) { brw->gs.state_bo = gs_unit_create_from_key(brw, &key); } - return dri_bufmgr_check_aperture_space(brw->gs.state_bo); } const struct brw_tracked_state brw_gs_unit = { diff --git a/i965/brw_metaops.c b/i965/brw_metaops.c index 252a899..41bfa2e 100644 --- a/i965/brw_metaops.c +++ b/i965/brw_metaops.c @@ -32,9 +32,9 @@ -#include "glheader.h" -#include "context.h" -#include "macros.h" +#include "main/glheader.h" +#include "main/context.h" +#include "main/macros.h" #include "shader/arbprogparse.h" diff --git a/i965/brw_misc_state.c b/i965/brw_misc_state.c index 62df259..627705f 100644 --- a/i965/brw_misc_state.c +++ b/i965/brw_misc_state.c @@ -71,6 +71,38 @@ const struct brw_tracked_state brw_blend_constant_color = { .emit = upload_blend_constant_color }; +/* Constant single cliprect for framebuffer object or DRI2 drawing */ +static void upload_drawing_rect(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + GLcontext *ctx = &intel->ctx; + + if (!intel->constant_cliprect) + return; + + BEGIN_BATCH(4, NO_LOOP_CLIPRECTS); + OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965); + OUT_BATCH(0); /* xmin, ymin */ + OUT_BATCH(((ctx->DrawBuffer->Width - 1) & 0xffff) | + ((ctx->DrawBuffer->Height - 1) << 16)); + OUT_BATCH(0); + ADVANCE_BATCH(); +} + +const struct brw_tracked_state brw_drawing_rect = { + .dirty = { + .mesa = _NEW_BUFFERS, + .brw = 0, + .cache = 0 + }, + .emit = upload_drawing_rect +}; + +static void prepare_binding_table_pointers(struct brw_context *brw) +{ + brw_add_validated_bo(brw, brw->wm.bind_bo); +} + /** * Upload the binding table pointers, which point each stage's array of surface * state pointers. @@ -88,7 +120,9 @@ static void upload_binding_table_pointers(struct brw_context *brw) OUT_BATCH(0); /* gs */ OUT_BATCH(0); /* clip */ OUT_BATCH(0); /* sf */ - OUT_RELOC(brw->wm.bind_bo, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, 0); + OUT_RELOC(brw->wm.bind_bo, + I915_GEM_DOMAIN_SAMPLER, 0, + 0); ADVANCE_BATCH(); } @@ -98,6 +132,7 @@ const struct brw_tracked_state brw_binding_table_pointers = { .brw = BRW_NEW_BATCH, .cache = CACHE_NEW_SURF_BIND, }, + .prepare = prepare_binding_table_pointers, .emit = upload_binding_table_pointers, }; @@ -114,40 +149,32 @@ static void upload_pipelined_state_pointers(struct brw_context *brw ) BEGIN_BATCH(7, IGNORE_CLIPRECTS); OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2)); - OUT_RELOC(brw->vs.state_bo, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, 0); + OUT_RELOC(brw->vs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); if (brw->gs.prog_active) - OUT_RELOC(brw->gs.state_bo, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, 1); + OUT_RELOC(brw->gs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); else OUT_BATCH(0); if (!brw->metaops.active) - OUT_RELOC(brw->clip.state_bo, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, 1); + OUT_RELOC(brw->clip.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); else OUT_BATCH(0); - OUT_RELOC(brw->sf.state_bo, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, 0); - OUT_RELOC(brw->wm.state_bo, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, 0); - OUT_RELOC(brw->cc.state_bo, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, 0); + OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); + OUT_RELOC(brw->wm.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); + OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); ADVANCE_BATCH(); brw->state.dirty.brw |= BRW_NEW_PSP; } -#if 0 -/* Combined into brw_psp_urb_cbs */ -const struct brw_tracked_state brw_pipelined_state_pointers = { - .dirty = { - .mesa = 0, - .brw = BRW_NEW_METAOPS | BRW_NEW_BATCH, - .cache = (CACHE_NEW_VS_UNIT | - CACHE_NEW_GS_UNIT | - CACHE_NEW_GS_PROG | - CACHE_NEW_CLIP_UNIT | - CACHE_NEW_SF_UNIT | - CACHE_NEW_WM_UNIT | - CACHE_NEW_CC_UNIT) - }, - .emit = upload_pipelined_state_pointers -}; -#endif + +static void prepare_psp_urb_cbs(struct brw_context *brw) +{ + brw_add_validated_bo(brw, brw->vs.state_bo); + brw_add_validated_bo(brw, brw->gs.state_bo); + brw_add_validated_bo(brw, brw->clip.state_bo); + brw_add_validated_bo(brw, brw->wm.state_bo); + brw_add_validated_bo(brw, brw->cc.state_bo); +} static void upload_psp_urb_cbs(struct brw_context *brw ) { @@ -156,7 +183,6 @@ static void upload_psp_urb_cbs(struct brw_context *brw ) brw_upload_constant_buffer_state(brw); } - const struct brw_tracked_state brw_psp_urb_cbs = { .dirty = { .mesa = 0, @@ -169,30 +195,23 @@ const struct brw_tracked_state brw_psp_urb_cbs = { CACHE_NEW_WM_UNIT | CACHE_NEW_CC_UNIT) }, + .prepare = prepare_psp_urb_cbs, .emit = upload_psp_urb_cbs, }; -/** - * Upload the depthbuffer offset and format. - * - * We have to do this per state validation as we need to emit the relocation - * in the batch buffer. - */ - -static int prepare_depthbuffer(struct brw_context *brw) +static void prepare_depthbuffer(struct brw_context *brw) { struct intel_region *region = brw->state.depth_region; - if (!region || !region->buffer) - return 0; - return dri_bufmgr_check_aperture_space(region->buffer); + if (region != NULL) + brw_add_validated_bo(brw, region->buffer); } static void emit_depthbuffer(struct brw_context *brw) { struct intel_context *intel = &brw->intel; struct intel_region *region = brw->state.depth_region; - unsigned int len = (BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) ? sizeof(struct brw_depthbuffer_gm45_g4x) / 4 : sizeof(struct brw_depthbuffer) / 4; + unsigned int len = BRW_IS_G4X(brw) ? 6 : 5; if (region == NULL) { BEGIN_BATCH(len, IGNORE_CLIPRECTS); @@ -203,7 +222,7 @@ static void emit_depthbuffer(struct brw_context *brw) OUT_BATCH(0); OUT_BATCH(0); - if (BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) + if (BRW_IS_G4X(brw)) OUT_BATCH(0); ADVANCE_BATCH(); @@ -230,16 +249,17 @@ static void emit_depthbuffer(struct brw_context *brw) OUT_BATCH(((region->pitch * region->cpp) - 1) | (format << 18) | (BRW_TILEWALK_YMAJOR << 26) | - (region->tiled << 27) | + ((region->tiling != I915_TILING_NONE) << 27) | (BRW_SURFACE_2D << 29)); OUT_RELOC(region->buffer, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE, 0); + I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, + 0); OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) | ((region->pitch - 1) << 6) | ((region->height - 1) << 19)); OUT_BATCH(0); - if (BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) + if (BRW_IS_G4X(brw)) OUT_BATCH(0); ADVANCE_BATCH(); @@ -324,7 +344,7 @@ static void upload_aa_line_parameters(struct brw_context *brw) { struct brw_aa_line_parameters balp; - if (!(BRW_IS_GM45(brw) || BRW_IS_G4X(brw))) + if (!BRW_IS_G4X(brw)) return; /* use legacy aa line coverage computation */ @@ -380,40 +400,6 @@ const struct brw_tracked_state brw_line_stipple = { }; - -/*********************************************************************** - * Misc constant state packets - */ - -static void upload_pipe_control(struct brw_context *brw) -{ - struct brw_pipe_control pc; - - return; - - memset(&pc, 0, sizeof(pc)); - - pc.header.opcode = CMD_PIPE_CONTROL; - pc.header.length = sizeof(pc)/4 - 2; - pc.header.post_sync_operation = PIPE_CONTROL_NOWRITE; - - pc.header.instruction_state_cache_flush_enable = 1; - - pc.bits1.dest_addr_type = PIPE_CONTROL_GTTWRITE_GLOBAL; - - BRW_BATCH_STRUCT(brw, &pc); -} - -const struct brw_tracked_state brw_pipe_control = { - .dirty = { - .mesa = 0, - .brw = BRW_NEW_BATCH, - .cache = 0 - }, - .emit = upload_pipe_control -}; - - /*********************************************************************** * Misc invarient state packets */ diff --git a/i965/brw_program.c b/i965/brw_program.c index c38610b..0c86911 100644 --- a/i965/brw_program.c +++ b/i965/brw_program.c @@ -111,13 +111,18 @@ static void brwProgramStringNotify( GLcontext *ctx, struct gl_program *prog ) { if (target == GL_FRAGMENT_PROGRAM_ARB) { + struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog; struct brw_context *brw = brw_context(ctx); struct brw_fragment_program *p = (struct brw_fragment_program *)prog; struct brw_fragment_program *fp = (struct brw_fragment_program *)brw->fragment_program; + if (fprog->FogOption) { + _mesa_append_fog_code(ctx, fprog); + fprog->FogOption = GL_NONE; + } + if (p == fp) brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM; p->id = brw->program_id++; - p->param_state = p->program.Base.Parameters->StateFlags; } else if (target == GL_VERTEX_PROGRAM_ARB) { struct brw_context *brw = brw_context(ctx); @@ -129,7 +134,6 @@ static void brwProgramStringNotify( GLcontext *ctx, _mesa_insert_mvp_code(ctx, &p->program); } p->id = brw->program_id++; - p->param_state = p->program.Base.Parameters->StateFlags; /* Also tell tnl about it: */ diff --git a/i965/brw_queryobj.c b/i965/brw_queryobj.c new file mode 100644 index 0000000..cb9169e --- /dev/null +++ b/i965/brw_queryobj.c @@ -0,0 +1,259 @@ +/* + * Copyright © 2008 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <eric@anholt.net> + * + */ + +/** @file support for ARB_query_object + * + * ARB_query_object is implemented by using the PIPE_CONTROL command to stall + * execution on the completion of previous depth tests, and write the + * current PS_DEPTH_COUNT to a buffer object. + * + * We use before and after counts when drawing during a query so that + * we don't pick up other clients' query data in ours. To reduce overhead, + * a single BO is used to record the query data for all active queries at + * once. This also gives us a simple bound on how much batchbuffer space is + * required for handling queries, so that we can be sure that we won't + * have to emit a batchbuffer without getting the ending PS_DEPTH_COUNT. + */ +#include "main/simple_list.h" +#include "main/imports.h" + +#include "brw_context.h" +#include "brw_state.h" +#include "intel_batchbuffer.h" +#include "intel_reg.h" + +/** Waits on the query object's BO and totals the results for this query */ +static void +brw_queryobj_get_results(struct brw_query_object *query) +{ + int i; + uint64_t *results; + + if (query->bo == NULL) + return; + + /* Map and count the pixels from the current query BO */ + dri_bo_map(query->bo, GL_FALSE); + results = query->bo->virtual; + for (i = query->first_index; i <= query->last_index; i++) { + query->Base.Result += results[i * 2 + 1] - results[i * 2]; + } + dri_bo_unmap(query->bo); + + dri_bo_unreference(query->bo); + query->bo = NULL; +} + +static struct gl_query_object * +brw_new_query_object(GLcontext *ctx, GLuint id) +{ + struct brw_query_object *query; + + query = _mesa_calloc(sizeof(struct brw_query_object)); + + query->Base.Id = id; + query->Base.Result = 0; + query->Base.Active = GL_FALSE; + query->Base.Ready = GL_TRUE; + + return &query->Base; +} + +static void +brw_delete_query(GLcontext *ctx, struct gl_query_object *q) +{ + struct brw_query_object *query = (struct brw_query_object *)q; + + dri_bo_unreference(query->bo); + _mesa_free(query); +} + +static void +brw_begin_query(GLcontext *ctx, struct gl_query_object *q) +{ + struct brw_context *brw = brw_context(ctx); + struct intel_context *intel = intel_context(ctx); + struct brw_query_object *query = (struct brw_query_object *)q; + + /* Reset our driver's tracking of query state. */ + dri_bo_unreference(query->bo); + query->bo = NULL; + query->first_index = -1; + query->last_index = -1; + + insert_at_head(&brw->query.active_head, query); + intel->stats_wm++; +} + +/** + * Begin the ARB_occlusion_query query on a query object. + */ +static void +brw_end_query(GLcontext *ctx, struct gl_query_object *q) +{ + struct brw_context *brw = brw_context(ctx); + struct intel_context *intel = intel_context(ctx); + struct brw_query_object *query = (struct brw_query_object *)q; + + /* Flush the batchbuffer in case it has writes to our query BO. + * Have later queries write to a new query BO so that further rendering + * doesn't delay the collection of our results. + */ + if (query->bo) { + brw_emit_query_end(brw); + intel_batchbuffer_flush(intel->batch); + + dri_bo_unreference(brw->query.bo); + brw->query.bo = NULL; + } + + remove_from_list(query); + + intel->stats_wm--; +} + +static void brw_wait_query(GLcontext *ctx, struct gl_query_object *q) +{ + struct brw_query_object *query = (struct brw_query_object *)q; + + brw_queryobj_get_results(query); + query->Base.Ready = GL_TRUE; +} + +static void brw_check_query(GLcontext *ctx, struct gl_query_object *q) +{ + /* XXX: Need to expose dri_bo_is_idle from bufmgr. */ +#if 0 + struct brw_query_object *query = (struct brw_query_object *)q; + + if (dri_bo_is_idle(query->bo)) { + brw_queryobj_get_results(query); + query->Base.Ready = GL_TRUE; + } +#else + brw_wait_query(ctx, q); +#endif +} + +/** Called to set up the query BO and account for its aperture space */ +void +brw_prepare_query_begin(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + + /* Skip if we're not doing any queries. */ + if (is_empty_list(&brw->query.active_head)) + return; + + /* Get a new query BO if we're going to need it. */ + if (brw->query.bo == NULL || + brw->query.index * 2 + 1 >= 4096 / sizeof(uint64_t)) { + dri_bo_unreference(brw->query.bo); + brw->query.bo = NULL; + + brw->query.bo = dri_bo_alloc(intel->bufmgr, "query", 4096, 1); + brw->query.index = 0; + } + + brw_add_validated_bo(brw, brw->query.bo); +} + +/** Called just before primitive drawing to get a beginning PS_DEPTH_COUNT. */ +void +brw_emit_query_begin(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + struct brw_query_object *query; + + /* Skip if we're not doing any queries, or we've emitted the start. */ + if (brw->query.active || is_empty_list(&brw->query.active_head)) + return; + + BEGIN_BATCH(4, IGNORE_CLIPRECTS); + OUT_BATCH(_3DSTATE_PIPE_CONTROL | + PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_WRITE_DEPTH_COUNT); + /* This object could be mapped cacheable, but we don't have an exposed + * mechanism to support that. Since it's going uncached, tell GEM that + * we're writing to it. The usual clflush should be all that's required + * to pick up the results. + */ + OUT_RELOC(brw->query.bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + PIPE_CONTROL_GLOBAL_GTT_WRITE | + ((brw->query.index * 2) * sizeof(uint64_t))); + OUT_BATCH(0); + OUT_BATCH(0); + ADVANCE_BATCH(); + + foreach(query, &brw->query.active_head) { + if (query->bo != brw->query.bo) { + if (query->bo != NULL) + brw_queryobj_get_results(query); + dri_bo_reference(brw->query.bo); + query->bo = brw->query.bo; + query->first_index = brw->query.index; + } + query->last_index = brw->query.index; + } + brw->query.active = GL_TRUE; +} + +/** Called at batchbuffer flush to get an ending PS_DEPTH_COUNT */ +void +brw_emit_query_end(struct brw_context *brw) +{ + struct intel_context *intel = &brw->intel; + + if (!brw->query.active) + return; + + BEGIN_BATCH(4, IGNORE_CLIPRECTS); + OUT_BATCH(_3DSTATE_PIPE_CONTROL | + PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_WRITE_DEPTH_COUNT); + OUT_RELOC(brw->query.bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + PIPE_CONTROL_GLOBAL_GTT_WRITE | + ((brw->query.index * 2 + 1) * sizeof(uint64_t))); + OUT_BATCH(0); + OUT_BATCH(0); + ADVANCE_BATCH(); + + brw->query.active = GL_FALSE; + brw->query.index++; +} + +void brw_init_queryobj_functions(struct dd_function_table *functions) +{ + functions->NewQueryObject = brw_new_query_object; + functions->DeleteQuery = brw_delete_query; + functions->BeginQuery = brw_begin_query; + functions->EndQuery = brw_end_query; + functions->CheckQuery = brw_check_query; + functions->WaitQuery = brw_wait_query; +} diff --git a/i965/brw_sf.c b/i965/brw_sf.c index 0b61748..1a11d54 100644 --- a/i965/brw_sf.c +++ b/i965/brw_sf.c @@ -30,9 +30,9 @@ */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" #include "intel_batchbuffer.h" @@ -73,10 +73,12 @@ static void compile_sf_prog( struct brw_context *brw, c.attr_to_idx[i] = idx; c.idx_to_attr[idx] = i; if (i >= VERT_RESULT_TEX0 && i <= VERT_RESULT_TEX7) { - c.point_attrs[i].CoordReplace = - brw->attribs.Point->CoordReplace[i - VERT_RESULT_TEX0]; - } else - c.point_attrs[i].CoordReplace = GL_FALSE; + c.point_attrs[i].CoordReplace = + brw->attribs.Point->CoordReplace[i - VERT_RESULT_TEX0]; + } + else { + c.point_attrs[i].CoordReplace = GL_FALSE; + } idx++; } @@ -106,7 +108,6 @@ static void compile_sf_prog( struct brw_context *brw, assert(0); return; } - /* get the program */ @@ -125,7 +126,7 @@ static void compile_sf_prog( struct brw_context *brw, /* Calculate interpolants for triangle and line rasterization. */ -static int upload_sf_prog( struct brw_context *brw ) +static void upload_sf_prog(struct brw_context *brw) { struct brw_sf_prog_key key; @@ -174,7 +175,6 @@ static int upload_sf_prog( struct brw_context *brw ) &brw->sf.prog_data); if (brw->sf.prog_bo == NULL) compile_sf_prog( brw, &key ); - return dri_bufmgr_check_aperture_space(brw->sf.prog_bo); } diff --git a/i965/brw_sf_emit.c b/i965/brw_sf_emit.c index 6fba8c8..ffdb0ae 100644 --- a/i965/brw_sf_emit.c +++ b/i965/brw_sf_emit.c @@ -30,9 +30,9 @@ */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" #include "intel_batchbuffer.h" diff --git a/i965/brw_sf_state.c b/i965/brw_sf_state.c index 24388b7..242b704 100644 --- a/i965/brw_sf_state.c +++ b/i965/brw_sf_state.c @@ -34,30 +34,23 @@ #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" -#include "macros.h" +#include "main/macros.h" #include "intel_fbo.h" -static int upload_sf_vp(struct brw_context *brw) +static void upload_sf_vp(struct brw_context *brw) { GLcontext *ctx = &brw->intel.ctx; const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF; struct brw_sf_viewport sfv; - struct intel_renderbuffer *irb = - intel_renderbuffer(ctx->DrawBuffer->_ColorDrawBuffers[0]); GLfloat y_scale, y_bias; memset(&sfv, 0, sizeof(sfv)); - if (ctx->DrawBuffer->Name) { - /* User-created FBO */ - if (irb && !irb->RenderToTexture) { - y_scale = -1.0; - y_bias = ctx->DrawBuffer->Height; - } else { - y_scale = 1.0; - y_bias = 0; - } - } else { + if (intel_rendering_to_texture(ctx)) { + y_scale = 1.0; + y_bias = 0; + } + else { y_scale = -1.0; y_bias = ctx->DrawBuffer->Height; } @@ -98,8 +91,6 @@ static int upload_sf_vp(struct brw_context *brw) dri_bo_unreference(brw->sf.vp_bo); brw->sf.vp_bo = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0 ); - - return dri_bufmgr_check_aperture_space(brw->sf.vp_bo); } const struct brw_tracked_state brw_sf_vp = { @@ -122,6 +113,7 @@ struct brw_sf_unit_key { GLboolean scissor, line_smooth, point_sprite, point_attenuated; float line_width; float point_size; + GLboolean render_to_texture; }; static void @@ -152,6 +144,8 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key) key->point_sprite = brw->attribs.Point->PointSprite; key->point_size = brw->attribs.Point->Size; key->point_attenuated = brw->attribs.Point->_Attenuated; + + key->render_to_texture = intel_rendering_to_texture(&brw->intel.ctx); } static dri_bo * @@ -174,7 +168,8 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, sf.thread4.nr_urb_entries = key->nr_urb_entries; sf.thread4.urb_entry_allocation_size = key->sfsize - 1; - sf.thread4.max_threads = MIN2(12, key->nr_urb_entries / 2) - 1; + /* Each SF thread produces 1 PUE, and there can be up to 24 threads */ + sf.thread4.max_threads = MIN2(24, key->nr_urb_entries) - 1; if (INTEL_DEBUG & DEBUG_SINGLE_THREAD) sf.thread4.max_threads = 0; @@ -197,6 +192,11 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, else sf.sf5.front_winding = BRW_FRONTWINDING_CW; + /* The viewport is inverted for rendering to texture, and that inverts + * polygon front/back orientation. + */ + sf.sf5.front_winding ^= key->render_to_texture; + switch (key->cull_face) { case GL_FRONT: sf.sf6.cull_mode = BRW_CULLMODE_FRONT; @@ -230,7 +230,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, /* XXX clamp max depends on AA vs. non-AA */ sf.sf7.sprite_point = key->point_sprite; - sf.sf7.point_size = CLAMP(nearbyint(key->point_size), 1, 255) * (1<<3); + sf.sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3); sf.sf7.use_point_size_state = !key->point_attenuated; sf.sf7.aa_line_distance_mode = 0; @@ -253,27 +253,26 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, NULL, NULL); /* Emit SF program relocation */ - dri_emit_reloc(bo, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, - sf.thread0.grf_reg_count << 1, - offsetof(struct brw_sf_unit_state, thread0), - brw->sf.prog_bo); + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + sf.thread0.grf_reg_count << 1, + offsetof(struct brw_sf_unit_state, thread0), + brw->sf.prog_bo); /* Emit SF viewport relocation */ - dri_emit_reloc(bo, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, - sf.sf5.front_winding | (sf.sf5.viewport_transform << 1), - offsetof(struct brw_sf_unit_state, sf5), - brw->sf.vp_bo); + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + sf.sf5.front_winding | (sf.sf5.viewport_transform << 1), + offsetof(struct brw_sf_unit_state, sf5), + brw->sf.vp_bo); return bo; } -static int upload_sf_unit( struct brw_context *brw ) +static void upload_sf_unit( struct brw_context *brw ) { struct brw_sf_unit_key key; dri_bo *reloc_bufs[2]; - int ret = 0; sf_unit_populate_key(brw, &key); @@ -288,15 +287,6 @@ static int upload_sf_unit( struct brw_context *brw ) if (brw->sf.state_bo == NULL) { brw->sf.state_bo = sf_unit_create_from_key(brw, &key, reloc_bufs); } - - if (reloc_bufs[0]) - ret |= dri_bufmgr_check_aperture_space(reloc_bufs[0]); - - if (reloc_bufs[1]) - ret |= dri_bufmgr_check_aperture_space(reloc_bufs[1]); - - ret |= dri_bufmgr_check_aperture_space(brw->sf.state_bo); - return ret; } const struct brw_tracked_state brw_sf_unit = { diff --git a/i965/brw_state.h b/i965/brw_state.h index d1fca05..df839c5 100644 --- a/i965/brw_state.h +++ b/i965/brw_state.h @@ -35,6 +35,16 @@ #include "brw_context.h" +static inline void +brw_add_validated_bo(struct brw_context *brw, dri_bo *bo) +{ + assert(brw->state.validated_bo_count < ARRAY_SIZE(brw->state.validated_bos)); + + if (bo != NULL) { + dri_bo_reference(bo); + brw->state.validated_bos[brw->state.validated_bo_count++] = bo; + } +}; const struct brw_tracked_state brw_blend_constant_color; const struct brw_tracked_state brw_cc_unit; @@ -73,13 +83,23 @@ const struct brw_tracked_state brw_wm_unit; const struct brw_tracked_state brw_psp_urb_cbs; -const struct brw_tracked_state brw_active_vertprog; -const struct brw_tracked_state brw_tnl_vertprog; const struct brw_tracked_state brw_pipe_control; const struct brw_tracked_state brw_clear_surface_cache; const struct brw_tracked_state brw_clear_batch_cache; +const struct brw_tracked_state brw_drawing_rect; +const struct brw_tracked_state brw_indices; +const struct brw_tracked_state brw_vertices; + +/*********************************************************************** + * brw_state.c + */ +void brw_validate_state(struct brw_context *brw); +void brw_upload_state(struct brw_context *brw); +void brw_init_state(struct brw_context *brw); +void brw_destroy_state(struct brw_context *brw); + /*********************************************************************** * brw_state_cache.c */ diff --git a/i965/brw_state_batch.c b/i965/brw_state_batch.c index 77e2736..dc87859 100644 --- a/i965/brw_state_batch.c +++ b/i965/brw_state_batch.c @@ -33,7 +33,7 @@ #include "brw_state.h" #include "intel_batchbuffer.h" -#include "imports.h" +#include "main/imports.h" @@ -97,8 +97,6 @@ void brw_clear_batch_cache_flush( struct brw_context *brw ) { clear_batch_cache(brw); - brw->wrap = 0; - /* brw_do_flush(brw, BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE); */ brw->state.dirty.mesa |= ~0; diff --git a/i965/brw_state_cache.c b/i965/brw_state_cache.c index d617650..d5b5166 100644 --- a/i965/brw_state_cache.c +++ b/i965/brw_state_cache.c @@ -58,7 +58,7 @@ #include "brw_state.h" #include "intel_batchbuffer.h" -#include "imports.h" +#include "main/imports.h" /* XXX: Fixme - have to include these to get the sizes of the prog_key * structs: @@ -214,10 +214,7 @@ brw_upload_cache( struct brw_cache *cache, /* Create the buffer object to contain the data */ bo = dri_bo_alloc(cache->brw->intel.bufmgr, - cache->name[cache_id], data_size, 1 << 6, - DRM_BO_FLAG_MEM_LOCAL | - DRM_BO_FLAG_CACHED | - DRM_BO_FLAG_CACHED_MAPPED); + cache->name[cache_id], data_size, 1 << 6); /* Set up the memory containing the key, aux_data, and reloc_bufs */ @@ -500,9 +497,10 @@ void brw_destroy_cache( struct brw_context *brw ) GLuint i; brw_clear_cache(brw); - for (i = 0; i < BRW_MAX_CACHE; i++) + for (i = 0; i < BRW_MAX_CACHE; i++) { + dri_bo_unreference(brw->cache.last_bo[i]); free(brw->cache.name[i]); - + } free(brw->cache.items); brw->cache.items = NULL; brw->cache.size = 0; diff --git a/i965/brw_state_dump.c b/i965/brw_state_dump.c index 3a93f9f..b28c57c 100644 --- a/i965/brw_state_dump.c +++ b/i965/brw_state_dump.c @@ -25,7 +25,7 @@ * */ -#include "mtypes.h" +#include "main/mtypes.h" #include "brw_context.h" #include "brw_state.h" diff --git a/i965/brw_state_upload.c b/i965/brw_state_upload.c index 3b2ccd4..4845859 100644 --- a/i965/brw_state_upload.c +++ b/i965/brw_state_upload.c @@ -33,7 +33,6 @@ #include "brw_context.h" #include "brw_state.h" -#include "dri_bufmgr.h" #include "intel_batchbuffer.h" /* This is used to initialize brw->state.atoms[]. We could use this @@ -46,8 +45,6 @@ const struct brw_tracked_state *atoms[] = { &brw_check_fallback, - &brw_tnl_vertprog, - &brw_active_vertprog, &brw_wm_input_sizes, &brw_vs_prog, &brw_gs_prog, @@ -80,7 +77,6 @@ const struct brw_tracked_state *atoms[] = */ &brw_invarient_state, &brw_state_base_address, - &brw_pipe_control, &brw_binding_table_pointers, &brw_blend_constant_color, @@ -102,6 +98,9 @@ const struct brw_tracked_state *atoms[] = &brw_psp_urb_cbs, #endif + &brw_drawing_rect, + &brw_indices, + &brw_vertices, NULL, /* brw_constant_buffer */ }; @@ -169,48 +168,172 @@ static void xor_states( struct brw_state_flags *result, result->cache = a->cache ^ b->cache; } +static void +brw_clear_validated_bos(struct brw_context *brw) +{ + int i; + + /* Clear the last round of validated bos */ + for (i = 0; i < brw->state.validated_bo_count; i++) { + dri_bo_unreference(brw->state.validated_bos[i]); + brw->state.validated_bos[i] = NULL; + } + brw->state.validated_bo_count = 0; +} + +struct dirty_bit_map { + uint32_t bit; + char *name; + uint32_t count; +}; + +#define DEFINE_BIT(name) {name, #name, 0} + +static struct dirty_bit_map mesa_bits[] = { + DEFINE_BIT(_NEW_MODELVIEW), + DEFINE_BIT(_NEW_PROJECTION), + DEFINE_BIT(_NEW_TEXTURE_MATRIX), + DEFINE_BIT(_NEW_COLOR_MATRIX), + DEFINE_BIT(_NEW_ACCUM), + DEFINE_BIT(_NEW_COLOR), + DEFINE_BIT(_NEW_DEPTH), + DEFINE_BIT(_NEW_EVAL), + DEFINE_BIT(_NEW_FOG), + DEFINE_BIT(_NEW_HINT), + DEFINE_BIT(_NEW_LIGHT), + DEFINE_BIT(_NEW_LINE), + DEFINE_BIT(_NEW_PIXEL), + DEFINE_BIT(_NEW_POINT), + DEFINE_BIT(_NEW_POLYGON), + DEFINE_BIT(_NEW_POLYGONSTIPPLE), + DEFINE_BIT(_NEW_SCISSOR), + DEFINE_BIT(_NEW_STENCIL), + DEFINE_BIT(_NEW_TEXTURE), + DEFINE_BIT(_NEW_TRANSFORM), + DEFINE_BIT(_NEW_VIEWPORT), + DEFINE_BIT(_NEW_PACKUNPACK), + DEFINE_BIT(_NEW_ARRAY), + DEFINE_BIT(_NEW_RENDERMODE), + DEFINE_BIT(_NEW_BUFFERS), + DEFINE_BIT(_NEW_MULTISAMPLE), + DEFINE_BIT(_NEW_TRACK_MATRIX), + DEFINE_BIT(_NEW_PROGRAM), + {0, 0, 0} +}; + +static struct dirty_bit_map brw_bits[] = { + DEFINE_BIT(BRW_NEW_URB_FENCE), + DEFINE_BIT(BRW_NEW_FRAGMENT_PROGRAM), + DEFINE_BIT(BRW_NEW_VERTEX_PROGRAM), + DEFINE_BIT(BRW_NEW_INPUT_DIMENSIONS), + DEFINE_BIT(BRW_NEW_CURBE_OFFSETS), + DEFINE_BIT(BRW_NEW_REDUCED_PRIMITIVE), + DEFINE_BIT(BRW_NEW_PRIMITIVE), + DEFINE_BIT(BRW_NEW_CONTEXT), + DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS), + DEFINE_BIT(BRW_NEW_INPUT_VARYING), + DEFINE_BIT(BRW_NEW_PSP), + DEFINE_BIT(BRW_NEW_METAOPS), + DEFINE_BIT(BRW_NEW_FENCE), + DEFINE_BIT(BRW_NEW_INDICES), + DEFINE_BIT(BRW_NEW_VERTICES), + DEFINE_BIT(BRW_NEW_BATCH), + DEFINE_BIT(BRW_NEW_DEPTH_BUFFER), + {0, 0, 0} +}; + +static struct dirty_bit_map cache_bits[] = { + DEFINE_BIT(CACHE_NEW_CC_VP), + DEFINE_BIT(CACHE_NEW_CC_UNIT), + DEFINE_BIT(CACHE_NEW_WM_PROG), + DEFINE_BIT(CACHE_NEW_SAMPLER_DEFAULT_COLOR), + DEFINE_BIT(CACHE_NEW_SAMPLER), + DEFINE_BIT(CACHE_NEW_WM_UNIT), + DEFINE_BIT(CACHE_NEW_SF_PROG), + DEFINE_BIT(CACHE_NEW_SF_VP), + DEFINE_BIT(CACHE_NEW_SF_UNIT), + DEFINE_BIT(CACHE_NEW_VS_UNIT), + DEFINE_BIT(CACHE_NEW_VS_PROG), + DEFINE_BIT(CACHE_NEW_GS_UNIT), + DEFINE_BIT(CACHE_NEW_GS_PROG), + DEFINE_BIT(CACHE_NEW_CLIP_VP), + DEFINE_BIT(CACHE_NEW_CLIP_UNIT), + DEFINE_BIT(CACHE_NEW_CLIP_PROG), + DEFINE_BIT(CACHE_NEW_SURFACE), + DEFINE_BIT(CACHE_NEW_SURF_BIND), + {0, 0, 0} +}; + + +static void +brw_update_dirty_count(struct dirty_bit_map *bit_map, int32_t bits) +{ + int i; + + for (i = 0; i < 32; i++) { + if (bit_map[i].bit == 0) + return; + + if (bit_map[i].bit & bits) + bit_map[i].count++; + } +} + +static void +brw_print_dirty_count(struct dirty_bit_map *bit_map, int32_t bits) +{ + int i; + + for (i = 0; i < 32; i++) { + if (bit_map[i].bit == 0) + return; + + fprintf(stderr, "0x%08x: %12d (%s)\n", + bit_map[i].bit, bit_map[i].count, bit_map[i].name); + } +} /*********************************************************************** * Emit all state: */ -int brw_validate_state( struct brw_context *brw ) +void brw_validate_state( struct brw_context *brw ) { + struct intel_context *intel = &brw->intel; struct brw_state_flags *state = &brw->state.dirty; - GLuint i, ret, count; + GLuint i; + + brw_clear_validated_bos(brw); state->mesa |= brw->intel.NewGLState; brw->intel.NewGLState = 0; - if (brw->wrap) - state->brw |= BRW_NEW_CONTEXT; + brw_add_validated_bo(brw, intel->batch->buf); if (brw->emit_state_always) { state->mesa |= ~0; state->brw |= ~0; } - /* texenv program needs to notify us somehow when this happens: - * Some confusion about which state flag should represent this change. - */ if (brw->fragment_program != brw->attribs.FragmentProgram->_Current) { brw->fragment_program = brw->attribs.FragmentProgram->_Current; - brw->state.dirty.mesa |= _NEW_PROGRAM; brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM; } + if (brw->vertex_program != brw->attribs.VertexProgram->_Current) { + brw->vertex_program = brw->attribs.VertexProgram->_Current; + brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM; + } if (state->mesa == 0 && state->cache == 0 && state->brw == 0) - return 0; + return; if (brw->state.dirty.brw & BRW_NEW_CONTEXT) brw_clear_batch_cache_flush(brw); brw->intel.Fallback = 0; - count = 0; - /* do prepare stage for all atoms */ for (i = 0; i < Elements(atoms); i++) { const struct brw_tracked_state *atom = brw->state.atoms[i]; @@ -220,15 +343,20 @@ int brw_validate_state( struct brw_context *brw ) if (check_state(state, &atom->dirty)) { if (atom->prepare) { - ret = atom->prepare(brw); - if (ret) - return ret; + atom->prepare(brw); } } } +} + - if (brw->intel.Fallback) - return 0; +void brw_upload_state(struct brw_context *brw) +{ + struct brw_state_flags *state = &brw->state.dirty; + int i; + static int dirty_count = 0; + + brw_clear_validated_bos(brw); if (INTEL_DEBUG) { /* Debug version which enforces various sanity checks on the @@ -251,8 +379,9 @@ int brw_validate_state( struct brw_context *brw ) break; if (check_state(state, &atom->dirty)) { - if (atom->emit) + if (atom->emit) { atom->emit( brw ); + } } accumulate_state(&examined, &atom->dirty); @@ -274,13 +403,25 @@ int brw_validate_state( struct brw_context *brw ) break; if (check_state(state, &atom->dirty)) { - if (atom->emit) + if (atom->emit) { atom->emit( brw ); + } } } } + if (INTEL_DEBUG & DEBUG_STATE) { + brw_update_dirty_count(mesa_bits, state->mesa); + brw_update_dirty_count(brw_bits, state->brw); + brw_update_dirty_count(cache_bits, state->cache); + if (dirty_count++ % 1000 == 0) { + brw_print_dirty_count(mesa_bits, state->mesa); + brw_print_dirty_count(brw_bits, state->brw); + brw_print_dirty_count(cache_bits, state->cache); + fprintf(stderr, "\n"); + } + } + if (!brw->intel.Fallback) memset(state, 0, sizeof(*state)); - return 0; } diff --git a/i965/brw_structs.h b/i965/brw_structs.h index ec865c9..4e577d0 100644 --- a/i965/brw_structs.h +++ b/i965/brw_structs.h @@ -175,7 +175,7 @@ struct brw_depthbuffer } dword4; }; -struct brw_depthbuffer_gm45_g4x +struct brw_depthbuffer_g4x { union header_union header; @@ -1405,7 +1405,7 @@ struct brw_instruction GLuint msg_target:4; GLuint pad1:3; GLuint end_of_thread:1; - } sampler_gm45_g4x; + } sampler_g4x; struct brw_urb_immediate urb; diff --git a/i965/brw_tex.c b/i965/brw_tex.c index 258c626..0bb6f17 100644 --- a/i965/brw_tex.c +++ b/i965/brw_tex.c @@ -30,19 +30,19 @@ */ -#include "glheader.h" -#include "mtypes.h" -#include "imports.h" -#include "simple_list.h" -#include "enums.h" -#include "image.h" -#include "teximage.h" -#include "texstore.h" -#include "texformat.h" +#include "main/glheader.h" +#include "main/mtypes.h" +#include "main/imports.h" +#include "main/simple_list.h" +#include "main/enums.h" +#include "main/image.h" +#include "main/teximage.h" +#include "main/texstore.h" +#include "main/texformat.h" + #include "texmem.h" #include "intel_context.h" -#include "intel_ioctl.h" #include "intel_regions.h" #include "intel_tex.h" #include "brw_context.h" diff --git a/i965/brw_tex_layout.c b/i965/brw_tex_layout.c index e437c41..51a617f 100644 --- a/i965/brw_tex_layout.c +++ b/i965/brw_tex_layout.c @@ -36,7 +36,7 @@ #include "intel_mipmap_tree.h" #include "intel_tex_layout.h" #include "intel_context.h" -#include "macros.h" +#include "main/macros.h" #define FILE_DEBUG_FLAG DEBUG_MIPTREE diff --git a/i965/brw_urb.c b/i965/brw_urb.c index c423dbe..7673dd3 100644 --- a/i965/brw_urb.c +++ b/i965/brw_urb.c @@ -42,7 +42,44 @@ #define SF 3 #define CS 4 -/* XXX: Are the min_entry_size numbers useful? +/** @file brw_urb.c + * + * Manages the division of the URB space between the various fixed-function + * units. + * + * See the Thread Initiation Management section of the GEN4 B-Spec, and + * the individual *_STATE structures for restrictions on numbers of + * entries and threads. + */ + +/* + * Generally, a unit requires a min_nr_entries based on how many entries + * it produces before the downstream unit gets unblocked and can use and + * dereference some of its handles. + * + * The SF unit preallocates a PUE at the start of thread dispatch, and only + * uses that one. So it requires one entry per thread. + * + * For CLIP, the SF unit will hold the previous primitive while the + * next is getting assembled, meaning that linestrips require 3 CLIP VUEs + * (vertices) to ensure continued processing, trifans require 4, and tristrips + * require 5. There can be 1 or 2 threads, and each has the same requirement. + * + * GS has the same requirement as CLIP, but it never handles tristrips, + * so we can lower the minimum to 4 for the POLYGONs (trifans) it produces. + * We only run it single-threaded. + * + * For VS, the number of entries may be 8, 12, 16, or 32 (or 64 on G4X). + * Each thread processes 2 preallocated VUEs (vertices) at a time, and they + * get streamed down as soon as threads processing earlier vertices get + * theirs accepted. + * + * Each unit will take the number of URB entries we give it (based on the + * entry size calculated in brw_vs_emit.c for VUEs, brw_sf_emit.c for PUEs, + * and brw_curbe.c for the CURBEs) and decide its maximum number of + * threads it can support based on that. in brw_*_state.c. + * + * XXX: Are the min_entry_size numbers useful? * XXX: Verify min_nr_entries, esp for VS. * XXX: Verify SF min_entry_size. */ @@ -54,7 +91,7 @@ static const struct { } limits[CS+1] = { { 16, 32, 1, 5 }, /* vs */ { 4, 8, 1, 5 }, /* gs */ - { 6, 8, 1, 5 }, /* clp */ + { 5, 10, 1, 5 }, /* clp */ { 1, 8, 1, 12 }, /* sf */ { 1, 4, 1, 32 } /* cs */ }; @@ -74,7 +111,7 @@ static GLboolean check_urb_layout( struct brw_context *brw ) /* Most minimal update, forces re-emit of URB fence packet after GS * unit turned on/off. */ -static int recalculate_urb_fence( struct brw_context *brw ) +static void recalculate_urb_fence( struct brw_context *brw ) { GLuint csize = brw->curbe.total_size; GLuint vsize = brw->vs.prog_data->urb_entry_size; @@ -92,9 +129,9 @@ static int recalculate_urb_fence( struct brw_context *brw ) if (brw->urb.vsize < vsize || brw->urb.sfsize < sfsize || brw->urb.csize < csize || - (brw->urb.constrained && (brw->urb.vsize > brw->urb.vsize || - brw->urb.sfsize > brw->urb.sfsize || - brw->urb.csize > brw->urb.csize))) { + (brw->urb.constrained && (brw->urb.vsize > vsize || + brw->urb.sfsize > sfsize || + brw->urb.csize > csize))) { brw->urb.csize = csize; @@ -114,6 +151,10 @@ static int recalculate_urb_fence( struct brw_context *brw ) brw->urb.nr_sf_entries = limits[SF].min_nr_entries; brw->urb.nr_cs_entries = limits[CS].min_nr_entries; + /* Mark us as operating with constrained nr_entries, so that next + * time we recalculate we'll resize the fences in the hope of + * escaping constrained mode and getting back to normal performance. + */ brw->urb.constrained = 1; if (!check_urb_layout(brw)) { @@ -142,7 +183,6 @@ static int recalculate_urb_fence( struct brw_context *brw ) brw->state.dirty.brw |= BRW_NEW_URB_FENCE; } - return 0; } @@ -187,15 +227,3 @@ void brw_upload_urb_fence(struct brw_context *brw) BRW_BATCH_STRUCT(brw, &uf); } - - -#if 0 -const struct brw_tracked_state brw_urb_fence = { - .dirty = { - .mesa = 0, - .brw = BRW_NEW_URB_FENCE | BRW_NEW_PSP, - .cache = 0 - }, - .update = brw_upload_urb_fence -}; -#endif diff --git a/i965/brw_util.c b/i965/brw_util.c index d8d35c5..ce21aa4 100644 --- a/i965/brw_util.c +++ b/i965/brw_util.c @@ -30,7 +30,7 @@ */ -#include "mtypes.h" +#include "main/mtypes.h" #include "shader/prog_parameter.h" #include "brw_util.h" #include "brw_defines.h" diff --git a/i965/brw_util.h b/i965/brw_util.h index bd6cc0a..33e7cd8 100644 --- a/i965/brw_util.h +++ b/i965/brw_util.h @@ -33,7 +33,7 @@ #ifndef BRW_UTIL_H #define BRW_UTIL_H -#include "mtypes.h" +#include "main/mtypes.h" extern GLuint brw_count_bits( GLuint val ); extern GLuint brw_parameter_list_state_flags(struct gl_program_parameter_list *paramList); diff --git a/i965/brw_vs.c b/i965/brw_vs.c index f89b0e1..1db7cee 100644 --- a/i965/brw_vs.c +++ b/i965/brw_vs.c @@ -83,7 +83,7 @@ static void do_vs_prog( struct brw_context *brw, } -static int brw_upload_vs_prog( struct brw_context *brw ) +static void brw_upload_vs_prog(struct brw_context *brw) { struct brw_vs_prog_key key; struct brw_vertex_program *vp = @@ -115,7 +115,6 @@ static int brw_upload_vs_prog( struct brw_context *brw ) &brw->vs.prog_data); if (brw->vs.prog_bo == NULL) do_vs_prog(brw, vp, &key); - return dri_bufmgr_check_aperture_space(brw->vs.prog_bo); } diff --git a/i965/brw_vs.h b/i965/brw_vs.h index 41a33ff..22388ec 100644 --- a/i965/brw_vs.h +++ b/i965/brw_vs.h @@ -80,8 +80,4 @@ struct brw_vs_compile { void brw_vs_emit( struct brw_vs_compile *c ); - -void brw_ProgramCacheDestroy( GLcontext *ctx ); -void brw_ProgramCacheInit( GLcontext *ctx ); - #endif diff --git a/i965/brw_vs_constval.c b/i965/brw_vs_constval.c index a0106b8..6fbac02 100644 --- a/i965/brw_vs_constval.c +++ b/i965/brw_vs_constval.c @@ -30,7 +30,7 @@ */ -#include "macros.h" +#include "main/macros.h" #include "brw_context.h" #include "brw_vs.h" @@ -166,7 +166,7 @@ static GLuint get_input_size(struct brw_context *brw, /* Calculate sizes of vertex program outputs. Size is the largest * component index which might vary from [0,0,0,1] */ -static int calc_wm_input_sizes( struct brw_context *brw ) +static void calc_wm_input_sizes( struct brw_context *brw ) { /* BRW_NEW_VERTEX_PROGRAM */ struct brw_vertex_program *vp = @@ -210,7 +210,6 @@ static int calc_wm_input_sizes( struct brw_context *brw ) memcpy(brw->wm.input_size_masks, t.size_masks, sizeof(t.size_masks)); brw->state.dirty.brw |= BRW_NEW_WM_INPUT_DIMENSIONS; } - return 0; } const struct brw_tracked_state brw_wm_input_sizes = { diff --git a/i965/brw_vs_emit.c b/i965/brw_vs_emit.c index 8759826..174331a 100644 --- a/i965/brw_vs_emit.c +++ b/i965/brw_vs_emit.c @@ -73,8 +73,6 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) c->prog_data.curb_read_length = reg - 1; - - /* Allocate input regs: */ c->nr_inputs = 0; @@ -84,8 +82,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0); reg++; } - } - + } /* Allocate outputs: TODO: could organize the non-position outputs * to go straight into message regs. @@ -196,6 +193,7 @@ static void unalias1( struct brw_vs_compile *c, struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask); func(c, tmp, arg0); brw_MOV(p, dst, tmp); + release_tmp(c, tmp); } else { func(c, dst, arg0); @@ -217,12 +215,38 @@ static void unalias2( struct brw_vs_compile *c, struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask); func(c, tmp, arg0, arg1); brw_MOV(p, dst, tmp); + release_tmp(c, tmp); } else { func(c, dst, arg0, arg1); } } +static void unalias3( struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1, + struct brw_reg arg2, + void (*func)( struct brw_vs_compile *, + struct brw_reg, + struct brw_reg, + struct brw_reg, + struct brw_reg )) +{ + if ((dst.file == arg0.file && dst.nr == arg0.nr) || + (dst.file == arg1.file && dst.nr == arg1.nr) || + (dst.file == arg2.file && dst.nr == arg2.nr)) { + struct brw_compile *p = &c->func; + struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask); + func(c, tmp, arg0, arg1, arg2); + brw_MOV(p, dst, tmp); + release_tmp(c, tmp); + } + else { + func(c, dst, arg0, arg1, arg2); + } +} + static void emit_sop( struct brw_compile *p, struct brw_reg dst, struct brw_reg arg0, @@ -339,6 +363,7 @@ static void emit_math1( struct brw_vs_compile *c, } } + static void emit_math2( struct brw_vs_compile *c, GLuint function, struct brw_reg dst, @@ -370,7 +395,6 @@ static void emit_math2( struct brw_vs_compile *c, release_tmp(c, tmp); } } - static void emit_exp_noalias( struct brw_vs_compile *c, @@ -420,7 +444,7 @@ static void emit_exp_noalias( struct brw_vs_compile *c, BRW_MATH_FUNCTION_EXP, brw_writemask(dst, WRITEMASK_Z), brw_swizzle1(arg0, 0), - BRW_MATH_PRECISION_PARTIAL); + BRW_MATH_PRECISION_FULL); } if (dst.dw1.bits.writemask & WRITEMASK_W) { @@ -521,8 +545,6 @@ static void emit_log_noalias( struct brw_vs_compile *c, } - - /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1 */ static void emit_dst_noalias( struct brw_vs_compile *c, @@ -544,6 +566,7 @@ static void emit_dst_noalias( struct brw_vs_compile *c, brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1); } + static void emit_xpd( struct brw_compile *p, struct brw_reg dst, struct brw_reg t, @@ -554,7 +577,6 @@ static void emit_xpd( struct brw_compile *p, } - static void emit_lit_noalias( struct brw_vs_compile *c, struct brw_reg dst, struct brw_reg arg0 ) @@ -595,8 +617,42 @@ static void emit_lit_noalias( struct brw_vs_compile *c, brw_ENDIF(p, if_insn); } +static void emit_lrp_noalias(struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0, + struct brw_reg arg1, + struct brw_reg arg2) +{ + struct brw_compile *p = &c->func; + + brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0)); + brw_MUL(p, brw_null_reg(), dst, arg2); + brw_MAC(p, dst, arg0, arg1); +} + +/** 3 or 4-component vector normalization */ +static void emit_nrm( struct brw_vs_compile *c, + struct brw_reg dst, + struct brw_reg arg0, + int num_comps) +{ + struct brw_compile *p = &c->func; + struct brw_reg tmp = get_tmp(c); + + /* tmp = dot(arg0, arg0) */ + if (num_comps == 3) + brw_DP3(p, tmp, arg0, arg0); + else + brw_DP4(p, tmp, arg0, arg0); + + /* tmp = 1 / sqrt(tmp) */ + emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL); + /* dst = arg0 * tmp */ + brw_MUL(p, dst, arg0, tmp); + release_tmp(c, tmp); +} /* TODO: relative addressing! @@ -634,7 +690,6 @@ static struct brw_reg get_reg( struct brw_vs_compile *c, } - static struct brw_reg deref( struct brw_vs_compile *c, struct brw_reg arg, GLint offset) @@ -728,8 +783,6 @@ static struct brw_reg get_dst( struct brw_vs_compile *c, } - - static void emit_swz( struct brw_vs_compile *c, struct brw_reg dst, struct prog_src_register src ) @@ -801,8 +854,8 @@ static void emit_swz( struct brw_vs_compile *c, } - -/* Post-vertex-program processing. Send the results to the URB. +/** + * Post-vertex-program processing. Send the results to the URB. */ static void emit_vertex_write( struct brw_vs_compile *c) { @@ -817,9 +870,7 @@ static void emit_vertex_write( struct brw_vs_compile *c) get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG)); } - - /* Build ndc coords? TODO: Shortcircuit when w is known to be one. - */ + /* Build ndc coords */ if (!c->key.know_w_is_one) { ndc = get_tmp(c); emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL); @@ -829,12 +880,12 @@ static void emit_vertex_write( struct brw_vs_compile *c) ndc = pos; } - /* This includes the workaround for -ve rhw, so is no longer an - * optional step: + /* Update the header for point size, user clipping flags, and -ve rhw + * workaround. */ if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) || c->key.nr_userclip || - !c->key.know_w_is_one) + (!BRW_IS_G4X(p->brw) && !c->key.know_w_is_one)) { struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD); GLuint i; @@ -849,7 +900,6 @@ static void emit_vertex_write( struct brw_vs_compile *c) brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8)); } - for (i = 0; i < c->key.nr_userclip; i++) { brw_set_conditionalmod(p, BRW_CONDITIONAL_L); brw_DP4(p, brw_null_reg(), pos, c->userplane[i]); @@ -857,7 +907,6 @@ static void emit_vertex_write( struct brw_vs_compile *c) brw_set_predicate_control(p, BRW_PREDICATE_NONE); } - /* i965 clipping workaround: * 1) Test for -ve rhw * 2) If set, @@ -867,7 +916,7 @@ static void emit_vertex_write( struct brw_vs_compile *c) * Later, clipping will detect ucp[6] and ensure the primitive is * clipped against all fixed planes. */ - if (!(BRW_IS_GM45(p->brw) || BRW_IS_G4X(p->brw)) && !c->key.know_w_is_one) { + if (!BRW_IS_G4X(p->brw) && !c->key.know_w_is_one) { brw_CMP(p, vec8(brw_null_reg()), BRW_CONDITIONAL_L, @@ -889,14 +938,12 @@ static void emit_vertex_write( struct brw_vs_compile *c) brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0)); } - /* Emit the (interleaved) headers for the two vertices - an 8-reg * of zeros followed by two sets of NDC coordinates: */ brw_set_access_mode(p, BRW_ALIGN_1); brw_MOV(p, offset(m0, 2), ndc); brw_MOV(p, offset(m0, 3), pos); - brw_urb_WRITE(p, brw_null_reg(), /* dest */ @@ -910,9 +957,9 @@ static void emit_vertex_write( struct brw_vs_compile *c) 1, /* writes complete */ 0, /* urb destination offset */ BRW_URB_SWIZZLE_INTERLEAVE); - } + static void post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst ) { @@ -959,7 +1006,7 @@ void brw_vs_emit(struct brw_vs_compile *c ) GLuint file; if (INTEL_DEBUG & DEBUG_VS) { - _mesa_printf("\n\n\nvs-emit:\n"); + _mesa_printf("vs-emit:\n"); _mesa_print_program(&c->vp->program.Base); _mesa_printf("\n"); } @@ -1017,6 +1064,11 @@ void brw_vs_emit(struct brw_vs_compile *c ) else dst = get_dst(c, inst->DstReg); + if (inst->SaturateMode != SATURATE_OFF) { + _mesa_problem(NULL, "Unsupported saturate %d in vertex shader", + inst->SaturateMode); + } + switch (inst->Opcode) { case OPCODE_ABS: brw_MOV(p, dst, brw_abs(args[0])); @@ -1024,6 +1076,9 @@ void brw_vs_emit(struct brw_vs_compile *c ) case OPCODE_ADD: brw_ADD(p, dst, args[0], args[1]); break; + case OPCODE_COS: + emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL); + break; case OPCODE_DP3: brw_DP3(p, dst, args[0], args[1]); break; @@ -1033,6 +1088,12 @@ void brw_vs_emit(struct brw_vs_compile *c ) case OPCODE_DPH: brw_DPH(p, dst, args[0], args[1]); break; + case OPCODE_NRM3: + emit_nrm(c, dst, args[0], 3); + break; + case OPCODE_NRM4: + emit_nrm(c, dst, args[0], 4); + break; case OPCODE_DST: unalias2(c, dst, args[0], args[1], emit_dst_noalias); break; @@ -1060,6 +1121,9 @@ void brw_vs_emit(struct brw_vs_compile *c ) case OPCODE_LIT: unalias1(c, dst, args[0], emit_lit_noalias); break; + case OPCODE_LRP: + unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias); + break; case OPCODE_MAD: brw_MOV(p, brw_acc_reg(), args[2]); brw_MAC(p, dst, args[0], args[1]); @@ -1089,6 +1153,9 @@ void brw_vs_emit(struct brw_vs_compile *c ) case OPCODE_SEQ: emit_seq(p, dst, args[0], args[1]); break; + case OPCODE_SIN: + emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL); + break; case OPCODE_SNE: emit_sne(p, dst, args[0], args[1]); break; @@ -1097,7 +1164,7 @@ void brw_vs_emit(struct brw_vs_compile *c ) break; case OPCODE_SGT: emit_sgt(p, dst, args[0], args[1]); - break; + break; case OPCODE_SLT: emit_slt(p, dst, args[0], args[1]); break; @@ -1113,6 +1180,10 @@ void brw_vs_emit(struct brw_vs_compile *c ) */ emit_swz(c, dst, inst->SrcReg[0] ); break; + case OPCODE_TRUNC: + /* round toward zero */ + brw_RNDZ(p, dst, args[0]); + break; case OPCODE_XPD: emit_xpd(p, dst, args[0], args[1]); break; @@ -1131,7 +1202,7 @@ void brw_vs_emit(struct brw_vs_compile *c ) brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16)); brw_set_predicate_control_flag_value(p, 0xff); - break; + break; case OPCODE_CAL: brw_set_access_mode(p, BRW_ALIGN_1); brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16)); @@ -1140,7 +1211,7 @@ void brw_vs_emit(struct brw_vs_compile *c ) get_addr_reg(stack_index), brw_imm_d(4)); inst->Data = &p->store[p->nr_insn]; brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16)); - break; + break; case OPCODE_RET: brw_ADD(p, get_addr_reg(stack_index), get_addr_reg(stack_index), brw_imm_d(-4)); @@ -1149,14 +1220,17 @@ void brw_vs_emit(struct brw_vs_compile *c ) brw_set_access_mode(p, BRW_ALIGN_16); case OPCODE_END: brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16)); - break; + break; case OPCODE_PRINT: case OPCODE_BGNSUB: case OPCODE_ENDSUB: + /* no-op instructions */ break; default: - _mesa_printf("Unsupport opcode %d in vertex shader\n", inst->Opcode); - break; + _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader", + inst->Opcode, inst->Opcode < MAX_OPCODE ? + _mesa_opcode_string(inst->Opcode) : + "unknown"); } if ((inst->DstReg.File == PROGRAM_OUTPUT) diff --git a/i965/brw_vs_state.c b/i965/brw_vs_state.c index 2a64f3d..9425816 100644 --- a/i965/brw_vs_state.c +++ b/i965/brw_vs_state.c @@ -34,7 +34,7 @@ #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" -#include "macros.h" +#include "main/macros.h" struct brw_vs_unit_key { unsigned int total_grf; @@ -77,12 +77,19 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key) { struct brw_vs_unit_state vs; dri_bo *bo; + int chipset_max_threads; memset(&vs, 0, sizeof(vs)); vs.thread0.kernel_start_pointer = brw->vs.prog_bo->offset >> 6; /* reloc */ vs.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1; vs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754; + /* Choosing multiple program flow means that we may get 2-vertex threads, + * which will have the channel mask for dwords 4-7 enabled in the thread, + * and those dwords will be written to the second URB handle when we + * brw_urb_WRITE() results. + */ + vs.thread1.single_program_flow = 0; vs.thread3.urb_entry_read_length = key->urb_entry_read_length; vs.thread3.const_urb_entry_read_length = key->curb_entry_read_length; vs.thread3.dispatch_grf_start_reg = 1; @@ -91,8 +98,13 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key) vs.thread4.nr_urb_entries = key->nr_urb_entries; vs.thread4.urb_entry_allocation_size = key->urb_size - 1; - vs.thread4.max_threads = MIN2(MAX2(0, (key->nr_urb_entries - 6) / 2 - 1), - 15); + + if (BRW_IS_G4X(brw)) + chipset_max_threads = 32; + else + chipset_max_threads = 16; + vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2, + 1, chipset_max_threads) - 1; if (INTEL_DEBUG & DEBUG_SINGLE_THREAD) vs.thread4.max_threads = 0; @@ -115,16 +127,16 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key) NULL, NULL); /* Emit VS program relocation */ - dri_emit_reloc(bo, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, - vs.thread0.grf_reg_count << 1, - offsetof(struct brw_vs_unit_state, thread0), - brw->vs.prog_bo); + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + vs.thread0.grf_reg_count << 1, + offsetof(struct brw_vs_unit_state, thread0), + brw->vs.prog_bo); return bo; } -static int prepare_vs_unit( struct brw_context *brw ) +static void prepare_vs_unit(struct brw_context *brw) { struct brw_vs_unit_key key; @@ -138,7 +150,6 @@ static int prepare_vs_unit( struct brw_context *brw ) if (brw->vs.state_bo == NULL) { brw->vs.state_bo = vs_unit_create_from_key(brw, &key); } - return dri_bufmgr_check_aperture_space(brw->vs.state_bo); } const struct brw_tracked_state brw_vs_unit = { diff --git a/i965/brw_vs_tnl.c b/i965/brw_vs_tnl.c deleted file mode 100644 index e409620..0000000 --- a/i965/brw_vs_tnl.c +++ /dev/null @@ -1,1709 +0,0 @@ -/* - * Mesa 3-D graphics library - * Version: 6.3 - * - * Copyright (C) 2005 Tungsten Graphics All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * TUNGSTEN GRAPHICS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN - * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * \file t_vp_build.c - * Create a vertex program to execute the current fixed function T&L pipeline. - * \author Keith Whitwell - */ - - -#include "glheader.h" -#include "macros.h" -#include "enums.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" -#include "brw_vs.h" -#include "brw_state.h" - - -struct state_key { - unsigned light_global_enabled:1; - unsigned light_local_viewer:1; - unsigned light_twoside:1; - unsigned light_color_material:1; - unsigned light_color_material_mask:12; - unsigned light_material_mask:12; - unsigned normalize:1; - unsigned rescale_normals:1; - unsigned fog_source_is_depth:1; - unsigned tnl_do_vertex_fog:1; - unsigned separate_specular:1; - unsigned fog_option:2; - unsigned point_attenuated:1; - unsigned texture_enabled_global:1; - unsigned fragprog_inputs_read:12; - - struct { - unsigned light_enabled:1; - unsigned light_eyepos3_is_zero:1; - unsigned light_spotcutoff_is_180:1; - unsigned light_attenuated:1; - unsigned texunit_really_enabled:1; - unsigned texmat_enabled:1; - unsigned texgen_enabled:4; - unsigned texgen_mode0:4; - unsigned texgen_mode1:4; - unsigned texgen_mode2:4; - unsigned texgen_mode3:4; - } unit[8]; -}; - - - -#define FOG_NONE 0 -#define FOG_LINEAR 1 -#define FOG_EXP 2 -#define FOG_EXP2 3 - -static GLuint translate_fog_mode( GLenum mode ) -{ - switch (mode) { - case GL_LINEAR: return FOG_LINEAR; - case GL_EXP: return FOG_EXP; - case GL_EXP2: return FOG_EXP2; - default: return FOG_NONE; - } -} - -#define TXG_NONE 0 -#define TXG_OBJ_LINEAR 1 -#define TXG_EYE_LINEAR 2 -#define TXG_SPHERE_MAP 3 -#define TXG_REFLECTION_MAP 4 -#define TXG_NORMAL_MAP 5 - -static GLuint translate_texgen( GLboolean enabled, GLenum mode ) -{ - if (!enabled) - return TXG_NONE; - - switch (mode) { - case GL_OBJECT_LINEAR: return TXG_OBJ_LINEAR; - case GL_EYE_LINEAR: return TXG_EYE_LINEAR; - case GL_SPHERE_MAP: return TXG_SPHERE_MAP; - case GL_REFLECTION_MAP_NV: return TXG_REFLECTION_MAP; - case GL_NORMAL_MAP_NV: return TXG_NORMAL_MAP; - default: return TXG_NONE; - } -} - -static void make_state_key( GLcontext *ctx, struct state_key *key ) -{ - struct brw_context *brw = brw_context(ctx); - const struct gl_fragment_program *fp = brw->fragment_program; - GLuint i; - - /* This now relies on texenvprogram.c being active: - */ - assert(fp); - - memset(key, 0, sizeof(*key)); - - /* BRW_NEW_FRAGMENT_PROGRAM */ - key->fragprog_inputs_read = fp->Base.InputsRead; - - /* _NEW_LIGHT */ - key->separate_specular = (brw->attribs.Light->Model.ColorControl == - GL_SEPARATE_SPECULAR_COLOR); - - /* _NEW_LIGHT */ - if (brw->attribs.Light->Enabled) { - key->light_global_enabled = 1; - - if (brw->attribs.Light->Model.LocalViewer) - key->light_local_viewer = 1; - - if (brw->attribs.Light->Model.TwoSide) - key->light_twoside = 1; - - if (brw->attribs.Light->ColorMaterialEnabled) { - key->light_color_material = 1; - key->light_color_material_mask = brw->attribs.Light->ColorMaterialBitmask; - } - - /* BRW_NEW_INPUT_VARYING */ - - /* For these programs, material values are stuffed into the - * generic slots: - */ - for (i = 0 ; i < MAT_ATTRIB_MAX ; i++) - if (brw->vb.info.varying & (1<<(VERT_ATTRIB_GENERIC0 + i))) - key->light_material_mask |= 1<<i; - - for (i = 0; i < MAX_LIGHTS; i++) { - struct gl_light *light = &brw->attribs.Light->Light[i]; - - if (light->Enabled) { - key->unit[i].light_enabled = 1; - - if (light->EyePosition[3] == 0.0) - key->unit[i].light_eyepos3_is_zero = 1; - - if (light->SpotCutoff == 180.0) - key->unit[i].light_spotcutoff_is_180 = 1; - - if (light->ConstantAttenuation != 1.0 || - light->LinearAttenuation != 0.0 || - light->QuadraticAttenuation != 0.0) - key->unit[i].light_attenuated = 1; - } - } - } - - /* _NEW_TRANSFORM */ - if (brw->attribs.Transform->Normalize) - key->normalize = 1; - - if (brw->attribs.Transform->RescaleNormals) - key->rescale_normals = 1; - - /* BRW_NEW_FRAGMENT_PROGRAM */ - key->fog_option = translate_fog_mode(fp->FogOption); - if (key->fog_option) - key->fragprog_inputs_read |= FRAG_BIT_FOGC; - - /* _NEW_FOG */ - if (brw->attribs.Fog->FogCoordinateSource == GL_FRAGMENT_DEPTH_EXT) - key->fog_source_is_depth = 1; - - /* _NEW_HINT, ??? */ - if (1) - key->tnl_do_vertex_fog = 1; - - /* _NEW_POINT */ - if (brw->attribs.Point->_Attenuated) - key->point_attenuated = 1; - - /* _NEW_TEXTURE */ - if (brw->attribs.Texture->_TexGenEnabled || - brw->attribs.Texture->_TexMatEnabled || - brw->attribs.Texture->_EnabledUnits) - key->texture_enabled_global = 1; - - for (i = 0; i < MAX_TEXTURE_UNITS; i++) { - struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[i]; - - if (texUnit->_ReallyEnabled) - key->unit[i].texunit_really_enabled = 1; - - if (brw->attribs.Texture->_TexMatEnabled & ENABLE_TEXMAT(i)) - key->unit[i].texmat_enabled = 1; - - if (texUnit->TexGenEnabled) { - key->unit[i].texgen_enabled = 1; - - key->unit[i].texgen_mode0 = - translate_texgen( texUnit->TexGenEnabled & (1<<0), - texUnit->GenModeS ); - key->unit[i].texgen_mode1 = - translate_texgen( texUnit->TexGenEnabled & (1<<1), - texUnit->GenModeT ); - key->unit[i].texgen_mode2 = - translate_texgen( texUnit->TexGenEnabled & (1<<2), - texUnit->GenModeR ); - key->unit[i].texgen_mode3 = - translate_texgen( texUnit->TexGenEnabled & (1<<3), - texUnit->GenModeQ ); - } - } -} - - - -/* Very useful debugging tool - produces annotated listing of - * generated program with line/function references for each - * instruction back into this file: - */ -#define DISASSEM 0 - -/* Should be tunable by the driver - do we want to do matrix - * multiplications with DP4's or with MUL/MAD's? SSE works better - * with the latter, drivers may differ. - */ -#define PREFER_DP4 1 - - -/* Use uregs to represent registers internally, translate to Mesa's - * expected formats on emit. - * - * NOTE: These are passed by value extensively in this file rather - * than as usual by pointer reference. If this disturbs you, try - * remembering they are just 32bits in size. - * - * GCC is smart enough to deal with these dword-sized structures in - * much the same way as if I had defined them as dwords and was using - * macros to access and set the fields. This is much nicer and easier - * to evolve. - */ -struct ureg { - GLuint file:4; - GLint idx:8; /* relative addressing may be negative */ - GLuint negate:1; - GLuint swz:12; - GLuint pad:7; -}; - - -struct tnl_program { - const struct state_key *state; - struct gl_vertex_program *program; - - GLuint nr_instructions; - GLuint temp_in_use; - GLuint temp_reserved; - - struct ureg eye_position; - struct ureg eye_position_normalized; - struct ureg eye_normal; - struct ureg identity; - - GLuint materials; - GLuint color_materials; -}; - - -const static struct ureg undef = { - PROGRAM_UNDEFINED, - ~0, - 0, - 0, - 0 -}; - -/* Local shorthand: - */ -#define X SWIZZLE_X -#define Y SWIZZLE_Y -#define Z SWIZZLE_Z -#define W SWIZZLE_W - - -/* Construct a ureg: - */ -static struct ureg make_ureg(GLuint file, GLint idx) -{ - struct ureg reg; - reg.file = file; - reg.idx = idx; - reg.negate = 0; - reg.swz = SWIZZLE_NOOP; - reg.pad = 0; - return reg; -} - - - -static struct ureg ureg_negate( struct ureg reg ) -{ - reg.negate ^= 1; - return reg; -} - - -static struct ureg swizzle( struct ureg reg, int x, int y, int z, int w ) -{ - reg.swz = MAKE_SWIZZLE4(GET_SWZ(reg.swz, x), - GET_SWZ(reg.swz, y), - GET_SWZ(reg.swz, z), - GET_SWZ(reg.swz, w)); - - return reg; -} - -static struct ureg swizzle1( struct ureg reg, int x ) -{ - return swizzle(reg, x, x, x, x); -} - -static struct ureg get_temp( struct tnl_program *p ) -{ - int bit = ffs( ~p->temp_in_use ); - if (!bit) { - fprintf(stderr, "%s: out of temporaries\n", __FILE__); - assert(0); - } - - if (bit > p->program->Base.NumTemporaries) - p->program->Base.NumTemporaries = bit; - - p->temp_in_use |= 1<<(bit-1); - return make_ureg(PROGRAM_TEMPORARY, bit-1); -} - -static struct ureg reserve_temp( struct tnl_program *p ) -{ - struct ureg temp = get_temp( p ); - p->temp_reserved |= 1<<temp.idx; - return temp; -} - -static void release_temp( struct tnl_program *p, struct ureg reg ) -{ - if (reg.file == PROGRAM_TEMPORARY) { - p->temp_in_use &= ~(1<<reg.idx); - p->temp_in_use |= p->temp_reserved; /* can't release reserved temps */ - } -} - -static void release_temps( struct tnl_program *p ) -{ - p->temp_in_use = p->temp_reserved; -} - - - -static struct ureg register_input( struct tnl_program *p, GLuint input ) -{ - assert(input < 32); - - p->program->Base.InputsRead |= (1<<input); - return make_ureg(PROGRAM_INPUT, input); -} - -static struct ureg register_output( struct tnl_program *p, GLuint output ) -{ - p->program->Base.OutputsWritten |= (1<<output); - return make_ureg(PROGRAM_OUTPUT, output); -} - -static struct ureg register_const4f( struct tnl_program *p, - GLfloat s0, - GLfloat s1, - GLfloat s2, - GLfloat s3) -{ - GLfloat values[4]; - GLint idx; - GLuint swizzle; - values[0] = s0; - values[1] = s1; - values[2] = s2; - values[3] = s3; - idx = _mesa_add_unnamed_constant( p->program->Base.Parameters, values, 4, - &swizzle); - assert(swizzle == SWIZZLE_NOOP); /* Need to handle swizzle in reg setup */ - return make_ureg(PROGRAM_STATE_VAR, idx); -} - -#define register_const1f(p, s0) register_const4f(p, s0, 0, 0, 1) -#define register_scalar_const(p, s0) register_const4f(p, s0, s0, s0, s0) -#define register_const2f(p, s0, s1) register_const4f(p, s0, s1, 0, 1) -#define register_const3f(p, s0, s1, s2) register_const4f(p, s0, s1, s2, 1) - -static GLboolean is_undef( struct ureg reg ) -{ - return reg.file == PROGRAM_UNDEFINED; -} - -static struct ureg get_identity_param( struct tnl_program *p ) -{ - if (is_undef(p->identity)) - p->identity = register_const4f(p, 0,0,0,1); - - return p->identity; -} - -static struct ureg register_param5( struct tnl_program *p, - GLint s0, - GLint s1, - GLint s2, - GLint s3, - GLint s4) -{ - gl_state_index tokens[STATE_LENGTH]; - GLint idx; - tokens[0] = s0; - tokens[1] = s1; - tokens[2] = s2; - tokens[3] = s3; - tokens[4] = s4; - idx = _mesa_add_state_reference( p->program->Base.Parameters, tokens ); - return make_ureg(PROGRAM_STATE_VAR, idx); -} - - -#define register_param1(p,s0) register_param5(p,s0,0,0,0,0) -#define register_param2(p,s0,s1) register_param5(p,s0,s1,0,0,0) -#define register_param3(p,s0,s1,s2) register_param5(p,s0,s1,s2,0,0) -#define register_param4(p,s0,s1,s2,s3) register_param5(p,s0,s1,s2,s3,0) - - -static void register_matrix_param5( struct tnl_program *p, - GLint s0, /* matrix name */ - GLint s1, /* texture matrix number */ - GLint s2, /* first row */ - GLint s3, /* last row */ - GLint s4, /* modifier */ - struct ureg *matrix ) -{ - GLint i; - - /* This is a bit sad as the support is there to pull the whole - * matrix out in one go: - */ - for (i = 0; i <= s3 - s2; i++) - matrix[i] = register_param5( p, s0, s1, i, i, s4 ); -} - - -static void emit_arg( struct prog_src_register *src, - struct ureg reg ) -{ - src->File = reg.file; - src->Index = reg.idx; - src->Swizzle = reg.swz; - src->RelAddr = 0; - src->NegateBase = reg.negate; - src->Abs = 0; - src->NegateAbs = 0; -} - -static void emit_dst( struct prog_dst_register *dst, - struct ureg reg, GLuint mask ) -{ - dst->File = reg.file; - dst->Index = reg.idx; - /* allow zero as a shorthand for xyzw */ - dst->WriteMask = mask ? mask : WRITEMASK_XYZW; - dst->CondMask = 0; - dst->CondSwizzle = 0; - dst->CondSrc = 0; - dst->pad = 0; -} - -static void debug_insn( struct prog_instruction *inst, const char *fn, - GLuint line ) -{ - if (DISASSEM) { - static const char *last_fn; - - if (fn != last_fn) { - last_fn = fn; - _mesa_printf("%s:\n", fn); - } - - _mesa_printf("%d:\t", line); - _mesa_print_instruction(inst); - } -} - - -static void emit_op3fn(struct tnl_program *p, - GLuint op, - struct ureg dest, - GLuint mask, - struct ureg src0, - struct ureg src1, - struct ureg src2, - const char *fn, - GLuint line) -{ - GLuint nr = p->program->Base.NumInstructions++; - - if (nr >= p->nr_instructions) { - int new_nr_instructions = p->nr_instructions * 2; - - p->program->Base.Instructions = - _mesa_realloc(p->program->Base.Instructions, - sizeof(struct prog_instruction) * p->nr_instructions, - sizeof(struct prog_instruction) * new_nr_instructions); - p->nr_instructions = new_nr_instructions; - } - - { - struct prog_instruction *inst = &p->program->Base.Instructions[nr]; - memset(inst, 0, sizeof(*inst)); - inst->Opcode = op; - inst->StringPos = 0; - inst->Data = 0; - - emit_arg( &inst->SrcReg[0], src0 ); - emit_arg( &inst->SrcReg[1], src1 ); - emit_arg( &inst->SrcReg[2], src2 ); - - emit_dst( &inst->DstReg, dest, mask ); - - debug_insn(inst, fn, line); - } -} - - - -#define emit_op3(p, op, dst, mask, src0, src1, src2) \ - emit_op3fn(p, op, dst, mask, src0, src1, src2, __FUNCTION__, __LINE__) - -#define emit_op2(p, op, dst, mask, src0, src1) \ - emit_op3fn(p, op, dst, mask, src0, src1, undef, __FUNCTION__, __LINE__) - -#define emit_op1(p, op, dst, mask, src0) \ - emit_op3fn(p, op, dst, mask, src0, undef, undef, __FUNCTION__, __LINE__) - - -static struct ureg make_temp( struct tnl_program *p, struct ureg reg ) -{ - if (reg.file == PROGRAM_TEMPORARY && - !(p->temp_reserved & (1<<reg.idx))) - return reg; - else { - struct ureg temp = get_temp(p); - emit_op1(p, OPCODE_MOV, temp, 0, reg); - return temp; - } -} - - -/* Currently no tracking performed of input/output/register size or - * active elements. Could be used to reduce these operations, as - * could the matrix type. - */ -static void emit_matrix_transform_vec4( struct tnl_program *p, - struct ureg dest, - const struct ureg *mat, - struct ureg src) -{ - emit_op2(p, OPCODE_DP4, dest, WRITEMASK_X, src, mat[0]); - emit_op2(p, OPCODE_DP4, dest, WRITEMASK_Y, src, mat[1]); - emit_op2(p, OPCODE_DP4, dest, WRITEMASK_Z, src, mat[2]); - emit_op2(p, OPCODE_DP4, dest, WRITEMASK_W, src, mat[3]); -} - -/* This version is much easier to implement if writemasks are not - * supported natively on the target or (like SSE), the target doesn't - * have a clean/obvious dotproduct implementation. - */ -static void emit_transpose_matrix_transform_vec4( struct tnl_program *p, - struct ureg dest, - const struct ureg *mat, - struct ureg src) -{ - struct ureg tmp; - - if (dest.file != PROGRAM_TEMPORARY) - tmp = get_temp(p); - else - tmp = dest; - - emit_op2(p, OPCODE_MUL, tmp, 0, swizzle1(src,X), mat[0]); - emit_op3(p, OPCODE_MAD, tmp, 0, swizzle1(src,Y), mat[1], tmp); - emit_op3(p, OPCODE_MAD, tmp, 0, swizzle1(src,Z), mat[2], tmp); - emit_op3(p, OPCODE_MAD, dest, 0, swizzle1(src,W), mat[3], tmp); - - if (dest.file != PROGRAM_TEMPORARY) - release_temp(p, tmp); -} - -static void emit_matrix_transform_vec3( struct tnl_program *p, - struct ureg dest, - const struct ureg *mat, - struct ureg src) -{ - emit_op2(p, OPCODE_DP3, dest, WRITEMASK_X, src, mat[0]); - emit_op2(p, OPCODE_DP3, dest, WRITEMASK_Y, src, mat[1]); - emit_op2(p, OPCODE_DP3, dest, WRITEMASK_Z, src, mat[2]); -} - - -static void emit_normalize_vec3( struct tnl_program *p, - struct ureg dest, - struct ureg src ) -{ - emit_op2(p, OPCODE_DP3, dest, WRITEMASK_W, src, src); - emit_op1(p, OPCODE_RSQ, dest, WRITEMASK_W, swizzle1(dest,W)); - emit_op2(p, OPCODE_MUL, dest, WRITEMASK_XYZ, src, swizzle1(dest,W)); -} - -static void emit_passthrough( struct tnl_program *p, - GLuint input, - GLuint output ) -{ - struct ureg out = register_output(p, output); - emit_op1(p, OPCODE_MOV, out, 0, register_input(p, input)); -} - -static struct ureg get_eye_position( struct tnl_program *p ) -{ - if (is_undef(p->eye_position)) { - struct ureg pos = register_input( p, VERT_ATTRIB_POS ); - struct ureg modelview[4]; - - p->eye_position = reserve_temp(p); - - if (PREFER_DP4) { - register_matrix_param5( p, STATE_MODELVIEW_MATRIX, 0, 0, 3, - 0, modelview ); - - emit_matrix_transform_vec4(p, p->eye_position, modelview, pos); - } - else { - register_matrix_param5( p, STATE_MODELVIEW_MATRIX, 0, 0, 3, - STATE_MATRIX_TRANSPOSE, modelview ); - - emit_transpose_matrix_transform_vec4(p, p->eye_position, modelview, pos); - } - } - - return p->eye_position; -} - - -#if 0 -static struct ureg get_eye_z( struct tnl_program *p ) -{ - if (!is_undef(p->eye_position)) { - return swizzle1(p->eye_position, Z); - } - else if (!is_undef(p->eye_z)) { - struct ureg pos = register_input( p, BRW_ATTRIB_POS ); - struct ureg modelview2; - - p->eye_z = reserve_temp(p); - - register_matrix_param6( p, STATE_MATRIX, STATE_MODELVIEW, 0, 2, 1, - STATE_MATRIX, &modelview2 ); - - emit_matrix_transform_vec4(p, p->eye_position, modelview, pos); - emit_op2(p, OPCODE_DP4, p->eye_z, WRITEMASK_Z, pos, modelview2); - } - - return swizzle1(p->eye_z, Z) -} -#endif - - - -static struct ureg get_eye_position_normalized( struct tnl_program *p ) -{ - if (is_undef(p->eye_position_normalized)) { - struct ureg eye = get_eye_position(p); - p->eye_position_normalized = reserve_temp(p); - emit_normalize_vec3(p, p->eye_position_normalized, eye); - } - - return p->eye_position_normalized; -} - - -static struct ureg get_eye_normal( struct tnl_program *p ) -{ - if (is_undef(p->eye_normal)) { - struct ureg normal = register_input(p, VERT_ATTRIB_NORMAL ); - struct ureg mvinv[3]; - - register_matrix_param5( p, STATE_MODELVIEW_MATRIX, 0, 0, 2, - STATE_MATRIX_INVTRANS, mvinv ); - - p->eye_normal = reserve_temp(p); - - /* Transform to eye space: - */ - emit_matrix_transform_vec3( p, p->eye_normal, mvinv, normal ); - - /* Normalize/Rescale: - */ - if (p->state->normalize) { - emit_normalize_vec3( p, p->eye_normal, p->eye_normal ); - } - else if (p->state->rescale_normals) { - struct ureg rescale = register_param2(p, STATE_INTERNAL, - STATE_NORMAL_SCALE); - - emit_op2( p, OPCODE_MUL, p->eye_normal, 0, p->eye_normal, - swizzle1(rescale, X)); - } - } - - return p->eye_normal; -} - - - -static void build_hpos( struct tnl_program *p ) -{ - struct ureg pos = register_input( p, VERT_ATTRIB_POS ); - struct ureg hpos = register_output( p, VERT_RESULT_HPOS ); - struct ureg mvp[4]; - - if (PREFER_DP4) { - register_matrix_param5( p, STATE_MVP_MATRIX, 0, 0, 3, - 0, mvp ); - emit_matrix_transform_vec4( p, hpos, mvp, pos ); - } - else { - register_matrix_param5( p, STATE_MVP_MATRIX, 0, 0, 3, - STATE_MATRIX_TRANSPOSE, mvp ); - emit_transpose_matrix_transform_vec4( p, hpos, mvp, pos ); - } -} - - -static GLuint material_attrib( GLuint side, GLuint property ) -{ - return (property - STATE_AMBIENT) * 2 + side; -} - -/* Get a bitmask of which material values vary on a per-vertex basis. - */ -static void set_material_flags( struct tnl_program *p ) -{ - p->color_materials = 0; - p->materials = 0; - - if (p->state->light_color_material) { - p->materials = - p->color_materials = p->state->light_color_material_mask; - } - - p->materials |= p->state->light_material_mask; -} - - -static struct ureg get_material( struct tnl_program *p, GLuint side, - GLuint property ) -{ - GLuint attrib = material_attrib(side, property); - - if (p->color_materials & (1<<attrib)) - return register_input(p, VERT_ATTRIB_COLOR0); - else if (p->materials & (1<<attrib)) - return register_input( p, attrib + _TNL_ATTRIB_MAT_FRONT_AMBIENT ); - else - return register_param3( p, STATE_MATERIAL, side, property ); -} - -#define SCENE_COLOR_BITS(side) ((MAT_BIT_FRONT_EMISSION | \ - MAT_BIT_FRONT_AMBIENT | \ - MAT_BIT_FRONT_DIFFUSE) << (side)) - -/* Either return a precalculated constant value or emit code to - * calculate these values dynamically in the case where material calls - * are present between begin/end pairs. - * - * Probably want to shift this to the program compilation phase - if - * we always emitted the calculation here, a smart compiler could - * detect that it was constant (given a certain set of inputs), and - * lift it out of the main loop. That way the programs created here - * would be independent of the vertex_buffer details. - */ -static struct ureg get_scenecolor( struct tnl_program *p, GLuint side ) -{ - if (p->materials & SCENE_COLOR_BITS(side)) { - struct ureg lm_ambient = register_param1(p, STATE_LIGHTMODEL_AMBIENT); - struct ureg material_emission = get_material(p, side, STATE_EMISSION); - struct ureg material_ambient = get_material(p, side, STATE_AMBIENT); - struct ureg material_diffuse = get_material(p, side, STATE_DIFFUSE); - struct ureg tmp = make_temp(p, material_diffuse); - emit_op3(p, OPCODE_MAD, tmp, WRITEMASK_XYZ, lm_ambient, - material_ambient, material_emission); - return tmp; - } - else - return register_param2( p, STATE_LIGHTMODEL_SCENECOLOR, side ); -} - - -static struct ureg get_lightprod( struct tnl_program *p, GLuint light, - GLuint side, GLuint property ) -{ - GLuint attrib = material_attrib(side, property); - if (p->materials & (1<<attrib)) { - struct ureg light_value = - register_param3(p, STATE_LIGHT, light, property); - struct ureg material_value = get_material(p, side, property); - struct ureg tmp = get_temp(p); - emit_op2(p, OPCODE_MUL, tmp, 0, light_value, material_value); - return tmp; - } - else - return register_param4(p, STATE_LIGHTPROD, light, side, property); -} - -static struct ureg calculate_light_attenuation( struct tnl_program *p, - GLuint i, - struct ureg VPpli, - struct ureg dist ) -{ - struct ureg attenuation = register_param3(p, STATE_LIGHT, i, - STATE_ATTENUATION); - struct ureg att = get_temp(p); - - /* Calculate spot attenuation: - */ - if (!p->state->unit[i].light_spotcutoff_is_180) { - struct ureg spot_dir_norm = register_param3(p, STATE_INTERNAL, - STATE_SPOT_DIR_NORMALIZED, i); - struct ureg spot = get_temp(p); - struct ureg slt = get_temp(p); - - emit_op2(p, OPCODE_DP3, spot, 0, ureg_negate(VPpli), spot_dir_norm); - emit_op2(p, OPCODE_SLT, slt, 0, swizzle1(spot_dir_norm,W), spot); - emit_op2(p, OPCODE_POW, spot, 0, spot, swizzle1(attenuation, W)); - emit_op2(p, OPCODE_MUL, att, 0, slt, spot); - - release_temp(p, spot); - release_temp(p, slt); - } - - /* Calculate distance attenuation: - */ - if (p->state->unit[i].light_attenuated) { - - /* 1/d,d,d,1/d */ - emit_op1(p, OPCODE_RCP, dist, WRITEMASK_YZ, dist); - /* 1,d,d*d,1/d */ - emit_op2(p, OPCODE_MUL, dist, WRITEMASK_XZ, dist, swizzle1(dist,Y)); - /* 1/dist-atten */ - emit_op2(p, OPCODE_DP3, dist, 0, attenuation, dist); - - if (!p->state->unit[i].light_spotcutoff_is_180) { - /* dist-atten */ - emit_op1(p, OPCODE_RCP, dist, 0, dist); - /* spot-atten * dist-atten */ - emit_op2(p, OPCODE_MUL, att, 0, dist, att); - } else { - /* dist-atten */ - emit_op1(p, OPCODE_RCP, att, 0, dist); - } - } - - return att; -} - - - - - -/* Need to add some addtional parameters to allow lighting in object - * space - STATE_SPOT_DIRECTION and STATE_HALF_VECTOR implicitly assume eye - * space lighting. - */ -static void build_lighting( struct tnl_program *p ) -{ - const GLboolean twoside = p->state->light_twoside; - const GLboolean separate = p->state->separate_specular; - GLuint nr_lights = 0, count = 0; - struct ureg normal = get_eye_normal(p); - struct ureg lit = get_temp(p); - struct ureg dots = get_temp(p); - struct ureg _col0 = undef, _col1 = undef; - struct ureg _bfc0 = undef, _bfc1 = undef; - GLuint i; - - for (i = 0; i < MAX_LIGHTS; i++) - if (p->state->unit[i].light_enabled) - nr_lights++; - - set_material_flags(p); - - { - struct ureg shininess = get_material(p, 0, STATE_SHININESS); - emit_op1(p, OPCODE_MOV, dots, WRITEMASK_W, swizzle1(shininess,X)); - release_temp(p, shininess); - - _col0 = make_temp(p, get_scenecolor(p, 0)); - if (separate) - _col1 = make_temp(p, get_identity_param(p)); - else - _col1 = _col0; - - } - - if (twoside) { - struct ureg shininess = get_material(p, 1, STATE_SHININESS); - emit_op1(p, OPCODE_MOV, dots, WRITEMASK_Z, - ureg_negate(swizzle1(shininess,X))); - release_temp(p, shininess); - - _bfc0 = make_temp(p, get_scenecolor(p, 1)); - if (separate) - _bfc1 = make_temp(p, get_identity_param(p)); - else - _bfc1 = _bfc0; - } - - - /* If no lights, still need to emit the scenecolor. - */ - /* KW: changed to do this always - v1.17 "Fix lighting alpha result"? - */ - if (p->state->fragprog_inputs_read & FRAG_BIT_COL0) - { - struct ureg res0 = register_output( p, VERT_RESULT_COL0 ); - emit_op1(p, OPCODE_MOV, res0, 0, _col0); - - if (twoside) { - struct ureg res0 = register_output( p, VERT_RESULT_BFC0 ); - emit_op1(p, OPCODE_MOV, res0, 0, _bfc0); - } - } - - if (separate && (p->state->fragprog_inputs_read & FRAG_BIT_COL1)) { - - struct ureg res1 = register_output( p, VERT_RESULT_COL1 ); - emit_op1(p, OPCODE_MOV, res1, 0, _col1); - - if (twoside) { - struct ureg res1 = register_output( p, VERT_RESULT_BFC1 ); - emit_op1(p, OPCODE_MOV, res1, 0, _bfc1); - } - } - - if (nr_lights == 0) { - release_temps(p); - return; - } - - - for (i = 0; i < MAX_LIGHTS; i++) { - if (p->state->unit[i].light_enabled) { - struct ureg half = undef; - struct ureg att = undef, VPpli = undef; - - count++; - - if (p->state->unit[i].light_eyepos3_is_zero) { - /* Can used precomputed constants in this case. - * Attenuation never applies to infinite lights. - */ - VPpli = register_param3(p, STATE_LIGHT, i, - STATE_POSITION_NORMALIZED); - if (p->state->light_local_viewer) { - struct ureg eye_hat = get_eye_position_normalized(p); - half = get_temp(p); - emit_op2(p, OPCODE_SUB, half, 0, VPpli, eye_hat); - emit_normalize_vec3(p, half, half); - } else { - half = register_param3(p, STATE_LIGHT, i, STATE_HALF_VECTOR); - } - } - else { - struct ureg Ppli = register_param3(p, STATE_LIGHT, i, - STATE_POSITION); - struct ureg V = get_eye_position(p); - struct ureg dist = get_temp(p); - struct ureg tmpPpli = get_temp(p); - - VPpli = get_temp(p); - half = get_temp(p); - - /* In homogeneous object coordinates - */ - emit_op1(p, OPCODE_RCP, dist, 0, swizzle1(Ppli, W)); - emit_op2(p, OPCODE_MUL, tmpPpli, 0, Ppli, dist); - - /* Calulate VPpli vector - */ - emit_op2(p, OPCODE_SUB, VPpli, 0, tmpPpli, V); - - /* Normalize VPpli. The dist value also used in - * attenuation below. - */ - emit_op2(p, OPCODE_DP3, dist, 0, VPpli, VPpli); - emit_op1(p, OPCODE_RSQ, dist, 0, dist); - emit_op2(p, OPCODE_MUL, VPpli, 0, VPpli, dist); - - - /* Calculate attenuation: - */ - if (!p->state->unit[i].light_spotcutoff_is_180 || - p->state->unit[i].light_attenuated) { - att = calculate_light_attenuation(p, i, VPpli, dist); - } - - - /* Calculate viewer direction, or use infinite viewer: - */ - if (p->state->light_local_viewer) { - struct ureg eye_hat = get_eye_position_normalized(p); - emit_op2(p, OPCODE_SUB, half, 0, VPpli, eye_hat); - } - else { - struct ureg z_dir = swizzle(get_identity_param(p),X,Y,W,Z); - emit_op2(p, OPCODE_ADD, half, 0, VPpli, z_dir); - } - - emit_normalize_vec3(p, half, half); - - release_temp(p, dist); - release_temp(p, tmpPpli); - } - - /* Calculate dot products: - */ - emit_op2(p, OPCODE_DP3, dots, WRITEMASK_X, normal, VPpli); - emit_op2(p, OPCODE_DP3, dots, WRITEMASK_Y, normal, half); - - - /* Front face lighting: - */ - { - struct ureg ambient = get_lightprod(p, i, 0, STATE_AMBIENT); - struct ureg diffuse = get_lightprod(p, i, 0, STATE_DIFFUSE); - struct ureg specular = get_lightprod(p, i, 0, STATE_SPECULAR); - struct ureg res0, res1; - GLuint mask0, mask1; - - emit_op1(p, OPCODE_LIT, lit, 0, dots); - - if (!is_undef(att)) - emit_op2(p, OPCODE_MUL, lit, 0, lit, att); - - - mask0 = 0; - mask1 = 0; - res0 = _col0; - res1 = _col1; - - if (count == nr_lights) { - if (separate) { - mask0 = WRITEMASK_XYZ; - mask1 = WRITEMASK_XYZ; - - if (p->state->fragprog_inputs_read & FRAG_BIT_COL0) - res0 = register_output( p, VERT_RESULT_COL0 ); - - if (p->state->fragprog_inputs_read & FRAG_BIT_COL1) - res1 = register_output( p, VERT_RESULT_COL1 ); - } - else { - mask1 = WRITEMASK_XYZ; - - if (p->state->fragprog_inputs_read & FRAG_BIT_COL0) - res1 = register_output( p, VERT_RESULT_COL0 ); - } - } - - emit_op3(p, OPCODE_MAD, _col0, 0, swizzle1(lit,X), ambient, _col0); - emit_op3(p, OPCODE_MAD, res0, mask0, swizzle1(lit,Y), diffuse, _col0); - emit_op3(p, OPCODE_MAD, res1, mask1, swizzle1(lit,Z), specular, _col1); - - release_temp(p, ambient); - release_temp(p, diffuse); - release_temp(p, specular); - } - - /* Back face lighting: - */ - if (twoside) { - struct ureg ambient = get_lightprod(p, i, 1, STATE_AMBIENT); - struct ureg diffuse = get_lightprod(p, i, 1, STATE_DIFFUSE); - struct ureg specular = get_lightprod(p, i, 1, STATE_SPECULAR); - struct ureg res0, res1; - GLuint mask0, mask1; - - emit_op1(p, OPCODE_LIT, lit, 0, ureg_negate(swizzle(dots,X,Y,W,Z))); - - if (!is_undef(att)) - emit_op2(p, OPCODE_MUL, lit, 0, lit, att); - - mask0 = 0; - mask1 = 0; - res0 = _bfc0; - res1 = _bfc1; - - if (count == nr_lights) { - if (separate) { - mask0 = WRITEMASK_XYZ; - mask1 = WRITEMASK_XYZ; - if (p->state->fragprog_inputs_read & FRAG_BIT_COL0) - res0 = register_output( p, VERT_RESULT_BFC0 ); - - if (p->state->fragprog_inputs_read & FRAG_BIT_COL1) - res1 = register_output( p, VERT_RESULT_BFC1 ); - } - else { - mask1 = WRITEMASK_XYZ; - - if (p->state->fragprog_inputs_read & FRAG_BIT_COL0) - res1 = register_output( p, VERT_RESULT_BFC0 ); - } - } - - emit_op3(p, OPCODE_MAD, _bfc0, 0, swizzle1(lit,X), ambient, _bfc0); - emit_op3(p, OPCODE_MAD, res0, mask0, swizzle1(lit,Y), diffuse, _bfc0); - emit_op3(p, OPCODE_MAD, res1, mask1, swizzle1(lit,Z), specular, _bfc1); - - release_temp(p, ambient); - release_temp(p, diffuse); - release_temp(p, specular); - } - - release_temp(p, half); - release_temp(p, VPpli); - release_temp(p, att); - } - } - - release_temps( p ); -} - - -static void build_fog( struct tnl_program *p ) -{ - struct ureg fog = register_output(p, VERT_RESULT_FOGC); - struct ureg input; - GLuint useabs = p->state->fog_source_is_depth && p->state->fog_option && - (p->state->fog_option != FOG_EXP2); - - if (p->state->fog_source_is_depth) { - input = swizzle1(get_eye_position(p), Z); - } - else { - input = swizzle1(register_input(p, VERT_ATTRIB_FOG), X); - if (p->state->fog_option && - p->state->tnl_do_vertex_fog) - input = swizzle1(register_input(p, VERT_ATTRIB_FOG), X); - else - input = register_input(p, VERT_ATTRIB_FOG); - } - - if (p->state->fog_option && - p->state->tnl_do_vertex_fog) { - struct ureg params = register_param2(p, STATE_INTERNAL, - STATE_FOG_PARAMS_OPTIMIZED); - struct ureg tmp = get_temp(p); - struct ureg id = get_identity_param(p); - - emit_op1(p, OPCODE_MOV, fog, 0, id); - - if (useabs) { - emit_op1(p, OPCODE_ABS, tmp, 0, input); - } - - switch (p->state->fog_option) { - case FOG_LINEAR: { - emit_op3(p, OPCODE_MAD, tmp, 0, useabs ? tmp : input, - swizzle1(params,X), swizzle1(params,Y)); - emit_op2(p, OPCODE_MAX, tmp, 0, tmp, swizzle1(id,X)); /* saturate */ - emit_op2(p, OPCODE_MIN, fog, WRITEMASK_X, tmp, swizzle1(id,W)); - break; - } - case FOG_EXP: - emit_op2(p, OPCODE_MUL, tmp, 0, useabs ? tmp : input, - swizzle1(params,Z)); - emit_op1(p, OPCODE_EX2, fog, WRITEMASK_X, ureg_negate(tmp)); - break; - case FOG_EXP2: - emit_op2(p, OPCODE_MUL, tmp, 0, input, swizzle1(params,W)); - emit_op2(p, OPCODE_MUL, tmp, 0, tmp, tmp); - emit_op1(p, OPCODE_EX2, fog, WRITEMASK_X, ureg_negate(tmp)); - break; - } - - release_temp(p, tmp); - } - else { - /* results = incoming fog coords (compute fog per-fragment later) - * - * KW: Is it really necessary to do anything in this case? - */ - emit_op1(p, useabs ? OPCODE_ABS : OPCODE_MOV, fog, 0, input); - } -} - -static void build_reflect_texgen( struct tnl_program *p, - struct ureg dest, - GLuint writemask ) -{ - struct ureg normal = get_eye_normal(p); - struct ureg eye_hat = get_eye_position_normalized(p); - struct ureg tmp = get_temp(p); - - /* n.u */ - emit_op2(p, OPCODE_DP3, tmp, 0, normal, eye_hat); - /* 2n.u */ - emit_op2(p, OPCODE_ADD, tmp, 0, tmp, tmp); - /* (-2n.u)n + u */ - emit_op3(p, OPCODE_MAD, dest, writemask, ureg_negate(tmp), normal, eye_hat); - - release_temp(p, tmp); -} - -static void build_sphere_texgen( struct tnl_program *p, - struct ureg dest, - GLuint writemask ) -{ - struct ureg normal = get_eye_normal(p); - struct ureg eye_hat = get_eye_position_normalized(p); - struct ureg tmp = get_temp(p); - struct ureg half = register_scalar_const(p, .5); - struct ureg r = get_temp(p); - struct ureg inv_m = get_temp(p); - struct ureg id = get_identity_param(p); - - /* Could share the above calculations, but it would be - * a fairly odd state for someone to set (both sphere and - * reflection active for different texture coordinate - * components. Of course - if two texture units enable - * reflect and/or sphere, things start to tilt in favour - * of seperating this out: - */ - - /* n.u */ - emit_op2(p, OPCODE_DP3, tmp, 0, normal, eye_hat); - /* 2n.u */ - emit_op2(p, OPCODE_ADD, tmp, 0, tmp, tmp); - /* (-2n.u)n + u */ - emit_op3(p, OPCODE_MAD, r, 0, ureg_negate(tmp), normal, eye_hat); - /* r + 0,0,1 */ - emit_op2(p, OPCODE_ADD, tmp, 0, r, swizzle(id,X,Y,W,Z)); - /* rx^2 + ry^2 + (rz+1)^2 */ - emit_op2(p, OPCODE_DP3, tmp, 0, tmp, tmp); - /* 2/m */ - emit_op1(p, OPCODE_RSQ, tmp, 0, tmp); - /* 1/m */ - emit_op2(p, OPCODE_MUL, inv_m, 0, tmp, half); - /* r/m + 1/2 */ - emit_op3(p, OPCODE_MAD, dest, writemask, r, inv_m, half); - - release_temp(p, tmp); - release_temp(p, r); - release_temp(p, inv_m); -} - - -static void build_texture_transform( struct tnl_program *p ) -{ - GLuint i, j; - - for (i = 0; i < MAX_TEXTURE_UNITS; i++) { - - if (!(p->state->fragprog_inputs_read & (FRAG_BIT_TEX0<<i))) - continue; - - if (p->state->unit[i].texgen_enabled || - p->state->unit[i].texmat_enabled) { - - GLuint texmat_enabled = p->state->unit[i].texmat_enabled; - struct ureg out = register_output(p, VERT_RESULT_TEX0 + i); - struct ureg out_texgen = undef; - - if (p->state->unit[i].texgen_enabled) { - GLuint copy_mask = 0; - GLuint sphere_mask = 0; - GLuint reflect_mask = 0; - GLuint normal_mask = 0; - GLuint modes[4]; - - if (texmat_enabled) - out_texgen = get_temp(p); - else - out_texgen = out; - - modes[0] = p->state->unit[i].texgen_mode0; - modes[1] = p->state->unit[i].texgen_mode1; - modes[2] = p->state->unit[i].texgen_mode2; - modes[3] = p->state->unit[i].texgen_mode3; - - for (j = 0; j < 4; j++) { - switch (modes[j]) { - case TXG_OBJ_LINEAR: { - struct ureg obj = register_input(p, VERT_ATTRIB_POS); - struct ureg plane = - register_param3(p, STATE_TEXGEN, i, - STATE_TEXGEN_OBJECT_S + j); - - emit_op2(p, OPCODE_DP4, out_texgen, WRITEMASK_X << j, - obj, plane ); - break; - } - case TXG_EYE_LINEAR: { - struct ureg eye = get_eye_position(p); - struct ureg plane = - register_param3(p, STATE_TEXGEN, i, - STATE_TEXGEN_EYE_S + j); - - emit_op2(p, OPCODE_DP4, out_texgen, WRITEMASK_X << j, - eye, plane ); - break; - } - case TXG_SPHERE_MAP: - sphere_mask |= WRITEMASK_X << j; - break; - case TXG_REFLECTION_MAP: - reflect_mask |= WRITEMASK_X << j; - break; - case TXG_NORMAL_MAP: - normal_mask |= WRITEMASK_X << j; - break; - case TXG_NONE: - copy_mask |= WRITEMASK_X << j; - } - - } - - - if (sphere_mask) { - build_sphere_texgen(p, out_texgen, sphere_mask); - } - - if (reflect_mask) { - build_reflect_texgen(p, out_texgen, reflect_mask); - } - - if (normal_mask) { - struct ureg normal = get_eye_normal(p); - emit_op1(p, OPCODE_MOV, out_texgen, normal_mask, normal ); - } - - if (copy_mask) { - struct ureg in = register_input(p, VERT_ATTRIB_TEX0+i); - emit_op1(p, OPCODE_MOV, out_texgen, copy_mask, in ); - } - } - - if (texmat_enabled) { - struct ureg texmat[4]; - struct ureg in = (!is_undef(out_texgen) ? - out_texgen : - register_input(p, VERT_ATTRIB_TEX0+i)); - if (PREFER_DP4) { - register_matrix_param5( p, STATE_TEXTURE_MATRIX, i, 0, 3, - 0, texmat ); - emit_matrix_transform_vec4( p, out, texmat, in ); - } - else { - register_matrix_param5( p, STATE_TEXTURE_MATRIX, i, 0, 3, - STATE_MATRIX_TRANSPOSE, texmat ); - emit_transpose_matrix_transform_vec4( p, out, texmat, in ); - } - } - - release_temps(p); - } - else { - emit_passthrough(p, VERT_ATTRIB_TEX0+i, VERT_RESULT_TEX0+i); - } - } -} - - -/* Seems like it could be tighter: - */ -static void build_pointsize( struct tnl_program *p ) -{ - struct ureg eye = get_eye_position(p); - struct ureg state_size = register_param1(p, STATE_POINT_SIZE); - struct ureg state_attenuation = register_param1(p, STATE_POINT_ATTENUATION); - struct ureg out = register_output(p, VERT_RESULT_PSIZ); - struct ureg ut = get_temp(p); - - /* 1, Z, Z * Z, 1 */ - emit_op1(p, OPCODE_MOV, ut, WRITEMASK_XW, swizzle1(get_identity_param(p), W)); - emit_op1(p, OPCODE_ABS, ut, WRITEMASK_YZ, swizzle1(eye, Z)); - emit_op2(p, OPCODE_MUL, ut, WRITEMASK_Z, ut, ut); - - - /* p1 + p2 * dist + p3 * dist * dist, 0 */ - emit_op2(p, OPCODE_DP3, ut, WRITEMASK_X, ut, state_attenuation); - - /* 1 / sqrt(factor) */ - emit_op1(p, OPCODE_RSQ, ut, WRITEMASK_X, ut ); - - /* ut = pointSize / factor */ - emit_op2(p, OPCODE_MUL, ut, WRITEMASK_X, ut, state_size); - - /* Clamp to min/max - state_size.[yz] - */ - emit_op2(p, OPCODE_MAX, ut, WRITEMASK_X, ut, swizzle1(state_size, Y)); - emit_op2(p, OPCODE_MIN, out, 0, swizzle1(ut, X), swizzle1(state_size, Z)); - - release_temp(p, ut); -} - -static void build_tnl_program( struct tnl_program *p ) -{ - /* Emit the program, starting with modelviewproject: - */ - build_hpos(p); - - /* Lighting calculations: - */ - if (p->state->fragprog_inputs_read & (FRAG_BIT_COL0|FRAG_BIT_COL1)) { - if (p->state->light_global_enabled) - build_lighting(p); - else { - if (p->state->fragprog_inputs_read & FRAG_BIT_COL0) - emit_passthrough(p, VERT_ATTRIB_COLOR0, VERT_RESULT_COL0); - - if (p->state->fragprog_inputs_read & FRAG_BIT_COL1) - emit_passthrough(p, VERT_ATTRIB_COLOR1, VERT_RESULT_COL1); - } - } - - if ((p->state->fragprog_inputs_read & FRAG_BIT_FOGC) || - p->state->fog_option != FOG_NONE) - build_fog(p); - - if (p->state->fragprog_inputs_read & FRAG_BITS_TEX_ANY) - build_texture_transform(p); - - if (p->state->point_attenuated) - build_pointsize(p); - - /* Finish up: - */ - emit_op1(p, OPCODE_END, undef, 0, undef); - - /* Disassemble: - */ - if (DISASSEM) { - _mesa_printf ("\n"); - } -} - - -static void build_new_tnl_program( const struct state_key *key, - struct gl_vertex_program *program, - GLuint max_temps) -{ - struct tnl_program p; - - _mesa_memset(&p, 0, sizeof(p)); - p.state = key; - p.program = program; - p.eye_position = undef; - p.eye_position_normalized = undef; - p.eye_normal = undef; - p.identity = undef; - p.temp_in_use = 0; - p.nr_instructions = 16; - - if (max_temps >= sizeof(int) * 8) - p.temp_reserved = 0; - else - p.temp_reserved = ~((1<<max_temps)-1); - - p.program->Base.Instructions = - _mesa_malloc(sizeof(struct prog_instruction) * p.nr_instructions); - p.program->Base.String = 0; - p.program->Base.NumInstructions = - p.program->Base.NumTemporaries = - p.program->Base.NumParameters = - p.program->Base.NumAttributes = p.program->Base.NumAddressRegs = 0; - p.program->Base.Parameters = _mesa_new_parameter_list(); - p.program->Base.InputsRead = 0; - p.program->Base.OutputsWritten = 0; - - build_tnl_program( &p ); -} - -static void *search_cache( struct brw_tnl_cache *cache, - GLuint hash, - const void *key, - GLuint keysize) -{ - struct brw_tnl_cache_item *c; - - for (c = cache->items[hash % cache->size]; c; c = c->next) { - if (c->hash == hash && memcmp(c->key, key, keysize) == 0) - return c->data; - } - - return NULL; -} - -static void rehash( struct brw_tnl_cache *cache ) -{ - struct brw_tnl_cache_item **items; - struct brw_tnl_cache_item *c, *next; - GLuint size, i; - - size = cache->size * 3; - items = (struct brw_tnl_cache_item**) _mesa_malloc(size * sizeof(*items)); - _mesa_memset(items, 0, size * sizeof(*items)); - - for (i = 0; i < cache->size; i++) - for (c = cache->items[i]; c; c = next) { - next = c->next; - c->next = items[c->hash % size]; - items[c->hash % size] = c; - } - - FREE(cache->items); - cache->items = items; - cache->size = size; -} - -static void cache_item( struct brw_tnl_cache *cache, - GLuint hash, - const struct state_key *key, - void *data ) -{ - struct brw_tnl_cache_item *c = MALLOC(sizeof(*c)); - c->hash = hash; - - c->key = malloc(sizeof(*key)); - memcpy(c->key, key, sizeof(*key)); - - c->data = data; - - if (++cache->n_items > cache->size * 1.5) - rehash(cache); - - c->next = cache->items[hash % cache->size]; - cache->items[hash % cache->size] = c; -} - - -static GLuint hash_key( struct state_key *key ) -{ - GLuint *ikey = (GLuint *)key; - GLuint hash = 0, i; - - /* I'm sure this can be improved on, but speed is important: - */ - for (i = 0; i < sizeof(*key)/sizeof(GLuint); i++) - hash += ikey[i]; - - return hash; -} - -static int prepare_tnl_program( struct brw_context *brw ) -{ - GLcontext *ctx = &brw->intel.ctx; - struct state_key key; - GLuint hash; - struct gl_vertex_program *old = brw->tnl_program; - - /* _NEW_PROGRAM */ - if (brw->attribs.VertexProgram->_Current) - return 0; - - /* Grab all the relevent state and put it in a single structure: - */ - make_state_key(ctx, &key); - hash = hash_key(&key); - - /* Look for an already-prepared program for this state: - */ - brw->tnl_program = (struct gl_vertex_program *) - search_cache( &brw->tnl_program_cache, hash, &key, sizeof(key) ); - - /* OK, we'll have to build a new one: - */ - if (!brw->tnl_program) { - brw->tnl_program = (struct gl_vertex_program *) - ctx->Driver.NewProgram(ctx, GL_VERTEX_PROGRAM_ARB, 0); - - build_new_tnl_program( &key, brw->tnl_program, -/* ctx->Const.MaxVertexProgramTemps */ - 32 - ); - - if (ctx->Driver.ProgramStringNotify) - ctx->Driver.ProgramStringNotify( ctx, GL_VERTEX_PROGRAM_ARB, - &brw->tnl_program->Base ); - - cache_item( &brw->tnl_program_cache, - hash, &key, brw->tnl_program ); - } - - if (old != brw->tnl_program) - brw->state.dirty.brw |= BRW_NEW_TNL_PROGRAM; - return 0; -} - -/* Note: See brw_draw.c - the vertex program must not rely on - * brw->primitive or brw->reduced_prim. - */ -const struct brw_tracked_state brw_tnl_vertprog = { - .dirty = { - .mesa = (_NEW_PROGRAM | - _NEW_LIGHT | - _NEW_TRANSFORM | - _NEW_FOG | - _NEW_HINT | - _NEW_POINT | - _NEW_TEXTURE | - _NEW_TEXTURE_MATRIX), - .brw = (BRW_NEW_FRAGMENT_PROGRAM | - BRW_NEW_INPUT_VARYING), - .cache = 0 - }, - .prepare = prepare_tnl_program -}; - - - - -static int prepare_active_vertprog( struct brw_context *brw ) -{ - const struct gl_vertex_program *prev = brw->vertex_program; - - /* NEW_PROGRAM */ - if (brw->attribs.VertexProgram->_Current) { - brw->vertex_program = brw->attribs.VertexProgram->_Current; - } - else { - /* BRW_NEW_TNL_PROGRAM */ - brw->vertex_program = brw->tnl_program; - } - - if (brw->vertex_program != prev) - brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM; - - return 0; -} - - - -const struct brw_tracked_state brw_active_vertprog = { - .dirty = { - .mesa = _NEW_PROGRAM, - .brw = BRW_NEW_TNL_PROGRAM, - .cache = 0 - }, - .prepare = prepare_active_vertprog -}; - - -void brw_ProgramCacheInit( GLcontext *ctx ) -{ - struct brw_context *brw = brw_context(ctx); - - brw->tnl_program_cache.size = 17; - brw->tnl_program_cache.n_items = 0; - brw->tnl_program_cache.items = (struct brw_tnl_cache_item **) - _mesa_calloc(brw->tnl_program_cache.size * - sizeof(struct brw_tnl_cache_item)); -} - -void brw_ProgramCacheDestroy( GLcontext *ctx ) -{ - struct brw_context *brw = brw_context(ctx); - struct brw_tnl_cache_item *c, *next; - GLuint i; - - for (i = 0; i < brw->tnl_program_cache.size; i++) - for (c = brw->tnl_program_cache.items[i]; c; c = next) { - next = c->next; - FREE(c->key); - FREE(c->data); - FREE(c); - } - - FREE(brw->tnl_program_cache.items); -} diff --git a/i965/brw_vtbl.c b/i965/brw_vtbl.c index 31e96a2..f7293ef 100644 --- a/i965/brw_vtbl.c +++ b/i965/brw_vtbl.c @@ -32,11 +32,11 @@ -#include "glheader.h" -#include "mtypes.h" -#include "imports.h" -#include "macros.h" -#include "colormac.h" +#include "main/glheader.h" +#include "main/mtypes.h" +#include "main/imports.h" +#include "main/macros.h" +#include "main/colormac.h" #include "intel_batchbuffer.h" #include "intel_regions.h" @@ -51,20 +51,52 @@ #include "brw_vs.h" #include <stdarg.h> +static void +dri_bo_release(dri_bo **bo) +{ + dri_bo_unreference(*bo); + *bo = NULL; +} /* called from intelDestroyContext() */ static void brw_destroy_context( struct intel_context *intel ) { - GLcontext *ctx = &intel->ctx; struct brw_context *brw = brw_context(&intel->ctx); + int i; brw_destroy_metaops(brw); brw_destroy_state(brw); brw_draw_destroy( brw ); - brw_ProgramCacheDestroy( ctx ); brw_FrameBufferTexDestroy( brw ); + + for (i = 0; i < brw->state.nr_draw_regions; i++) + intel_region_release(&brw->state.draw_regions[i]); + brw->state.nr_draw_regions = 0; + intel_region_release(&brw->state.depth_region); + + dri_bo_release(&brw->curbe.curbe_bo); + dri_bo_release(&brw->vs.prog_bo); + dri_bo_release(&brw->vs.state_bo); + dri_bo_release(&brw->gs.prog_bo); + dri_bo_release(&brw->gs.state_bo); + dri_bo_release(&brw->clip.prog_bo); + dri_bo_release(&brw->clip.state_bo); + dri_bo_release(&brw->clip.vp_bo); + dri_bo_release(&brw->sf.prog_bo); + dri_bo_release(&brw->sf.state_bo); + dri_bo_release(&brw->sf.vp_bo); + for (i = 0; i < BRW_MAX_TEX_UNIT; i++) + dri_bo_release(&brw->wm.sdc_bo[i]); + dri_bo_release(&brw->wm.bind_bo); + for (i = 0; i < BRW_WM_MAX_SURF; i++) + dri_bo_release(&brw->wm.surf_bo[i]); + dri_bo_release(&brw->wm.prog_bo); + dri_bo_release(&brw->wm.state_bo); + dri_bo_release(&brw->cc.prog_bo); + dri_bo_release(&brw->cc.state_bo); + dri_bo_release(&brw->cc.vp_bo); } /* called from intelDrawBuffer() @@ -87,6 +119,15 @@ static void brw_set_draw_region( struct intel_context *intel, brw->state.nr_draw_regions = num_regions; } +/* called from intel_batchbuffer_flush and children before sending a + * batchbuffer off. + */ +static void brw_finish_batch(struct intel_context *intel) +{ + struct brw_context *brw = brw_context(&intel->ctx); + + brw_emit_query_end(brw); +} /* called from intelFlushBatchLocked */ @@ -97,8 +138,7 @@ static void brw_new_batch( struct intel_context *intel ) /* Check that we didn't just wrap our batchbuffer at a bad time. */ assert(!brw->no_batch_wrap); - dri_bo_unreference(brw->curbe.curbe_bo); - brw->curbe.curbe_bo = NULL; + brw->curbe.need_new_bo = GL_TRUE; /* Mark all context state as needing to be re-emitted. * This is probably not as severe as on 915, since almost all of our state @@ -131,8 +171,6 @@ static void brw_note_unlock( struct intel_context *intel ) struct brw_context *brw = brw_context(&intel->ctx); brw_state_cache_check_size(brw); - - brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_LOCK; } @@ -186,6 +224,7 @@ void brwInitVtbl( struct brw_context *brw ) brw->intel.vtbl.note_fence = brw_note_fence; brw->intel.vtbl.note_unlock = brw_note_unlock; brw->intel.vtbl.new_batch = brw_new_batch; + brw->intel.vtbl.finish_batch = brw_finish_batch; brw->intel.vtbl.destroy = brw_destroy_context; brw->intel.vtbl.set_draw_region = brw_set_draw_region; brw->intel.vtbl.flush_cmd = brw_flush_cmd; diff --git a/i965/brw_wm.c b/i965/brw_wm.c index a470a25..c50b0d2 100644 --- a/i965/brw_wm.c +++ b/i965/brw_wm.c @@ -36,63 +36,24 @@ #include "brw_state.h" +/** Return number of src args for given instruction */ GLuint brw_wm_nr_args( GLuint opcode ) { switch (opcode) { - case WM_PIXELXY: - case OPCODE_ABS: - case OPCODE_FLR: - case OPCODE_FRC: - case OPCODE_SWZ: - case OPCODE_MOV: - case OPCODE_COS: - case OPCODE_EX2: - case OPCODE_LG2: - case OPCODE_RCP: - case OPCODE_RSQ: - case OPCODE_SIN: - case OPCODE_SCS: - case OPCODE_TEX: - case OPCODE_TXB: - case OPCODE_TXP: - case OPCODE_KIL: - case OPCODE_LIT: - case WM_CINTERP: - case WM_WPOSXY: + case WM_CINTERP: + case WM_WPOSXY: return 1; - - case OPCODE_POW: - case OPCODE_SUB: - case OPCODE_SGE: - case OPCODE_SGT: - case OPCODE_SLE: - case OPCODE_SLT: - case OPCODE_SEQ: - case OPCODE_SNE: - case OPCODE_ADD: - case OPCODE_MAX: - case OPCODE_MIN: - case OPCODE_MUL: - case OPCODE_XPD: - case OPCODE_DP3: - case OPCODE_DP4: - case OPCODE_DPH: - case OPCODE_DST: - case WM_LINTERP: + case WM_LINTERP: case WM_DELTAXY: case WM_PIXELW: return 2; - case WM_FB_WRITE: - case WM_PINTERP: - case OPCODE_MAD: - case OPCODE_CMP: - case OPCODE_LRP: + case WM_PINTERP: return 3; - default: - return 0; + assert(opcode < MAX_OPCODE); + return _mesa_num_inst_src_regs(opcode); } } @@ -175,6 +136,9 @@ static void do_wm_prog( struct brw_context *brw, */ brw_wm_emit(c); } + if (INTEL_DEBUG & DEBUG_WM) + fprintf(stderr, "\n"); + /* get the program */ program = brw_get_program(&c->func, &program_size); @@ -230,12 +194,6 @@ static void brw_wm_populate_key( struct brw_context *brw, lookup |= IZ_STENCIL_WRITE_ENABLE_BIT; } - /* XXX: when should this be disabled? - */ - if (1) - lookup |= IZ_EARLY_DEPTH_TEST_BIT; - - line_aa = AA_NEVER; /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */ @@ -322,7 +280,7 @@ static void brw_wm_populate_key( struct brw_context *brw, } -static int brw_prepare_wm_prog( struct brw_context *brw ) +static void brw_prepare_wm_prog(struct brw_context *brw) { struct brw_wm_prog_key key; struct brw_fragment_program *fp = (struct brw_fragment_program *) @@ -339,8 +297,6 @@ static int brw_prepare_wm_prog( struct brw_context *brw ) &brw->wm.prog_data); if (brw->wm.prog_bo == NULL) do_wm_prog(brw, fp, &key); - - return dri_bufmgr_check_aperture_space(brw->wm.prog_bo); } diff --git a/i965/brw_wm.h b/i965/brw_wm.h index 297617e..ded0796 100644 --- a/i965/brw_wm.h +++ b/i965/brw_wm.h @@ -49,8 +49,7 @@ #define IZ_DEPTH_TEST_ENABLE_BIT 0x8 #define IZ_STENCIL_WRITE_ENABLE_BIT 0x10 #define IZ_STENCIL_TEST_ENABLE_BIT 0x20 -#define IZ_EARLY_DEPTH_TEST_BIT 0x40 -#define IZ_BIT_MAX 0x80 +#define IZ_BIT_MAX 0x40 #define AA_NEVER 0 #define AA_SOMETIMES 1 @@ -157,6 +156,7 @@ struct brw_wm_instruction { #define BRW_WM_MAX_PARAM 256 #define BRW_WM_MAX_CONST 256 #define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS +#define BRW_WM_MAX_SUBROUTINE 16 @@ -246,7 +246,10 @@ struct brw_wm_compile { struct brw_reg stack; struct brw_reg emit_mask_reg; GLuint reg_index; + GLuint tmp_regs[BRW_WM_MAX_GRF]; GLuint tmp_index; + GLuint tmp_max; + GLuint subroutines[BRW_WM_MAX_SUBROUTINE]; }; diff --git a/i965/brw_wm_debug.c b/i965/brw_wm_debug.c index f31d097..8f07f89 100644 --- a/i965/brw_wm_debug.c +++ b/i965/brw_wm_debug.c @@ -163,9 +163,9 @@ void brw_wm_print_program( struct brw_wm_compile *c, { GLuint insn; - _mesa_printf("\n\n\n%s:\n", stage); + _mesa_printf("%s:\n", stage); for (insn = 0; insn < c->nr_insns; insn++) brw_wm_print_insn(c, &c->instruction[insn]); - _mesa_printf("\n\n\n"); + _mesa_printf("\n"); } diff --git a/i965/brw_wm_emit.c b/i965/brw_wm_emit.c index 9b919b9..b5050a3 100644 --- a/i965/brw_wm_emit.c +++ b/i965/brw_wm_emit.c @@ -30,7 +30,7 @@ */ -#include "macros.h" +#include "main/macros.h" #include "brw_context.h" #include "brw_wm.h" @@ -194,7 +194,7 @@ static void emit_linterp( struct brw_compile *p, interp[2] = brw_vec1_grf(nr+1, 0); interp[3] = brw_vec1_grf(nr+1, 4); - for(i = 0; i < 4; i++ ) { + for (i = 0; i < 4; i++) { if (mask & (1<<i)) { brw_LINE(p, brw_null_reg(), interp[i], deltas[0]); brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]); @@ -219,42 +219,40 @@ static void emit_pinterp( struct brw_compile *p, interp[2] = brw_vec1_grf(nr+1, 0); interp[3] = brw_vec1_grf(nr+1, 4); - for(i = 0; i < 4; i++ ) { + for (i = 0; i < 4; i++) { if (mask & (1<<i)) { brw_LINE(p, brw_null_reg(), interp[i], deltas[0]); brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]); } } - for(i = 0; i < 4; i++ ) { + for (i = 0; i < 4; i++) { if (mask & (1<<i)) { brw_MUL(p, dst[i], dst[i], w[3]); } } } + static void emit_cinterp( struct brw_compile *p, const struct brw_reg *dst, GLuint mask, const struct brw_reg *arg0 ) { - struct brw_reg interp[4]; - GLuint nr = arg0[0].nr; - GLuint i; - - interp[0] = brw_vec1_grf(nr, 0); - interp[1] = brw_vec1_grf(nr, 4); - interp[2] = brw_vec1_grf(nr+1, 0); - interp[3] = brw_vec1_grf(nr+1, 4); - - for(i = 0; i < 4; i++ ) { - if (mask & (1<<i)) { - brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */ - } - } -} - + struct brw_reg interp[4]; + GLuint nr = arg0[0].nr; + GLuint i; + interp[0] = brw_vec1_grf(nr, 0); + interp[1] = brw_vec1_grf(nr, 4); + interp[2] = brw_vec1_grf(nr+1, 0); + interp[3] = brw_vec1_grf(nr+1, 4); + for (i = 0; i < 4; i++) { + if (mask & (1<<i)) { + brw_MOV(p, dst[i], suboffset(interp[i],3)); /* TODO: optimize away like other moves */ + } + } +} static void emit_alu1( struct brw_compile *p, @@ -280,6 +278,7 @@ static void emit_alu1( struct brw_compile *p, brw_set_saturate(p, 0); } + static void emit_alu2( struct brw_compile *p, struct brw_instruction *(*func)(struct brw_compile *, struct brw_reg, @@ -351,6 +350,7 @@ static void emit_lrp( struct brw_compile *p, } } } + static void emit_sop( struct brw_compile *p, const struct brw_reg *dst, GLuint mask, @@ -376,7 +376,7 @@ static void emit_slt( struct brw_compile *p, const struct brw_reg *arg0, const struct brw_reg *arg1 ) { - emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1); + emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1); } static void emit_sle( struct brw_compile *p, @@ -385,7 +385,7 @@ static void emit_sle( struct brw_compile *p, const struct brw_reg *arg0, const struct brw_reg *arg1 ) { - emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1); + emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1); } static void emit_sgt( struct brw_compile *p, @@ -394,7 +394,7 @@ static void emit_sgt( struct brw_compile *p, const struct brw_reg *arg0, const struct brw_reg *arg1 ) { - emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1); + emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1); } static void emit_sge( struct brw_compile *p, @@ -403,7 +403,7 @@ static void emit_sge( struct brw_compile *p, const struct brw_reg *arg0, const struct brw_reg *arg1 ) { - emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1); + emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1); } static void emit_seq( struct brw_compile *p, @@ -412,7 +412,7 @@ static void emit_seq( struct brw_compile *p, const struct brw_reg *arg0, const struct brw_reg *arg1 ) { - emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1); + emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1); } static void emit_sne( struct brw_compile *p, @@ -421,7 +421,7 @@ static void emit_sne( struct brw_compile *p, const struct brw_reg *arg0, const struct brw_reg *arg1 ) { - emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1); + emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1); } static void emit_cmp( struct brw_compile *p, @@ -505,7 +505,7 @@ static void emit_dp3( struct brw_compile *p, const struct brw_reg *arg1 ) { if (!(mask & WRITEMASK_XYZW)) - return; /* Do not emit dead code*/ + return; /* Do not emit dead code */ assert((mask & WRITEMASK_XYZW) == WRITEMASK_X); @@ -525,7 +525,7 @@ static void emit_dp4( struct brw_compile *p, const struct brw_reg *arg1 ) { if (!(mask & WRITEMASK_XYZW)) - return; /* Do not emit dead code*/ + return; /* Do not emit dead code */ assert((mask & WRITEMASK_XYZW) == WRITEMASK_X); @@ -546,7 +546,7 @@ static void emit_dph( struct brw_compile *p, const struct brw_reg *arg1 ) { if (!(mask & WRITEMASK_XYZW)) - return; /* Do not emit dead code*/ + return; /* Do not emit dead code */ assert((mask & WRITEMASK_XYZW) == WRITEMASK_X); @@ -592,7 +592,7 @@ static void emit_math1( struct brw_compile *p, const struct brw_reg *arg0 ) { if (!(mask & WRITEMASK_XYZW)) - return; /* Do not emit dead code*/ + return; /* Do not emit dead code */ //assert((mask & WRITEMASK_XYZW) == WRITEMASK_X || // function == BRW_MATH_FUNCTION_SINCOS); @@ -619,7 +619,7 @@ static void emit_math2( struct brw_compile *p, const struct brw_reg *arg1) { if (!(mask & WRITEMASK_XYZW)) - return; /* Do not emit dead code*/ + return; /* Do not emit dead code */ assert((mask & WRITEMASK_XYZW) == WRITEMASK_X); @@ -760,7 +760,6 @@ static void emit_txb( struct brw_wm_compile *c, brw_MOV(p, brw_message_reg(8), arg[3]); msgLength = 9; - brw_SAMPLE(p, retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW), 1, @@ -772,7 +771,6 @@ static void emit_txb( struct brw_wm_compile *c, 8, /* responseLength */ msgLength, 0); - } @@ -823,7 +821,6 @@ static void emit_kil( struct brw_wm_compile *c, struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); GLuint i; - /* XXX - usually won't need 4 compares! */ for (i = 0; i < 4; i++) { @@ -836,6 +833,7 @@ static void emit_kil( struct brw_wm_compile *c, } } + static void fire_fb_write( struct brw_wm_compile *c, GLuint base_reg, GLuint nr, @@ -869,6 +867,7 @@ static void fire_fb_write( struct brw_wm_compile *c, eot); } + static void emit_aa( struct brw_wm_compile *c, struct brw_reg *arg1, GLuint reg ) @@ -962,7 +961,6 @@ static void emit_fb_write( struct brw_wm_compile *c, nr += 2; } - if (!c->key.runtime_check_aads_emit) { if (c->key.aa_dest_stencil_reg) emit_aa(c, arg1, 2); @@ -996,8 +994,6 @@ static void emit_fb_write( struct brw_wm_compile *c, } - - /* Post-fragment-program processing. Send the results to the * framebuffer. */ @@ -1022,6 +1018,7 @@ static void emit_spill( struct brw_wm_compile *c, slot); } + static void emit_unspill( struct brw_wm_compile *c, struct brw_reg reg, GLuint slot ) @@ -1047,7 +1044,6 @@ static void emit_unspill( struct brw_wm_compile *c, } - /** * Retrieve upto 4 GEN4 register pairs for the given wm reg: */ @@ -1073,6 +1069,7 @@ static void get_argument_regs( struct brw_wm_compile *c, } } + static void spill_values( struct brw_wm_compile *c, struct brw_wm_value *values, GLuint nr ) @@ -1085,7 +1082,6 @@ static void spill_values( struct brw_wm_compile *c, } - /* Emit the fragment program instructions here. */ void brw_wm_emit( struct brw_wm_compile *c ) @@ -1176,7 +1172,7 @@ void brw_wm_emit( struct brw_wm_compile *c ) emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]); break; - case OPCODE_DP3: /* */ + case OPCODE_DP3: emit_dp3(p, dst, dst_flags, args[0], args[1]); break; @@ -1188,7 +1184,7 @@ void brw_wm_emit( struct brw_wm_compile *c ) emit_dph(p, dst, dst_flags, args[0], args[1]); break; - case OPCODE_LRP: /* */ + case OPCODE_LRP: emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]); break; @@ -1302,8 +1298,10 @@ void brw_wm_emit( struct brw_wm_compile *c ) break; default: - _mesa_printf("unsupport opcode %d in fragment program\n", - inst->opcode); + _mesa_printf("Unsupported opcode %i (%s) in fragment shader\n", + inst->opcode, inst->opcode < MAX_OPCODE ? + _mesa_opcode_string(inst->opcode) : + "unknown"); } for (i = 0; i < 4; i++) @@ -1313,8 +1311,3 @@ void brw_wm_emit( struct brw_wm_compile *c ) inst->dst[i]->spill_slot); } } - - - - - diff --git a/i965/brw_wm_fp.c b/i965/brw_wm_fp.c index bc933fe..6df2c95 100644 --- a/i965/brw_wm_fp.c +++ b/i965/brw_wm_fp.c @@ -30,9 +30,9 @@ */ -#include "glheader.h" -#include "macros.h" -#include "enums.h" +#include "main/glheader.h" +#include "main/macros.h" +#include "main/enums.h" #include "brw_context.h" #include "brw_wm.h" #include "brw_util.h" @@ -122,10 +122,11 @@ static struct prog_dst_register dst_reg(GLuint file, GLuint idx) reg.File = file; reg.Index = idx; reg.WriteMask = WRITEMASK_XYZW; + reg.RelAddr = 0; reg.CondMask = 0; reg.CondSwizzle = 0; - reg.pad = 0; reg.CondSrc = 0; + reg.pad = 0; return reg; } @@ -426,10 +427,6 @@ static struct prog_src_register search_or_add_param5(struct brw_wm_compile *c, idx = _mesa_add_state_reference( paramList, tokens ); - /* Recalculate state dependency: - */ - c->fp->param_state = paramList->StateFlags; - return src_reg(PROGRAM_STATE_VAR, idx); } @@ -814,62 +811,11 @@ static void precalc_txp( struct brw_wm_compile *c, - - -/*********************************************************************** - * Add instructions to perform fog blending - */ - -static void fog_blend( struct brw_wm_compile *c, - struct prog_src_register fog_factor ) -{ - struct prog_dst_register outcolor = dst_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLR); - struct prog_src_register fogcolor = search_or_add_param5( c, STATE_FOG_COLOR, 0,0,0,0 ); - - /* color.xyz = LRP fog_factor.xxxx, output_color, fog_color */ - - emit_op(c, - OPCODE_LRP, - dst_mask(outcolor, WRITEMASK_XYZ), - 0, 0, 0, - fog_factor, - src_reg_from_dst(outcolor), - fogcolor); -} - - - -/* This one is simple - just take the interpolated fog coordinate and - * use it as the fog blend factor. - */ -static void fog_interpolated( struct brw_wm_compile *c ) -{ - struct prog_src_register fogc = src_reg(PROGRAM_INPUT, FRAG_ATTRIB_FOGC); - - if (!(c->fp_interp_emitted & (1<<FRAG_ATTRIB_FOGC))) - emit_interp(c, FRAG_ATTRIB_FOGC); - - fog_blend( c, src_swizzle1(fogc, GET_SWZ(fogc.Swizzle,X))); -} - -static void emit_fog( struct brw_wm_compile *c ) -{ - if (!c->fp->program.FogOption) - return; - - if (1) - fog_interpolated( c ); - else { - /* TODO: per-pixel fog */ - assert(0); - } -} - static void emit_fb_write( struct brw_wm_compile *c ) { - struct prog_src_register outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLR); struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH); struct prog_src_register outdepth = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DEPR); + struct prog_src_register outcolor; GLuint i; struct prog_instruction *inst, *last_inst; @@ -893,7 +839,14 @@ static void emit_fb_write( struct brw_wm_compile *c ) } } last_inst->Sampler |= 1; //eot - }else { + } + else { + /* if gl_FragData[0] is written, use it, else use gl_FragColor */ + if (c->fp->program.Base.OutputsWritten & (1 << FRAG_RESULT_DATA0)) + outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0); + else + outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLR); + inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0), 0, 0, 0, outcolor, payload_r0_depth, outdepth); inst->Sampler = 1|(0<<1); @@ -960,7 +913,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c ) GLuint insn; if (INTEL_DEBUG & DEBUG_WM) { - _mesa_printf("\n\n\npre-fp:\n"); + _mesa_printf("pre-fp:\n"); _mesa_print_program(&fp->program.Base); _mesa_printf("\n"); } @@ -1055,7 +1008,6 @@ void brw_wm_pass_fp( struct brw_wm_compile *c ) emit_ddy(c, inst); break; case OPCODE_END: - emit_fog(c); emit_fb_write(c); break; case OPCODE_PRINT: @@ -1068,7 +1020,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c ) } if (INTEL_DEBUG & DEBUG_WM) { - _mesa_printf("\n\n\npass_fp:\n"); + _mesa_printf("pass_fp:\n"); print_insns( c->prog_instructions, c->nr_fp_insns ); _mesa_printf("\n"); } diff --git a/i965/brw_wm_glsl.c b/i965/brw_wm_glsl.c index 8dce40f..8fd776a 100644 --- a/i965/brw_wm_glsl.c +++ b/i965/brw_wm_glsl.c @@ -1,9 +1,13 @@ -#include "macros.h" +#include "main/macros.h" #include "shader/prog_parameter.h" #include "brw_context.h" #include "brw_eu.h" #include "brw_wm.h" +enum _subroutine { + SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4 +}; + /* Only guess, need a flag in gl_fragment_program later */ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp) { @@ -12,13 +16,17 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp) struct prog_instruction *inst = &fp->Base.Instructions[i]; switch (inst->Opcode) { case OPCODE_IF: - case OPCODE_INT: + case OPCODE_TRUNC: case OPCODE_ENDIF: case OPCODE_CAL: case OPCODE_BRK: case OPCODE_RET: case OPCODE_DDX: case OPCODE_DDY: + case OPCODE_NOISE1: + case OPCODE_NOISE2: + case OPCODE_NOISE3: + case OPCODE_NOISE4: case OPCODE_BGNLOOP: return GL_TRUE; default: @@ -47,13 +55,26 @@ static int get_scalar_dst_index(struct prog_instruction *inst) static struct brw_reg alloc_tmp(struct brw_wm_compile *c) { struct brw_reg reg; - reg = brw_vec8_grf(c->tmp_index--, 0); + if(c->tmp_index == c->tmp_max) + c->tmp_regs[ c->tmp_max++ ] = c->reg_index++; + + reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0); return reg; } -static void release_tmps(struct brw_wm_compile *c) +static int mark_tmps(struct brw_wm_compile *c) +{ + return c->tmp_index; +} + +static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index ) { - c->tmp_index = 127; + return brw_vec8_grf( c->tmp_regs[ index ], 0 ); +} + +static void release_tmps(struct brw_wm_compile *c, int mark) +{ + c->tmp_index = mark; } static struct brw_reg @@ -155,6 +176,68 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c, src->NegateBase, src->Abs); } +/* Subroutines are minimal support for resusable instruction sequences. + They are implemented as simply as possible to minimise overhead: there + is no explicit support for communication between the caller and callee + other than saving the return address in a temporary register, nor is + there any automatic local storage. This implies that great care is + required before attempting reentrancy or any kind of nested + subroutine invocations. */ +static void invoke_subroutine( struct brw_wm_compile *c, + enum _subroutine subroutine, + void (*emit)( struct brw_wm_compile * ) ) +{ + struct brw_compile *p = &c->func; + + assert( subroutine < BRW_WM_MAX_SUBROUTINE ); + + if( c->subroutines[ subroutine ] ) { + /* subroutine previously emitted: reuse existing instructions */ + + int mark = mark_tmps( c ); + struct brw_reg return_address = retype( alloc_tmp( c ), + BRW_REGISTER_TYPE_UD ); + int here = p->nr_insn; + + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) ); + + brw_ADD( p, brw_ip_reg(), brw_ip_reg(), + brw_imm_d( ( c->subroutines[ subroutine ] - + here - 1 ) << 4 ) ); + brw_pop_insn_state(p); + + release_tmps( c, mark ); + } else { + /* previously unused subroutine: emit, and mark for later reuse */ + + int mark = mark_tmps( c ); + struct brw_reg return_address = retype( alloc_tmp( c ), + BRW_REGISTER_TYPE_UD ); + struct brw_instruction *calc; + int base = p->nr_insn; + + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) ); + brw_pop_insn_state(p); + + c->subroutines[ subroutine ] = p->nr_insn; + + emit( c ); + + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_MOV( p, brw_ip_reg(), return_address ); + brw_pop_insn_state(p); + + brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) ); + + release_tmps( c, mark ); + } +} + static void emit_abs( struct brw_wm_compile *c, struct prog_instruction *inst) { @@ -172,7 +255,7 @@ static void emit_abs( struct brw_wm_compile *c, brw_set_saturate(p, 0); } -static void emit_int( struct brw_wm_compile *c, +static void emit_trunc( struct brw_wm_compile *c, struct prog_instruction *inst) { int i; @@ -184,7 +267,7 @@ static void emit_int( struct brw_wm_compile *c, struct brw_reg src, dst; dst = get_dst_reg(c, inst, i, 1) ; src = get_src_reg(c, &inst->SrcReg[0], i, 1); - brw_RNDD(p, dst, src); + brw_RNDZ(p, dst, src); } } brw_set_saturate(p, 0); @@ -540,7 +623,7 @@ static void emit_dph(struct brw_wm_compile *c, brw_MAC(p, brw_null_reg(), src0[1], src1[1]); brw_MAC(p, dst, src0[2], src1[2]); brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); - brw_ADD(p, dst, src0[3], src1[3]); + brw_ADD(p, dst, dst, src1[3]); brw_set_saturate(p, 0); } @@ -778,6 +861,7 @@ static void emit_lrp(struct brw_wm_compile *c, GLuint mask = inst->DstReg.WriteMask; struct brw_reg dst, tmp1, tmp2, src0, src1, src2; int i; + int mark = mark_tmps(c); for (i = 0; i < 4; i++) { if (mask & (1<<i)) { dst = get_dst_reg(c, inst, i, 1); @@ -804,19 +888,23 @@ static void emit_lrp(struct brw_wm_compile *c, brw_MAC(p, dst, src0, tmp1); brw_set_saturate(p, 0); } - release_tmps(c); + release_tmps(c, mark); } } +/** + * For GLSL shaders, this KIL will be unconditional. + * It may be contained inside an IF/ENDIF structure of course. + */ static void emit_kil(struct brw_wm_compile *c) { - struct brw_compile *p = &c->func; - struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); - brw_push_insn_state(p); - brw_set_mask_control(p, BRW_MASK_DISABLE); - brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK - brw_AND(p, depth, c->emit_mask_reg, depth); - brw_pop_insn_state(p); + struct brw_compile *p = &c->func; + struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK + brw_AND(p, depth, c->emit_mask_reg, depth); + brw_pop_insn_state(p); } static void emit_mad(struct brw_wm_compile *c, @@ -957,6 +1045,1055 @@ static void emit_ddy(struct brw_wm_compile *c, brw_set_saturate(p, 0); } +static __inline struct brw_reg high_words( struct brw_reg reg ) +{ + return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ), + 0, 8, 2 ); +} + +static __inline struct brw_reg low_words( struct brw_reg reg ) +{ + return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 ); +} + +static __inline struct brw_reg even_bytes( struct brw_reg reg ) +{ + return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 ); +} + +static __inline struct brw_reg odd_bytes( struct brw_reg reg ) +{ + return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ), + 0, 16, 2 ); +} + +/* One-, two- and three-dimensional Perlin noise, similar to the description + in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */ +static void noise1_sub( struct brw_wm_compile *c ) { + + struct brw_compile *p = &c->func; + struct brw_reg param, + x0, x1, /* gradients at each end */ + t, tmp[ 2 ], /* float temporaries */ + itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */ + int i; + int mark = mark_tmps( c ); + + x0 = alloc_tmp( c ); + x1 = alloc_tmp( c ); + t = alloc_tmp( c ); + tmp[ 0 ] = alloc_tmp( c ); + tmp[ 1 ] = alloc_tmp( c ); + itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD ); + itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD ); + itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD ); + itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD ); + itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD ); + + param = lookup_tmp( c, mark - 2 ); + + brw_set_access_mode( p, BRW_ALIGN_1 ); + + brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */ + + /* Arrange the two end coordinates into scalars (itmp0/itmp1) to + be hashed. Also compute the remainder (offset within the unit + length), interleaved to reduce register dependency penalties. */ + brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param ); + brw_FRC( p, param, param ); + brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) ); + brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */ + brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */ + + /* We're now ready to perform the hashing. The two hashes are + interleaved for performance. The hash function used is + designed to rapidly achieve avalanche and require only 32x16 + bit multiplication, and 16-bit swizzles (which we get for + free). We can't use immediate operands in the multiplies, + because immediates are permitted only in src1 and the 16-bit + factor is permitted only in src0. */ + for( i = 0; i < 2; i++ ) + brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] ); + for( i = 0; i < 2; i++ ) + brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), + high_words( itmp[ i ] ) ); + for( i = 0; i < 2; i++ ) + brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] ); + for( i = 0; i < 2; i++ ) + brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), + high_words( itmp[ i ] ) ); + for( i = 0; i < 2; i++ ) + brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] ); + for( i = 0; i < 2; i++ ) + brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), + high_words( itmp[ i ] ) ); + + /* Now we want to initialise the two gradients based on the + hashes. Format conversion from signed integer to float leaves + everything scaled too high by a factor of pow( 2, 31 ), but + we correct for that right at the end. */ + brw_ADD( p, t, param, brw_imm_f( -1.0 ) ); + brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) ); + brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) ); + + brw_MUL( p, x0, x0, param ); + brw_MUL( p, x1, x1, t ); + + /* We interpolate between the gradients using the polynomial + 6t^5 - 15t^4 + 10t^3 (Perlin). */ + brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) ); + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) ); + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param ); + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) ); + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param ); + brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the + pipeline */ + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param ); + brw_MUL( p, param, tmp[ 0 ], param ); + brw_MUL( p, x1, x1, param ); + brw_ADD( p, x0, x0, x1 ); + /* scale by pow( 2, -30 ), to compensate for the format conversion + above and an extra factor of 2 so that a single gradient covers + the [-1,1] range */ + brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) ); + + release_tmps( c, mark ); +} + +static void emit_noise1( struct brw_wm_compile *c, + struct prog_instruction *inst ) +{ + struct brw_compile *p = &c->func; + struct brw_reg src, param, dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + int mark = mark_tmps( c ); + + assert( mark == 0 ); + + src = get_src_reg( c, inst->SrcReg, 0, 1 ); + + param = alloc_tmp( c ); + + brw_MOV( p, param, src ); + + invoke_subroutine( c, SUB_NOISE1, noise1_sub ); + + /* Fill in the result: */ + brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE ); + for (i = 0 ; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i, 1); + brw_MOV( p, dst, param ); + } + } + if( inst->SaturateMode == SATURATE_ZERO_ONE ) + brw_set_saturate( p, 0 ); + + release_tmps( c, mark ); +} + +static void noise2_sub( struct brw_wm_compile *c ) { + + struct brw_compile *p = &c->func; + struct brw_reg param0, param1, + x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */ + t, tmp[ 4 ], /* float temporaries */ + itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */ + int i; + int mark = mark_tmps( c ); + + x0y0 = alloc_tmp( c ); + x0y1 = alloc_tmp( c ); + x1y0 = alloc_tmp( c ); + x1y1 = alloc_tmp( c ); + t = alloc_tmp( c ); + for( i = 0; i < 4; i++ ) { + tmp[ i ] = alloc_tmp( c ); + itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD ); + } + itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD ); + itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD ); + itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD ); + + param0 = lookup_tmp( c, mark - 3 ); + param1 = lookup_tmp( c, mark - 2 ); + + brw_set_access_mode( p, BRW_ALIGN_1 ); + + /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to + be hashed. Also compute the remainders (offsets within the unit + square), interleaved to reduce register dependency penalties. */ + brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 ); + brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 ); + brw_FRC( p, param0, param0 ); + brw_FRC( p, param1, param1 ); + brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */ + brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ), + low_words( itmp[ 1 ] ) ); + brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */ + brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */ + brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) ); + brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) ); + brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) ); + + /* We're now ready to perform the hashing. The four hashes are + interleaved for performance. The hash function used is + designed to rapidly achieve avalanche and require only 32x16 + bit multiplication, and 16-bit swizzles (which we get for + free). We can't use immediate operands in the multiplies, + because immediates are permitted only in src1 and the 16-bit + factor is permitted only in src0. */ + for( i = 0; i < 4; i++ ) + brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), + high_words( itmp[ i ] ) ); + for( i = 0; i < 4; i++ ) + brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), + high_words( itmp[ i ] ) ); + for( i = 0; i < 4; i++ ) + brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), + high_words( itmp[ i ] ) ); + + /* Now we want to initialise the four gradients based on the + hashes. Format conversion from signed integer to float leaves + everything scaled too high by a factor of pow( 2, 15 ), but + we correct for that right at the end. */ + brw_ADD( p, t, param0, brw_imm_f( -1.0 ) ); + brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) ); + brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) ); + brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) ); + brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) ); + + brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) ); + + brw_MUL( p, x1y0, x1y0, t ); + brw_MUL( p, x1y1, x1y1, t ); + brw_ADD( p, t, param1, brw_imm_f( -1.0 ) ); + brw_MUL( p, x0y0, x0y0, param0 ); + brw_MUL( p, x0y1, x0y1, param0 ); + + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 ); + brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 ); + brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t ); + brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t ); + + brw_ADD( p, x0y0, x0y0, tmp[ 0 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 2 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 1 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 3 ] ); + + /* We interpolate between the gradients using the polynomial + 6t^5 - 15t^4 + 10t^3 (Perlin). */ + brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) ); + brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) ); + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) ); + brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) ); + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 ); + brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 ); + brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the + pipeline */ + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) ); + brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) ); + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 ); + brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 ); + brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the + pipeline */ + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 ); + brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 ); + brw_MUL( p, param0, tmp[ 0 ], param0 ); + brw_MUL( p, param1, tmp[ 1 ], param1 ); + + /* Here we interpolate in the y dimension... */ + brw_MUL( p, x0y1, x0y1, param1 ); + brw_MUL( p, x1y1, x1y1, param1 ); + brw_ADD( p, x0y0, x0y0, x0y1 ); + brw_ADD( p, x1y0, x1y0, x1y1 ); + + /* And now in x. There are horrible register dependencies here, + but we have nothing else to do. */ + brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); + brw_MUL( p, x1y0, x1y0, param0 ); + brw_ADD( p, x0y0, x0y0, x1y0 ); + + /* scale by pow( 2, -15 ), as described above */ + brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) ); + + release_tmps( c, mark ); +} + +static void emit_noise2( struct brw_wm_compile *c, + struct prog_instruction *inst ) +{ + struct brw_compile *p = &c->func; + struct brw_reg src0, src1, param0, param1, dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + int mark = mark_tmps( c ); + + assert( mark == 0 ); + + src0 = get_src_reg( c, inst->SrcReg, 0, 1 ); + src1 = get_src_reg( c, inst->SrcReg, 1, 1 ); + + param0 = alloc_tmp( c ); + param1 = alloc_tmp( c ); + + brw_MOV( p, param0, src0 ); + brw_MOV( p, param1, src1 ); + + invoke_subroutine( c, SUB_NOISE2, noise2_sub ); + + /* Fill in the result: */ + brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE ); + for (i = 0 ; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i, 1); + brw_MOV( p, dst, param0 ); + } + } + if( inst->SaturateMode == SATURATE_ZERO_ONE ) + brw_set_saturate( p, 0 ); + + release_tmps( c, mark ); +} + +/* The three-dimensional case is much like the one- and two- versions above, + but since the number of corners is rapidly growing we now pack 16 16-bit + hashes into each register to extract more parallelism from the EUs. */ +static void noise3_sub( struct brw_wm_compile *c ) { + + struct brw_compile *p = &c->func; + struct brw_reg param0, param1, param2, + x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */ + xi, yi, zi, /* interpolation coefficients */ + t, tmp[ 8 ], /* float temporaries */ + itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */ + wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */ + int i; + int mark = mark_tmps( c ); + + x0y0 = alloc_tmp( c ); + x0y1 = alloc_tmp( c ); + x1y0 = alloc_tmp( c ); + x1y1 = alloc_tmp( c ); + xi = alloc_tmp( c ); + yi = alloc_tmp( c ); + zi = alloc_tmp( c ); + t = alloc_tmp( c ); + for( i = 0; i < 8; i++ ) { + tmp[ i ] = alloc_tmp( c ); + itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD ); + wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 ); + } + + param0 = lookup_tmp( c, mark - 4 ); + param1 = lookup_tmp( c, mark - 3 ); + param2 = lookup_tmp( c, mark - 2 ); + + brw_set_access_mode( p, BRW_ALIGN_1 ); + + /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to + be hashed. Also compute the remainders (offsets within the unit + cube), interleaved to reduce register dependency penalties. */ + brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 ); + brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 ); + brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 ); + brw_FRC( p, param0, param0 ); + brw_FRC( p, param1, param1 ); + brw_FRC( p, param2, param2 ); + /* Since we now have only 16 bits of precision in the hash, we must + be more careful about thorough mixing to maintain entropy as we + squash the input vector into a small scalar. */ + brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) ); + brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) ); + brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ), + brw_imm_uw( 0x9B93 ) ); + brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ), + brw_imm_uw( 0xBC8F ) ); + + /* Temporarily disable the execution mask while we work with ExecSize=16 + channels (the mask is set for ExecSize=8 and is probably incorrect). + Although this might cause execution of unwanted channels, the code + writes only to temporary registers and has no side effects, so + disabling the mask is harmless. */ + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) ); + brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) ); + brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) ); + + /* We're now ready to perform the hashing. The eight hashes are + interleaved for performance. The hash function used is + designed to rapidly achieve avalanche and require only 16x16 + bit multiplication, and 8-bit swizzles (which we get for + free). */ + for( i = 0; i < 4; i++ ) + brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ), + odd_bytes( wtmp[ i ] ) ); + for( i = 0; i < 4; i++ ) + brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ), + odd_bytes( wtmp[ i ] ) ); + brw_pop_insn_state( p ); + + /* Now we want to initialise the four rear gradients based on the + hashes. Format conversion from signed integer to float leaves + everything scaled too high by a factor of pow( 2, 15 ), but + we correct for that right at the end. */ + /* x component */ + brw_ADD( p, t, param0, brw_imm_f( -1.0 ) ); + brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) ); + brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) ); + brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) ); + brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) ); + brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, x1y0, x1y0, t ); + brw_MUL( p, x1y1, x1y1, t ); + brw_ADD( p, t, param1, brw_imm_f( -1.0 ) ); + brw_MUL( p, x0y0, x0y0, param0 ); + brw_MUL( p, x0y1, x0y1, param0 ); + + /* y component */ + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) ); + brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + brw_ADD( p, t, param0, brw_imm_f( -1.0 ) ); + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 ); + + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + + /* z component */ + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); + + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 ); + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 ); + + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + + /* We interpolate between the gradients using the polynomial + 6t^5 - 15t^4 + 10t^3 (Perlin). */ + brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) ); + brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) ); + brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) ); + brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) ); + brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) ); + brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) ); + brw_MUL( p, xi, xi, param0 ); + brw_MUL( p, yi, yi, param1 ); + brw_MUL( p, zi, zi, param2 ); + brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) ); + brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) ); + brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) ); + brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */ + brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */ + brw_MUL( p, xi, xi, param0 ); + brw_MUL( p, yi, yi, param1 ); + brw_MUL( p, zi, zi, param2 ); + brw_MUL( p, xi, xi, param0 ); + brw_MUL( p, yi, yi, param1 ); + brw_MUL( p, zi, zi, param2 ); + brw_MUL( p, xi, xi, param0 ); + brw_MUL( p, yi, yi, param1 ); + brw_MUL( p, zi, zi, param2 ); + + /* Here we interpolate in the y dimension... */ + brw_MUL( p, x0y1, x0y1, yi ); + brw_MUL( p, x1y1, x1y1, yi ); + brw_ADD( p, x0y0, x0y0, x0y1 ); + brw_ADD( p, x1y0, x1y0, x1y1 ); + + /* And now in x. Leave the result in tmp[ 0 ] (see below)... */ + brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); + brw_MUL( p, x1y0, x1y0, xi ); + brw_ADD( p, tmp[ 0 ], x0y0, x1y0 ); + + /* Now do the same thing for the front four gradients... */ + /* x component */ + brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) ); + brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) ); + brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) ); + brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) ); + brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, x1y0, x1y0, t ); + brw_MUL( p, x1y1, x1y1, t ); + brw_ADD( p, t, param1, brw_imm_f( -1.0 ) ); + brw_MUL( p, x0y0, x0y0, param0 ); + brw_MUL( p, x0y1, x0y1, param0 ); + + /* y component */ + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) ); + brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + brw_ADD( p, t, param2, brw_imm_f( -1.0 ) ); + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 ); + + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + + /* z component */ + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); + + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t ); + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + + /* The interpolation coefficients are still around from last time, so + again interpolate in the y dimension... */ + brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); + brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); + brw_MUL( p, x0y1, x0y1, yi ); + brw_MUL( p, x1y1, x1y1, yi ); + brw_ADD( p, x0y0, x0y0, x0y1 ); + brw_ADD( p, x1y0, x1y0, x1y1 ); + + /* And now in x. The rear face is in tmp[ 0 ] (see above), so this + time put the front face in tmp[ 1 ] and we're nearly there... */ + brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); + brw_MUL( p, x1y0, x1y0, xi ); + brw_ADD( p, tmp[ 1 ], x0y0, x1y0 ); + + /* The final interpolation, in the z dimension: */ + brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) ); + brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi ); + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] ); + + /* scale by pow( 2, -15 ), as described above */ + brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) ); + + release_tmps( c, mark ); +} + +static void emit_noise3( struct brw_wm_compile *c, + struct prog_instruction *inst ) +{ + struct brw_compile *p = &c->func; + struct brw_reg src0, src1, src2, param0, param1, param2, dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + int mark = mark_tmps( c ); + + assert( mark == 0 ); + + src0 = get_src_reg( c, inst->SrcReg, 0, 1 ); + src1 = get_src_reg( c, inst->SrcReg, 1, 1 ); + src2 = get_src_reg( c, inst->SrcReg, 2, 1 ); + + param0 = alloc_tmp( c ); + param1 = alloc_tmp( c ); + param2 = alloc_tmp( c ); + + brw_MOV( p, param0, src0 ); + brw_MOV( p, param1, src1 ); + brw_MOV( p, param2, src2 ); + + invoke_subroutine( c, SUB_NOISE3, noise3_sub ); + + /* Fill in the result: */ + brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE ); + for (i = 0 ; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i, 1); + brw_MOV( p, dst, param0 ); + } + } + if( inst->SaturateMode == SATURATE_ZERO_ONE ) + brw_set_saturate( p, 0 ); + + release_tmps( c, mark ); +} + +/* For the four-dimensional case, the little micro-optimisation benefits + we obtain by unrolling all the loops aren't worth the massive bloat it + now causes. Instead, we loop twice around performing a similar operation + to noise3, once for the w=0 cube and once for the w=1, with a bit more + code to glue it all together. */ +static void noise4_sub( struct brw_wm_compile *c ) { + + struct brw_compile *p = &c->func; + struct brw_reg param[ 4 ], + x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */ + w0, /* noise for the w=0 cube */ + floors[ 2 ], /* integer coordinates of base corner of hypercube */ + interp[ 4 ], /* interpolation coefficients */ + t, tmp[ 8 ], /* float temporaries */ + itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */ + wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */ + int i, j; + int mark = mark_tmps( c ); + GLuint loop, origin; + + x0y0 = alloc_tmp( c ); + x0y1 = alloc_tmp( c ); + x1y0 = alloc_tmp( c ); + x1y1 = alloc_tmp( c ); + t = alloc_tmp( c ); + w0 = alloc_tmp( c ); + floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD ); + floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD ); + + for( i = 0; i < 4; i++ ) { + param[ i ] = lookup_tmp( c, mark - 5 + i ); + interp[ i ] = alloc_tmp( c ); + } + + for( i = 0; i < 8; i++ ) { + tmp[ i ] = alloc_tmp( c ); + itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD ); + wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 ); + } + + brw_set_access_mode( p, BRW_ALIGN_1 ); + + /* We only want 16 bits of precision from the integral part of each + co-ordinate, but unfortunately the RNDD semantics would saturate + at 16 bits if we performed the operation directly to a 16-bit + destination. Therefore, we round to 32-bit temporaries where + appropriate, and then store only the lower 16 bits. */ + brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] ); + brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] ); + brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] ); + brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] ); + brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) ); + brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) ); + + /* Modify the flag register here, because the side effect is useful + later (see below). We know for certain that all flags will be + cleared, since the FRC instruction cannot possibly generate + negative results. Even for exceptional inputs (infinities, denormals, + NaNs), the architecture guarantees that the L conditional is false. */ + brw_set_conditionalmod( p, BRW_CONDITIONAL_L ); + brw_FRC( p, param[ 0 ], param[ 0 ] ); + brw_set_predicate_control( p, BRW_PREDICATE_NONE ); + for( i = 1; i < 4; i++ ) + brw_FRC( p, param[ i ], param[ i ] ); + + /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first + of all. */ + for( i = 0; i < 4; i++ ) + brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) ); + for( i = 0; i < 4; i++ ) + brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) ); + for( i = 0; i < 4; i++ ) + brw_MUL( p, interp[ i ], interp[ i ], param[ i ] ); + for( i = 0; i < 4; i++ ) + brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) ); + for( j = 0; j < 3; j++ ) + for( i = 0; i < 4; i++ ) + brw_MUL( p, interp[ i ], interp[ i ], param[ i ] ); + + /* Mark the current address, as it will be a jump destination. The + following code will be executed twice: first, with the flag + register clear indicating the w=0 case, and second with flags + set for w=1. */ + loop = p->nr_insn; + + /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to + be hashed. Since we have only 16 bits of precision in the hash, we + must be careful about thorough mixing to maintain entropy as we + squash the input vector into a small scalar. */ + brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ), + brw_imm_uw( 0xBC8F ) ); + brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ), + brw_imm_uw( 0xD0BD ) ); + brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ), + brw_imm_uw( 0x9B93 ) ); + brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ), + brw_imm_uw( 0xA359 ) ); + brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ), + brw_imm_uw( 0xBC8F ) ); + + /* Temporarily disable the execution mask while we work with ExecSize=16 + channels (the mask is set for ExecSize=8 and is probably incorrect). + Although this might cause execution of unwanted channels, the code + writes only to temporary registers and has no side effects, so + disabling the mask is harmless. */ + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) ); + brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) ); + brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) ); + + /* We're now ready to perform the hashing. The eight hashes are + interleaved for performance. The hash function used is + designed to rapidly achieve avalanche and require only 16x16 + bit multiplication, and 8-bit swizzles (which we get for + free). */ + for( i = 0; i < 4; i++ ) + brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ), + odd_bytes( wtmp[ i ] ) ); + for( i = 0; i < 4; i++ ) + brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) ); + for( i = 0; i < 4; i++ ) + brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ), + odd_bytes( wtmp[ i ] ) ); + brw_pop_insn_state( p ); + + /* Now we want to initialise the four rear gradients based on the + hashes. Format conversion from signed integer to float leaves + everything scaled too high by a factor of pow( 2, 15 ), but + we correct for that right at the end. */ + /* x component */ + brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) ); + brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) ); + brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) ); + brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) ); + brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) ); + brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, x1y0, x1y0, t ); + brw_MUL( p, x1y1, x1y1, t ); + brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) ); + brw_MUL( p, x0y0, x0y0, param[ 0 ] ); + brw_MUL( p, x0y1, x0y1, param[ 0 ] ); + + /* y component */ + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) ); + brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + /* prepare t for the w component (used below): w the first time through + the loop; w - 1 the second time) */ + brw_set_predicate_control( p, BRW_PREDICATE_NORMAL ); + brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) ); + p->current->header.predicate_inverse = 1; + brw_MOV( p, t, param[ 3 ] ); + p->current->header.predicate_inverse = 0; + brw_set_predicate_control( p, BRW_PREDICATE_NONE ); + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] ); + + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + + /* z component */ + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) ); + brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] ); + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] ); + + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + + /* w component */ + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); + + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t ); + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) ); + + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + + /* Here we interpolate in the y dimension... */ + brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); + brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); + brw_MUL( p, x0y1, x0y1, interp[ 1 ] ); + brw_MUL( p, x1y1, x1y1, interp[ 1 ] ); + brw_ADD( p, x0y0, x0y0, x0y1 ); + brw_ADD( p, x1y0, x1y0, x1y1 ); + + /* And now in x. Leave the result in tmp[ 0 ] (see below)... */ + brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); + brw_MUL( p, x1y0, x1y0, interp[ 0 ] ); + brw_ADD( p, tmp[ 0 ], x0y0, x1y0 ); + + /* Now do the same thing for the front four gradients... */ + /* x component */ + brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) ); + brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) ); + brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) ); + brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) ); + brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, x1y0, x1y0, t ); + brw_MUL( p, x1y1, x1y1, t ); + brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) ); + brw_MUL( p, x0y0, x0y0, param[ 0 ] ); + brw_MUL( p, x0y1, x0y1, param[ 0 ] ); + + /* y component */ + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) ); + brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) ); + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] ); + + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + + /* z component */ + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); + + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) ); + brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) ); + brw_pop_insn_state( p ); + + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t ); + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + /* prepare t for the w component (used below): w the first time through + the loop; w - 1 the second time) */ + brw_set_predicate_control( p, BRW_PREDICATE_NORMAL ); + brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) ); + p->current->header.predicate_inverse = 1; + brw_MOV( p, t, param[ 3 ] ); + p->current->header.predicate_inverse = 0; + brw_set_predicate_control( p, BRW_PREDICATE_NONE ); + + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + + /* w component */ + brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); + brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); + brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); + + brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t ); + brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); + brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t ); + brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); + + brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); + brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); + brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); + brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); + + /* Interpolate in the y dimension: */ + brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); + brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); + brw_MUL( p, x0y1, x0y1, interp[ 1 ] ); + brw_MUL( p, x1y1, x1y1, interp[ 1 ] ); + brw_ADD( p, x0y0, x0y0, x0y1 ); + brw_ADD( p, x1y0, x1y0, x1y1 ); + + /* And now in x. The rear face is in tmp[ 0 ] (see above), so this + time put the front face in tmp[ 1 ] and we're nearly there... */ + brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); + brw_MUL( p, x1y0, x1y0, interp[ 0 ] ); + brw_ADD( p, tmp[ 1 ], x0y0, x1y0 ); + + /* Another interpolation, in the z dimension: */ + brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) ); + brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] ); + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] ); + + /* Exit the loop if we've computed both cubes... */ + origin = p->nr_insn; + brw_push_insn_state( p ); + brw_set_predicate_control( p, BRW_PREDICATE_NORMAL ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) ); + brw_pop_insn_state( p ); + + /* Save the result for the w=0 case, and increment the w coordinate: */ + brw_MOV( p, w0, tmp[ 0 ] ); + brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ), + brw_imm_uw( 1 ) ); + + /* Loop around for the other cube. Explicitly set the flag register + (unfortunately we must spend an extra instruction to do this: we + can't rely on a side effect of the previous MOV or ADD because + conditional modifiers which are normally true might be false in + exceptional circumstances, e.g. given a NaN input; the add to + brw_ip_reg() is not suitable because the IP is not an 8-vector). */ + brw_push_insn_state( p ); + brw_set_mask_control( p, BRW_MASK_DISABLE ); + brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) ); + brw_ADD( p, brw_ip_reg(), brw_ip_reg(), + brw_imm_d( ( loop - p->nr_insn ) << 4 ) ); + brw_pop_insn_state( p ); + + /* Patch the previous conditional branch now that we know the + destination address. */ + brw_set_src1( p->store + origin, + brw_imm_d( ( p->nr_insn - origin ) << 4 ) ); + + /* The very last interpolation. */ + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) ); + brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] ); + brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 ); + + /* scale by pow( 2, -15 ), as described above */ + brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) ); + + release_tmps( c, mark ); +} + +static void emit_noise4( struct brw_wm_compile *c, + struct prog_instruction *inst ) +{ + struct brw_compile *p = &c->func; + struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst; + GLuint mask = inst->DstReg.WriteMask; + int i; + int mark = mark_tmps( c ); + + assert( mark == 0 ); + + src0 = get_src_reg( c, inst->SrcReg, 0, 1 ); + src1 = get_src_reg( c, inst->SrcReg, 1, 1 ); + src2 = get_src_reg( c, inst->SrcReg, 2, 1 ); + src3 = get_src_reg( c, inst->SrcReg, 3, 1 ); + + param0 = alloc_tmp( c ); + param1 = alloc_tmp( c ); + param2 = alloc_tmp( c ); + param3 = alloc_tmp( c ); + + brw_MOV( p, param0, src0 ); + brw_MOV( p, param1, src1 ); + brw_MOV( p, param2, src2 ); + brw_MOV( p, param3, src3 ); + + invoke_subroutine( c, SUB_NOISE4, noise4_sub ); + + /* Fill in the result: */ + brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE ); + for (i = 0 ; i < 4; i++) { + if (mask & (1<<i)) { + dst = get_dst_reg(c, inst, i, 1); + brw_MOV( p, dst, param0 ); + } + } + if( inst->SaturateMode == SATURATE_ZERO_ONE ) + brw_set_saturate( p, 0 ); + + release_tmps( c, mark ); +} + static void emit_wpos_xy(struct brw_wm_compile *c, struct prog_instruction *inst) { @@ -1201,8 +2338,8 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c) case OPCODE_LRP: emit_lrp(c, inst); break; - case OPCODE_INT: - emit_int(c, inst); + case OPCODE_TRUNC: + emit_trunc(c, inst); break; case OPCODE_MOV: emit_mov(c, inst); @@ -1276,6 +2413,18 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c) case OPCODE_MAD: emit_mad(c, inst); break; + case OPCODE_NOISE1: + emit_noise1(c, inst); + break; + case OPCODE_NOISE2: + emit_noise2(c, inst); + break; + case OPCODE_NOISE3: + emit_noise3(c, inst); + break; + case OPCODE_NOISE4: + emit_noise4(c, inst); + break; case OPCODE_TEX: emit_tex(c, inst); break; @@ -1368,7 +2517,6 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c) void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c) { brw_wm_pass_fp(c); - c->tmp_index = 127; brw_wm_emit_glsl(brw, c); c->prog_data.total_grf = c->reg_index; c->prog_data.total_scratch = 0; diff --git a/i965/brw_wm_iz.c b/i965/brw_wm_iz.c index ec2b976..bd60ac9 100644 --- a/i965/brw_wm_iz.c +++ b/i965/brw_wm_iz.c @@ -30,7 +30,7 @@ */ -#include "mtypes.h" +#include "main/mtypes.h" #include "brw_wm.h" @@ -52,70 +52,6 @@ const struct { { { P, 0, 0, 0, 0 }, { P, 0, 0, 0, 0 }, - { C, 0, 1, 0, 0 }, - { C, 0, 1, 0, 0 }, - { C, 1, 1, 0, 0 }, - { C, 1, 1, 0, 0 }, - { C, 0, 1, 0, 0 }, - { C, 0, 1, 0, 0 }, - { C, 1, 1, 1, 0 }, - { C, 1, 1, 1, 0 }, - { C, 0, 1, 1, 0 }, - { C, 0, 1, 1, 0 }, - { C, 1, 1, 1, 0 }, - { C, 1, 1, 1, 0 }, - { C, 0, 1, 1, 0 }, - { C, 0, 1, 1, 0 }, - { P, 0, 0, 0, 0 }, - { P, 0, 0, 0, 0 }, - { C, 0, 1, 0, 0 }, - { C, 0, 1, 0, 0 }, - { C, 1, 1, 0, 0 }, - { C, 1, 1, 0, 0 }, - { C, 0, 1, 0, 0 }, - { C, 0, 1, 0, 0 }, - { C, 1, 1, 1, 0 }, - { C, 1, 1, 1, 0 }, - { C, 0, 1, 1, 0 }, - { C, 0, 1, 1, 0 }, - { C, 1, 1, 1, 0 }, - { C, 1, 1, 1, 0 }, - { C, 0, 1, 1, 0 }, - { C, 0, 1, 1, 0 }, - { C, 0, 0, 0, 1 }, - { C, 0, 0, 0, 1 }, - { C, 0, 1, 0, 1 }, - { C, 0, 1, 0, 1 }, - { C, 1, 1, 0, 1 }, - { C, 1, 1, 0, 1 }, - { C, 0, 1, 0, 1 }, - { C, 0, 1, 0, 1 }, - { C, 1, 1, 1, 1 }, - { C, 1, 1, 1, 1 }, - { C, 0, 1, 1, 1 }, - { C, 0, 1, 1, 1 }, - { C, 1, 1, 1, 1 }, - { C, 1, 1, 1, 1 }, - { C, 0, 1, 1, 1 }, - { C, 0, 1, 1, 1 }, - { C, 0, 0, 0, 1 }, - { C, 0, 0, 0, 1 }, - { C, 0, 1, 0, 1 }, - { C, 0, 1, 0, 1 }, - { C, 1, 1, 0, 1 }, - { C, 1, 1, 0, 1 }, - { C, 0, 1, 0, 1 }, - { C, 0, 1, 0, 1 }, - { C, 1, 1, 1, 1 }, - { C, 1, 1, 1, 1 }, - { C, 0, 1, 1, 1 }, - { C, 0, 1, 1, 1 }, - { C, 1, 1, 1, 1 }, - { C, 1, 1, 1, 1 }, - { C, 0, 1, 1, 1 }, - { C, 0, 1, 1, 1 }, - { P, 0, 0, 0, 0 }, - { P, 0, 0, 0, 0 }, { P, 0, 0, 0, 0 }, { P, 0, 0, 0, 0 }, { P, 0, 0, 0, 0 }, diff --git a/i965/brw_wm_sampler_state.c b/i965/brw_wm_sampler_state.c index d40332e..8c9cb78 100644 --- a/i965/brw_wm_sampler_state.c +++ b/i965/brw_wm_sampler_state.c @@ -34,7 +34,7 @@ #include "brw_state.h" #include "brw_defines.h" -#include "macros.h" +#include "main/macros.h" @@ -229,6 +229,9 @@ brw_wm_sampler_populate_key(struct brw_context *brw, struct wm_sampler_entry *entry = &key->sampler[unit]; struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[unit]; struct gl_texture_object *texObj = texUnit->_Current; + struct intel_texture_object *intelObj = intel_texture_object(texObj); + struct gl_texture_image *firstImage = + texObj->Image[0][intelObj->firstLevel]; entry->wrap_r = texObj->WrapR; entry->wrap_s = texObj->WrapS; @@ -241,11 +244,25 @@ brw_wm_sampler_populate_key(struct brw_context *brw, entry->minfilter = texObj->MinFilter; entry->magfilter = texObj->MagFilter; entry->comparemode = texObj->CompareMode; - entry->comparefunc = texObj->CompareFunc; + entry->comparefunc = texObj->CompareFunc; dri_bo_unreference(brw->wm.sdc_bo[unit]); - brw->wm.sdc_bo[unit] = upload_default_color(brw, texObj->BorderColor); - + if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) { + float bordercolor[4] = { + texObj->BorderColor[0], + texObj->BorderColor[0], + texObj->BorderColor[0], + texObj->BorderColor[0] + }; + /* GL specs that border color for depth textures is taken from the + * R channel, while the hardware uses A. Spam R into all the + * channels for safety. + */ + brw->wm.sdc_bo[unit] = upload_default_color(brw, bordercolor); + } else { + brw->wm.sdc_bo[unit] = upload_default_color(brw, + texObj->BorderColor); + } key->sampler_count = unit + 1; } } @@ -255,11 +272,10 @@ brw_wm_sampler_populate_key(struct brw_context *brw, * complicates various things. However, this is still too confusing - * FIXME: simplify all the different new texture state flags. */ -static int upload_wm_samplers( struct brw_context *brw ) +static void upload_wm_samplers( struct brw_context *brw ) { struct wm_sampler_key key; int i; - int ret = 0; brw_wm_sampler_populate_key(brw, &key); @@ -271,7 +287,7 @@ static int upload_wm_samplers( struct brw_context *brw ) dri_bo_unreference(brw->wm.sampler_bo); brw->wm.sampler_bo = NULL; if (brw->wm.sampler_count == 0) - return 0; + return; brw->wm.sampler_bo = brw_search_cache(&brw->cache, BRW_SAMPLER, &key, sizeof(key), @@ -304,19 +320,14 @@ static int upload_wm_samplers( struct brw_context *brw ) if (!brw->attribs.Texture->Unit[i]._ReallyEnabled) continue; - ret |= dri_bufmgr_check_aperture_space(brw->wm.sdc_bo[i]); - dri_emit_reloc(brw->wm.sampler_bo, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, - 0, - i * sizeof(struct brw_sampler_state) + - offsetof(struct brw_sampler_state, ss2), - brw->wm.sdc_bo[i]); + dri_bo_emit_reloc(brw->wm.sampler_bo, + I915_GEM_DOMAIN_SAMPLER, 0, + 0, + i * sizeof(struct brw_sampler_state) + + offsetof(struct brw_sampler_state, ss2), + brw->wm.sdc_bo[i]); } } - - ret |= dri_bufmgr_check_aperture_space(brw->wm.sampler_bo); - return ret; - } const struct brw_tracked_state brw_wm_samplers = { diff --git a/i965/brw_wm_state.c b/i965/brw_wm_state.c index f4da0f2..5302405 100644 --- a/i965/brw_wm_state.c +++ b/i965/brw_wm_state.c @@ -34,7 +34,6 @@ #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" -#include "dri_bufmgr.h" #include "brw_wm.h" /*********************************************************************** @@ -68,8 +67,13 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key) if (INTEL_DEBUG & DEBUG_SINGLE_THREAD) key->max_threads = 1; - else - key->max_threads = 32; + else { + /* WM maximum threads is number of EUs times number of threads per EU. */ + if (BRW_IS_G4X(brw)) + key->max_threads = 10 * 5; + else + key->max_threads = 8 * 4; + } /* CACHE_NEW_WM_PROG */ key->total_grf = brw->wm.prog_data->total_grf; @@ -84,7 +88,7 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key) /* BRW_NEW_CURBE_OFFSETS */ key->curbe_offset = brw->curbe.wm_start; - /* CACHE_NEW_SURFACE */ + /* BRW_NEW_NR_SURFACEs */ key->nr_surfaces = brw->wm.nr_surfaces; /* CACHE_NEW_SAMPLER */ @@ -199,40 +203,39 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key, NULL, NULL); /* Emit WM program relocation */ - dri_emit_reloc(bo, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, - wm.thread0.grf_reg_count << 1, - offsetof(struct brw_wm_unit_state, thread0), - brw->wm.prog_bo); + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + wm.thread0.grf_reg_count << 1, + offsetof(struct brw_wm_unit_state, thread0), + brw->wm.prog_bo); /* Emit scratch space relocation */ if (key->total_scratch != 0) { - dri_emit_reloc(bo, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE, - wm.thread2.per_thread_scratch_space, - offsetof(struct brw_wm_unit_state, thread2), - brw->wm.scratch_buffer); + dri_bo_emit_reloc(bo, + 0, 0, + wm.thread2.per_thread_scratch_space, + offsetof(struct brw_wm_unit_state, thread2), + brw->wm.scratch_buffer); } /* Emit sampler state relocation */ if (key->sampler_count != 0) { - dri_emit_reloc(bo, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, - wm.wm4.stats_enable | (wm.wm4.sampler_count << 2), - offsetof(struct brw_wm_unit_state, wm4), - brw->wm.sampler_bo); + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + wm.wm4.stats_enable | (wm.wm4.sampler_count << 2), + offsetof(struct brw_wm_unit_state, wm4), + brw->wm.sampler_bo); } return bo; } -static int upload_wm_unit( struct brw_context *brw ) +static void upload_wm_unit( struct brw_context *brw ) { struct intel_context *intel = &brw->intel; struct brw_wm_unit_key key; dri_bo *reloc_bufs[3]; - int ret = 0, i; wm_unit_populate_key(brw, &key); /* Allocate the necessary scratch space if we haven't already. Don't @@ -251,7 +254,7 @@ static int upload_wm_unit( struct brw_context *brw ) brw->wm.scratch_buffer = dri_bo_alloc(intel->bufmgr, "wm scratch", total, - 4096, DRM_BO_FLAG_MEM_TT); + 4096); } } @@ -267,12 +270,6 @@ static int upload_wm_unit( struct brw_context *brw ) if (brw->wm.state_bo == NULL) { brw->wm.state_bo = wm_unit_create_from_key(brw, &key, reloc_bufs); } - - for (i = 0; i < 3; i++) - if (reloc_bufs[i]) - ret |= dri_bufmgr_check_aperture_space(reloc_bufs[i]); - ret |= dri_bufmgr_check_aperture_space(brw->wm.state_bo); - return ret; } const struct brw_tracked_state brw_wm_unit = { @@ -284,10 +281,9 @@ const struct brw_tracked_state brw_wm_unit = { .brw = (BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_CURBE_OFFSETS | - BRW_NEW_LOCK), + BRW_NEW_NR_SURFACES), - .cache = (CACHE_NEW_SURFACE | - CACHE_NEW_WM_PROG | + .cache = (CACHE_NEW_WM_PROG | CACHE_NEW_SAMPLER) }, .prepare = upload_wm_unit, diff --git a/i965/brw_wm_surface_state.c b/i965/brw_wm_surface_state.c index 37c6b52..06e71e6 100644 --- a/i965/brw_wm_surface_state.c +++ b/i965/brw_wm_surface_state.c @@ -30,9 +30,9 @@ */ -#include "mtypes.h" -#include "texformat.h" -#include "texstore.h" +#include "main/mtypes.h" +#include "main/texformat.h" +#include "main/texstore.h" #include "intel_mipmap_tree.h" #include "intel_batchbuffer.h" @@ -154,10 +154,29 @@ struct brw_wm_surface_key { GLint first_level, last_level; GLint width, height, depth; GLint pitch, cpp; - GLboolean tiled; + uint32_t tiling; GLuint offset; }; +static void +brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling) +{ + switch (tiling) { + case I915_TILING_NONE: + surf->ss3.tiled_surface = 0; + surf->ss3.tile_walk = 0; + break; + case I915_TILING_X: + surf->ss3.tiled_surface = 1; + surf->ss3.tile_walk = BRW_TILEWALK_XMAJOR; + break; + case I915_TILING_Y: + surf->ss3.tiled_surface = 1; + surf->ss3.tile_walk = BRW_TILEWALK_YMAJOR; + break; + } +} + static dri_bo * brw_create_texture_surface( struct brw_context *brw, struct brw_wm_surface_key *key ) @@ -173,28 +192,32 @@ brw_create_texture_surface( struct brw_context *brw, if (key->bo) surf.ss0.surface_format = translate_tex_format(key->format, key->depthmode); else { - switch(key->depth) { - case 32: surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; break; - default: - case 24: surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8X8_UNORM; break; - case 16: surf.ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM; break; - } + switch (key->depth) { + case 32: + surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; + break; + default: + case 24: + surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8X8_UNORM; + break; + case 16: + surf.ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM; + break; + } } /* This is ok for all textures with channel width 8bit or less: */ /* surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */ if (key->bo) - surf.ss1.base_addr = key->bo->offset; /* reloc */ + surf.ss1.base_addr = key->bo->offset; /* reloc */ else - surf.ss1.base_addr = key->offset; + surf.ss1.base_addr = key->offset; surf.ss2.mip_count = key->last_level - key->first_level; surf.ss2.width = key->width - 1; surf.ss2.height = key->height - 1; - - surf.ss3.tile_walk = BRW_TILEWALK_XMAJOR; - surf.ss3.tiled_surface = key->tiled; + brw_set_surface_tiling(&surf, key->tiling); surf.ss3.pitch = (key->pitch * key->cpp) - 1; surf.ss3.depth = key->depth - 1; @@ -214,18 +237,19 @@ brw_create_texture_surface( struct brw_context *brw, &key->bo, key->bo ? 1 : 0, &surf, sizeof(surf), NULL, NULL); + if (key->bo) { /* Emit relocation to surface contents */ - dri_emit_reloc(bo, - DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, - 0, - offsetof(struct brw_surface_state, ss1), - key->bo); + dri_bo_emit_reloc(bo, + I915_GEM_DOMAIN_SAMPLER, 0, + 0, + offsetof(struct brw_surface_state, ss1), + key->bo); } return bo; } -static int +static void brw_update_texture_surface( GLcontext *ctx, GLuint unit ) { struct brw_context *brw = brw_context(ctx); @@ -233,7 +257,6 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit ) struct intel_texture_object *intelObj = intel_texture_object(tObj); struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel]; struct brw_wm_surface_key key; - int ret = 0; memset(&key, 0, sizeof(key)); @@ -248,7 +271,6 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit ) key.depth = firstImage->Depth; key.bo = intelObj->mt->region->buffer; key.offset = 0; - ret |= dri_bufmgr_check_aperture_space(key.bo); } key.target = tObj->Target; @@ -258,7 +280,7 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit ) key.width = firstImage->Width; key.height = firstImage->Height; key.cpp = intelObj->mt->cpp; - key.tiled = intelObj->mt->region->tiled; + key.tiling = intelObj->mt->region->tiling; dri_bo_unreference(brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS]); brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] = brw_search_cache(&brw->cache, BRW_SS_SURFACE, @@ -268,9 +290,6 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit ) if (brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] == NULL) { brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] = brw_create_texture_surface(brw, &key); } - - ret |= dri_bufmgr_check_aperture_space(brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS]); - return ret; } /** @@ -278,18 +297,18 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit ) * While it is only used for the front/back buffer currently, it should be * usable for further buffers when doing ARB_draw_buffer support. */ -static int +static void brw_update_region_surface(struct brw_context *brw, struct intel_region *region, unsigned int unit, GLboolean cached) { dri_bo *region_bo = NULL; - int ret = 0; struct { unsigned int surface_type; unsigned int surface_format; unsigned int width, height, cpp; GLubyte color_mask[4]; - GLboolean tiled, color_blend; + GLboolean color_blend; + uint32_t tiling; } key; memset(&key, 0, sizeof(key)); @@ -302,16 +321,14 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region, key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; else key.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM; - key.tiled = region->tiled; + key.tiling = region->tiling; key.width = region->pitch; /* XXX: not really! */ key.height = region->height; key.cpp = region->cpp; - - ret |= dri_bufmgr_check_aperture_space(region->buffer); } else { key.surface_type = BRW_SURFACE_NULL; key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; - key.tiled = 0; + key.tiling = 0; key.width = 1; key.height = 1; key.cpp = 4; @@ -341,8 +358,7 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region, surf.ss2.width = key.width - 1; surf.ss2.height = key.height - 1; - surf.ss3.tile_walk = BRW_TILEWALK_XMAJOR; - surf.ss3.tiled_surface = key.tiled; + brw_set_surface_tiling(&surf, key.tiling); surf.ss3.pitch = (key.width * key.cpp) - 1; /* _NEW_COLOR */ @@ -359,19 +375,19 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region, &surf, sizeof(surf), NULL, NULL); if (region_bo != NULL) { - dri_emit_reloc(brw->wm.surf_bo[unit], - DRM_BO_FLAG_MEM_TT | - DRM_BO_FLAG_READ | - DRM_BO_FLAG_WRITE, - 0, - offsetof(struct brw_surface_state, ss1), - region_bo); + /* We might sample from it, and we might render to it, so flag + * them both. We might be able to figure out from other state + * a more restrictive relocation to emit. + */ + dri_bo_emit_reloc(brw->wm.surf_bo[unit], + I915_GEM_DOMAIN_RENDER | + I915_GEM_DOMAIN_SAMPLER, + I915_GEM_DOMAIN_RENDER, + 0, + offsetof(struct brw_surface_state, ss1), + region_bo); } } - - ret |= dri_bufmgr_check_aperture_space(brw->wm.surf_bo[unit]); - - return ret; } @@ -409,13 +425,11 @@ brw_wm_get_binding_table(struct brw_context *brw) /* Emit binding table relocations to surface state */ for (i = 0; i < BRW_WM_MAX_SURF; i++) { if (brw->wm.surf_bo[i] != NULL) { - dri_emit_reloc(bind_bo, - DRM_BO_FLAG_MEM_TT | - DRM_BO_FLAG_READ | - DRM_BO_FLAG_WRITE, - 0, - i * sizeof(GLuint), - brw->wm.surf_bo[i]); + dri_bo_emit_reloc(bind_bo, + I915_GEM_DOMAIN_INSTRUCTION, 0, + 0, + i * sizeof(GLuint), + brw->wm.surf_bo[i]); } } @@ -425,25 +439,23 @@ brw_wm_get_binding_table(struct brw_context *brw) return bind_bo; } -static int prepare_wm_surfaces(struct brw_context *brw ) +static void prepare_wm_surfaces(struct brw_context *brw ) { GLcontext *ctx = &brw->intel.ctx; struct intel_context *intel = &brw->intel; - GLuint i, ret; + GLuint i; + int old_nr_surfaces; if (brw->state.nr_draw_regions > 1) { for (i = 0; i < brw->state.nr_draw_regions; i++) { - ret = brw_update_region_surface(brw, brw->state.draw_regions[i], i, - GL_FALSE); - if (ret) - return ret; + brw_update_region_surface(brw, brw->state.draw_regions[i], i, + GL_FALSE); } }else { - ret = brw_update_region_surface(brw, brw->state.draw_regions[0], 0, GL_TRUE); - if (ret) - return ret; + brw_update_region_surface(brw, brw->state.draw_regions[0], 0, GL_TRUE); } + old_nr_surfaces = brw->wm.nr_surfaces; brw->wm.nr_surfaces = MAX_DRAW_BUFFERS; for (i = 0; i < BRW_MAX_TEX_UNIT; i++) { @@ -457,11 +469,8 @@ static int prepare_wm_surfaces(struct brw_context *brw ) dri_bo_reference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]); brw->wm.nr_surfaces = i + MAX_DRAW_BUFFERS + 1; } else { - ret = brw_update_texture_surface(ctx, i); + brw_update_texture_surface(ctx, i); brw->wm.nr_surfaces = i + MAX_DRAW_BUFFERS + 1; - - if (ret) - return ret; } } else { dri_bo_unreference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]); @@ -473,7 +482,8 @@ static int prepare_wm_surfaces(struct brw_context *brw ) dri_bo_unreference(brw->wm.bind_bo); brw->wm.bind_bo = brw_wm_get_binding_table(brw); - return dri_bufmgr_check_aperture_space(brw->wm.bind_bo); + if (brw->wm.nr_surfaces != old_nr_surfaces) + brw->state.dirty.brw |= BRW_NEW_NR_SURFACES; } diff --git a/i965/intel_state.c b/i965/intel_state.c index 0fba5a7..67ef5f7 100644 --- a/i965/intel_state.c +++ b/i965/intel_state.c @@ -26,12 +26,12 @@ **************************************************************************/ -#include "glheader.h" -#include "context.h" -#include "macros.h" -#include "enums.h" -#include "colormac.h" -#include "dd.h" +#include "main/glheader.h" +#include "main/context.h" +#include "main/macros.h" +#include "main/enums.h" +#include "main/colormac.h" +#include "main/dd.h" #include "intel_screen.h" #include "intel_context.h" |