Import i915 and i965 dri drivers from mesa 7.8-rc1.7.8-rc1

author: Luc Verhaegen <libv@skynet.be> 2010-03-16 20:19:03 +0100
committer: Luc Verhaegen <libv@skynet.be> 2010-03-16 20:19:03 +0100
commit: 54a8e9cc3988d908b5b846a752679127cacefd3b (patch)
tree: ee289172f13f246903151b12db7808b7cb0651e8 /i965
parent: 6b263d1e2c599ddcc0ce4c84425dbc1ac8de020c (diff)
66 files changed, 3143 insertions, 891 deletions
diff --git a/i965/Makefile.am b/i965/Makefile.am
index 8a61dab..d9c82d5 100644
--- a/i965/Makefile.am
+++ b/i965/Makefile.am
@@ -27,7 +27,6 @@ i965_dri_la_SOURCES = \
 	../shared/intel_pixel_draw.c \
 	../shared/intel_pixel_read.c \
 	../shared/intel_state.c \
-	../shared/intel_swapbuffers.c \
 	../shared/intel_syncobj.c \
 	../shared/intel_tex.c \
 	../shared/intel_tex_copy.c \
@@ -88,4 +87,15 @@ i965_dri_la_SOURCES = \
 	brw_wm_pass2.c \
 	brw_wm_sampler_state.c \
 	brw_wm_state.c \
-	brw_wm_surface_state.c
+	brw_wm_surface_state.c \
+	gen6_cc.c \
+	gen6_clip_state.c \
+	gen6_depthstencil.c \
+	gen6_gs_state.c \
+	gen6_sampler_state.c \
+	gen6_scissor_state.c \
+	gen6_sf_state.c \
+	gen6_urb.c \
+	gen6_viewport_state.c \
+	gen6_vs_state.c \
+	gen6_wm_state.c
diff --git a/i965/brw_cc.c b/i965/brw_cc.c
index bac1c3a..fa2d394 100644
--- a/i965/brw_cc.c
+++ b/i965/brw_cc.c
@@ -34,9 +34,7 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_util.h"
-#include "intel_fbo.h"
 #include "main/macros.h"
-#include "main/enums.h"
 
 static void prepare_cc_vp( struct brw_context *brw )
 {
@@ -295,8 +293,7 @@ cc_unit_create_from_key(struct brw_context *brw, struct brw_cc_unit_key *key)
    bo = brw_upload_cache(&brw->cache, BRW_CC_UNIT,
 			 key, sizeof(*key),
 			 &brw->cc.vp_bo, 1,
-			 &cc, sizeof(cc),
-			 NULL, NULL);
+			 &cc, sizeof(cc));
 
    /* Emit CC viewport relocation */
    dri_bo_emit_reloc(bo,
diff --git a/i965/brw_clip.c b/i965/brw_clip.c
index dbd10a5..d3275c7 100644
--- a/i965/brw_clip.c
+++ b/i965/brw_clip.c
@@ -50,6 +50,7 @@
 static void compile_clip_prog( struct brw_context *brw,
 			     struct brw_clip_prog_key *key )
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_clip_compile c;
    const GLuint *program;
    GLuint program_size;
@@ -65,14 +66,13 @@ static void compile_clip_prog( struct brw_context *brw,
    c.func.single_program_flow = 1;
 
    c.key = *key;
-   c.need_ff_sync = BRW_IS_IGDNG(brw);
 
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
    c.header_position_offset = ATTR_SIZE;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        delta = 3 * REG_SIZE;
    else
        delta = REG_SIZE;
@@ -85,7 +85,7 @@ static void compile_clip_prog( struct brw_context *brw,
 
    c.nr_attrs = brw_count_bits(c.key.attrs);
    
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
    else
        c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
@@ -130,20 +130,22 @@ static void compile_clip_prog( struct brw_context *brw,
    /* Upload
     */
    dri_bo_unreference(brw->clip.prog_bo);
-   brw->clip.prog_bo = brw_upload_cache( &brw->cache,
-					 BRW_CLIP_PROG,
-					 &c.key, sizeof(c.key),
-					 NULL, 0,
-					 program, program_size,
-					 &c.prog_data,
-					 &brw->clip.prog_data );
+   brw->clip.prog_bo = brw_upload_cache_with_auxdata(&brw->cache,
+						     BRW_CLIP_PROG,
+						     &c.key, sizeof(c.key),
+						     NULL, 0,
+						     program, program_size,
+						     &c.prog_data,
+						     sizeof(c.prog_data),
+						     &brw->clip.prog_data);
 }
 
 /* Calculate interpolants for triangle and line rasterization.
  */
 static void upload_clip_prog(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
+   GLcontext *ctx = &intel->ctx;
    struct brw_clip_prog_key key;
 
    memset(&key, 0, sizeof(key));
@@ -160,7 +162,7 @@ static void upload_clip_prog(struct brw_context *brw)
    /* _NEW_TRANSFORM */
    key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
    else
        key.clip_mode = BRW_CLIPMODE_NORMAL;
diff --git a/i965/brw_clip.h b/i965/brw_clip.h
index 1c68255..d71bac7 100644
--- a/i965/brw_clip.h
+++ b/i965/brw_clip.h
@@ -118,7 +118,6 @@ struct brw_clip_compile {
 
    GLuint header_position_offset;
    GLuint offset[VERT_ATTRIB_MAX];
-   GLboolean need_ff_sync;
 };
 
 #define ATTR_SIZE  (4*4)
diff --git a/i965/brw_clip_line.c b/i965/brw_clip_line.c
index fa9648f..ceb62a3 100644
--- a/i965/brw_clip_line.c
+++ b/i965/brw_clip_line.c
@@ -39,13 +39,13 @@
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
-#include "brw_util.h"
 #include "brw_clip.h"
 
 
 
 static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
    GLuint i = 0,j;
 
    /* Register usage is static, precompute here:
@@ -85,7 +85,7 @@ static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
       i++;
    }
 
-   if (c->need_ff_sync) {
+   if (intel->needs_ff_sync) {
       c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
       i++;
    }
@@ -126,6 +126,7 @@ static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
 static void clip_and_emit_line( struct brw_clip_compile *c )
 {
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
    struct brw_indirect vtx0     = brw_indirect(0, 0);
    struct brw_indirect vtx1      = brw_indirect(1, 0);
    struct brw_indirect newvtx0   = brw_indirect(2, 0);
@@ -152,7 +153,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    brw_clip_init_clipmask(c);
 
    /* -ve rhw workaround */
-   if (BRW_IS_965(p->brw)) {
+   if (brw->has_negative_rhw_bug) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
               brw_imm_ud(1<<20));
@@ -189,7 +190,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
               * Both can be negative on GM965/G965 due to RHW workaround
               * if so, this object should be rejected.
               */
-             if (BRW_IS_965(p->brw)) {
+             if (brw->has_negative_rhw_bug) {
                  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
                  is_neg2 = brw_IF(p, BRW_EXECUTE_1);
                  {
@@ -214,7 +215,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
 
              /* If both are positive, do nothing */
              /* Only on GM965/G965 */
-             if (BRW_IS_965(p->brw)) {
+             if (brw->has_negative_rhw_bug) {
                  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
                  is_neg2 = brw_IF(p, BRW_EXECUTE_1);
              }
@@ -229,7 +230,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
                  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
              }
 
-             if (BRW_IS_965(p->brw)) {
+             if (brw->has_negative_rhw_bug) {
                  brw_ENDIF(p, is_neg2);
              }
          }
diff --git a/i965/brw_clip_point.c b/i965/brw_clip_point.c
index 8458f61..7f47634 100644
--- a/i965/brw_clip_point.c
+++ b/i965/brw_clip_point.c
@@ -39,7 +39,6 @@
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
-#include "brw_util.h"
 #include "brw_clip.h"
 
 
diff --git a/i965/brw_clip_state.c b/i965/brw_clip_state.c
index 234b374..424c9a1 100644
--- a/i965/brw_clip_state.c
+++ b/i965/brw_clip_state.c
@@ -32,7 +32,6 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "main/macros.h"
 
 struct brw_clip_unit_key {
    unsigned int total_grf;
@@ -74,6 +73,7 @@ static dri_bo *
 clip_unit_create_from_key(struct brw_context *brw,
 			  struct brw_clip_unit_key *key)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_clip_unit_state clip;
    dri_bo *bo;
 
@@ -105,7 +105,7 @@ clip_unit_create_from_key(struct brw_context *brw,
       /* Although up to 16 concurrent Clip threads are allowed on IGDNG, 
        * only 2 threads can output VUEs at a time.
        */
-      if (BRW_IS_IGDNG(brw))
+      if (intel->is_ironlake)
          clip.thread4.max_threads = 16 - 1;        
       else
          clip.thread4.max_threads = 2 - 1;
@@ -130,7 +130,7 @@ clip_unit_create_from_key(struct brw_context *brw,
    clip.clip5.api_mode = BRW_CLIP_API_OGL;
    clip.clip5.clip_mode = key->clip_mode;
 
-   if (BRW_IS_G4X(brw))
+   if (intel->is_g4x)
       clip.clip5.negative_w_clip_test = 1;
 
    clip.clip6.clipper_viewport_state_ptr = 0;
@@ -142,8 +142,7 @@ clip_unit_create_from_key(struct brw_context *brw,
    bo = brw_upload_cache(&brw->cache, BRW_CLIP_UNIT,
 			 key, sizeof(*key),
 			 &brw->clip.prog_bo, 1,
-			 &clip, sizeof(clip),
-			 NULL, NULL);
+			 &clip, sizeof(clip));
 
    /* Emit clip program relocation */
    assert(brw->clip.prog_bo);
diff --git a/i965/brw_clip_tri.c b/i965/brw_clip_tri.c
index cf79224..815211a 100644
--- a/i965/brw_clip_tri.c
+++ b/i965/brw_clip_tri.c
@@ -39,7 +39,6 @@
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
-#include "brw_util.h"
 #include "brw_clip.h"
 
 static void release_tmps( struct brw_clip_compile *c )
@@ -51,6 +50,7 @@ static void release_tmps( struct brw_clip_compile *c )
 void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
 			      GLuint nr_verts )
 {
+   struct intel_context *intel = &c->func.brw->intel;
    GLuint i = 0,j;
 
    /* Register usage is static, precompute here:
@@ -78,7 +78,7 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
       for (j = 0; j < 3; j++) {
 	 GLuint delta = c->nr_attrs*16 + 32;
 
-         if (BRW_IS_IGDNG(c->func.brw))
+         if (intel->is_ironlake)
              delta = c->nr_attrs * 16 + 32 * 3;
 
 	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
@@ -119,7 +119,7 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
       i++;
    }
 
-   if (c->need_ff_sync) {
+   if (intel->needs_ff_sync) {
       c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
       i++;
    }
@@ -571,6 +571,7 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
 {
    struct brw_instruction *neg_rhw;
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
    brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
    brw_clip_tri_init_vertices(c);
    brw_clip_init_clipmask(c);
@@ -578,7 +579,7 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
 
    /* if -ve rhw workaround bit is set, 
       do cliptest */
-   if (BRW_IS_965(p->brw)) {
+   if (brw->has_negative_rhw_bug) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
               brw_imm_ud(1<<20));
diff --git a/i965/brw_clip_unfilled.c b/i965/brw_clip_unfilled.c
index ad1bfa4..f36d22f 100644
--- a/i965/brw_clip_unfilled.c
+++ b/i965/brw_clip_unfilled.c
@@ -39,7 +39,6 @@
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
-#include "brw_util.h"
 #include "brw_clip.h"
 
 
diff --git a/i965/brw_clip_util.c b/i965/brw_clip_util.c
index 5a73abd..14bc889 100644
--- a/i965/brw_clip_util.c
+++ b/i965/brw_clip_util.c
@@ -40,7 +40,6 @@
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
-#include "brw_util.h"
 #include "brw_clip.h"
 
 
@@ -135,6 +134,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
 			     GLboolean force_edgeflag)
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg tmp = get_tmp(c);
    GLuint i;
 
@@ -142,7 +142,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
     */
    /*
     * After CLIP stage, only first 256 bits of the VUE are read
-    * back on IGDNG, so needn't change it
+    * back on Ironlake, so needn't change it
     */
    brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
       
@@ -151,7 +151,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    for (i = 0; i < c->nr_attrs; i++) {
       GLuint delta = i*16 + 32;
 
-      if (BRW_IS_IGDNG(p->brw))
+      if (intel->is_ironlake)
           delta = i * 16 + 32 * 3;
 
       if (delta == c->offset[VERT_RESULT_EDGE]) {
@@ -185,7 +185,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    if (i & 1) {
       GLuint delta = i*16 + 32;
 
-      if (BRW_IS_IGDNG(p->brw))
+      if (intel->is_ironlake)
           delta = i * 16 + 32 * 3;
 
       brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
@@ -359,7 +359,9 @@ void brw_clip_init_clipmask( struct brw_clip_compile *c )
 
 void brw_clip_ff_sync(struct brw_clip_compile *c)
 {
-    if (c->need_ff_sync) {
+    struct intel_context *intel = &c->func.brw->intel;
+
+    if (intel->needs_ff_sync) {
         struct brw_compile *p = &c->func;
         struct brw_instruction *need_ff_sync;
 
@@ -388,7 +390,9 @@ void brw_clip_ff_sync(struct brw_clip_compile *c)
 
 void brw_clip_init_ff_sync(struct brw_clip_compile *c)
 {
-    if (c->need_ff_sync) {
+    struct intel_context *intel = &c->func.brw->intel;
+
+    if (intel->needs_ff_sync) {
 	struct brw_compile *p = &c->func;
         
         brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));
diff --git a/i965/brw_context.c b/i965/brw_context.c
index aaa2d80..a512896 100644
--- a/i965/brw_context.c
+++ b/i965/brw_context.c
@@ -33,7 +33,6 @@
 #include "main/imports.h"
 #include "main/api_noop.h"
 #include "main/macros.h"
-/* #include "main/vtxfmt.h" */
 #include "main/simple_list.h"
 #include "shader/shader_api.h"
 
@@ -41,16 +40,9 @@
 #include "brw_defines.h"
 #include "brw_draw.h"
 #include "brw_state.h"
-#include "brw_vs.h"
-#include "intel_tex.h"
-#include "intel_blit.h"
-#include "intel_batchbuffer.h"
-#include "intel_pixel.h"
 #include "intel_span.h"
 #include "tnl/t_pipeline.h"
 
-#include "utils.h"
-
 
 /***************************************
  * Mesa's Driver Functions
@@ -77,7 +69,7 @@ static void brwInitDriverFunctions( struct dd_function_table *functions )
 }
 
 GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
-			    __DRIcontextPrivate *driContextPriv,
+			    __DRIcontext *driContextPriv,
 			    void *sharedContextPrivate)
 {
    struct dd_function_table functions;
@@ -86,7 +78,7 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
    GLcontext *ctx = &intel->ctx;
 
    if (!brw) {
-      _mesa_printf("%s: failed to alloc context\n", __FUNCTION__);
+      printf("%s: failed to alloc context\n", __FUNCTION__);
       return GL_FALSE;
    }
 
@@ -95,7 +87,7 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 
    if (!intelInitContext( intel, mesaVis, driContextPriv,
 			  sharedContextPrivate, &functions )) {
-      _mesa_printf("%s: failed to init intel context\n", __FUNCTION__);
+      printf("%s: failed to init intel context\n", __FUNCTION__);
       FREE(brw);
       return GL_FALSE;
    }
@@ -111,6 +103,9 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
    ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureCoordUnits,
                                      ctx->Const.MaxTextureImageUnits);
    ctx->Const.MaxVertexTextureImageUnits = 0; /* no vertex shader textures */
+   ctx->Const.MaxCombinedTextureImageUnits =
+      ctx->Const.MaxVertexTextureImageUnits +
+      ctx->Const.MaxTextureImageUnits;
 
    /* Mesa limits textures to 4kx4k; it would be nice to fix that someday
     */
@@ -155,6 +150,38 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
       MIN2(ctx->Const.FragmentProgram.MaxNativeParameters,
 	   ctx->Const.FragmentProgram.MaxEnvParams);
 
+   if (intel->is_ironlake || intel->is_g4x || intel->gen >= 6) {
+      brw->CMD_VF_STATISTICS = CMD_VF_STATISTICS_GM45;
+      brw->CMD_PIPELINE_SELECT = CMD_PIPELINE_SELECT_GM45;
+      brw->has_surface_tile_offset = GL_TRUE;
+      brw->has_compr4 = GL_TRUE;
+      brw->has_aa_line_parameters = GL_TRUE;
+  } else {
+      brw->CMD_VF_STATISTICS = CMD_VF_STATISTICS_965;
+      brw->CMD_PIPELINE_SELECT = CMD_PIPELINE_SELECT_965;
+   }
+
+   /* WM maximum threads is number of EUs times number of threads per EU. */
+   if (intel->is_ironlake) {
+      brw->urb.size = 1024;
+      brw->vs_max_threads = 72;
+      brw->wm_max_threads = 12 * 6;
+   } else if (intel->is_g4x) {
+      brw->urb.size = 384;
+      brw->vs_max_threads = 32;
+      brw->wm_max_threads = 10 * 5;
+   } else if (intel->gen < 6) {
+      brw->urb.size = 256;
+      brw->vs_max_threads = 16;
+      brw->wm_max_threads = 8 * 4;
+      brw->has_negative_rhw_bug = GL_TRUE;
+   }
+
+   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD) {
+      brw->vs_max_threads = 1;
+      brw->wm_max_threads = 1;
+   }
+
    brw_init_state( brw );
 
    brw->state.dirty.mesa = ~0;
diff --git a/i965/brw_context.h b/i965/brw_context.h
index fded47a..d6fc37e 100644
--- a/i965/brw_context.h
+++ b/i965/brw_context.h
@@ -131,7 +131,6 @@ struct brw_context;
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
 #define BRW_NEW_PSP                     0x800
 #define BRW_NEW_WM_SURFACES		0x1000
-#define BRW_NEW_FENCE                   0x2000
 #define BRW_NEW_INDICES			0x4000
 #define BRW_NEW_VERTICES		0x8000
 /**
@@ -172,8 +171,8 @@ struct brw_fragment_program {
    GLuint id;  /**< serial no. to identify frag progs, never re-used */
    GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
 
-   dri_bo *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
+   dri_bo *const_buffer;    /** Program constant buffer/surface */
 
    /** for debugging, which texture units are referenced */
    GLbitfield tex_units_used;
@@ -283,6 +282,9 @@ struct brw_vs_ouput_sizes {
 
 
 enum brw_cache_id {
+   BRW_BLEND_STATE,
+   BRW_DEPTH_STENCIL_STATE,
+   BRW_COLOR_CALC_STATE,
    BRW_CC_VP,
    BRW_CC_UNIT,
    BRW_WM_PROG,
@@ -291,7 +293,7 @@ enum brw_cache_id {
    BRW_WM_UNIT,
    BRW_SF_PROG,
    BRW_SF_VP,
-   BRW_SF_UNIT,
+   BRW_SF_UNIT, /* scissor state on gen6 */
    BRW_VS_UNIT,
    BRW_VS_PROG,
    BRW_GS_UNIT,
@@ -332,7 +334,6 @@ struct brw_cache {
    struct brw_cache_item **items;
    GLuint size, n_items;
 
-   GLuint aux_size[BRW_MAX_CACHE];
    char *name[BRW_MAX_CACHE];
 
    /* Record of the last BOs chosen for each cache_id.  Used to set
@@ -356,6 +357,9 @@ struct brw_tracked_state {
 
 /* Flags for brw->state.cache.
  */
+#define CACHE_NEW_BLEND_STATE            (1<<BRW_BLEND_STATE)
+#define CACHE_NEW_DEPTH_STENCIL_STATE    (1<<BRW_DEPTH_STENCIL_STATE)
+#define CACHE_NEW_COLOR_CALC_STATE       (1<<BRW_COLOR_CALC_STATE)
 #define CACHE_NEW_CC_VP                  (1<<BRW_CC_VP)
 #define CACHE_NEW_CC_UNIT                (1<<BRW_CC_UNIT)
 #define CACHE_NEW_WM_PROG                (1<<BRW_WM_PROG)
@@ -438,8 +442,11 @@ struct brw_context
    GLuint primitive;
 
    GLboolean emit_state_always;
-   GLboolean no_batch_wrap;
-
+   GLboolean has_surface_tile_offset;
+   GLboolean has_compr4;
+   GLboolean has_negative_rhw_bug;
+   GLboolean has_aa_line_parameters;
+;
    struct {
       struct brw_state_flags dirty;
 
@@ -515,6 +522,12 @@ struct brw_context
     */
    GLuint next_free_page;
 
+   /* hw-dependent 3DSTATE_VF_STATISTICS opcode */
+   uint32_t CMD_VF_STATISTICS;
+   /* hw-dependent 3DSTATE_PIPELINE_SELECT opcode */
+   uint32_t CMD_PIPELINE_SELECT;
+   int vs_max_threads;
+   int wm_max_threads;
 
    /* BRW_NEW_URB_ALLOCATIONS:
     */
@@ -531,7 +544,8 @@ struct brw_context
       GLuint nr_sf_entries;
       GLuint nr_cs_entries;
 
-/*       GLuint vs_size; */
+      /* gen6 */
+      GLuint vs_size;
 /*       GLuint gs_size; */
 /*       GLuint clip_size; */
 /*       GLuint sf_size; */
@@ -542,6 +556,7 @@ struct brw_context
       GLuint clip_start;
       GLuint sf_start;
       GLuint cs_start;
+      GLuint size; /* Hardware URB size, in KB. */
    } urb;
 
    
@@ -564,15 +579,11 @@ struct brw_context
 
       GLfloat *last_buf;
       GLuint last_bufsz;
-      /**
-       *  Whether we should create a new bo instead of reusing the old one
-       * (if we just dispatch the batch pointing at the old one.
-       */
-      GLboolean need_new_bo;
    } curbe;
 
    struct {
       struct brw_vs_prog_data *prog_data;
+      int8_t *constant_map; /* variable array following prog_data */
 
       dri_bo *prog_bo;
       dri_bo *state_bo;
@@ -639,9 +650,16 @@ struct brw_context
 
 
    struct {
+      /* gen4 */
       dri_bo *prog_bo;
-      dri_bo *state_bo;
       dri_bo *vp_bo;
+
+      /* gen6 */
+      dri_bo *blend_state_bo;
+      dri_bo *depth_stencil_state_bo;
+      dri_bo *color_calc_state_bo;
+
+      dri_bo *state_bo;
    } cc;
 
    struct {
@@ -669,7 +687,7 @@ void brwInitVtbl( struct brw_context *brw );
  * brw_context.c
  */
 GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
-			    __DRIcontextPrivate *driContextPriv,
+			    __DRIcontext *driContextPriv,
 			    void *sharedContextPrivate);
 
 /*======================================================================
diff --git a/i965/brw_curbe.c b/i965/brw_curbe.c
index aadcfbe..4e78b08 100644
--- a/i965/brw_curbe.c
+++ b/i965/brw_curbe.c
@@ -114,13 +114,13 @@ static void calculate_curbe_offsets( struct brw_context *brw )
       brw->curbe.total_size = reg;
 
       if (0)
-	 _mesa_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n",
-		      brw->curbe.wm_start,
-		      brw->curbe.wm_size,
-		      brw->curbe.clip_start,
-		      brw->curbe.clip_size,
-		      brw->curbe.vs_start,
-		      brw->curbe.vs_size );
+	 printf("curbe wm %d+%d clip %d+%d vs %d+%d\n",
+		brw->curbe.wm_start,
+		brw->curbe.wm_size,
+		brw->curbe.clip_start,
+		brw->curbe.clip_size,
+		brw->curbe.vs_start,
+		brw->curbe.vs_size );
 
       brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS;
    }
@@ -198,7 +198,7 @@ static void prepare_constant_buffer(struct brw_context *brw)
       return;
    }
 
-   buf = (GLfloat *) _mesa_calloc(bufsz);
+   buf = (GLfloat *) calloc(1, bufsz);
 
    /* fragment shader constants */
    if (brw->curbe.wm_size) {
@@ -256,25 +256,36 @@ static void prepare_constant_buffer(struct brw_context *brw)
        */
       _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
 
-      /* XXX just use a memcpy here */
-      for (i = 0; i < nr; i++) {
-         const GLfloat *value = vp->program.Base.Parameters->ParameterValues[i];
-	 buf[offset + i * 4 + 0] = value[0];
-	 buf[offset + i * 4 + 1] = value[1];
-	 buf[offset + i * 4 + 2] = value[2];
-	 buf[offset + i * 4 + 3] = value[3];
+      if (vp->use_const_buffer) {
+	 /* Load the subset of push constants that will get used when
+	  * we also have a pull constant buffer.
+	  */
+	 for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
+	    if (brw->vs.constant_map[i] != -1) {
+	       assert(brw->vs.constant_map[i] <= nr);
+	       memcpy(buf + offset + brw->vs.constant_map[i] * 4,
+		      vp->program.Base.Parameters->ParameterValues[i],
+		      4 * sizeof(float));
+	    }
+	 }
+      } else {
+	 for (i = 0; i < nr; i++) {
+	    memcpy(buf + offset + i * 4,
+		   vp->program.Base.Parameters->ParameterValues[i],
+		   4 * sizeof(float));
+	 }
       }
    }
 
    if (0) {
       for (i = 0; i < sz*16; i+=4) 
-	 _mesa_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
-		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
+	 printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+		buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
 
-      _mesa_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
-		   brw->curbe.last_buf, buf,
-		   bufsz, brw->curbe.last_bufsz,
-		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
+      printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+	     brw->curbe.last_buf, buf,
+	     bufsz, brw->curbe.last_bufsz,
+	     brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
    }
 
    if (brw->curbe.curbe_bo != NULL &&
@@ -282,20 +293,20 @@ static void prepare_constant_buffer(struct brw_context *brw)
        bufsz == brw->curbe.last_bufsz &&
        memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
       /* constants have not changed */
-      _mesa_free(buf);
+      free(buf);
    } 
    else {
       /* constants have changed */
       if (brw->curbe.last_buf)
-	 _mesa_free(brw->curbe.last_buf);
+	 free(brw->curbe.last_buf);
 
       brw->curbe.last_buf = buf;
       brw->curbe.last_bufsz = bufsz;
 
       if (brw->curbe.curbe_bo != NULL &&
-	  (brw->curbe.need_new_bo ||
-	   brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size))
+	  brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size)
       {
+	 drm_intel_gem_bo_unmap_gtt(brw->curbe.curbe_bo);
 	 dri_bo_unreference(brw->curbe.curbe_bo);
 	 brw->curbe.curbe_bo = NULL;
       }
@@ -307,6 +318,7 @@ static void prepare_constant_buffer(struct brw_context *brw)
 	 brw->curbe.curbe_bo = dri_bo_alloc(brw->intel.bufmgr, "CURBE",
 					    4096, 1 << 6);
 	 brw->curbe.curbe_next_offset = 0;
+	 drm_intel_gem_bo_map_gtt(brw->curbe.curbe_bo);
       }
 
       brw->curbe.curbe_offset = brw->curbe.curbe_next_offset;
@@ -315,7 +327,9 @@ static void prepare_constant_buffer(struct brw_context *brw)
 
       /* Copy data to the buffer:
        */
-      dri_bo_subdata(brw->curbe.curbe_bo, brw->curbe.curbe_offset, bufsz, buf);
+      memcpy(brw->curbe.curbe_bo->virtual + brw->curbe.curbe_offset,
+	     buf,
+	     bufsz);
    }
 
    brw_add_validated_bo(brw, brw->curbe.curbe_bo);
@@ -340,7 +354,7 @@ static void emit_constant_buffer(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
    GLuint sz = brw->curbe.total_size;
 
-   BEGIN_BATCH(2, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(2);
    if (sz == 0) {
       OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
       OUT_BATCH(0);
diff --git a/i965/brw_defines.h b/i965/brw_defines.h
index c19510b..bb1b5f5 100644
--- a/i965/brw_defines.h
+++ b/i965/brw_defines.h
@@ -530,6 +530,7 @@
 #define BRW_OPCODE_POP        47
 #define BRW_OPCODE_WAIT       48
 #define BRW_OPCODE_SEND       49
+#define BRW_OPCODE_MATH       56
 #define BRW_OPCODE_ADD        64
 #define BRW_OPCODE_MUL        65
 #define BRW_OPCODE_AVG        66
@@ -727,7 +728,8 @@
 #define BRW_MATH_FUNCTION_SIN                              6 /* was 7 */
 #define BRW_MATH_FUNCTION_COS                              7 /* was 8 */
 #define BRW_MATH_FUNCTION_SINCOS                           8 /* was 6 */
-#define BRW_MATH_FUNCTION_TAN                              9
+#define BRW_MATH_FUNCTION_TAN                              9 /* gen4 */
+#define BRW_MATH_FUNCTION_FDIV                             9 /* gen6+ */
 #define BRW_MATH_FUNCTION_POW                              10
 #define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
 #define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
@@ -778,17 +780,33 @@
 
 #define CMD_PIPELINED_STATE_POINTERS  0x7800
 #define CMD_BINDING_TABLE_PTRS        0x7801
+# define GEN6_BINDING_TABLE_MODIFY_VS	(1 << 8)
+# define GEN6_BINDING_TABLE_MODIFY_GS	(1 << 9)
+# define GEN6_BINDING_TABLE_MODIFY_PS	(1 << 10)
+
+#define CMD_3D_SAMPLER_STATE_POINTERS			0x7802 /* SNB+ */
+# define PS_SAMPLER_STATE_CHANGE				(1 << 12)
+# define GS_SAMPLER_STATE_CHANGE				(1 << 9)
+# define VS_SAMPLER_STATE_CHANGE				(1 << 8)
+/* DW1: VS */
+/* DW2: GS */
+/* DW3: PS */
 
 #define CMD_VERTEX_BUFFER             0x7808
 # define BRW_VB0_INDEX_SHIFT		27
+# define GEN6_VB0_INDEX_SHIFT		26
 # define BRW_VB0_ACCESS_VERTEXDATA	(0 << 26)
 # define BRW_VB0_ACCESS_INSTANCEDATA	(1 << 26)
+# define GEN6_VB0_ACCESS_VERTEXDATA	(0 << 20)
+# define GEN6_VB0_ACCESS_INSTANCEDATA	(1 << 20)
 # define BRW_VB0_PITCH_SHIFT		0
 
 #define CMD_VERTEX_ELEMENT            0x7809
 # define BRW_VE0_INDEX_SHIFT		27
+# define GEN6_VE0_INDEX_SHIFT		26
 # define BRW_VE0_FORMAT_SHIFT		16
 # define BRW_VE0_VALID			(1 << 26)
+# define GEN6_VE0_VALID			(1 << 25)
 # define BRW_VE0_SRC_OFFSET_SHIFT	0
 # define BRW_VE1_COMPONENT_NOSTORE	0
 # define BRW_VE1_COMPONENT_STORE_SRC	1
@@ -805,8 +823,219 @@
 # define BRW_VE1_DST_OFFSET_SHIFT	0
 
 #define CMD_INDEX_BUFFER              0x780a
-#define CMD_VF_STATISTICS_965         0x780b
+#define CMD_VF_STATISTICS_965          0x780b
 #define CMD_VF_STATISTICS_GM45        0x680b
+#define CMD_3D_CC_STATE_POINTERS      0x780e /* GEN6+ */
+
+#define CMD_URB					0x7805 /* GEN6+ */
+# define GEN6_URB_VS_SIZE_SHIFT				16
+# define GEN6_URB_VS_ENTRIES_SHIFT			0
+# define GEN6_URB_GS_SIZE_SHIFT				8
+# define GEN6_URB_GS_ENTRIES_SHIFT			0
+
+#define CMD_VIEWPORT_STATE_POINTERS			0x780d /* GEN6+ */
+# define GEN6_CC_VIEWPORT_MODIFY			(1 << 12)
+# define GEN6_SF_VIEWPORT_MODIFY			(1 << 11)
+# define GEN6_CLIP_VIEWPORT_MODIFY			(1 << 10)
+
+#define CMD_3D_SCISSOR_STATE_POINTERS		0x780f /* GEN6+ */
+
+#define CMD_3D_VS_STATE		      0x7810 /* GEN6+ */
+/* DW2 */
+# define GEN6_VS_SPF_MODE				(1 << 31)
+# define GEN6_VS_VECTOR_MASK_ENABLE			(1 << 30)
+# define GEN6_VS_SAMPLER_COUNT_SHIFT			27
+# define GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT	18
+/* DW4 */
+# define GEN6_VS_DISPATCH_START_GRF_SHIFT		20
+# define GEN6_VS_URB_READ_LENGTH_SHIFT			11
+# define GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT		4
+/* DW5 */
+# define GEN6_VS_MAX_THREADS_SHIFT			25
+# define GEN6_VS_STATISTICS_ENABLE			(1 << 10)
+# define GEN6_VS_CACHE_DISABLE				(1 << 1)
+# define GEN6_VS_ENABLE					(1 << 0)
+
+#define CMD_3D_GS_STATE		      0x7811 /* GEN6+ */
+/* DW2 */
+# define GEN6_GS_SPF_MODE				(1 << 31)
+# define GEN6_GS_VECTOR_MASK_ENABLE			(1 << 30)
+# define GEN6_GS_SAMPLER_COUNT_SHIFT			27
+# define GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT	18
+/* DW4 */
+# define GEN6_GS_URB_READ_LENGTH_SHIFT			11
+# define GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT		4
+# define GEN6_GS_DISPATCH_START_GRF_SHIFT		0
+/* DW5 */
+# define GEN6_GS_MAX_THREADS_SHIFT			25
+# define GEN6_GS_STATISTICS_ENABLE			(1 << 10)
+# define GEN6_GS_SO_STATISTICS_ENABLE			(1 << 9)
+# define GEN6_GS_RENDERING_ENABLE			(1 << 8)
+/* DW6 */
+# define GEN6_GS_ENABLE					(1 << 15)
+
+#define CMD_3D_CLIP_STATE		      0x7812 /* GEN6+ */
+/* DW1 */
+# define GEN6_CLIP_STATISTICS_ENABLE			(1 << 10)
+/* DW2 */
+# define GEN6_CLIP_ENABLE				(1 << 31)
+# define GEN6_CLIP_API_OGL				(0 << 30)
+# define GEN6_CLIP_API_D3D				(1 << 30)
+# define GEN6_CLIP_XY_TEST				(1 << 28)
+# define GEN6_CLIP_Z_TEST				(1 << 27)
+# define GEN6_CLIP_GB_TEST				(1 << 26)
+# define GEN6_CLIP_MODE_NORMAL				(0 << 13)
+# define GEN6_CLIP_MODE_REJECT_ALL			(3 << 13)
+# define GEN6_CLIP_MODE_ACCEPT_ALL			(4 << 13)
+# define GEN6_CLIP_PERSPECTIVE_DIVIDE_DISABLE		(1 << 9)
+# define GEN6_CLIP_BARYCENTRIC_ENABLE			(1 << 8)
+# define GEN6_CLIP_TRI_PROVOKE_SHIFT			4
+# define GEN6_CLIP_LINE_PROVOKE_SHIFT			2
+# define GEN6_CLIP_TRIFAN_PROVOKE_SHIFT			0
+/* DW3 */
+# define GEN6_CLIP_MIN_POINT_WIDTH_SHIFT		17
+# define GEN6_CLIP_MAX_POINT_WIDTH_SHIFT		6
+
+#define CMD_3D_SF_STATE				0x7813 /* GEN6+ */
+/* DW1 */
+# define GEN6_SF_NUM_OUTPUTS_SHIFT			22
+# define GEN6_SF_SWIZZLE_ENABLE				(1 << 21)
+# define GEN6_SF_POINT_SPRITE_LOWERLEFT			(1 << 20)
+# define GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT		11
+# define GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT		4
+/* DW2 */
+# define GEN6_SF_LEGACY_GLOBAL_DEPTH_BIAS		(1 << 11)
+# define GEN6_SF_STATISTICS_ENABLE			(1 << 10)
+# define GEN6_SF_GLOBAL_DEPTH_OFFSET_SOLID		(1 << 9)
+# define GEN6_SF_GLOBAL_DEPTH_OFFSET_WIREFRAME		(1 << 8)
+# define GEN6_SF_GLOBAL_DEPTH_OFFSET_POINT		(1 << 7)
+# define GEN6_SF_FRONT_SOLID				(0 << 5)
+# define GEN6_SF_FRONT_WIREFRAME			(1 << 5)
+# define GEN6_SF_FRONT_POINT				(2 << 5)
+# define GEN6_SF_BACK_SOLID				(0 << 3)
+# define GEN6_SF_BACK_WIREFRAME				(1 << 3)
+# define GEN6_SF_BACK_POINT				(2 << 3)
+# define GEN6_SF_VIEWPORT_TRANSFORM_ENABLE		(1 << 1)
+# define GEN6_SF_WINDING_CCW				(1 << 0)
+/* DW3 */
+# define GEN6_SF_LINE_AA_ENABLE				(1 << 31)
+# define GEN6_SF_CULL_BOTH				(0 << 29)
+# define GEN6_SF_CULL_NONE				(1 << 29)
+# define GEN6_SF_CULL_FRONT				(2 << 29)
+# define GEN6_SF_CULL_BACK				(3 << 29)
+# define GEN6_SF_LINE_WIDTH_SHIFT			18 /* U3.7 */
+# define GEN6_SF_LINE_END_CAP_WIDTH_0_5			(0 << 16)
+# define GEN6_SF_LINE_END_CAP_WIDTH_1_0			(1 << 16)
+# define GEN6_SF_LINE_END_CAP_WIDTH_2_0			(2 << 16)
+# define GEN6_SF_LINE_END_CAP_WIDTH_4_0			(3 << 16)
+# define GEN6_SF_SCISSOR_ENABLE				(1 << 11)
+# define GEN6_SF_MSRAST_OFF_PIXEL			(0 << 8)
+# define GEN6_SF_MSRAST_OFF_PATTERN			(1 << 8)
+# define GEN6_SF_MSRAST_ON_PIXEL			(2 << 8)
+# define GEN6_SF_MSRAST_ON_PATTERN			(3 << 8)
+/* DW4 */
+# define GEN6_SF_TRI_PROVOKE_SHIFT			29
+# define GEN6_SF_LINE_PROVOKE_SHIFT			27
+# define GEN6_SF_TRIFAN_PROVOKE_SHIFT			25
+# define GEN6_SF_LINE_AA_MODE_MANHATTAN			(0 << 14)
+# define GEN6_SF_LINE_AA_MODE_TRUE			(1 << 14)
+# define GEN6_SF_VERTEX_SUBPIXEL_8BITS			(0 << 12)
+# define GEN6_SF_VERTEX_SUBPIXEL_4BITS			(1 << 12)
+# define GEN6_SF_USE_STATE_POINT_WIDTH			(1 << 11)
+# define GEN6_SF_POINT_WIDTH_SHIFT			0 /* U8.3 */
+/* DW5: depth offset constant */
+/* DW6: depth offset scale */
+/* DW7: depth offset clamp */
+/* DW8 */
+# define ATTRIBUTE_1_OVERRIDE_W				(1 << 31)
+# define ATTRIBUTE_1_OVERRIDE_Z				(1 << 30)
+# define ATTRIBUTE_1_OVERRIDE_Y				(1 << 29)
+# define ATTRIBUTE_1_OVERRIDE_X				(1 << 28)
+# define ATTRIBUTE_1_CONST_SOURCE_SHIFT			25
+# define ATTRIBUTE_1_SWIZZLE_SHIFT			22
+# define ATTRIBUTE_1_SOURCE_SHIFT			16
+# define ATTRIBUTE_0_OVERRIDE_W				(1 << 15)
+# define ATTRIBUTE_0_OVERRIDE_Z				(1 << 14)
+# define ATTRIBUTE_0_OVERRIDE_Y				(1 << 13)
+# define ATTRIBUTE_0_OVERRIDE_X				(1 << 12)
+# define ATTRIBUTE_0_CONST_SOURCE_SHIFT			9
+# define ATTRIBUTE_0_SWIZZLE_SHIFT			6
+# define ATTRIBUTE_0_SOURCE_SHIFT			0
+/* DW16: Point sprite texture coordinate enables */
+/* DW17: Constant interpolation enables */
+/* DW18: attr 0-7 wrap shortest enables */
+/* DW19: attr 8-16 wrap shortest enables */
+
+#define CMD_3D_WM_STATE		      0x7814 /* GEN6+ */
+/* DW1: kernel pointer */
+/* DW2 */
+# define GEN6_WM_SPF_MODE				(1 << 31)
+# define GEN6_WM_VECTOR_MASK_ENABLE			(1 << 30)
+# define GEN6_WM_SAMPLER_COUNT_SHIFT			27
+# define GEN6_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT	18
+/* DW3: scratch space */
+/* DW4 */
+# define GEN6_WM_STATISTICS_ENABLE			(1 << 31)
+# define GEN6_WM_DEPTH_CLEAR				(1 << 30)
+# define GEN6_WM_DEPTH_RESOLVE				(1 << 28)
+# define GEN6_WM_HIERARCHICAL_DEPTH_RESOLVE		(1 << 27)
+# define GEN6_WM_DISPATCH_START_GRF_SHIFT_0		16
+# define GEN6_WM_DISPATCH_START_GRF_SHIFT_1		8
+# define GEN6_WM_DISPATCH_START_GRF_SHIFT_2		0
+/* DW5 */
+# define GEN6_WM_MAX_THREADS_SHIFT			25
+# define GEN6_WM_KILL_ENABLE				(1 << 22)
+# define GEN6_WM_COMPUTED_DEPTH				(1 << 21)
+# define GEN6_WM_USES_SOURCE_DEPTH			(1 << 20)
+# define GEN6_WM_DISPATCH_ENABLE			(1 << 19)
+# define GEN6_WM_LINE_END_CAP_AA_WIDTH_0_5		(0 << 16)
+# define GEN6_WM_LINE_END_CAP_AA_WIDTH_1_0		(1 << 16)
+# define GEN6_WM_LINE_END_CAP_AA_WIDTH_2_0		(2 << 16)
+# define GEN6_WM_LINE_END_CAP_AA_WIDTH_4_0		(3 << 16)
+# define GEN6_WM_LINE_AA_WIDTH_0_5			(0 << 14)
+# define GEN6_WM_LINE_AA_WIDTH_1_0			(1 << 14)
+# define GEN6_WM_LINE_AA_WIDTH_2_0			(2 << 14)
+# define GEN6_WM_LINE_AA_WIDTH_4_0			(3 << 14)
+# define GEN6_WM_POLYGON_STIPPLE_ENABLE			(1 << 13)
+# define GEN6_WM_LINE_STIPPLE_ENABLE			(1 << 12)
+# define GEN6_WM_OMASK_TO_RENDER_TARGET			(1 << 9)
+# define GEN6_WM_USES_SOURCE_W				(1 << 8)
+# define GEN6_WM_DUAL_SOURCE_BLEND_ENABLE		(1 << 7)
+# define GEN6_WM_32_DISPATCH_ENABLE			(1 << 2)
+# define GEN6_WM_16_DISPATCH_ENABLE			(1 << 1)
+# define GEN6_WM_8_DISPATCH_ENABLE			(1 << 0)
+/* DW6 */
+# define GEN6_WM_NUM_SF_OUTPUTS_SHIFT			20
+# define GEN6_WM_POSOFFSET_NONE				(0 << 18)
+# define GEN6_WM_POSOFFSET_CENTROID			(2 << 18)
+# define GEN6_WM_POSOFFSET_SAMPLE			(3 << 18)
+# define GEN6_WM_POSITION_ZW_PIXEL			(0 << 16)
+# define GEN6_WM_POSITION_ZW_CENTROID			(2 << 16)
+# define GEN6_WM_POSITION_ZW_SAMPLE			(3 << 16)
+# define GEN6_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC	(1 << 15)
+# define GEN6_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC	(1 << 14)
+# define GEN6_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC	(1 << 13)
+# define GEN6_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC		(1 << 12)
+# define GEN6_WM_PERSPECTIVE_CENTROID_BARYCENTRIC	(1 << 11)
+# define GEN6_WM_PERSPECTIVE_PIXEL_BARYCENTRIC		(1 << 10)
+# define GEN6_WM_POINT_RASTRULE_UPPER_RIGHT		(1 << 9)
+# define GEN6_WM_MSRAST_OFF_PIXEL			(0 << 1)
+# define GEN6_WM_MSRAST_OFF_PATTERN			(1 << 1)
+# define GEN6_WM_MSRAST_ON_PIXEL			(2 << 1)
+# define GEN6_WM_MSRAST_ON_PATTERN			(3 << 1)
+# define GEN6_WM_MSDISPMODE_PERPIXEL			(1 << 0)
+/* DW7: kernel 1 pointer */
+/* DW8: kernel 2 pointer */
+
+#define CMD_3D_CONSTANT_VS_STATE	      0x7815 /* GEN6+ */
+#define CMD_3D_CONSTANT_GS_STATE	      0x7816 /* GEN6+ */
+#define CMD_3D_CONSTANT_PS_STATE	      0x7817 /* GEN6+ */
+# define GEN6_CONSTANT_BUFFER_3_ENABLE			(1 << 15)
+# define GEN6_CONSTANT_BUFFER_2_ENABLE			(1 << 14)
+# define GEN6_CONSTANT_BUFFER_1_ENABLE			(1 << 13)
+# define GEN6_CONSTANT_BUFFER_0_ENABLE			(1 << 12)
+
+#define CMD_3D_SAMPLE_MASK			0x7818 /* GEN6+ */
 
 #define CMD_DRAW_RECT                 0x7900
 #define CMD_BLEND_CONSTANT_COLOR      0x7901
@@ -818,6 +1047,25 @@
 #define CMD_GLOBAL_DEPTH_OFFSET_CLAMP 0x7909
 #define CMD_AA_LINE_PARAMETERS        0x790a
 
+#define CMD_GS_SVB_INDEX			0x790b /* CTG+ */
+/* DW1 */
+# define SVB_INDEX_SHIFT				29
+# define SVB_LOAD_INTERNAL_VERTEX_COUNT			(1 << 0) /* SNB+ */
+/* DW2: SVB index */
+/* DW3: SVB maximum index */
+
+#define CMD_3D_MULTISAMPLE			0x790d /* SNB+ */
+/* DW1 */
+# define MS_PIXEL_LOCATION_CENTER			(0 << 4)
+# define MS_PIXEL_LOCATION_UPPER_LEFT			(1 << 4)
+# define MS_NUMSAMPLES_1				(0 << 1)
+# define MS_NUMSAMPLES_4				(2 << 1)
+# define MS_NUMSAMPLES_8				(3 << 1)
+
+#define CMD_3D_CLEAR_PARAMS			0x7910 /* ILK+ */
+# define DEPTH_CLEAR_VALID				(1 << 15)
+/* DW1: depth clear value */
+
 #define CMD_PIPE_CONTROL              0x7a00
 
 #define CMD_3D_PRIM                   0x7b00
@@ -832,12 +1080,4 @@
 
 #include "intel_chipset.h"
 
-#define BRW_IS_G4X(brw)         (IS_G4X((brw)->intel.intelScreen->deviceID))
-#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->intel.intelScreen->deviceID))
-#define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
-#define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
-#define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
-#define URB_SIZES(brw)                  (BRW_IS_IGDNG(brw) ? 1024 : \
-                                         (BRW_IS_G4X(brw) ? 384 : 256))  /* 512 bit units */
-
 #endif
diff --git a/i965/brw_disasm.c b/i965/brw_disasm.c
index 9fef230..a8f6b99 100644
--- a/i965/brw_disasm.c
+++ b/i965/brw_disasm.c
@@ -239,7 +239,7 @@ char *imm_encoding[8] = {
     [2] = "UW",
     [3] = "W",
     [5] = "VF",
-    [5] = "V",
+    [6] = "V",
     [7] = "F"
 };
 
@@ -365,6 +365,7 @@ static int format (FILE *f, char *format, ...)
     va_start (args, format);
 
     vsnprintf (buf, sizeof (buf) - 1, format, args);
+    va_end (args);
     string (f, buf);
     return 0;
 }
diff --git a/i965/brw_draw.c b/i965/brw_draw.c
index 8bcb608..e348d46 100644
--- a/i965/brw_draw.c
+++ b/i965/brw_draw.c
@@ -39,10 +39,8 @@
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_state.h"
-#include "brw_fallback.h"
 
 #include "intel_batchbuffer.h"
-#include "intel_buffer_objects.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BATCH
 
@@ -84,7 +82,7 @@ static GLuint brw_set_prim(struct brw_context *brw, GLenum prim)
    GLcontext *ctx = &brw->intel.ctx;
 
    if (INTEL_DEBUG & DEBUG_PRIMS)
-      _mesa_printf("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim));
+      printf("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim));
    
    /* Slight optimization to avoid the GS program when not needed:
     */
@@ -127,7 +125,7 @@ static void brw_emit_prim(struct brw_context *brw,
    struct intel_context *intel = &brw->intel;
 
    if (INTEL_DEBUG & DEBUG_PRIMS)
-      _mesa_printf("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode), 
+      printf("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode), 
 		   prim->start, prim->count);
 
    prim_packet.header.opcode = CMD_3D_PRIM;
@@ -145,7 +143,7 @@ static void brw_emit_prim(struct brw_context *brw,
    prim_packet.base_vert_location = prim->basevertex;
 
    /* Can't wrap here, since we rely on the validated state. */
-   brw->no_batch_wrap = GL_TRUE;
+   intel->no_batch_wrap = GL_TRUE;
 
    /* If we're set to always flush, do it before and after the primitive emit.
     * We want to catch both missed flushes that hurt instruction/state cache
@@ -157,13 +155,13 @@ static void brw_emit_prim(struct brw_context *brw,
    }
    if (prim_packet.verts_per_instance) {
       intel_batchbuffer_data( brw->intel.batch, &prim_packet,
-			      sizeof(prim_packet), LOOP_CLIPRECTS);
+			      sizeof(prim_packet));
    }
    if (intel->always_flush_cache) {
       intel_batchbuffer_emit_mi_flush(intel->batch);
    }
 
-   brw->no_batch_wrap = GL_FALSE;
+   intel->no_batch_wrap = GL_FALSE;
 }
 
 static void brw_merge_inputs( struct brw_context *brw,
@@ -339,12 +337,7 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
     * so can't access it earlier.
     */
 
-   LOCK_HARDWARE(intel);
-
-   if (!intel->constant_cliprect && intel->driDrawable->numClipRects == 0) {
-      UNLOCK_HARDWARE(intel);
-      return GL_TRUE;
-   }
+   intel_prepare_render(intel);
 
    for (i = 0; i < nr_prims; i++) {
       uint32_t hw_prim;
@@ -356,8 +349,7 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
        * an upper bound of how much we might emit in a single
        * brw_try_draw_prims().
        */
-      intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4,
-				      LOOP_CLIPRECTS);
+      intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4);
 
       hw_prim = brw_set_prim(brw, prim[i].mode);
 
@@ -404,7 +396,6 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    if (intel->always_flush_batch)
       intel_batchbuffer_flush(intel->batch);
  out:
-   UNLOCK_HARDWARE(intel);
 
    brw_state_cache_check_size(brw);
 
diff --git a/i965/brw_draw_upload.c b/i965/brw_draw_upload.c
index ee684f6..71a4357 100644
--- a/i965/brw_draw_upload.c
+++ b/i965/brw_draw_upload.c
@@ -29,19 +29,15 @@
 #include "main/glheader.h"
 #include "main/bufferobj.h"
 #include "main/context.h"
-#include "main/state.h"
-/* #include "main/api_validate.h" */
 #include "main/enums.h"
 
 #include "brw_draw.h"
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_state.h"
-#include "brw_fallback.h"
 
 #include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
-#include "intel_tex.h"
 
 static GLuint double_types[5] = {
    0,
@@ -59,6 +55,14 @@ static GLuint float_types[5] = {
    BRW_SURFACEFORMAT_R32G32B32A32_FLOAT
 };
 
+static GLuint half_float_types[5] = {
+   0,
+   BRW_SURFACEFORMAT_R16_FLOAT,
+   BRW_SURFACEFORMAT_R16G16_FLOAT,
+   0, /* can't seem to render this one */
+   BRW_SURFACEFORMAT_R16G16B16A16_FLOAT
+};
+
 static GLuint uint_types_norm[5] = {
    0,
    BRW_SURFACEFORMAT_R32_UNORM,
@@ -165,13 +169,14 @@ static GLuint get_surface_type( GLenum type, GLuint size,
                                 GLenum format, GLboolean normalized )
 {
    if (INTEL_DEBUG & DEBUG_VERTS)
-      _mesa_printf("type %s size %d normalized %d\n", 
+      printf("type %s size %d normalized %d\n", 
 		   _mesa_lookup_enum_by_nr(type), size, normalized);
 
    if (normalized) {
       switch (type) {
       case GL_DOUBLE: return double_types[size];
       case GL_FLOAT: return float_types[size];
+      case GL_HALF_FLOAT: return half_float_types[size];
       case GL_INT: return int_types_norm[size];
       case GL_SHORT: return short_types_norm[size];
       case GL_BYTE: return byte_types_norm[size];
@@ -194,6 +199,7 @@ static GLuint get_surface_type( GLenum type, GLuint size,
       switch (type) {
       case GL_DOUBLE: return double_types[size];
       case GL_FLOAT: return float_types[size];
+      case GL_HALF_FLOAT: return half_float_types[size];
       case GL_INT: return int_types_scale[size];
       case GL_SHORT: return short_types_scale[size];
       case GL_BYTE: return byte_types_scale[size];
@@ -211,6 +217,7 @@ static GLuint get_size( GLenum type )
    switch (type) {
    case GL_DOUBLE: return sizeof(GLdouble);
    case GL_FLOAT: return sizeof(GLfloat);
+   case GL_HALF_FLOAT: return sizeof(GLhalfARB);
    case GL_INT: return sizeof(GLint);
    case GL_SHORT: return sizeof(GLshort);
    case GL_BYTE: return sizeof(GLbyte);
@@ -243,14 +250,6 @@ static void wrap_buffers( struct brw_context *brw,
       dri_bo_unreference(brw->vb.upload.bo);
    brw->vb.upload.bo = dri_bo_alloc(brw->intel.bufmgr, "temporary VBO",
 				    size, 1);
-
-   /* Set the internal VBO\ to no-backing-store.  We only use them as a
-    * temporary within a brw_try_draw_prims while the lock is held.
-    */
-   /* DON'T DO THIS AS IF WE HAVE TO RE-ORG MEMORY WE NEED SOMEWHERE WITH
-      FAKE TO PUSH THIS STUFF */
-//   if (!brw->intel.ttm)
-//      dri_bo_fake_disable_backing_store(brw->vb.upload.bo, NULL, NULL);
 }
 
 static void get_space( struct brw_context *brw,
@@ -277,7 +276,6 @@ copy_array_to_vbo_array( struct brw_context *brw,
 			 struct brw_vertex_element *element,
 			 GLuint dst_stride)
 {
-   struct intel_context *intel = &brw->intel;
    GLuint size = element->count * dst_stride;
 
    get_space(brw, size, &element->bo, &element->offset);
@@ -290,52 +288,26 @@ copy_array_to_vbo_array( struct brw_context *brw,
    }
 
    if (dst_stride == element->glarray->StrideB) {
-      if (intel->intelScreen->kernel_exec_fencing) {
-	 drm_intel_gem_bo_map_gtt(element->bo);
-	 memcpy((char *)element->bo->virtual + element->offset,
-		element->glarray->Ptr, size);
-	 drm_intel_gem_bo_unmap_gtt(element->bo);
-      } else {
-	 dri_bo_subdata(element->bo,
-			element->offset,
-			size,
-			element->glarray->Ptr);
-      }
+      drm_intel_gem_bo_map_gtt(element->bo);
+      memcpy((char *)element->bo->virtual + element->offset,
+	     element->glarray->Ptr, size);
+      drm_intel_gem_bo_unmap_gtt(element->bo);
    } else {
       char *dest;
       const unsigned char *src = element->glarray->Ptr;
       int i;
 
-      if (intel->intelScreen->kernel_exec_fencing) {
-	 drm_intel_gem_bo_map_gtt(element->bo);
-	 dest = element->bo->virtual;
-	 dest += element->offset;
-
-	 for (i = 0; i < element->count; i++) {
-	    memcpy(dest, src, dst_stride);
-	    src += element->glarray->StrideB;
-	    dest += dst_stride;
-	 }
-
-	 drm_intel_gem_bo_unmap_gtt(element->bo);
-      } else {
-	 void *data;
-
-	 data = _mesa_malloc(dst_stride * element->count);
-	 dest = data;
-	 for (i = 0; i < element->count; i++) {
-	    memcpy(dest, src, dst_stride);
-	    src += element->glarray->StrideB;
-	    dest += dst_stride;
-	 }
-
-	 dri_bo_subdata(element->bo,
-			element->offset,
-			size,
-			data);
+      drm_intel_gem_bo_map_gtt(element->bo);
+      dest = element->bo->virtual;
+      dest += element->offset;
 
-	 _mesa_free(data);
+      for (i = 0; i < element->count; i++) {
+	 memcpy(dest, src, dst_stride);
+	 src += element->glarray->StrideB;
+	 dest += dst_stride;
       }
+
+      drm_intel_gem_bo_unmap_gtt(element->bo);
    }
 }
 
@@ -356,7 +328,7 @@ static void brw_prepare_vertices(struct brw_context *brw)
    /* First build an array of pointers to ve's in vb.inputs_read
     */
    if (0)
-      _mesa_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
+      printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
 
    /* Accumulate the list of enabled arrays. */
    brw->vb.nr_enabled = 0;
@@ -502,12 +474,19 @@ static void brw_emit_vertices(struct brw_context *brw)
     * a VE loads from them.
     */
    if (brw->vb.nr_enabled == 0) {
-      BEGIN_BATCH(3, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(3);
       OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | 1);
-      OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) |
-		BRW_VE0_VALID |
-		(BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
-		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
+      if (IS_GEN6(intel->intelScreen->deviceID)) {
+	 OUT_BATCH((0 << GEN6_VE0_INDEX_SHIFT) |
+		   GEN6_VE0_VALID |
+		   (BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
+		   (0 << BRW_VE0_SRC_OFFSET_SHIFT));
+      } else {
+	 OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) |
+		   BRW_VE0_VALID |
+		   (BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
+		   (0 << BRW_VE0_SRC_OFFSET_SHIFT));
+      }
       OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
 		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
 		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
@@ -522,20 +501,28 @@ static void brw_emit_vertices(struct brw_context *brw)
     * are interleaved or from the same VBO.  TBD if this makes a
     * performance difference.
     */
-   BEGIN_BATCH(1 + brw->vb.nr_enabled * 4, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 4);
    OUT_BATCH((CMD_VERTEX_BUFFER << 16) |
 	     ((1 + brw->vb.nr_enabled * 4) - 2));
 
    for (i = 0; i < brw->vb.nr_enabled; i++) {
       struct brw_vertex_element *input = brw->vb.enabled[i];
+      uint32_t dw0;
+
+      if (intel->gen >= 6) {
+	 dw0 = GEN6_VB0_ACCESS_VERTEXDATA |
+	    (i << GEN6_VB0_INDEX_SHIFT);
+      } else {
+	 dw0 = BRW_VB0_ACCESS_VERTEXDATA |
+	    (i << BRW_VB0_INDEX_SHIFT);
+      }
 
-      OUT_BATCH((i << BRW_VB0_INDEX_SHIFT) |
-		BRW_VB0_ACCESS_VERTEXDATA |
+      OUT_BATCH(dw0 |
 		(input->stride << BRW_VB0_PITCH_SHIFT));
       OUT_RELOC(input->bo,
 		I915_GEM_DOMAIN_VERTEX, 0,
 		input->offset);
-      if (BRW_IS_IGDNG(brw)) {
+      if (intel->is_ironlake || intel->gen >= 6) {
 	 OUT_RELOC(input->bo,
 		   I915_GEM_DOMAIN_VERTEX, 0,
 		   input->bo->size - 1);
@@ -545,7 +532,7 @@ static void brw_emit_vertices(struct brw_context *brw)
    }
    ADVANCE_BATCH();
 
-   BEGIN_BATCH(1 + brw->vb.nr_enabled * 2, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 2);
    OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + brw->vb.nr_enabled * 2) - 2));
    for (i = 0; i < brw->vb.nr_enabled; i++) {
       struct brw_vertex_element *input = brw->vb.enabled[i];
@@ -566,12 +553,19 @@ static void brw_emit_vertices(struct brw_context *brw)
 	 break;
       }
 
-      OUT_BATCH((i << BRW_VE0_INDEX_SHIFT) |
-		BRW_VE0_VALID |
-		(format << BRW_VE0_FORMAT_SHIFT) |
-		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
+      if (IS_GEN6(intel->intelScreen->deviceID)) {
+	 OUT_BATCH((i << GEN6_VE0_INDEX_SHIFT) |
+		   GEN6_VE0_VALID |
+		   (format << BRW_VE0_FORMAT_SHIFT) |
+		   (0 << BRW_VE0_SRC_OFFSET_SHIFT));
+      } else {
+	 OUT_BATCH((i << BRW_VE0_INDEX_SHIFT) |
+		   BRW_VE0_VALID |
+		   (format << BRW_VE0_FORMAT_SHIFT) |
+		   (0 << BRW_VE0_SRC_OFFSET_SHIFT));
+      }
 
-      if (BRW_IS_IGDNG(brw))
+      if (intel->is_ironlake || intel->gen >= 6)
           OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
                     (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
                     (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
@@ -625,13 +619,9 @@ static void brw_prepare_indices(struct brw_context *brw)
 
       /* Straight upload
        */
-      if (intel->intelScreen->kernel_exec_fencing) {
-	 drm_intel_gem_bo_map_gtt(bo);
-	 memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size);
-	 drm_intel_gem_bo_unmap_gtt(bo);
-      } else {
-	 dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
-      }
+      drm_intel_gem_bo_map_gtt(bo);
+      memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size);
+      drm_intel_gem_bo_unmap_gtt(bo);
    } else {
       offset = (GLuint) (unsigned long) index_buffer->ptr;
       brw->ib.start_vertex_offset = 0;
@@ -712,7 +702,7 @@ static void brw_emit_index_buffer(struct brw_context *brw)
       ib.header.bits.index_format = get_index_type(index_buffer->type);
       ib.header.bits.cut_index_enable = 0;
 
-      BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(4);
       OUT_BATCH( ib.header.dword );
       OUT_RELOC(brw->ib.bo,
 		I915_GEM_DOMAIN_VERTEX, 0,
diff --git a/i965/brw_eu.c b/i965/brw_eu.c
index 1df5613..4e7c122 100644
--- a/i965/brw_eu.c
+++ b/i965/brw_eu.c
@@ -237,7 +237,7 @@ brw_resolve_cals(struct brw_compile *c)
         struct brw_glsl_call *call, *next;
         for (call = c->first_call; call; call = next) {
 	    next = call->next;
-	    _mesa_free(call);
+	    free(call);
 	}
 	c->first_call = NULL;
     }
@@ -247,7 +247,7 @@ brw_resolve_cals(struct brw_compile *c)
         struct brw_glsl_label *label, *next;
 	for (label = c->first_label; label; label = next) {
 	    next = label->next;
-	    _mesa_free(label);
+	    free(label);
 	}
 	c->first_label = NULL;
     }
diff --git a/i965/brw_eu_debug.c b/i965/brw_eu_debug.c
index 29f3f6d..99453af 100644
--- a/i965/brw_eu_debug.c
+++ b/i965/brw_eu_debug.c
@@ -54,9 +54,9 @@ void brw_print_reg( struct brw_reg hwreg )
       "f"
    };
 
-   _mesa_printf("%s%s", 
-		hwreg.abs ? "abs/" : "",
-		hwreg.negate ? "-" : "");
+   printf("%s%s", 
+	  hwreg.abs ? "abs/" : "",
+	  hwreg.negate ? "-" : "");
      
    if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
        hwreg.nr % 2 == 0 &&
@@ -66,7 +66,7 @@ void brw_print_reg( struct brw_reg hwreg )
        hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
        hwreg.type == BRW_REGISTER_TYPE_F) {
       /* vector register */
-      _mesa_printf("vec%d", hwreg.nr);
+      printf("vec%d", hwreg.nr);
    }
    else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
 	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
@@ -74,13 +74,13 @@ void brw_print_reg( struct brw_reg hwreg )
 	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
 	    hwreg.type == BRW_REGISTER_TYPE_F) {      
       /* "scalar" register */
-      _mesa_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
+      printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
    }
    else if (hwreg.file == BRW_IMMEDIATE_VALUE) {
-      _mesa_printf("imm %f", hwreg.dw1.f);
+      printf("imm %f", hwreg.dw1.f);
    }
    else {
-      _mesa_printf("%s%d.%d<%d;%d,%d>:%s", 
+      printf("%s%d.%d<%d;%d,%d>:%s", 
 		   file[hwreg.file],
 		   hwreg.nr,
 		   hwreg.subnr / type_sz(hwreg.type),
diff --git a/i965/brw_eu_emit.c b/i965/brw_eu_emit.c
index 7ceabba..f69d529 100644
--- a/i965/brw_eu_emit.c
+++ b/i965/brw_eu_emit.c
@@ -102,8 +102,6 @@ static void brw_set_dest( struct brw_instruction *insn,
 static void brw_set_src0( struct brw_instruction *insn,
                           struct brw_reg reg )
 {
-   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
-
    if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
       assert(reg.nr < 128);
 
@@ -199,7 +197,7 @@ void brw_set_src1( struct brw_instruction *insn,
        * in the future:
        */
       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
-      //assert (reg.file == BRW_GENERAL_REGISTER_FILE);
+      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 
       if (insn->header.access_mode == BRW_ALIGN_1) {
 	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
@@ -252,9 +250,10 @@ static void brw_set_math_message( struct brw_context *brw,
 				  GLboolean saturate,
 				  GLuint dataType )
 {
+   struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
        insn->bits3.math_igdng.function = function;
        insn->bits3.math_igdng.int_type = integer_type;
        insn->bits3.math_igdng.precision = low_precision;
@@ -319,9 +318,10 @@ static void brw_set_urb_message( struct brw_context *brw,
 				 GLuint offset,
 				 GLuint swizzle_control )
 {
+    struct intel_context *intel = &brw->intel;
     brw_set_src1(insn, brw_imm_d(0));
 
-    if (BRW_IS_IGDNG(brw)) {
+    if (intel->is_ironlake || intel->gen >= 6) {
         insn->bits3.urb_igdng.opcode = 0;	/* ? */
         insn->bits3.urb_igdng.offset = offset;
         insn->bits3.urb_igdng.swizzle_control = swizzle_control;
@@ -332,8 +332,16 @@ static void brw_set_urb_message( struct brw_context *brw,
         insn->bits3.urb_igdng.response_length = response_length;
         insn->bits3.urb_igdng.msg_length = msg_length;
         insn->bits3.urb_igdng.end_of_thread = end_of_thread;
-        insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
-        insn->bits2.send_igdng.end_of_thread = end_of_thread;
+	if (intel->gen >= 6) {
+	   /* For SNB, the SFID bits moved to the condmod bits, and
+	    * EOT stayed in bits3 above.  Does the EOT bit setting
+	    * below on Ironlake even do anything?
+	    */
+	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
+	} else {
+	   insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+	   insn->bits2.send_igdng.end_of_thread = end_of_thread;
+	}
     } else {
         insn->bits3.urb.opcode = 0;	/* ? */
         insn->bits3.urb.offset = offset;
@@ -358,9 +366,10 @@ static void brw_set_dp_write_message( struct brw_context *brw,
 				      GLuint response_length,
 				      GLuint end_of_thread )
 {
+   struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
        insn->bits3.dp_write_igdng.binding_table_index = binding_table_index;
        insn->bits3.dp_write_igdng.msg_control = msg_control;
        insn->bits3.dp_write_igdng.pixel_scoreboard_clear = pixel_scoreboard_clear;
@@ -395,9 +404,10 @@ static void brw_set_dp_read_message( struct brw_context *brw,
 				      GLuint response_length,
 				      GLuint end_of_thread )
 {
+   struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
        insn->bits3.dp_read_igdng.binding_table_index = binding_table_index;
        insn->bits3.dp_read_igdng.msg_control = msg_control;
        insn->bits3.dp_read_igdng.msg_type = msg_type;
@@ -433,10 +443,11 @@ static void brw_set_sampler_message(struct brw_context *brw,
                                     GLuint header_present,
                                     GLuint simd_mode)
 {
+   struct intel_context *intel = &brw->intel;
    assert(eot == 0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
       insn->bits3.sampler_igdng.binding_table_index = binding_table_index;
       insn->bits3.sampler_igdng.sampler = sampler;
       insn->bits3.sampler_igdng.msg_type = msg_type;
@@ -447,7 +458,7 @@ static void brw_set_sampler_message(struct brw_context *brw,
       insn->bits3.sampler_igdng.end_of_thread = eot;
       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_SAMPLER;
       insn->bits2.send_igdng.end_of_thread = eot;
-   } else if (BRW_IS_G4X(brw)) {
+   } else if (intel->is_g4x) {
       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
       insn->bits3.sampler_g4x.sampler = sampler;
       insn->bits3.sampler_g4x.msg_type = msg_type;
@@ -648,10 +659,11 @@ struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
 struct brw_instruction *brw_ELSE(struct brw_compile *p, 
 				 struct brw_instruction *if_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint br = 1;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
       br = 2;
 
    if (p->single_program_flow) {
@@ -690,9 +702,10 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
 void brw_ENDIF(struct brw_compile *p, 
 	       struct brw_instruction *patch_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    GLuint br = 1;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
       br = 2; 
  
    if (p->single_program_flow) {
@@ -803,10 +816,11 @@ struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 struct brw_instruction *brw_WHILE(struct brw_compile *p, 
                                   struct brw_instruction *do_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint br = 1;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
       br = 2;
 
    if (p->single_program_flow)
@@ -846,14 +860,15 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
 void brw_land_fwd_jump(struct brw_compile *p, 
 		       struct brw_instruction *jmp_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *landing = &p->store[p->nr_insn];
    GLuint jmpi = 1;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
        jmpi = 2;
 
    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
-   assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
+   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
 
    jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
 }
@@ -908,26 +923,40 @@ void brw_math( struct brw_compile *p,
 	       GLuint data_type,
 	       GLuint precision )
 {
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; 
-   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; 
+   struct intel_context *intel = &p->brw->intel;
 
-   /* Example code doesn't set predicate_control for send
-    * instructions.
-    */
-   insn->header.predicate_control = 0; 
-   insn->header.destreg__conditionalmod = msg_reg_nr;
+   if (intel->gen >= 6) {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
 
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src);
-   brw_set_math_message(p->brw,
-			insn, 
-			msg_length, response_length, 
-			function,
-			BRW_MATH_INTEGER_UNSIGNED,
-			precision,
-			saturate,
-			data_type);
+      /* Math is the same ISA format as other opcodes, except that CondModifier
+       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
+       */
+      insn->header.destreg__conditionalmod = function;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+      brw_set_src1(insn, brw_null_reg());
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+      GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
+      GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
+      /* Example code doesn't set predicate_control for send
+       * instructions.
+       */
+      insn->header.predicate_control = 0;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+      brw_set_math_message(p->brw,
+			   insn,
+			   msg_length, response_length,
+			   function,
+			   BRW_MATH_INTEGER_UNSIGNED,
+			   precision,
+			   saturate,
+			   data_type);
+   }
 }
 
 /**
@@ -1263,7 +1292,7 @@ void brw_SAMPLE(struct brw_compile *p,
    GLboolean need_stall = 0;
    
    if (writemask == 0) {
-      /*_mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
+      /*printf("%s: zero writemask??\n", __FUNCTION__); */
       return;
    }
    
@@ -1295,7 +1324,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
       if (newmask != writemask) {
 	 need_stall = 1;
-         /* _mesa_printf("need stall %x %x\n", newmask , writemask); */
+         /* printf("need stall %x %x\n", newmask , writemask); */
       }
       else {
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
@@ -1368,7 +1397,18 @@ void brw_urb_WRITE(struct brw_compile *p,
 		   GLuint offset,
 		   GLuint swizzle)
 {
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_instruction *insn;
+
+   /* Sandybridge doesn't have the implied move for SENDs,
+    * and the first message register index comes from src0.
+    */
+   if (intel->gen >= 6) {
+      brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
+      src0 = brw_message_reg(msg_reg_nr);
+   }
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
 
    assert(msg_length < BRW_MAX_MRF);
 
@@ -1376,7 +1416,8 @@ void brw_urb_WRITE(struct brw_compile *p,
    brw_set_src0(insn, src0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->header.destreg__conditionalmod = msg_reg_nr;
+   if (intel->gen < 6)
+      insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_urb_message(p->brw,
 		       insn,
diff --git a/i965/brw_fallback.c b/i965/brw_fallback.c
index 562a178..ba401c2 100644
--- a/i965/brw_fallback.c
+++ b/i965/brw_fallback.c
@@ -36,18 +36,13 @@
 #include "swrast/swrast.h"
 #include "tnl/tnl.h"
 #include "brw_context.h"
-#include "brw_fallback.h"
-#include "intel_chipset.h"
 #include "intel_fbo.h"
 #include "intel_regions.h"
 
-#include "glapi/glapi.h"
-
 #define FILE_DEBUG_FLAG DEBUG_FALLBACKS
 
 static GLboolean do_check_fallback(struct brw_context *brw)
 {
-   struct intel_context *intel = &brw->intel;
    GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
 
@@ -86,8 +81,7 @@ static GLboolean do_check_fallback(struct brw_context *brw)
    }
 
    /* _NEW_BUFFERS */
-   if (IS_965(intel->intelScreen->deviceID) &&
-       !IS_G4X(intel->intelScreen->deviceID)) {
+   if (!brw->has_surface_tile_offset) {
       for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
 	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
 	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
diff --git a/i965/brw_gs.c b/i965/brw_gs.c
index 610b6c3..7261b31 100644
--- a/i965/brw_gs.c
+++ b/i965/brw_gs.c
@@ -47,6 +47,7 @@
 static void compile_gs_prog( struct brw_context *brw,
 			     struct brw_gs_prog_key *key )
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_gs_compile c;
    const GLuint *program;
    GLuint program_size;
@@ -54,13 +55,12 @@ static void compile_gs_prog( struct brw_context *brw,
    memset(&c, 0, sizeof(c));
    
    c.key = *key;
-   c.need_ff_sync = BRW_IS_IGDNG(brw);
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
    c.nr_attrs = brw_count_bits(c.key.attrs);
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
    else
        c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
@@ -125,12 +125,13 @@ static void compile_gs_prog( struct brw_context *brw,
    /* Upload
     */
    dri_bo_unreference(brw->gs.prog_bo);
-   brw->gs.prog_bo = brw_upload_cache( &brw->cache, BRW_GS_PROG,
-				       &c.key, sizeof(c.key),
-				       NULL, 0,
-				       program, program_size,
-				       &c.prog_data,
-				       &brw->gs.prog_data );
+   brw->gs.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_GS_PROG,
+						   &c.key, sizeof(c.key),
+						   NULL, 0,
+						   program, program_size,
+						   &c.prog_data,
+						   sizeof(c.prog_data),
+						   &brw->gs.prog_data);
 }
 
 static const GLenum gs_prim[GL_POLYGON+1] = {  
diff --git a/i965/brw_gs.h b/i965/brw_gs.h
index 010c1c2..813b8d4 100644
--- a/i965/brw_gs.h
+++ b/i965/brw_gs.h
@@ -63,7 +63,6 @@ struct brw_gs_compile {
    GLuint nr_attrs;
    GLuint nr_regs;
    GLuint nr_bytes;
-   GLboolean need_ff_sync;
 };
 
 #define ATTR_SIZE  (4*4)
diff --git a/i965/brw_gs_emit.c b/i965/brw_gs_emit.c
index 0fc5b02..dd7b057 100644
--- a/i965/brw_gs_emit.c
+++ b/i965/brw_gs_emit.c
@@ -40,7 +40,6 @@
 #include "brw_defines.h"
 #include "brw_context.h"
 #include "brw_eu.h"
-#include "brw_util.h"
 #include "brw_gs.h"
 
 static void brw_gs_alloc_regs( struct brw_gs_compile *c,
@@ -122,12 +121,14 @@ static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
 
 void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 4);
    
    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
     * is the PV for quads, but vertex 0 for polygons:
     */
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);
    if (key->pv_first) {
       brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
@@ -145,9 +146,11 @@ void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 
 void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 4);
    
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);      
    if (key->pv_first) {
       brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
@@ -165,9 +168,11 @@ void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 
 void brw_gs_tris( struct brw_gs_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 3);
 
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
@@ -176,9 +181,11 @@ void brw_gs_tris( struct brw_gs_compile *c )
 
 void brw_gs_lines( struct brw_gs_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 2);
 
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
@@ -186,9 +193,11 @@ void brw_gs_lines( struct brw_gs_compile *c )
 
 void brw_gs_points( struct brw_gs_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 1);
 
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
 }
diff --git a/i965/brw_gs_state.c b/i965/brw_gs_state.c
index ed9d2ff..d8ad5ce 100644
--- a/i965/brw_gs_state.c
+++ b/i965/brw_gs_state.c
@@ -34,7 +34,6 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "main/macros.h"
 
 struct brw_gs_unit_key {
    unsigned int total_grf;
@@ -72,6 +71,7 @@ gs_unit_populate_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 static dri_bo *
 gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_gs_unit_state gs;
    dri_bo *bo;
 
@@ -98,7 +98,7 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    else
       gs.thread4.max_threads = 0;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
       gs.thread4.rendering_enable = 1;
 
    if (INTEL_DEBUG & DEBUG_STATS)
@@ -107,8 +107,7 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    bo = brw_upload_cache(&brw->cache, BRW_GS_UNIT,
 			 key, sizeof(*key),
 			 &brw->gs.prog_bo, 1,
-			 &gs, sizeof(gs),
-			 NULL, NULL);
+			 &gs, sizeof(gs));
 
    if (key->prog_active) {
       /* Emit GS program relocation */
diff --git a/i965/brw_misc_state.c b/i965/brw_misc_state.c
index 4b0d598..d030ed4 100644
--- a/i965/brw_misc_state.c
+++ b/i965/brw_misc_state.c
@@ -78,10 +78,7 @@ static void upload_drawing_rect(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
    GLcontext *ctx = &intel->ctx;
 
-   if (!intel->constant_cliprect)
-      return;
-
-   BEGIN_BATCH(4, NO_LOOP_CLIPRECTS);
+   BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965);
    OUT_BATCH(0); /* xmin, ymin */
    OUT_BATCH(((ctx->DrawBuffer->Width - 1) & 0xffff) |
@@ -116,7 +113,7 @@ static void upload_binding_table_pointers(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
 
-   BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(6);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
    if (brw->vs.bind_bo != NULL)
       OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
@@ -139,6 +136,41 @@ const struct brw_tracked_state brw_binding_table_pointers = {
    .emit = upload_binding_table_pointers,
 };
 
+/**
+ * Upload the binding table pointers, which point each stage's array of surface
+ * state pointers.
+ *
+ * The binding table pointers are relative to the surface state base address,
+ * which is 0.
+ */
+static void upload_gen6_binding_table_pointers(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(4);
+   OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 |
+	     GEN6_BINDING_TABLE_MODIFY_VS |
+	     GEN6_BINDING_TABLE_MODIFY_GS |
+	     GEN6_BINDING_TABLE_MODIFY_PS |
+	     (4 - 2));
+   if (brw->vs.bind_bo != NULL)
+      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
+   else
+      OUT_BATCH(0);
+   OUT_BATCH(0); /* gs */
+   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
+   ADVANCE_BATCH();
+}
+
+const struct brw_tracked_state gen6_binding_table_pointers = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH,
+      .cache = CACHE_NEW_SURF_BIND,
+   },
+   .prepare = prepare_binding_table_pointers,
+   .emit = upload_gen6_binding_table_pointers,
+};
 
 /**
  * Upload pointers to the per-stage state.
@@ -150,7 +182,7 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
 {
    struct intel_context *intel = &brw->intel;
 
-   BEGIN_BATCH(7, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(7);
    OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2));
    OUT_RELOC(brw->vs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
    if (brw->gs.prog_active)
@@ -212,10 +244,17 @@ static void emit_depthbuffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    struct intel_region *region = brw->state.depth_region;
-   unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5;
+   unsigned int len;
+
+   if (intel->gen >= 6)
+      len = 7;
+   else if (intel->is_g4x || intel->is_ironlake)
+      len = 6;
+   else
+      len = 5;
 
    if (region == NULL) {
-      BEGIN_BATCH(len, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(len);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
 		(BRW_SURFACE_NULL << 29));
@@ -223,9 +262,12 @@ static void emit_depthbuffer(struct brw_context *brw)
       OUT_BATCH(0);
       OUT_BATCH(0);
 
-      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+      if (intel->is_g4x || intel->is_ironlake || intel->gen >= 6)
          OUT_BATCH(0);
 
+      if (intel->gen >= 6)
+	 OUT_BATCH(0);
+
       ADVANCE_BATCH();
    } else {
       unsigned int format;
@@ -246,8 +288,10 @@ static void emit_depthbuffer(struct brw_context *brw)
       }
 
       assert(region->tiling != I915_TILING_X);
+      if (IS_GEN6(intel->intelScreen->deviceID))
+	 assert(region->tiling != I915_TILING_NONE);
 
-      BEGIN_BATCH(len, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(len);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH(((region->pitch * region->cpp) - 1) |
 		(format << 18) |
@@ -262,9 +306,20 @@ static void emit_depthbuffer(struct brw_context *brw)
 		((region->height - 1) << 19));
       OUT_BATCH(0);
 
-      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+      if (intel->is_g4x || intel->is_ironlake || intel->gen >= 6)
          OUT_BATCH(0);
 
+      if (intel->gen >= 6)
+	 OUT_BATCH(0);
+
+      ADVANCE_BATCH();
+   }
+
+   /* Initialize it for safety. */
+   if (intel->gen >= 6) {
+      BEGIN_BATCH(2);
+      OUT_BATCH(CMD_3D_CLEAR_PARAMS << 16 | (2 - 2));
+      OUT_BATCH(0);
       ADVANCE_BATCH();
    }
 }
@@ -330,7 +385,7 @@ const struct brw_tracked_state brw_polygon_stipple = {
 
 static void upload_polygon_stipple_offset(struct brw_context *brw)
 {
-   __DRIdrawablePrivate *dPriv = brw->intel.driDrawable;
+   GLcontext *ctx = &brw->intel.ctx;
    struct brw_polygon_stipple_offset bpso;
 
    memset(&bpso, 0, sizeof(bpso));
@@ -346,8 +401,8 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
     * worry about.
     */
    if (brw->intel.ctx.DrawBuffer->Name == 0) {
-      bpso.bits0.x_offset = (32 - (dPriv->x & 31)) & 31;
-      bpso.bits0.y_offset = (32 - ((dPriv->y + dPriv->h) & 31)) & 31;
+      bpso.bits0.x_offset = 0;
+      bpso.bits0.y_offset = (32 - (ctx->DrawBuffer->Height & 31)) & 31;
    }
    else {
       bpso.bits0.y_offset = 0;
@@ -374,8 +429,8 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
 static void upload_aa_line_parameters(struct brw_context *brw)
 {
    struct brw_aa_line_parameters balp;
-   
-   if (BRW_IS_965(brw))
+
+   if (!brw->has_aa_line_parameters)
       return;
 
    /* use legacy aa line coverage computation */
@@ -438,18 +493,20 @@ const struct brw_tracked_state brw_line_stipple = {
 
 static void upload_invarient_state( struct brw_context *brw )
 {
+   struct intel_context *intel = &brw->intel;
+
    {
       /* 0x61040000  Pipeline Select */
       /*     PipelineSelect            : 0 */
       struct brw_pipeline_select ps;
 
       memset(&ps, 0, sizeof(ps));
-      ps.header.opcode = CMD_PIPELINE_SELECT(brw);
+      ps.header.opcode = brw->CMD_PIPELINE_SELECT;
       ps.header.pipeline_select = 0;
       BRW_BATCH_STRUCT(brw, &ps);
    }
 
-   {
+   if (intel->gen < 6) {
       struct brw_global_depth_offset_clamp gdo;
       memset(&gdo, 0, sizeof(gdo));
 
@@ -462,6 +519,32 @@ static void upload_invarient_state( struct brw_context *brw )
       BRW_BATCH_STRUCT(brw, &gdo);
    }
 
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   if (intel->gen >= 6) {
+      int i;
+
+      BEGIN_BATCH(3);
+      OUT_BATCH(CMD_3D_MULTISAMPLE << 16 | (3 - 2));
+      OUT_BATCH(MS_PIXEL_LOCATION_CENTER |
+		MS_NUMSAMPLES_1);
+      OUT_BATCH(0); /* positions for 4/8-sample */
+      ADVANCE_BATCH();
+
+      BEGIN_BATCH(2);
+      OUT_BATCH(CMD_3D_SAMPLE_MASK << 16 | (2 - 2));
+      OUT_BATCH(1);
+      ADVANCE_BATCH();
+
+      for (i = 0; i < 4; i++) {
+	 BEGIN_BATCH(4);
+	 OUT_BATCH(CMD_GS_SVB_INDEX << 16 | (4 - 2));
+	 OUT_BATCH(i << SVB_INDEX_SHIFT);
+	 OUT_BATCH(0);
+	 OUT_BATCH(0xffffffff);
+	 ADVANCE_BATCH();
+      }
+   }
 
    /* 0x61020000  State Instruction Pointer */
    {
@@ -480,7 +563,7 @@ static void upload_invarient_state( struct brw_context *brw )
       struct brw_vf_statistics vfs;
       memset(&vfs, 0, sizeof(vfs));
 
-      vfs.opcode = CMD_VF_STATISTICS(brw);
+      vfs.opcode = brw->CMD_VF_STATISTICS;
       if (INTEL_DEBUG & DEBUG_STATS)
 	 vfs.statistics_enable = 1; 
 
@@ -512,8 +595,21 @@ static void upload_state_base_address( struct brw_context *brw )
    /* Output the structure (brw_state_base_address) directly to the
     * batchbuffer, so we can emit relocations inline.
     */
-   if (BRW_IS_IGDNG(brw)) {
-       BEGIN_BATCH(8, IGNORE_CLIPRECTS);
+   if (intel->gen >= 6) {
+       BEGIN_BATCH(10);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Dynamic state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* Instruction base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Dynamic state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       OUT_BATCH(1); /* Instruction access upper bound */
+       ADVANCE_BATCH();
+   } else if (intel->is_ironlake) {
+       BEGIN_BATCH(8);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
        OUT_BATCH(1); /* General state base address */
        OUT_BATCH(1); /* Surface state base address */
@@ -524,7 +620,7 @@ static void upload_state_base_address( struct brw_context *brw )
        OUT_BATCH(1); /* Instruction access upper bound */
        ADVANCE_BATCH();
    } else {
-       BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+       BEGIN_BATCH(6);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
        OUT_BATCH(1); /* General state base address */
        OUT_BATCH(1); /* Surface state base address */
diff --git a/i965/brw_program.c b/i965/brw_program.c
index bac6918..c78f7b3 100644
--- a/i965/brw_program.c
+++ b/i965/brw_program.c
@@ -37,7 +37,6 @@
 #include "tnl/tnl.h"
 
 #include "brw_context.h"
-#include "brw_util.h"
 #include "brw_wm.h"
 
 static void brwBindProgram( GLcontext *ctx,
@@ -112,9 +111,10 @@ static GLboolean brwIsProgramNative( GLcontext *ctx,
    return GL_TRUE;
 }
 
-static void brwProgramStringNotify( GLcontext *ctx,
-				    GLenum target,
-				    struct gl_program *prog )
+
+static GLboolean brwProgramStringNotify( GLcontext *ctx,
+                                         GLenum target,
+                                         struct gl_program *prog )
 {
    struct brw_context *brw = brw_context(ctx);
 
@@ -151,6 +151,9 @@ static void brwProgramStringNotify( GLcontext *ctx,
        */
       _tnl_program_string(ctx, target, prog);
    }
+
+   /* XXX check if program is legal, within limits */
+   return GL_TRUE;
 }
 
 void brwInitFragProgFuncs( struct dd_function_table *functions )
diff --git a/i965/brw_queryobj.c b/i965/brw_queryobj.c
index a195bc3..6cce7e5 100644
--- a/i965/brw_queryobj.c
+++ b/i965/brw_queryobj.c
@@ -73,7 +73,7 @@ brw_new_query_object(GLcontext *ctx, GLuint id)
 {
    struct brw_query_object *query;
 
-   query = _mesa_calloc(sizeof(struct brw_query_object));
+   query = calloc(1, sizeof(struct brw_query_object));
 
    query->Base.Id = id;
    query->Base.Result = 0;
@@ -89,7 +89,7 @@ brw_delete_query(GLcontext *ctx, struct gl_query_object *q)
    struct brw_query_object *query = (struct brw_query_object *)q;
 
    dri_bo_unreference(query->bo);
-   _mesa_free(query);
+   free(query);
 }
 
 static void
@@ -188,7 +188,7 @@ brw_emit_query_begin(struct brw_context *brw)
    if (brw->query.active || is_empty_list(&brw->query.active_head))
       return;
 
-   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_PIPE_CONTROL |
 	     PIPE_CONTROL_DEPTH_STALL |
 	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
@@ -227,7 +227,7 @@ brw_emit_query_end(struct brw_context *brw)
    if (!brw->query.active)
       return;
 
-   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_PIPE_CONTROL |
 	     PIPE_CONTROL_DEPTH_STALL |
 	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
diff --git a/i965/brw_sf.c b/i965/brw_sf.c
index 968890f..8e6839b 100644
--- a/i965/brw_sf.c
+++ b/i965/brw_sf.c
@@ -117,12 +117,13 @@ static void compile_sf_prog( struct brw_context *brw,
    /* Upload
     */
    dri_bo_unreference(brw->sf.prog_bo);
-   brw->sf.prog_bo = brw_upload_cache( &brw->cache, BRW_SF_PROG,
-				       &c.key, sizeof(c.key),
-				       NULL, 0,
-				       program, program_size,
-				       &c.prog_data,
-				       &brw->sf.prog_data );
+   brw->sf.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_SF_PROG,
+						   &c.key, sizeof(c.key),
+						   NULL, 0,
+						   program, program_size,
+						   &c.prog_data,
+						   sizeof(c.prog_data),
+						   &brw->sf.prog_data);
 }
 
 /* Calculate interpolants for triangle and line rasterization.
diff --git a/i965/brw_sf_emit.c b/i965/brw_sf_emit.c
index 3eae41e..bb08055 100644
--- a/i965/brw_sf_emit.c
+++ b/i965/brw_sf_emit.c
@@ -149,6 +149,7 @@ static void copy_colors( struct brw_sf_compile *c,
 static void do_flatshade_triangle( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg ip = brw_ip_reg();
    GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
@@ -161,7 +162,7 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
    if (c->key.primitive == SF_UNFILLED_TRIS)
       return;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
        jmpi = 2;
 
    brw_push_insn_state(p);
@@ -187,6 +188,7 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
 static void do_flatshade_line( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg ip = brw_ip_reg();
    GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
@@ -199,7 +201,7 @@ static void do_flatshade_line( struct brw_sf_compile *c )
    if (c->key.primitive == SF_UNFILLED_TRIS)
       return;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
        jmpi = 2;
 
    brw_push_insn_state(p);
diff --git a/i965/brw_sf_state.c b/i965/brw_sf_state.c
index bb69435..847c886 100644
--- a/i965/brw_sf_state.c
+++ b/i965/brw_sf_state.c
@@ -35,7 +35,6 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "main/macros.h"
-#include "intel_fbo.h"
 
 static void upload_sf_vp(struct brw_context *brw)
 {
@@ -70,9 +69,9 @@ static void upload_sf_vp(struct brw_context *brw)
     * for DrawBuffer->_[XY]{min,max}
     */
 
-   /* The scissor only needs to handle the intersection of drawable and
-    * scissor rect.  Clipping to the boundaries of static shared buffers
-    * for front/back/depth is covered by looping over cliprects in brw_draw.c.
+   /* The scissor only needs to handle the intersection of drawable
+    * and scissor rect, since there are no longer cliprects for shared
+    * buffers with DRI2.
     *
     * Note that the hardware's coordinates are inclusive, while Mesa's min is
     * inclusive but max is exclusive.
@@ -165,6 +164,7 @@ static dri_bo *
 sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 			dri_bo **reloc_bufs)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_sf_unit_state sf;
    dri_bo *bo;
    int chipset_max_threads;
@@ -177,7 +177,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    sf.thread3.dispatch_grf_start_reg = 3;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        sf.thread3.urb_entry_read_offset = 3;
    else
        sf.thread3.urb_entry_read_offset = 1;
@@ -187,10 +187,10 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    sf.thread4.nr_urb_entries = key->nr_urb_entries;
    sf.thread4.urb_entry_allocation_size = key->sfsize - 1;
 
-   /* Each SF thread produces 1 PUE, and there can be up to 24(Pre-IGDNG) or 
-    * 48(IGDNG) threads 
+   /* Each SF thread produces 1 PUE, and there can be up to 24 (Pre-Ironlake) or
+    * 48 (Ironlake) threads.
     */
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
       chipset_max_threads = 48;
    else
       chipset_max_threads = 24;
@@ -308,8 +308,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    bo = brw_upload_cache(&brw->cache, BRW_SF_UNIT,
 			 key, sizeof(*key),
 			 reloc_bufs, 2,
-			 &sf, sizeof(sf),
-			 NULL, NULL);
+			 &sf, sizeof(sf));
 
    /* STATE_PREFETCH command description describes this state as being
     * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
diff --git a/i965/brw_state.h b/i965/brw_state.h
index b129b1f..f790cfa 100644
--- a/i965/brw_state.h
+++ b/i965/brw_state.h
@@ -35,7 +35,7 @@
 
 #include "brw_context.h"
 
-static inline void
+static INLINE void
 brw_add_validated_bo(struct brw_context *brw, dri_bo *bo)
 {
    assert(brw->state.validated_bo_count < ARRAY_SIZE(brw->state.validated_bos));
@@ -90,6 +90,23 @@ const struct brw_tracked_state brw_drawing_rect;
 const struct brw_tracked_state brw_indices;
 const struct brw_tracked_state brw_vertices;
 const struct brw_tracked_state brw_index_buffer;
+const struct brw_tracked_state gen6_binding_table_pointers;
+const struct brw_tracked_state gen6_blend_state;
+const struct brw_tracked_state gen6_cc_state_pointers;
+const struct brw_tracked_state gen6_cc_vp;
+const struct brw_tracked_state gen6_clip_state;
+const struct brw_tracked_state gen6_clip_vp;
+const struct brw_tracked_state gen6_color_calc_state;
+const struct brw_tracked_state gen6_depth_stencil_state;
+const struct brw_tracked_state gen6_gs_state;
+const struct brw_tracked_state gen6_sampler_state;
+const struct brw_tracked_state gen6_scissor_state;
+const struct brw_tracked_state gen6_sf_state;
+const struct brw_tracked_state gen6_sf_vp;
+const struct brw_tracked_state gen6_urb;
+const struct brw_tracked_state gen6_viewport_state;
+const struct brw_tracked_state gen6_vs_state;
+const struct brw_tracked_state gen6_wm_state;
 
 /**
  * Use same key for WM and VS surfaces.
@@ -124,16 +141,26 @@ dri_bo *brw_cache_data(struct brw_cache *cache,
 		       dri_bo **reloc_bufs,
 		       GLuint nr_reloc_bufs);
 
-dri_bo *brw_upload_cache( struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *key,
-			  GLuint key_sz,
-			  dri_bo **reloc_bufs,
-			  GLuint nr_reloc_bufs,
-			  const void *data,
-			  GLuint data_sz,
-			  const void *aux,
-			  void *aux_return );
+drm_intel_bo *brw_upload_cache(struct brw_cache *cache,
+			       enum brw_cache_id cache_id,
+			       const void *key,
+			       GLuint key_sz,
+			       dri_bo **reloc_bufs,
+			       GLuint nr_reloc_bufs,
+			       const void *data,
+			       GLuint data_sz);
+
+drm_intel_bo *brw_upload_cache_with_auxdata(struct brw_cache *cache,
+					    enum brw_cache_id cache_id,
+					    const void *key,
+					    GLuint key_sz,
+					    dri_bo **reloc_bufs,
+					    GLuint nr_reloc_bufs,
+					    const void *data,
+					    GLuint data_sz,
+					    const void *aux,
+					    GLuint aux_sz,
+					    void *aux_return);
 
 dri_bo *brw_search_cache( struct brw_cache *cache,
 			  enum brw_cache_id cache_id,
@@ -151,7 +178,7 @@ void brw_state_cache_bo_delete(struct brw_cache *cache, dri_bo *bo);
 /***********************************************************************
  * brw_state_batch.c
  */
-#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
+#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)))
 #define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
 
 GLboolean brw_cached_batch_struct( struct brw_context *brw,
diff --git a/i965/brw_state_batch.c b/i965/brw_state_batch.c
index 7821898..3901941 100644
--- a/i965/brw_state_batch.c
+++ b/i965/brw_state_batch.c
@@ -48,7 +48,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
    struct header *newheader = (struct header *)data;
 
    if (brw->emit_state_always) {
-      intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+      intel_batchbuffer_data(brw->intel.batch, data, sz);
       return GL_TRUE;
    }
 
@@ -57,8 +57,8 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 	 if (item->sz == sz && memcmp(item->header, newheader, sz) == 0)
 	    return GL_FALSE;
 	 if (item->sz != sz) {
-	    _mesa_free(item->header);
-	    item->header = _mesa_malloc(sz);
+	    free(item->header);
+	    item->header = malloc(sz);
 	    item->sz = sz;
 	 }
 	 goto emit;
@@ -68,14 +68,14 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 
    assert(!item);
    item = CALLOC_STRUCT(brw_cached_batch_item);
-   item->header = _mesa_malloc(sz);
+   item->header = malloc(sz);
    item->sz = sz;
    item->next = brw->cached_batch_items;
    brw->cached_batch_items = item;
 
  emit:
    memcpy(item->header, newheader, sz);
-   intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+   intel_batchbuffer_data(brw->intel.batch, data, sz);
    return GL_TRUE;
 }
 
diff --git a/i965/brw_state_cache.c b/i965/brw_state_cache.c
index e4c9ba7..c08cb45 100644
--- a/i965/brw_state_cache.c
+++ b/i965/brw_state_cache.c
@@ -59,37 +59,27 @@
 #include "main/imports.h"
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
-
-/* XXX: Fixme - have to include these to get the sizes of the prog_key
- * structs:
- */
 #include "brw_wm.h"
-#include "brw_vs.h"
-#include "brw_clip.h"
-#include "brw_sf.h"
-#include "brw_gs.h"
 
 
 static GLuint
-hash_key(const void *key, GLuint key_size,
-         dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+hash_key(struct brw_cache_item *item)
 {
-   GLuint *ikey = (GLuint *)key;
-   GLuint hash = 0, i;
+   GLuint *ikey = (GLuint *)item->key;
+   GLuint hash = item->cache_id, i;
 
-   assert(key_size % 4 == 0);
+   assert(item->key_size % 4 == 0);
 
    /* I'm sure this can be improved on:
     */
-   for (i = 0; i < key_size/4; i++) {
+   for (i = 0; i < item->key_size/4; i++) {
       hash ^= ikey[i];
       hash = (hash << 5) | (hash >> 27);
    }
 
    /* Include the BO pointers as key data as well */
-   ikey = (GLuint *)reloc_bufs;
-   key_size = nr_reloc_bufs * sizeof(dri_bo *);
-   for (i = 0; i < key_size/4; i++) {
+   ikey = (GLuint *)item->reloc_bufs;
+   for (i = 0; i < item->nr_reloc_bufs * sizeof(drm_intel_bo *) / 4; i++) {
       hash ^= ikey[i];
       hash = (hash << 5) | (hash >> 27);
    }
@@ -114,11 +104,22 @@ update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
    cache->brw->state.dirty.cache |= 1 << cache_id;
 }
 
+static int
+brw_cache_item_equals(const struct brw_cache_item *a,
+		      const struct brw_cache_item *b)
+{
+   return a->cache_id == b->cache_id &&
+      a->hash == b->hash &&
+      a->key_size == b->key_size &&
+      (memcmp(a->key, b->key, a->key_size) == 0) &&
+      a->nr_reloc_bufs == b->nr_reloc_bufs &&
+      (memcmp(a->reloc_bufs, b->reloc_bufs,
+	      a->nr_reloc_bufs * sizeof(dri_bo *)) == 0);
+}
 
 static struct brw_cache_item *
-search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
-	     GLuint hash, const void *key, GLuint key_size,
-	     dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+search_cache(struct brw_cache *cache, GLuint hash,
+	     struct brw_cache_item *lookup)
 {
    struct brw_cache_item *c;
 
@@ -133,13 +134,7 @@ search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 #endif
 
    for (c = cache->items[hash % cache->size]; c; c = c->next) {
-      if (c->cache_id == cache_id &&
-	  c->hash == hash &&
-	  c->key_size == key_size &&
-	  memcmp(c->key, key, key_size) == 0 &&
-	  c->nr_reloc_bufs == nr_reloc_bufs &&
-	  memcmp(c->reloc_bufs, reloc_bufs,
-		 nr_reloc_bufs * sizeof(dri_bo *)) == 0)
+      if (brw_cache_item_equals(lookup, c))
 	 return c;
    }
 
@@ -155,7 +150,7 @@ rehash(struct brw_cache *cache)
    GLuint size, i;
 
    size = cache->size * 3;
-   items = (struct brw_cache_item**) _mesa_calloc(size * sizeof(*items));
+   items = (struct brw_cache_item**) calloc(1, size * sizeof(*items));
 
    for (i = 0; i < cache->size; i++)
       for (c = cache->items[i]; c; c = next) {
@@ -182,10 +177,18 @@ brw_search_cache(struct brw_cache *cache,
                  void *aux_return)
 {
    struct brw_cache_item *item;
-   GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
+   struct brw_cache_item lookup;
+   GLuint hash;
 
-   item = search_cache(cache, cache_id, hash, key, key_size,
-		       reloc_bufs, nr_reloc_bufs);
+   lookup.cache_id = cache_id;
+   lookup.key = key;
+   lookup.key_size = key_size;
+   lookup.reloc_bufs = reloc_bufs;
+   lookup.nr_reloc_bufs = nr_reloc_bufs;
+   hash = hash_key(&lookup);
+   lookup.hash = hash;
+
+   item = search_cache(cache, hash, &lookup);
 
    if (item == NULL)
       return NULL;
@@ -200,48 +203,52 @@ brw_search_cache(struct brw_cache *cache,
 }
 
 
-dri_bo *
-brw_upload_cache( struct brw_cache *cache,
-		  enum brw_cache_id cache_id,
-		  const void *key,
-		  GLuint key_size,
-		  dri_bo **reloc_bufs,
-		  GLuint nr_reloc_bufs,
-		  const void *data,
-		  GLuint data_size,
-		  const void *aux,
-		  void *aux_return )
+drm_intel_bo *
+brw_upload_cache_with_auxdata(struct brw_cache *cache,
+			      enum brw_cache_id cache_id,
+			      const void *key,
+			      GLuint key_size,
+			      dri_bo **reloc_bufs,
+			      GLuint nr_reloc_bufs,
+			      const void *data,
+			      GLuint data_size,
+			      const void *aux,
+			      GLuint aux_size,
+			      void *aux_return)
 {
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
-   GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
+   GLuint hash;
    GLuint relocs_size = nr_reloc_bufs * sizeof(dri_bo *);
-   GLuint aux_size = cache->aux_size[cache_id];
    void *tmp;
    dri_bo *bo;
    int i;
 
+   item->cache_id = cache_id;
+   item->key = key;
+   item->key_size = key_size;
+   item->reloc_bufs = reloc_bufs;
+   item->nr_reloc_bufs = nr_reloc_bufs;
+   hash = hash_key(item);
+   item->hash = hash;
+
    /* Create the buffer object to contain the data */
    bo = dri_bo_alloc(cache->brw->intel.bufmgr,
 		     cache->name[cache_id], data_size, 1 << 6);
 
 
    /* Set up the memory containing the key, aux_data, and reloc_bufs */
-   tmp = _mesa_malloc(key_size + aux_size + relocs_size);
+   tmp = malloc(key_size + aux_size + relocs_size);
 
    memcpy(tmp, key, key_size);
-   memcpy(tmp + key_size, aux, cache->aux_size[cache_id]);
+   memcpy(tmp + key_size, aux, aux_size);
    memcpy(tmp + key_size + aux_size, reloc_bufs, relocs_size);
    for (i = 0; i < nr_reloc_bufs; i++) {
       if (reloc_bufs[i] != NULL)
 	 dri_bo_reference(reloc_bufs[i]);
    }
 
-   item->cache_id = cache_id;
    item->key = tmp;
-   item->hash = hash;
-   item->key_size = key_size;
    item->reloc_bufs = tmp + key_size + aux_size;
-   item->nr_reloc_bufs = nr_reloc_bufs;
 
    item->bo = bo;
    dri_bo_reference(bo);
@@ -255,12 +262,11 @@ brw_upload_cache( struct brw_cache *cache,
    cache->n_items++;
 
    if (aux_return) {
-      assert(cache->aux_size[cache_id]);
       *(void **)aux_return = (void *)((char *)item->key + item->key_size);
    }
 
    if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("upload %s: %d bytes to cache id %d\n",
+      printf("upload %s: %d bytes to cache id %d\n",
 		   cache->name[cache_id],
 		   data_size, cache_id);
 
@@ -272,6 +278,23 @@ brw_upload_cache( struct brw_cache *cache,
    return bo;
 }
 
+drm_intel_bo *
+brw_upload_cache(struct brw_cache *cache,
+		 enum brw_cache_id cache_id,
+		 const void *key,
+		 GLuint key_size,
+		 dri_bo **reloc_bufs,
+		 GLuint nr_reloc_bufs,
+		 const void *data,
+		 GLuint data_size)
+{
+   return brw_upload_cache_with_auxdata(cache, cache_id,
+					key, key_size,
+					reloc_bufs, nr_reloc_bufs,
+					data, data_size,
+					NULL, 0,
+					NULL);
+}
 
 /**
  * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
@@ -292,11 +315,18 @@ brw_cache_data(struct brw_cache *cache,
 	       GLuint nr_reloc_bufs)
 {
    dri_bo *bo;
-   struct brw_cache_item *item;
-   GLuint hash = hash_key(data, data_size, reloc_bufs, nr_reloc_bufs);
-
-   item = search_cache(cache, cache_id, hash, data, data_size,
-		       reloc_bufs, nr_reloc_bufs);
+   struct brw_cache_item *item, lookup;
+   GLuint hash;
+
+   lookup.cache_id = cache_id;
+   lookup.key = data;
+   lookup.key_size = data_size;
+   lookup.reloc_bufs = reloc_bufs;
+   lookup.nr_reloc_bufs = nr_reloc_bufs;
+   hash = hash_key(&lookup);
+   lookup.hash = hash;
+
+   item = search_cache(cache, hash, &lookup);
    if (item) {
       update_cache_last(cache, cache_id, item->bo);
       dri_bo_reference(item->bo);
@@ -306,8 +336,7 @@ brw_cache_data(struct brw_cache *cache,
    bo = brw_upload_cache(cache, cache_id,
 			 data, data_size,
 			 reloc_bufs, nr_reloc_bufs,
-			 data, data_size,
-			 NULL, NULL);
+			 data, data_size);
 
    return bo;
 }
@@ -321,11 +350,9 @@ enum pool_type {
 static void
 brw_init_cache_id(struct brw_cache *cache,
                   const char *name,
-                  enum brw_cache_id id,
-                  GLuint aux_size)
+                  enum brw_cache_id id)
 {
    cache->name[id] = strdup(name);
-   cache->aux_size[id] = aux_size;
 }
 
 
@@ -339,82 +366,31 @@ brw_init_non_surface_cache(struct brw_context *brw)
    cache->size = 7;
    cache->n_items = 0;
    cache->items = (struct brw_cache_item **)
-      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
-
-   brw_init_cache_id(cache,
-		     "CC_VP",
-		     BRW_CC_VP,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "CC_UNIT",
-		     BRW_CC_UNIT,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "WM_PROG",
-		     BRW_WM_PROG,
-		     sizeof(struct brw_wm_prog_data));
-
-   brw_init_cache_id(cache,
-		     "SAMPLER_DEFAULT_COLOR",
-		     BRW_SAMPLER_DEFAULT_COLOR,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "SAMPLER",
-		     BRW_SAMPLER,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "WM_UNIT",
-		     BRW_WM_UNIT,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "SF_PROG",
-		     BRW_SF_PROG,
-		     sizeof(struct brw_sf_prog_data));
-
-   brw_init_cache_id(cache,
-		     "SF_VP",
-		     BRW_SF_VP,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "SF_UNIT",
-		     BRW_SF_UNIT,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "VS_UNIT",
-		     BRW_VS_UNIT,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "VS_PROG",
-		     BRW_VS_PROG,
-		     sizeof(struct brw_vs_prog_data));
-
-   brw_init_cache_id(cache,
-		     "CLIP_UNIT",
-		     BRW_CLIP_UNIT,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "CLIP_PROG",
-		     BRW_CLIP_PROG,
-		     sizeof(struct brw_clip_prog_data));
-
-   brw_init_cache_id(cache,
-		     "GS_UNIT",
-		     BRW_GS_UNIT,
-		     0);
-
-   brw_init_cache_id(cache,
-		     "GS_PROG",
-		     BRW_GS_PROG,
-		     sizeof(struct brw_gs_prog_data));
+      calloc(1, cache->size * sizeof(struct brw_cache_item));
+
+   brw_init_cache_id(cache, "CC_VP", BRW_CC_VP);
+   brw_init_cache_id(cache, "CC_UNIT", BRW_CC_UNIT);
+   brw_init_cache_id(cache, "WM_PROG", BRW_WM_PROG);
+   brw_init_cache_id(cache, "SAMPLER_DEFAULT_COLOR", BRW_SAMPLER_DEFAULT_COLOR);
+   brw_init_cache_id(cache, "SAMPLER", BRW_SAMPLER);
+   brw_init_cache_id(cache, "WM_UNIT", BRW_WM_UNIT);
+   brw_init_cache_id(cache, "SF_PROG", BRW_SF_PROG);
+   brw_init_cache_id(cache, "SF_VP", BRW_SF_VP);
+
+   brw_init_cache_id(cache, "SF_UNIT", BRW_SF_UNIT);
+
+   brw_init_cache_id(cache, "VS_UNIT", BRW_VS_UNIT);
+
+   brw_init_cache_id(cache, "VS_PROG", BRW_VS_PROG);
+
+   brw_init_cache_id(cache, "CLIP_UNIT", BRW_CLIP_UNIT);
+
+   brw_init_cache_id(cache, "CLIP_PROG", BRW_CLIP_PROG);
+
+   brw_init_cache_id(cache, "GS_UNIT", BRW_GS_UNIT);
+
+   brw_init_cache_id(cache, "GS_PROG", BRW_GS_PROG);
+   brw_init_cache_id(cache, "BLEND_STATE", BRW_BLEND_STATE);
 }
 
 
@@ -428,17 +404,10 @@ brw_init_surface_cache(struct brw_context *brw)
    cache->size = 7;
    cache->n_items = 0;
    cache->items = (struct brw_cache_item **)
-      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
-
-   brw_init_cache_id(cache,
-		     "SS_SURFACE",
-		     BRW_SS_SURFACE,
-		     0);
+      calloc(1, cache->size * sizeof(struct brw_cache_item));
 
-   brw_init_cache_id(cache,
-		     "SS_SURF_BIND",
-		     BRW_SS_SURF_BIND,
-		     0);
+   brw_init_cache_id(cache, "SS_SURFACE", BRW_SS_SURFACE);
+   brw_init_cache_id(cache, "SS_SURF_BIND", BRW_SS_SURF_BIND);
 }
 
 
@@ -457,7 +426,7 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    GLuint i;
 
    if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("%s\n", __FUNCTION__);
+      printf("%s\n", __FUNCTION__);
 
    for (i = 0; i < cache->size; i++) {
       for (c = cache->items[i]; c; c = next) {
@@ -476,7 +445,7 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    cache->n_items = 0;
 
    if (brw->curbe.last_buf) {
-      _mesa_free(brw->curbe.last_buf);
+      free(brw->curbe.last_buf);
       brw->curbe.last_buf = NULL;
    }
 
@@ -497,7 +466,7 @@ brw_state_cache_bo_delete(struct brw_cache *cache, dri_bo *bo)
    GLuint i;
 
    if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("%s\n", __FUNCTION__);
+      printf("%s\n", __FUNCTION__);
 
    for (i = 0; i < cache->size; i++) {
       for (prev = &cache->items[i]; *prev;) {
@@ -525,7 +494,7 @@ void
 brw_state_cache_check_size(struct brw_context *brw)
 {
    if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
+      printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
 
    /* un-tuned guess.  We've got around 20 state objects for a total of around
     * 32k, so 1000 of them is around 1.5MB.
@@ -544,7 +513,7 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
    GLuint i;
 
    if (INTEL_DEBUG & DEBUG_STATE)
-      _mesa_printf("%s\n", __FUNCTION__);
+      printf("%s\n", __FUNCTION__);
 
    brw_clear_cache(brw, cache);
    for (i = 0; i < BRW_MAX_CACHE; i++) {
diff --git a/i965/brw_state_dump.c b/i965/brw_state_dump.c
index e94fa7d..020ac52 100644
--- a/i965/brw_state_dump.c
+++ b/i965/brw_state_dump.c
@@ -28,7 +28,6 @@
 #include "main/mtypes.h"
 
 #include "brw_context.h"
-#include "brw_state.h"
 #include "brw_defines.h"
 
 /**
diff --git a/i965/brw_state_upload.c b/i965/brw_state_upload.c
index af8dfb4..9e54f29 100644
--- a/i965/brw_state_upload.c
+++ b/i965/brw_state_upload.c
@@ -35,6 +35,7 @@
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
 #include "intel_buffers.h"
+#include "intel_chipset.h"
 
 /* This is used to initialize brw->state.atoms[].  We could use this
  * list directly except for a single atom, brw_constant_buffer, which
@@ -42,7 +43,7 @@
  * current fragment and vertex programs, and so cannot be a static
  * value.
  */
-const struct brw_tracked_state *atoms[] =
+static const struct brw_tracked_state *gen4_atoms[] =
 {
    &brw_check_fallback,
 
@@ -101,6 +102,63 @@ const struct brw_tracked_state *atoms[] =
    &brw_constant_buffer
 };
 
+const struct brw_tracked_state *gen6_atoms[] =
+{
+   &brw_check_fallback,
+
+   &brw_wm_input_sizes,
+   &brw_vs_prog,
+   &brw_gs_prog,
+   &brw_wm_prog,
+
+   &gen6_clip_vp,
+   &gen6_sf_vp,
+   &gen6_cc_vp,
+
+   /* Command packets: */
+   &brw_invarient_state,
+
+   &gen6_viewport_state,	/* must do after *_vp stages */
+
+   &gen6_urb,
+   &gen6_blend_state,		/* must do before cc unit */
+   &gen6_color_calc_state,	/* must do before cc unit */
+   &gen6_depth_stencil_state,	/* must do before cc unit */
+   &gen6_cc_state_pointers,
+
+   &brw_vs_surfaces,		/* must do before unit */
+   &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
+   &brw_wm_surfaces,		/* must do before samplers and unit */
+
+   &brw_wm_samplers,
+   &gen6_sampler_state,
+
+   &gen6_vs_state,
+   &gen6_gs_state,
+   &gen6_clip_state,
+   &gen6_sf_state,
+   &gen6_wm_state,
+
+   &gen6_scissor_state,
+
+   &brw_state_base_address,
+
+   &gen6_binding_table_pointers,
+
+   &brw_depthbuffer,
+
+   &brw_polygon_stipple,
+   &brw_polygon_stipple_offset,
+
+   &brw_line_stipple,
+   &brw_aa_line_parameters,
+
+   &brw_drawing_rect,
+
+   &brw_indices,
+   &brw_index_buffer,
+   &brw_vertices,
+};
 
 void brw_init_state( struct brw_context *brw )
 {
@@ -208,7 +266,6 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_CONTEXT),
    DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
    DEFINE_BIT(BRW_NEW_PSP),
-   DEFINE_BIT(BRW_NEW_FENCE),
    DEFINE_BIT(BRW_NEW_INDICES),
    DEFINE_BIT(BRW_NEW_INDEX_BUFFER),
    DEFINE_BIT(BRW_NEW_VERTICES),
@@ -218,6 +275,7 @@ static struct dirty_bit_map brw_bits[] = {
 };
 
 static struct dirty_bit_map cache_bits[] = {
+   DEFINE_BIT(CACHE_NEW_BLEND_STATE),
    DEFINE_BIT(CACHE_NEW_CC_VP),
    DEFINE_BIT(CACHE_NEW_CC_UNIT),
    DEFINE_BIT(CACHE_NEW_WM_PROG),
@@ -277,6 +335,8 @@ void brw_validate_state( struct brw_context *brw )
    struct intel_context *intel = &brw->intel;
    struct brw_state_flags *state = &brw->state.dirty;
    GLuint i;
+   const struct brw_tracked_state **atoms;
+   int num_atoms;
 
    brw_clear_validated_bos(brw);
 
@@ -285,6 +345,14 @@ void brw_validate_state( struct brw_context *brw )
 
    brw_add_validated_bo(brw, intel->batch->buf);
 
+   if (IS_GEN6(intel->intelScreen->deviceID)) {
+      atoms = gen6_atoms;
+      num_atoms = ARRAY_SIZE(gen6_atoms);
+   } else {
+      atoms = gen4_atoms;
+      num_atoms = ARRAY_SIZE(gen4_atoms);
+   }
+
    if (brw->emit_state_always) {
       state->mesa |= ~0;
       state->brw |= ~0;
@@ -312,7 +380,7 @@ void brw_validate_state( struct brw_context *brw )
    brw->intel.Fallback = GL_FALSE; /* boolean, not bitfield */
 
    /* do prepare stage for all atoms */
-   for (i = 0; i < Elements(atoms); i++) {
+   for (i = 0; i < num_atoms; i++) {
       const struct brw_tracked_state *atom = atoms[i];
 
       if (brw->intel.Fallback)
@@ -344,9 +412,20 @@ void brw_validate_state( struct brw_context *brw )
 
 void brw_upload_state(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_state_flags *state = &brw->state.dirty;
    int i;
    static int dirty_count = 0;
+   const struct brw_tracked_state **atoms;
+   int num_atoms;
+
+   if (IS_GEN6(intel->intelScreen->deviceID)) {
+      atoms = gen6_atoms;
+      num_atoms = ARRAY_SIZE(gen6_atoms);
+   } else {
+      atoms = gen4_atoms;
+      num_atoms = ARRAY_SIZE(gen4_atoms);
+   }
 
    brw_clear_validated_bos(brw);
 
@@ -356,10 +435,10 @@ void brw_upload_state(struct brw_context *brw)
        * state atoms are ordered correctly in the list.
        */
       struct brw_state_flags examined, prev;      
-      _mesa_memset(&examined, 0, sizeof(examined));
+      memset(&examined, 0, sizeof(examined));
       prev = *state;
 
-      for (i = 0; i < Elements(atoms); i++) {	 
+      for (i = 0; i < num_atoms; i++) {
 	 const struct brw_tracked_state *atom = atoms[i];
 	 struct brw_state_flags generated;
 
@@ -388,7 +467,7 @@ void brw_upload_state(struct brw_context *brw)
       }
    }
    else {
-      for (i = 0; i < Elements(atoms); i++) {	 
+      for (i = 0; i < num_atoms; i++) {
 	 const struct brw_tracked_state *atom = atoms[i];
 
 	 if (brw->intel.Fallback)
diff --git a/i965/brw_structs.h b/i965/brw_structs.h
index 66d4127..3c2adfc 100644
--- a/i965/brw_structs.h
+++ b/i965/brw_structs.h
@@ -658,7 +658,105 @@ struct brw_clip_unit_state
    GLfloat viewport_ymax;  
 };
 
+struct gen6_blend_state
+{
+   struct {
+      GLuint dest_blend_factor:5;
+      GLuint source_blend_factor:5;
+      GLuint pad3:1;
+      GLuint blend_func:3;
+      GLuint pad2:1;
+      GLuint ia_dest_blend_factor:5;
+      GLuint ia_source_blend_factor:5;
+      GLuint pad1:1;
+      GLuint ia_blend_func:3;
+      GLuint pad0:1;
+      GLuint ia_blend_enable:1;
+      GLuint blend_enable:1;
+   } blend0;
+
+   struct {
+      GLuint post_blend_clamp_enable:1;
+      GLuint pre_blend_clamp_enable:1;
+      GLuint clamp_range:2;
+      GLuint pad0:4;
+      GLuint x_dither_offset:2;
+      GLuint y_dither_offset:2;
+      GLuint dither_enable:1;
+      GLuint alpha_test_func:3;
+      GLuint alpha_test_enable:1;
+      GLuint pad1:1;
+      GLuint logic_op_func:4;
+      GLuint logic_op_enable:1;
+      GLuint pad2:1;
+      GLuint write_disable_b:1;
+      GLuint write_disable_g:1;
+      GLuint write_disable_r:1;
+      GLuint write_disable_a:1;
+      GLuint pad3:1;
+      GLuint alpha_to_coverage_dither:1;
+      GLuint alpha_to_one:1;
+      GLuint alpha_to_coverage:1;
+   } blend1;
+};
+
+struct gen6_color_calc_state
+{
+   struct {
+      GLuint alpha_test_format:1;
+      GLuint pad0:14;
+      GLuint round_disable:1;
+      GLuint bf_stencil_ref:8;
+      GLuint stencil_ref:8;
+   } cc0;
 
+   union {
+      GLfloat alpha_ref_f;
+      struct {
+	 GLuint ui:8;
+	 GLuint pad0:24;
+      } alpha_ref_fi;
+   } cc1;
+
+   GLfloat constant_r;
+   GLfloat constant_g;
+   GLfloat constant_b;
+   GLfloat constant_a;
+};
+
+struct gen6_depth_stencil_state
+{
+   struct {
+      GLuint pad0:3;
+      GLuint bf_stencil_pass_depth_pass_op:3;
+      GLuint bf_stencil_pass_depth_fail_op:3;
+      GLuint bf_stencil_fail_op:3;
+      GLuint bf_stencil_func:3;
+      GLuint bf_stencil_enable:1;
+      GLuint pad1:2;
+      GLuint stencil_write_enable:1;
+      GLuint stencil_pass_depth_pass_op:3;
+      GLuint stencil_pass_depth_fail_op:3;
+      GLuint stencil_fail_op:3;
+      GLuint stencil_func:3;
+      GLuint stencil_enable:1;
+   } ds0;
+
+   struct {
+      GLuint bf_stencil_write_mask:8;
+      GLuint bf_stencil_test_mask:8;
+      GLuint stencil_write_mask:8;
+      GLuint stencil_test_mask:8;
+   } ds1;
+
+   struct {
+      GLuint pad0:25;
+      GLuint depth_write_enable:1;
+      GLuint depth_test_func:3;
+      GLuint pad1:1;
+      GLuint depth_test_enable:1;
+   } ds2;
+};
 
 struct brw_cc_unit_state
 {
@@ -752,8 +850,6 @@ struct brw_cc_unit_state
    } cc7;
 };
 
-
-
 struct brw_sf_unit_state
 {
    struct thread0 thread0;
@@ -813,6 +909,11 @@ struct brw_sf_unit_state
 
 };
 
+struct gen6_scissor_state
+{
+   GLuint ymin, xmin;
+   GLuint ymax, xmax;
+};
 
 struct brw_gs_unit_state
 {
@@ -1043,6 +1144,15 @@ struct brw_sf_viewport
    } scissor;
 };
 
+struct gen6_sf_viewport {
+   GLfloat m00;
+   GLfloat m11;
+   GLfloat m22;
+   GLfloat m30;
+   GLfloat m31;
+   GLfloat m32;
+};
+
 /* Documented in the subsystem/shared-functions/sampler chapter...
  */
 struct brw_surface_state
diff --git a/i965/brw_tex_layout.c b/i965/brw_tex_layout.c
index e59e52e..09edfd8 100644
--- a/i965/brw_tex_layout.c
+++ b/i965/brw_tex_layout.c
@@ -36,7 +36,6 @@
 #include "intel_tex_layout.h"
 #include "intel_context.h"
 #include "main/macros.h"
-#include "intel_chipset.h"
 
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
 
@@ -49,7 +48,7 @@ GLboolean brw_miptree_layout(struct intel_context *intel,
 
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      if (IS_IGDNG(intel->intelScreen->deviceID)) {
+      if (intel->is_ironlake) {
           GLuint align_h = 2, align_w = 4;
           GLuint level;
           GLuint x = 0;
diff --git a/i965/brw_urb.c b/i965/brw_urb.c
index 8c6f435..4f6b900 100644
--- a/i965/brw_urb.c
+++ b/i965/brw_urb.c
@@ -105,7 +105,8 @@ static GLboolean check_urb_layout( struct brw_context *brw )
    brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize;
    brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize;
 
-   return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= URB_SIZES(brw);
+   return brw->urb.cs_start + brw->urb.nr_cs_entries *
+      brw->urb.csize <= brw->urb.size;
 }
 
 /* Most minimal update, forces re-emit of URB fence packet after GS
@@ -113,6 +114,7 @@ static GLboolean check_urb_layout( struct brw_context *brw )
  */
 static void recalculate_urb_fence( struct brw_context *brw )
 {
+   struct intel_context *intel = &brw->intel;
    GLuint csize = brw->curbe.total_size;
    GLuint vsize = brw->vs.prog_data->urb_entry_size;
    GLuint sfsize = brw->sf.prog_data->urb_entry_size;
@@ -146,7 +148,7 @@ static void recalculate_urb_fence( struct brw_context *brw )
 
       brw->urb.constrained = 0;
 
-      if (BRW_IS_IGDNG(brw)) {
+      if (intel->is_ironlake) {
          brw->urb.nr_vs_entries = 128;
          brw->urb.nr_sf_entries = 48;
          if (check_urb_layout(brw)) {
@@ -156,7 +158,7 @@ static void recalculate_urb_fence( struct brw_context *brw )
             brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
             brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
          }
-      } else if (BRW_IS_G4X(brw)) {
+      } else if (intel->is_g4x) {
 	 brw->urb.nr_vs_entries = 64;
 	 if (check_urb_layout(brw)) {
 	    goto done;
@@ -184,23 +186,23 @@ static void recalculate_urb_fence( struct brw_context *brw )
 	     * entries and the values for minimum nr of entries
 	     * provided above.
 	     */
-	    _mesa_printf("couldn't calculate URB layout!\n");
+	    printf("couldn't calculate URB layout!\n");
 	    exit(1);
 	 }
 	 
 	 if (INTEL_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
-	    _mesa_printf("URB CONSTRAINED\n");
+	    printf("URB CONSTRAINED\n");
       }
 
 done:
       if (INTEL_DEBUG & DEBUG_URB)
-	 _mesa_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+	 printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
 		      brw->urb.vs_start,
 		      brw->urb.gs_start,
 		      brw->urb.clip_start,
 		      brw->urb.sf_start,
 		      brw->urb.cs_start, 
-		      URB_SIZES(brw));
+		      brw->urb.size);
       
       brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
    }
@@ -244,7 +246,7 @@ void brw_upload_urb_fence(struct brw_context *brw)
    uf.bits0.gs_fence  = brw->urb.clip_start; 
    uf.bits0.clp_fence = brw->urb.sf_start; 
    uf.bits1.sf_fence  = brw->urb.cs_start; 
-   uf.bits1.cs_fence  = URB_SIZES(brw);
+   uf.bits1.cs_fence  = brw->urb.size;
 
    BRW_BATCH_STRUCT(brw, &uf);
 }
diff --git a/i965/brw_vs.c b/i965/brw_vs.c
index fd055e2..44b085e 100644
--- a/i965/brw_vs.c
+++ b/i965/brw_vs.c
@@ -35,6 +35,7 @@
 #include "brw_util.h"
 #include "brw_state.h"
 #include "shader/prog_print.h"
+#include "shader/prog_parameter.h"
 
 
 
@@ -42,9 +43,11 @@ static void do_vs_prog( struct brw_context *brw,
 			struct brw_vertex_program *vp,
 			struct brw_vs_prog_key *key )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    GLuint program_size;
    const GLuint *program;
    struct brw_vs_compile c;
+   int aux_size;
 
    memset(&c, 0, sizeof(c));
    memcpy(&c.key, key, sizeof(*key));
@@ -73,13 +76,27 @@ static void do_vs_prog( struct brw_context *brw,
     */
    program = brw_get_program(&c.func, &program_size);
 
+   /* We upload from &c.prog_data including the constant_map assuming
+    * they're packed together.  It would be nice to have a
+    * compile-time assert macro here.
+    */
+   assert(c.constant_map == (int8_t *)&c.prog_data +
+	  sizeof(c.prog_data));
+   assert(ctx->Const.VertexProgram.MaxNativeParameters ==
+	  ARRAY_SIZE(c.constant_map));
+
+   aux_size = sizeof(c.prog_data);
+   if (c.vp->use_const_buffer)
+      aux_size += c.vp->program.Base.Parameters->NumParameters;
+
    dri_bo_unreference(brw->vs.prog_bo);
-   brw->vs.prog_bo = brw_upload_cache( &brw->cache, BRW_VS_PROG,
-				       &c.key, sizeof(c.key),
-				       NULL, 0,
-				       program, program_size,
-				       &c.prog_data,
-				       &brw->vs.prog_data );
+   brw->vs.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_VS_PROG,
+						   &c.key, sizeof(c.key),
+						   NULL, 0,
+						   program, program_size,
+						   &c.prog_data,
+						   aux_size,
+						   &brw->vs.prog_data);
 }
 
 
@@ -109,6 +126,8 @@ static void brw_upload_vs_prog(struct brw_context *brw)
 				      &brw->vs.prog_data);
    if (brw->vs.prog_bo == NULL)
       do_vs_prog(brw, vp, &key);
+   brw->vs.constant_map = ((int8_t *)brw->vs.prog_data +
+			   sizeof(*brw->vs.prog_data));
 }
 
 
diff --git a/i965/brw_vs.h b/i965/brw_vs.h
index 4a59136..95e0501 100644
--- a/i965/brw_vs.h
+++ b/i965/brw_vs.h
@@ -51,6 +51,7 @@ struct brw_vs_compile {
    struct brw_compile func;
    struct brw_vs_prog_key key;
    struct brw_vs_prog_data prog_data;
+   int8_t constant_map[1024];
 
    struct brw_vertex_program *vp;
 
@@ -81,6 +82,8 @@ struct brw_vs_compile {
       GLint index;
       struct brw_reg reg;
    } current_const[3];
+
+   GLboolean needs_stack;
 };
 
 void brw_vs_emit( struct brw_vs_compile *c );
diff --git a/i965/brw_vs_emit.c b/i965/brw_vs_emit.c
index 27aac8b..a7c4b58 100644
--- a/i965/brw_vs_emit.c
+++ b/i965/brw_vs_emit.c
@@ -67,6 +67,7 @@ static void release_tmps( struct brw_vs_compile *c )
  */
 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
    GLuint i, reg = 0, mrf;
    int attributes_in_vue;
 
@@ -103,9 +104,47 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    /* Vertex program parameters from curbe:
     */
    if (c->vp->use_const_buffer) {
-      /* get constants from a real constant buffer */
-      c->prog_data.curb_read_length = 0;
-      c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
+      int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
+      int constant = 0;
+
+      /* We've got more constants than we can load with the push
+       * mechanism.  This is often correlated with reladdr loads where
+       * we should probably be using a pull mechanism anyway to avoid
+       * excessive reading.  However, the pull mechanism is slow in
+       * general.  So, we try to allocate as many non-reladdr-loaded
+       * constants through the push buffer as we can before giving up.
+       */
+      memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
+      for (i = 0;
+	   i < c->vp->program.Base.NumInstructions && constant < max_constant;
+	   i++) {
+	 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
+	 int arg;
+
+	 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
+	    if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
+		 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
+		 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
+		 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
+		 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
+		inst->SrcReg[arg].RelAddr)
+	       continue;
+
+	    if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
+	       c->constant_map[inst->SrcReg[arg].Index] = constant++;
+	    }
+	 }
+      }
+
+      for (i = 0; i < constant; i++) {
+         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
+							      (i%2) * 4),
+						 0, 4, 1);
+      }
+      reg += (constant + 1) / 2;
+      c->prog_data.curb_read_length = reg - 1;
+      /* XXX 0 causes a bug elsewhere... */
+      c->prog_data.nr_params = MAX2(constant * 4, 4);
    }
    else {
       /* use a section of the GRF for constants */
@@ -141,10 +180,12 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->first_output = reg;
    c->first_overflow_output = 0;
 
-   if (BRW_IS_IGDNG(c->func.brw))
-       mrf = 8;
+   if (intel->gen >= 6)
+      mrf = 6;
+   else if (intel->is_ironlake)
+      mrf = 8;
    else
-       mrf = 4;
+      mrf = 4;
 
    for (i = 0; i < VERT_RESULT_MAX; i++) {
       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
@@ -213,8 +254,10 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
       }
    }
 
-   c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
-   reg += 2;
+   if (c->needs_stack) {
+      c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
+      reg += 2;
+   }
 
    /* Some opcodes need an internal temporary:
     */
@@ -238,17 +281,19 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     */
    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 
-   if (BRW_IS_IGDNG(c->func.brw))
-       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
+   if (intel->gen >= 6)
+      c->prog_data.urb_entry_size = (attributes_in_vue + 4 + 7) / 8;
+   else if (intel->is_ironlake)
+      c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
    else
-       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
+      c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 
    c->prog_data.total_grf = reg;
 
    if (INTEL_DEBUG & DEBUG_VS) {
-      _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
-      _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
-      _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
+      printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
+      printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
+      printf("%s reg = %d\n", __FUNCTION__, reg);
    }
 }
 
@@ -438,9 +483,11 @@ static void emit_math1( struct brw_vs_compile *c,
     * whether that turns out to be a simulator bug or not:
     */
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg tmp = dst;
-   GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
-			 dst.file != BRW_GENERAL_REGISTER_FILE);
+   GLboolean need_tmp = (intel->gen < 6 &&
+			 (dst.dw1.bits.writemask != 0xf ||
+			  dst.file != BRW_GENERAL_REGISTER_FILE));
 
    if (need_tmp) 
       tmp = get_tmp(c);
@@ -469,9 +516,11 @@ static void emit_math2( struct brw_vs_compile *c,
 			GLuint precision)
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg tmp = dst;
-   GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
-			 dst.file != BRW_GENERAL_REGISTER_FILE);
+   GLboolean need_tmp = (intel->gen < 6 &&
+			 (dst.dw1.bits.writemask != 0xf ||
+			  dst.file != BRW_GENERAL_REGISTER_FILE));
 
    if (need_tmp) 
       tmp = get_tmp(c);
@@ -761,15 +810,14 @@ get_constant(struct brw_vs_compile *c,
 {
    const struct prog_src_register *src = &inst->SrcReg[argIndex];
    struct brw_compile *p = &c->func;
-   struct brw_reg const_reg;
-   struct brw_reg const2_reg;
-   const GLboolean relAddr = src->RelAddr;
+   struct brw_reg const_reg = c->current_const[argIndex].reg;
 
    assert(argIndex < 3);
 
-   if (c->current_const[argIndex].index != src->Index || relAddr) {
+   if (c->current_const[argIndex].index != src->Index) {
       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 
+      /* Keep track of the last constant loaded in this slot, for reuse. */
       c->current_const[argIndex].index = src->Index;
 
 #if 0
@@ -778,48 +826,74 @@ get_constant(struct brw_vs_compile *c,
 #endif
       /* need to fetch the constant now */
       brw_dp_READ_4_vs(p,
-                       c->current_const[argIndex].reg,/* writeback dest */
+                       const_reg,                     /* writeback dest */
                        0,                             /* oword */
-                       relAddr,                       /* relative indexing? */
+                       0,                             /* relative indexing? */
                        addrReg,                       /* address register */
                        16 * src->Index,               /* byte offset */
                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
                        );
-
-      if (relAddr) {
-         /* second read */
-         const2_reg = get_tmp(c);
-
-         /* use upper half of address reg for second read */
-         addrReg = stride(addrReg, 0, 4, 0);
-         addrReg.subnr = 16;
-
-         brw_dp_READ_4_vs(p,
-                          const2_reg,              /* writeback dest */
-                          1,                       /* oword */
-                          relAddr,                 /* relative indexing? */
-                          addrReg,                 /* address register */
-                          16 * src->Index,         /* byte offset */
-                          SURF_INDEX_VERT_CONST_BUFFER
-                          );
-      }
    }
 
-   const_reg = c->current_const[argIndex].reg;
+   /* replicate lower four floats into upper half (to get XYZWXYZW) */
+   const_reg = stride(const_reg, 0, 4, 0);
+   const_reg.subnr = 0;
 
-   if (relAddr) {
-      /* merge the two Owords into the constant register */
-      /* const_reg[7..4] = const2_reg[7..4] */
-      brw_MOV(p,
-              suboffset(stride(const_reg, 0, 4, 1), 4),
-              suboffset(stride(const2_reg, 0, 4, 1), 4));
-      release_tmp(c, const2_reg);
-   }
-   else {
-      /* replicate lower four floats into upper half (to get XYZWXYZW) */
-      const_reg = stride(const_reg, 0, 4, 0);
-      const_reg.subnr = 0;
-   }
+   return const_reg;
+}
+
+static struct brw_reg
+get_reladdr_constant(struct brw_vs_compile *c,
+		     const struct prog_instruction *inst,
+		     GLuint argIndex)
+{
+   const struct prog_src_register *src = &inst->SrcReg[argIndex];
+   struct brw_compile *p = &c->func;
+   struct brw_reg const_reg = c->current_const[argIndex].reg;
+   struct brw_reg const2_reg;
+   struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
+
+   assert(argIndex < 3);
+
+   /* Can't reuse a reladdr constant load. */
+   c->current_const[argIndex].index = -1;
+
+ #if 0
+   printf("  fetch const[a0.x+%d] for arg %d into reg %d\n",
+	  src->Index, argIndex, c->current_const[argIndex].reg.nr);
+#endif
+
+   /* fetch the first vec4 */
+   brw_dp_READ_4_vs(p,
+		    const_reg,                     /* writeback dest */
+		    0,                             /* oword */
+		    1,                             /* relative indexing? */
+		    addrReg,                       /* address register */
+		    16 * src->Index,               /* byte offset */
+		    SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
+		    );
+   /* second vec4 */
+   const2_reg = get_tmp(c);
+
+   /* use upper half of address reg for second read */
+   addrReg = stride(addrReg, 0, 4, 0);
+   addrReg.subnr = 16;
+
+   brw_dp_READ_4_vs(p,
+		    const2_reg,              /* writeback dest */
+		    1,                       /* oword */
+		    1,                       /* relative indexing? */
+		    addrReg,                 /* address register */
+		    16 * src->Index,         /* byte offset */
+		    SURF_INDEX_VERT_CONST_BUFFER
+		    );
+
+   /* merge the two Owords into the constant register */
+   /* const_reg[7..4] = const2_reg[7..4] */
+   brw_MOV(p,
+	   suboffset(stride(const_reg, 0, 4, 1), 4),
+	   suboffset(stride(const2_reg, 0, 4, 1), 4));
+   release_tmp(c, const2_reg);
 
    return const_reg;
 }
@@ -927,7 +1001,13 @@ get_src_reg( struct brw_vs_compile *c,
    case PROGRAM_ENV_PARAM:
    case PROGRAM_LOCAL_PARAM:
       if (c->vp->use_const_buffer) {
-         return get_constant(c, inst, argIndex);
+	 if (!relAddr && c->constant_map[index] != -1) {
+	    assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
+	    return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
+	 } else if (relAddr)
+	    return get_reladdr_constant(c, inst, argIndex);
+	 else
+	    return get_constant(c, inst, argIndex);
       }
       else if (relAddr) {
          return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
@@ -1113,11 +1193,13 @@ static void emit_swz( struct brw_vs_compile *c,
 static void emit_vertex_write( struct brw_vs_compile *c)
 {
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
    struct brw_reg m0 = brw_message_reg(0);
    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
    int eot;
-   GLuint len_vertext_header = 2;
+   GLuint len_vertex_header = 2;
 
    if (c->key.copy_edgeflag) {
       brw_MOV(p, 
@@ -1125,18 +1207,20 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 	      get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
    }
 
-   /* Build ndc coords */
-   ndc = get_tmp(c);
-   /* ndc = 1.0 / pos.w */
-   emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
-   /* ndc.xyz = pos * ndc */
-   brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
+   if (intel->gen < 6) {
+      /* Build ndc coords */
+      ndc = get_tmp(c);
+      /* ndc = 1.0 / pos.w */
+      emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
+      /* ndc.xyz = pos * ndc */
+      brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
+   }
 
    /* Update the header for point size, user clipping flags, and -ve rhw
     * workaround.
     */
    if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
-       c->key.nr_userclip || BRW_IS_965(p->brw))
+       c->key.nr_userclip || brw->has_negative_rhw_bug)
    {
       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
       GLuint i;
@@ -1167,7 +1251,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (BRW_IS_965(p->brw)) {
+      if (brw->has_negative_rhw_bug) {
 	 brw_CMP(p,
 		 vec8(brw_null_reg()),
 		 BRW_CONDITIONAL_L,
@@ -1193,21 +1277,41 @@ static void emit_vertex_write( struct brw_vs_compile *c)
     * of zeros followed by two sets of NDC coordinates:
     */
    brw_set_access_mode(p, BRW_ALIGN_1);
-   brw_MOV(p, offset(m0, 2), ndc);
-
-   if (BRW_IS_IGDNG(p->brw)) {
-       /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
-       brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
-       /* m4, m5 contain the distances from vertex to the user clip planeXXX. 
-        * Seems it is useless for us.
-        * m6 is used for aligning, so that the remainder of vertex element is 
-        * reg-aligned.
-        */
-       brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
-       len_vertext_header = 6;
+
+   if (intel->gen >= 6) {
+      /* There are 16 DWs (D0-D15) in VUE header on Sandybridge:
+       * dword 0-3 (m1) of the header is indices, point width, clip flags.
+       * dword 4-7 (m2) is the 4D space position
+       * dword 8-15 (m3,m4) of the vertex header is the user clip distance.
+       * m5 is the first vertex data we fill, which is the vertex position.
+       */
+      brw_MOV(p, offset(m0, 2), pos);
+      brw_MOV(p, offset(m0, 5), pos);
+      len_vertex_header = 4;
+   } else if (intel->is_ironlake) {
+      /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
+       * dword 0-3 (m1) of the header is indices, point width, clip flags.
+       * dword 4-7 (m2) is the ndc position (set above)
+       * dword 8-11 (m3) of the vertex header is the 4D space position
+       * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
+       * m6 is a pad so that the vertex element data is aligned
+       * m7 is the first vertex data we fill, which is the vertex position.
+       */
+      brw_MOV(p, offset(m0, 2), ndc);
+      brw_MOV(p, offset(m0, 3), pos);
+      brw_MOV(p, offset(m0, 7), pos);
+      len_vertex_header = 6;
    } else {
-       brw_MOV(p, offset(m0, 3), pos);
-       len_vertext_header = 2;
+      /* There are 8 dwords in VUE header pre-Ironlake:
+       * dword 0-3 (m1) is indices, point width, clip flags.
+       * dword 4-7 (m2) is ndc position (set above)
+       *
+       * dword 8-11 (m3) is the first vertex data, which we always have be the
+       * vertex position.
+       */
+      brw_MOV(p, offset(m0, 2), ndc);
+      brw_MOV(p, offset(m0, 3), pos);
+      len_vertex_header = 2;
    }
 
    eot = (c->first_overflow_output == 0);
@@ -1218,7 +1322,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 c->r0,		/* src */
 		 0,		/* allocate */
 		 1,		/* used */
-		 MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
+		 MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */
 		 0,		/* response len */
 		 eot, 		/* eot */
 		 eot, 		/* writes complete */
@@ -1359,29 +1463,32 @@ void brw_vs_emit(struct brw_vs_compile *c )
 #define MAX_LOOP_DEPTH 32
    struct brw_compile *p = &c->func;
    struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
    GLuint insn, if_depth = 0, loop_depth = 0;
    GLuint end_offset = 0;
    struct brw_instruction *end_inst, *last_inst;
-   struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
+   struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH] = { 0 };
    const struct brw_indirect stack_index = brw_indirect(0, 0);   
    GLuint index;
    GLuint file;
 
    if (INTEL_DEBUG & DEBUG_VS) {
-      _mesa_printf("vs-mesa:\n");
+      printf("vs-mesa:\n");
       _mesa_print_program(&c->vp->program.Base); 
-      _mesa_printf("\n");
+      printf("\n");
    }
 
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_access_mode(p, BRW_ALIGN_16);
-   
-   /* Message registers can't be read, so copy the output into GRF register
-      if they are used in source registers */
+
    for (insn = 0; insn < nr_insns; insn++) {
        GLuint i;
        struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
+
+       /* Message registers can't be read, so copy the output into GRF
+	* register if they are used in source registers
+	*/
        for (i = 0; i < 3; i++) {
 	   struct prog_src_register *src = &inst->SrcReg[i];
 	   GLuint index = src->Index;
@@ -1389,12 +1496,23 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	   if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
 	       c->output_regs[index].used_in_src = GL_TRUE;
        }
+
+       switch (inst->Opcode) {
+       case OPCODE_CAL:
+       case OPCODE_RET:
+	  c->needs_stack = GL_TRUE;
+	  break;
+       default:
+	  break;
+       }
    }
 
    /* Static register allocation
     */
    brw_vs_alloc_regs(c);
-   brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
+
+   if (c->needs_stack)
+      brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
 
    for (insn = 0; insn < nr_insns; insn++) {
 
@@ -1592,7 +1710,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
             loop_depth--;
 
-	    if (BRW_IS_IGDNG(brw))
+	    if (intel->is_ironlake)
 	       br = 2;
 
             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
@@ -1708,9 +1826,9 @@ void brw_vs_emit(struct brw_vs_compile *c )
    if (INTEL_DEBUG & DEBUG_VS) {
       int i;
 
-      _mesa_printf("vs-native:\n");
+      printf("vs-native:\n");
       for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i]);
-      _mesa_printf("\n");
+      printf("\n");
    }
 }
diff --git a/i965/brw_vs_state.c b/i965/brw_vs_state.c
index 7285466..fd9f2fe 100644
--- a/i965/brw_vs_state.c
+++ b/i965/brw_vs_state.c
@@ -82,9 +82,9 @@ vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 static dri_bo *
 vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_vs_unit_state vs;
    dri_bo *bo;
-   int chipset_max_threads;
 
    memset(&vs, 0, sizeof(vs));
 
@@ -98,7 +98,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     */
    vs.thread1.single_program_flow = 0;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
       vs.thread1.binding_table_entry_count = 0; /* hardware requirement */
    else
       vs.thread1.binding_table_entry_count = key->nr_surfaces;
@@ -109,7 +109,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    vs.thread3.urb_entry_read_offset = 0;
    vs.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
       switch (key->nr_urb_entries) {
       case 8:
       case 12:
@@ -135,7 +135,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
       case 32:
 	 break;
       case 64:
-	 assert(BRW_IS_G4X(brw));
+	 assert(intel->is_g4x);
 	 break;
       default:
 	 assert(0);
@@ -145,17 +145,8 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 
    vs.thread4.urb_entry_allocation_size = key->urb_size - 1;
 
-   if (BRW_IS_IGDNG(brw))
-      chipset_max_threads = 72;
-   else if (BRW_IS_G4X(brw))
-      chipset_max_threads = 32;
-   else
-      chipset_max_threads = 16;
    vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2,
-				  1, chipset_max_threads) - 1;
-
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
-      vs.thread4.max_threads = 0;
+				  1, brw->vs_max_threads) - 1;
 
    /* No samplers for ARB_vp programs:
     */
@@ -173,8 +164,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    bo = brw_upload_cache(&brw->cache, BRW_VS_UNIT,
 			 key, sizeof(*key),
 			 &brw->vs.prog_bo, 1,
-			 &vs, sizeof(vs),
-			 NULL, NULL);
+			 &vs, sizeof(vs));
 
    /* Emit VS program relocation */
    dri_bo_emit_reloc(bo,
diff --git a/i965/brw_vs_surface_state.c b/i965/brw_vs_surface_state.c
index 3bc9840..4007b5a 100644
--- a/i965/brw_vs_surface_state.c
+++ b/i965/brw_vs_surface_state.c
@@ -35,7 +35,6 @@
 
 #include "brw_context.h"
 #include "brw_state.h"
-#include "brw_defines.h"
 
 /* Creates a new VS constant buffer reflecting the current VS program's
  * constants, if needed by the VS program.
@@ -68,13 +67,13 @@ brw_vs_update_constant_buffer(struct brw_context *brw)
     */
    _mesa_load_state_parameters(&brw->intel.ctx, vp->program.Base.Parameters);
 
-   intel_bo_map_gtt_preferred(intel, const_buffer, GL_TRUE);
+   drm_intel_gem_bo_map_gtt(const_buffer);
    for (i = 0; i < params->NumParameters; i++) {
       memcpy(const_buffer->virtual + i * 4 * sizeof(float),
 	     params->ParameterValues[i],
 	     4 * sizeof(float));
    }
-   intel_bo_unmap_gtt_preferred(intel, const_buffer);
+   drm_intel_gem_bo_unmap_gtt(const_buffer);
 
    return const_buffer;
 }
@@ -105,7 +104,7 @@ brw_update_vs_constant_surface( GLcontext *ctx,
    /* If there's no constant buffer, then no surface BO is needed to point at
     * it.
     */
-   if (vp->const_buffer == 0) {
+   if (vp->const_buffer == NULL) {
       drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
       brw->vs.surf_bo[surf] = NULL;
       return;
@@ -133,7 +132,7 @@ brw_update_vs_constant_surface( GLcontext *ctx,
    brw->vs.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
                                             BRW_SS_SURFACE,
                                             &key, sizeof(key),
-                                            &key.bo, key.bo ? 1 : 0,
+                                            &key.bo, 1,
                                             NULL);
    if (brw->vs.surf_bo[surf] == NULL) {
       brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
@@ -156,7 +155,7 @@ brw_vs_get_binding_table(struct brw_context *brw)
 
    if (bind_bo == NULL) {
       GLuint data_size = BRW_VS_MAX_SURF * sizeof(GLuint);
-      uint32_t *data = malloc(data_size);
+      uint32_t data[BRW_VS_MAX_SURF];
       int i;
 
       for (i = 0; i < BRW_VS_MAX_SURF; i++)
@@ -168,8 +167,7 @@ brw_vs_get_binding_table(struct brw_context *brw)
       bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
 				  NULL, 0,
 				  brw->vs.surf_bo, BRW_VS_MAX_SURF,
-				  data, data_size,
-				  NULL, NULL);
+				  data, data_size);
 
       /* Emit binding table relocations to surface state */
       for (i = 0; i < BRW_VS_MAX_SURF; i++) {
@@ -182,8 +180,6 @@ brw_vs_get_binding_table(struct brw_context *brw)
 				    I915_GEM_DOMAIN_INSTRUCTION, 0);
 	 }
       }
-
-      free(data);
    }
 
    return bind_bo;
diff --git a/i965/brw_vtbl.c b/i965/brw_vtbl.c
index 34aaea3..96a44bf 100644
--- a/i965/brw_vtbl.c
+++ b/i965/brw_vtbl.c
@@ -44,7 +44,6 @@
 #include "brw_state.h"
 #include "brw_draw.h"
 #include "brw_state.h"
-#include "brw_fallback.h"
 #include "brw_vs.h"
 #include "brw_wm.h"
 
@@ -68,11 +67,11 @@ static void brw_destroy_context( struct intel_context *intel )
    brw_draw_destroy( brw );
    brw_clear_validated_bos(brw);
    if (brw->wm.compile_data) {
-      _mesa_free(brw->wm.compile_data->instruction);
-      _mesa_free(brw->wm.compile_data->vreg);
-      _mesa_free(brw->wm.compile_data->refs);
-      _mesa_free(brw->wm.compile_data->prog_instructions);
-      _mesa_free(brw->wm.compile_data);
+      free(brw->wm.compile_data->instruction);
+      free(brw->wm.compile_data->vreg);
+      free(brw->wm.compile_data->refs);
+      free(brw->wm.compile_data->prog_instructions);
+      free(brw->wm.compile_data);
    }
 
    for (i = 0; i < brw->state.nr_color_regions; i++)
@@ -103,6 +102,9 @@ static void brw_destroy_context( struct intel_context *intel )
    dri_bo_release(&brw->cc.prog_bo);
    dri_bo_release(&brw->cc.state_bo);
    dri_bo_release(&brw->cc.vp_bo);
+   dri_bo_release(&brw->cc.blend_state_bo);
+   dri_bo_release(&brw->cc.depth_stencil_state_bo);
+   dri_bo_release(&brw->cc.color_calc_state_bo);
 }
 
 
@@ -140,6 +142,12 @@ static void brw_finish_batch(struct intel_context *intel)
 {
    struct brw_context *brw = brw_context(&intel->ctx);
    brw_emit_query_end(brw);
+
+   if (brw->curbe.curbe_bo) {
+      drm_intel_gem_bo_unmap_gtt(brw->curbe.curbe_bo);
+      drm_intel_bo_unreference(brw->curbe.curbe_bo);
+      brw->curbe.curbe_bo = NULL;
+   }
 }
 
 
@@ -150,11 +158,6 @@ static void brw_new_batch( struct intel_context *intel )
 {
    struct brw_context *brw = brw_context(&intel->ctx);
 
-   /* Check that we didn't just wrap our batchbuffer at a bad time. */
-   assert(!brw->no_batch_wrap);
-
-   brw->curbe.need_new_bo = GL_TRUE;
-
    /* Mark all context state as needing to be re-emitted.
     * This is probably not as severe as on 915, since almost all of our state
     * is just in referenced buffers.
@@ -175,12 +178,6 @@ static void brw_new_batch( struct intel_context *intel )
    }
 }
 
-
-static void brw_note_fence( struct intel_context *intel, GLuint fence )
-{
-   brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
-}
-
 static void brw_invalidate_state( struct intel_context *intel, GLuint new_state )
 {
    /* nothing */
@@ -196,7 +193,6 @@ void brwInitVtbl( struct brw_context *brw )
    brw->intel.vtbl.update_texture_state = 0;
 
    brw->intel.vtbl.invalidate_state = brw_invalidate_state;
-   brw->intel.vtbl.note_fence = brw_note_fence;
    brw->intel.vtbl.new_batch = brw_new_batch;
    brw->intel.vtbl.finish_batch = brw_finish_batch;
    brw->intel.vtbl.destroy = brw_destroy_context;
diff --git a/i965/brw_wm.c b/i965/brw_wm.c
index 6895f64..991e1b9 100644
--- a/i965/brw_wm.c
+++ b/i965/brw_wm.c
@@ -30,7 +30,6 @@
   */
              
 #include "brw_context.h"
-#include "brw_util.h"
 #include "brw_wm.h"
 #include "brw_state.h"
 
@@ -152,11 +151,11 @@ static void do_wm_prog( struct brw_context *brw,
           */
          return;
       }
-      c->instruction = _mesa_calloc(BRW_WM_MAX_INSN * sizeof(*c->instruction));
-      c->prog_instructions = _mesa_calloc(BRW_WM_MAX_INSN *
+      c->instruction = calloc(1, BRW_WM_MAX_INSN * sizeof(*c->instruction));
+      c->prog_instructions = calloc(1, BRW_WM_MAX_INSN *
 					  sizeof(*c->prog_instructions));
-      c->vreg = _mesa_calloc(BRW_WM_MAX_VREG * sizeof(*c->vreg));
-      c->refs = _mesa_calloc(BRW_WM_MAX_REF * sizeof(*c->refs));
+      c->vreg = calloc(1, BRW_WM_MAX_VREG * sizeof(*c->vreg));
+      c->refs = calloc(1, BRW_WM_MAX_REF * sizeof(*c->refs));
    } else {
       void *instruction = c->instruction;
       void *prog_instructions = c->prog_instructions;
@@ -199,12 +198,13 @@ static void do_wm_prog( struct brw_context *brw,
    program = brw_get_program(&c->func, &program_size);
 
    dri_bo_unreference(brw->wm.prog_bo);
-   brw->wm.prog_bo = brw_upload_cache( &brw->cache, BRW_WM_PROG,
-				       &c->key, sizeof(c->key),
-				       NULL, 0,
-				       program, program_size,
-				       &c->prog_data,
-				       &brw->wm.prog_data );
+   brw->wm.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_WM_PROG,
+						   &c->key, sizeof(c->key),
+						   NULL, 0,
+						   program, program_size,
+						   &c->prog_data,
+						   sizeof(c->prog_data),
+						   &brw->wm.prog_data);
 }
 
 
@@ -336,11 +336,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
     * drawable height in order to invert the Y axis.
     */
    if (fp->program.Base.InputsRead & FRAG_BIT_WPOS) {
-      if (brw->intel.driDrawable != NULL) {
-         key->origin_x = brw->intel.driDrawable->x;
-         key->origin_y = brw->intel.driDrawable->y;
-         key->drawable_height = brw->intel.driDrawable->h;
-      }
+      key->drawable_height = ctx->DrawBuffer->Height;
    }
 
    key->nr_color_regions = brw->state.nr_color_regions;
diff --git a/i965/brw_wm.h b/i965/brw_wm.h
index 9dcb6e1..88d84ee 100644
--- a/i965/brw_wm.h
+++ b/i965/brw_wm.h
@@ -76,10 +76,9 @@ struct brw_wm_prog_key {
 
    GLushort tex_swizzles[BRW_MAX_TEX_UNIT];
 
-   GLuint program_string_id:32;
-   GLushort origin_x, origin_y;
    GLushort drawable_height;
    GLbitfield64 vp_outputs_written;
+   GLuint program_string_id:32;
 };
 
 
diff --git a/i965/brw_wm_debug.c b/i965/brw_wm_debug.c
index 2208210..a78cc8b 100644
--- a/i965/brw_wm_debug.c
+++ b/i965/brw_wm_debug.c
@@ -41,21 +41,21 @@ void brw_wm_print_value( struct brw_wm_compile *c,
    if (c->state >= PASS2_DONE) 
       brw_print_reg(value->hw_reg);
    else if( value == &c->undef_value )
-      _mesa_printf("undef");
+      printf("undef");
    else if( value - c->vreg >= 0 &&
 	    value - c->vreg < BRW_WM_MAX_VREG)
-      _mesa_printf("r%d", value - c->vreg);
+      printf("r%d", value - c->vreg);
    else if (value - c->creg >= 0 &&
 	    value - c->creg < BRW_WM_MAX_PARAM)
-      _mesa_printf("c%d", value - c->creg);
+      printf("c%d", value - c->creg);
    else if (value - c->payload.input_interp >= 0 &&
 	    value - c->payload.input_interp < FRAG_ATTRIB_MAX)
-      _mesa_printf("i%d", value - c->payload.input_interp);
+      printf("i%d", value - c->payload.input_interp);
    else if (value - c->payload.depth >= 0 &&
 	    value - c->payload.depth < FRAG_ATTRIB_MAX)
-      _mesa_printf("d%d", value - c->payload.depth);
+      printf("d%d", value - c->payload.depth);
    else 
-      _mesa_printf("?");
+      printf("?");
 }
 
 void brw_wm_print_ref( struct brw_wm_compile *c,
@@ -64,16 +64,16 @@ void brw_wm_print_ref( struct brw_wm_compile *c,
    struct brw_reg hw_reg = ref->hw_reg;
 
    if (ref->unspill_reg)
-      _mesa_printf("UNSPILL(%x)/", ref->value->spill_slot);
+      printf("UNSPILL(%x)/", ref->value->spill_slot);
 
    if (c->state >= PASS2_DONE)
       brw_print_reg(ref->hw_reg);
    else {
-      _mesa_printf("%s", hw_reg.negate ? "-" : "");
-      _mesa_printf("%s", hw_reg.abs ? "abs/" : "");
+      printf("%s", hw_reg.negate ? "-" : "");
+      printf("%s", hw_reg.abs ? "abs/" : "");
       brw_wm_print_value(c, ref->value);
       if ((hw_reg.nr&1) || hw_reg.subnr) {
-	 _mesa_printf("->%d.%d", (hw_reg.nr&1), hw_reg.subnr);
+	 printf("->%d.%d", (hw_reg.nr&1), hw_reg.subnr);
       }
    }
 }
@@ -84,22 +84,22 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
    GLuint i, arg;
    GLuint nr_args = brw_wm_nr_args(inst->opcode);
 
-   _mesa_printf("[");
+   printf("[");
    for (i = 0; i < 4; i++) {
       if (inst->dst[i]) {
 	 brw_wm_print_value(c, inst->dst[i]);
 	 if (inst->dst[i]->spill_slot)
-	    _mesa_printf("/SPILL(%x)",inst->dst[i]->spill_slot);
+	    printf("/SPILL(%x)",inst->dst[i]->spill_slot);
       }
       else
-	 _mesa_printf("#");
+	 printf("#");
       if (i < 3)      
-	 _mesa_printf(",");
+	 printf(",");
    }
-   _mesa_printf("]");
+   printf("]");
 
    if (inst->writemask != WRITEMASK_XYZW)
-      _mesa_printf(".%s%s%s%s", 
+      printf(".%s%s%s%s", 
 		   GET_BIT(inst->writemask, 0) ? "x" : "",
 		   GET_BIT(inst->writemask, 1) ? "y" : "",
 		   GET_BIT(inst->writemask, 2) ? "z" : "",
@@ -107,58 +107,58 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
 
    switch (inst->opcode) {
    case WM_PIXELXY:
-      _mesa_printf(" = PIXELXY");
+      printf(" = PIXELXY");
       break;
    case WM_DELTAXY:
-      _mesa_printf(" = DELTAXY");
+      printf(" = DELTAXY");
       break;
    case WM_PIXELW:
-      _mesa_printf(" = PIXELW");
+      printf(" = PIXELW");
       break;
    case WM_WPOSXY:
-      _mesa_printf(" = WPOSXY");
+      printf(" = WPOSXY");
       break;
    case WM_PINTERP:
-      _mesa_printf(" = PINTERP");
+      printf(" = PINTERP");
       break;
    case WM_LINTERP:
-      _mesa_printf(" = LINTERP");
+      printf(" = LINTERP");
       break;
    case WM_CINTERP:
-      _mesa_printf(" = CINTERP");
+      printf(" = CINTERP");
       break;
    case WM_FB_WRITE:
-      _mesa_printf(" = FB_WRITE");
+      printf(" = FB_WRITE");
       break;
    case WM_FRONTFACING:
-      _mesa_printf(" = FRONTFACING");
+      printf(" = FRONTFACING");
       break;
    default:
-      _mesa_printf(" = %s", _mesa_opcode_string(inst->opcode));
+      printf(" = %s", _mesa_opcode_string(inst->opcode));
       break;
    }
 
    if (inst->saturate)
-      _mesa_printf("_SAT");
+      printf("_SAT");
 
    for (arg = 0; arg < nr_args; arg++) {
 
-      _mesa_printf(" [");
+      printf(" [");
 
       for (i = 0; i < 4; i++) {
 	 if (inst->src[arg][i]) {
 	    brw_wm_print_ref(c, inst->src[arg][i]);
 	 }
 	 else
-	    _mesa_printf("%%");
+	    printf("%%");
 
 	 if (i < 3) 
-	    _mesa_printf(",");
+	    printf(",");
 	 else
-	    _mesa_printf("]");
+	    printf("]");
       }
    }
-   _mesa_printf("\n");
+   printf("\n");
 }
 
 void brw_wm_print_program( struct brw_wm_compile *c,
@@ -166,9 +166,9 @@ void brw_wm_print_program( struct brw_wm_compile *c,
 {
    GLuint insn;
 
-   _mesa_printf("%s:\n", stage);
+   printf("%s:\n", stage);
    for (insn = 0; insn < c->nr_insns; insn++)
       brw_wm_print_insn(c, &c->instruction[insn]);
-   _mesa_printf("\n");
+   printf("\n");
 }
 
diff --git a/i965/brw_wm_emit.c b/i965/brw_wm_emit.c
index 5390fd2..9315bca 100644
--- a/i965/brw_wm_emit.c
+++ b/i965/brw_wm_emit.c
@@ -138,19 +138,43 @@ void emit_wpos_xy(struct brw_wm_compile *c,
     * X and Y channels.
     */
    if (mask & WRITEMASK_X) {
-      /* X' = X - origin */
-      brw_ADD(p,
-	      dst[0],
-	      retype(arg0[0], BRW_REGISTER_TYPE_W),
-	      brw_imm_d(0 - c->key.origin_x));
+      if (c->fp->program.PixelCenterInteger) {
+	 /* X' = X */
+	 brw_MOV(p,
+		 dst[0],
+		 retype(arg0[0], BRW_REGISTER_TYPE_W));
+      } else {
+	 /* X' = X + 0.5 */
+	 brw_ADD(p,
+		 dst[0],
+		 retype(arg0[0], BRW_REGISTER_TYPE_W),
+		 brw_imm_f(0.5));
+      }
    }
 
    if (mask & WRITEMASK_Y) {
-      /* Y' = height - (Y - origin_y) = height + origin_y - Y */
-      brw_ADD(p,
-	      dst[1],
-	      negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
-	      brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
+      if (c->fp->program.OriginUpperLeft) {
+	 if (c->fp->program.PixelCenterInteger) {
+	    /* Y' = Y */
+	    brw_MOV(p,
+		    dst[1],
+		    retype(arg0[1], BRW_REGISTER_TYPE_W));
+	 } else {
+	    /* Y' = Y + 0.5 */
+	    brw_ADD(p,
+		    dst[1],
+		    retype(arg0[1], BRW_REGISTER_TYPE_W),
+		    brw_imm_f(0.5));
+	 }
+      } else {
+	 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
+
+	 /* Y' = (height - 1) - Y + center */
+	 brw_ADD(p,
+		 dst[1],
+		 negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
+		 brw_imm_f(c->key.drawable_height - 1 + center_offset));
+      }
    }
 }
 
@@ -692,7 +716,7 @@ void emit_xpd(struct brw_compile *p,
 {
    GLuint i;
 
-   assert(!(mask & WRITEMASK_W) == WRITEMASK_X);
+   assert((mask & WRITEMASK_W) != WRITEMASK_W);
    
    for (i = 0 ; i < 3; i++) {
       if (mask & (1<<i)) {
@@ -830,6 +854,7 @@ void emit_tex(struct brw_wm_compile *c,
 	      GLboolean shadow)
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg dst_retyped;
    GLuint cur_mrf = 2, response_length;
    GLuint i, nr_texcoords;
@@ -873,7 +898,7 @@ void emit_tex(struct brw_wm_compile *c,
    }
 
    /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
-   if (!BRW_IS_IGDNG(p->brw) && c->dispatch_width == 8)
+   if (!intel->is_ironlake && c->dispatch_width == 8)
       nr_texcoords = 3;
 
    /* For shadow comparisons, we have to supply u,v,r. */
@@ -891,7 +916,7 @@ void emit_tex(struct brw_wm_compile *c,
 
    /* Fill in the shadow comparison reference value. */
    if (shadow) {
-      if (BRW_IS_IGDNG(p->brw)) {
+      if (intel->is_ironlake) {
 	 /* Fill in the cube map array index value. */
 	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
 	 cur_mrf += mrf_per_channel;
@@ -904,7 +929,7 @@ void emit_tex(struct brw_wm_compile *c,
       cur_mrf += mrf_per_channel;
    }
 
-   if (BRW_IS_IGDNG(p->brw)) {
+   if (intel->is_ironlake) {
       if (shadow)
 	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG;
       else
@@ -944,6 +969,7 @@ void emit_txb(struct brw_wm_compile *c,
 	      GLuint sampler)
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    GLuint msgLength;
    GLuint msg_type;
    GLuint mrf_per_channel;
@@ -955,8 +981,8 @@ void emit_txb(struct brw_wm_compile *c,
     * undefined, and trust the execution mask to keep the undefined pixels
     * from mattering.
     */
-   if (c->dispatch_width == 16 || !BRW_IS_IGDNG(p->brw)) {
-      if (BRW_IS_IGDNG(p->brw))
+   if (c->dispatch_width == 16 || !intel->is_ironlake) {
+      if (intel->is_ironlake)
 	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
       else
 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
@@ -1084,7 +1110,7 @@ static void emit_kil_nv( struct brw_wm_compile *c )
 
    brw_push_insn_state(p);
    brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
    brw_pop_insn_state(p);
 }
@@ -1174,7 +1200,7 @@ void emit_fb_write(struct brw_wm_compile *c,
    brw_push_insn_state(p);
 
    for (channel = 0; channel < 4; channel++) {
-      if (c->dispatch_width == 16 && (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))) {
+      if (c->dispatch_width == 16 && brw->has_compr4) {
 	 /* By setting the high bit of the MRF register number, we indicate
 	  * that we want COMPR4 mode - instead of doing the usual destination
 	  * + 1 for the second half we get destination + 4.
@@ -1596,10 +1622,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       default:
-	 _mesa_printf("Unsupported opcode %i (%s) in fragment shader\n",
-		      inst->opcode, inst->opcode < MAX_OPCODE ?
-				    _mesa_opcode_string(inst->opcode) :
-				    "unknown");
+	 printf("Unsupported opcode %i (%s) in fragment shader\n",
+		inst->opcode, inst->opcode < MAX_OPCODE ?
+		_mesa_opcode_string(inst->opcode) :
+		"unknown");
       }
       
       for (i = 0; i < 4; i++)
@@ -1612,9 +1638,9 @@ void brw_wm_emit( struct brw_wm_compile *c )
    if (INTEL_DEBUG & DEBUG_WM) {
       int i;
 
-      _mesa_printf("wm-native:\n");
+      printf("wm-native:\n");
       for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i]);
-      _mesa_printf("\n");
+      printf("\n");
    }
 }
diff --git a/i965/brw_wm_fp.c b/i965/brw_wm_fp.c
index 7d03179..d73c391 100644
--- a/i965/brw_wm_fp.c
+++ b/i965/brw_wm_fp.c
@@ -138,7 +138,6 @@ static struct prog_dst_register dst_reg(GLuint file, GLuint idx)
    reg.CondMask = COND_TR;
    reg.CondSwizzle = 0;
    reg.CondSrc = 0;
-   reg.pad = 0;
    return reg;
 }
 
@@ -160,7 +159,7 @@ static struct prog_dst_register get_temp( struct brw_wm_compile *c )
    int bit = _mesa_ffs( ~c->fp_temp );
 
    if (!bit) {
-      _mesa_printf("%s: out of temporaries\n", __FILE__);
+      printf("%s: out of temporaries\n", __FILE__);
       exit(1);
    }
 
@@ -1035,7 +1034,7 @@ static void print_insns( const struct prog_instruction *insn,
 {
    GLuint i;
    for (i = 0; i < nr; i++, insn++) {
-      _mesa_printf("%3d: ", i);
+      printf("%3d: ", i);
       if (insn->Opcode < MAX_OPCODE)
 	 _mesa_print_instruction(insn);
       else if (insn->Opcode < MAX_WM_OPCODE) {
@@ -1046,7 +1045,7 @@ static void print_insns( const struct prog_instruction *insn,
 				     3);
       }
       else 
-	 _mesa_printf("965 Opcode %d\n", insn->Opcode);
+	 printf("965 Opcode %d\n", insn->Opcode);
    }
 }
 
@@ -1061,9 +1060,9 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
    GLuint insn;
 
    if (INTEL_DEBUG & DEBUG_WM) {
-      _mesa_printf("pre-fp:\n");
+      printf("pre-fp:\n");
       _mesa_print_program(&fp->program.Base); 
-      _mesa_printf("\n");
+      printf("\n");
    }
 
    c->pixel_xy = src_undef();
@@ -1169,9 +1168,9 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
    }
 
    if (INTEL_DEBUG & DEBUG_WM) {
-      _mesa_printf("pass_fp:\n");
+      printf("pass_fp:\n");
       print_insns( c->prog_instructions, c->nr_fp_insns );
-      _mesa_printf("\n");
+      printf("\n");
    }
 }
 
diff --git a/i965/brw_wm_glsl.c b/i965/brw_wm_glsl.c
index e8c2cb6..562608e 100644
--- a/i965/brw_wm_glsl.c
+++ b/i965/brw_wm_glsl.c
@@ -743,7 +743,7 @@ static void emit_kil(struct brw_wm_compile *c)
     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
     brw_push_insn_state(p);
     brw_set_mask_control(p, BRW_MASK_DISABLE);
-    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
     brw_AND(p, depth, c->emit_mask_reg, depth);
     brw_pop_insn_state(p);
 }
@@ -1826,6 +1826,7 @@ get_argument_regs(struct brw_wm_compile *c,
 
 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 {
+   struct intel_context *intel = &brw->intel;
 #define MAX_IF_DEPTH 32
 #define MAX_LOOP_DEPTH 32
     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
@@ -1848,7 +1849,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
         c->cur_inst = i;
 
 #if 0
-        _mesa_printf("Inst %d: ", i);
+        printf("Inst %d: ", i);
         _mesa_print_instruction(inst);
 #endif
 
@@ -1876,10 +1877,6 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	else
 	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
 
-	dst_flags = inst->DstReg.WriteMask;
-	if (inst->SaturateMode == SATURATE_ZERO_ONE)
-	    dst_flags |= SATURATE;
-
 	switch (inst->Opcode) {
 	    case WM_PIXELXY:
 		emit_pixel_xy(c, dst, dst_flags);
@@ -2043,6 +2040,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
 		break;
 	    case OPCODE_ELSE:
+		assert(if_depth > 0);
 		if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
 		break;
 	    case OPCODE_ENDIF:
@@ -2096,9 +2094,10 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
                   struct brw_instruction *inst0, *inst1;
                   GLuint br = 1;
 
-                  if (BRW_IS_IGDNG(brw))
+                  if (intel->is_ironlake)
                      br = 2;
- 
+
+		  assert(loop_depth > 0);
                   loop_depth--;
                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
@@ -2116,7 +2115,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
                }
                break;
 	    default:
-		_mesa_printf("unsupported IR in fragment shader %d\n",
+		printf("unsupported IR in fragment shader %d\n",
 			inst->Opcode);
 	}
 
@@ -2128,10 +2127,10 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
     post_wm_emit(c);
 
     if (INTEL_DEBUG & DEBUG_WM) {
-      _mesa_printf("wm-native:\n");
+      printf("wm-native:\n");
       for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i]);
-      _mesa_printf("\n");
+      printf("\n");
     }
 }
 
@@ -2142,7 +2141,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
 {
     if (INTEL_DEBUG & DEBUG_WM) {
-        _mesa_printf("brw_wm_glsl_emit:\n");
+        printf("brw_wm_glsl_emit:\n");
     }
 
     /* initial instruction translation/simplification */
diff --git a/i965/brw_wm_pass0.c b/i965/brw_wm_pass0.c
index ff4c082..60bd92e 100644
--- a/i965/brw_wm_pass0.c
+++ b/i965/brw_wm_pass0.c
@@ -105,7 +105,7 @@ static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c,
    GLuint i = c->prog_data.nr_params++;
    
    if (i >= BRW_WM_MAX_PARAM) {
-      _mesa_printf("%s: out of params\n", __FUNCTION__);
+      printf("%s: out of params\n", __FUNCTION__);
       c->prog_data.error = 1;
       return NULL;
    }
@@ -154,7 +154,7 @@ static const struct brw_wm_ref *get_const_ref( struct brw_wm_compile *c,
       return c->constref[i].ref;
    }
    else {
-      _mesa_printf("%s: out of constrefs\n", __FUNCTION__);
+      printf("%s: out of constrefs\n", __FUNCTION__);
       c->prog_data.error = 1;
       return NULL;
    }
diff --git a/i965/brw_wm_sampler_state.c b/i965/brw_wm_sampler_state.c
index aa2e519..d7650af 100644
--- a/i965/brw_wm_sampler_state.c
+++ b/i965/brw_wm_sampler_state.c
@@ -89,7 +89,6 @@ struct wm_sampler_key {
       float max_aniso;
       GLenum minfilter, magfilter;
       GLenum comparemode, comparefunc;
-      dri_bo *sdc_bo;
 
       /** If target is cubemap, take context setting.
        */
@@ -105,7 +104,7 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
 				     dri_bo *sdc_bo,
 				     struct brw_sampler_state *sampler)
 {
-   _mesa_memset(sampler, 0, sizeof(*sampler));
+   memset(sampler, 0, sizeof(*sampler));
 
    switch (key->minfilter) {
    case GL_NEAREST:
@@ -230,7 +229,7 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
    GLcontext *ctx = &brw->intel.ctx;
    int unit;
 
-   memset(key, 0, sizeof(*key));
+   key->sampler_count = 0;
 
    for (unit = 0; unit < BRW_MAX_TEX_UNIT; unit++) {
       if (ctx->Texture.Unit[unit]._ReallyEnabled) {
@@ -241,6 +240,8 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 	 struct gl_texture_image *firstImage =
 	    texObj->Image[0][intelObj->firstLevel];
 
+	 memset(entry, 0, sizeof(*entry));
+
          entry->tex_target = texObj->Target;
 
 	 entry->seamless_cube_map = (texObj->Target == GL_TEXTURE_CUBE_MAP)
@@ -262,10 +263,10 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 	 dri_bo_unreference(brw->wm.sdc_bo[unit]);
 	 if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
 	    float bordercolor[4] = {
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0]
+	       texObj->BorderColor.f[0],
+	       texObj->BorderColor.f[0],
+	       texObj->BorderColor.f[0],
+	       texObj->BorderColor.f[0]
 	    };
 	    /* GL specs that border color for depth textures is taken from the
 	     * R channel, while the hardware uses A.  Spam R into all the
@@ -274,7 +275,7 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 	    brw->wm.sdc_bo[unit] = upload_default_color(brw, bordercolor);
 	 } else {
 	    brw->wm.sdc_bo[unit] = upload_default_color(brw,
-							texObj->BorderColor);
+							texObj->BorderColor.f);
 	 }
 	 key->sampler_count = unit + 1;
       }
@@ -289,7 +290,7 @@ static void upload_wm_samplers( struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct wm_sampler_key key;
-   int i;
+   int i, sampler_key_size;
 
    brw_wm_sampler_populate_key(brw, &key);
 
@@ -303,8 +304,11 @@ static void upload_wm_samplers( struct brw_context *brw )
    if (brw->wm.sampler_count == 0)
       return;
 
+   /* Only include the populated portion of the key in the search. */
+   sampler_key_size = offsetof(struct wm_sampler_key,
+			       sampler[key.sampler_count]);
    brw->wm.sampler_bo = brw_search_cache(&brw->cache, BRW_SAMPLER,
-					 &key, sizeof(key),
+					 &key, sampler_key_size,
 					 brw->wm.sdc_bo, key.sampler_count,
 					 NULL);
 
@@ -324,10 +328,9 @@ static void upload_wm_samplers( struct brw_context *brw )
       }
 
       brw->wm.sampler_bo = brw_upload_cache(&brw->cache, BRW_SAMPLER,
-					    &key, sizeof(key),
+					    &key, sampler_key_size,
 					    brw->wm.sdc_bo, key.sampler_count,
-					    &sampler, sizeof(sampler),
-					    NULL, NULL);
+					    &sampler, sizeof(sampler));
 
       /* Emit SDC relocations */
       for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
diff --git a/i965/brw_wm_state.c b/i965/brw_wm_state.c
index f89ed9b..a7f80db 100644
--- a/i965/brw_wm_state.c
+++ b/i965/brw_wm_state.c
@@ -49,8 +49,6 @@ struct brw_wm_unit_key {
    unsigned int curbe_offset;
    unsigned int urb_size;
 
-   unsigned int max_threads;
-
    unsigned int nr_surfaces, sampler_count;
    GLboolean uses_depth, computes_depth, uses_kill, is_glsl;
    GLboolean polygon_stipple, stats_wm, line_stipple, offset_enable;
@@ -67,18 +65,6 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    memset(key, 0, sizeof(*key));
 
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
-      key->max_threads = 1;
-   else {
-      /* WM maximum threads is number of EUs times number of threads per EU. */
-      if (BRW_IS_IGDNG(brw))
-         key->max_threads = 12 * 6;
-      else if (BRW_IS_G4X(brw))
-	 key->max_threads = 10 * 5;
-      else
-	 key->max_threads = 8 * 4;
-   }
-
    /* CACHE_NEW_WM_PROG */
    key->total_grf = brw->wm.prog_data->total_grf;
    key->urb_entry_read_length = brw->wm.prog_data->urb_read_length;
@@ -140,6 +126,7 @@ static dri_bo *
 wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 			dri_bo **reloc_bufs)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_wm_unit_state wm;
    dri_bo *bo;
 
@@ -150,7 +137,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.thread1.depth_coef_urb_read_offset = 1;
    wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
       wm.thread1.binding_table_entry_count = 0; /* hardware requirement */
    else
       wm.thread1.binding_table_entry_count = key->nr_surfaces;
@@ -170,7 +157,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
    wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
 
-   if (BRW_IS_IGDNG(brw)) 
+   if (intel->is_ironlake)
       wm.wm4.sampler_count = 0; /* hardware requirement */
    else
       wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
@@ -191,7 +178,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    else
       wm.wm5.enable_16_pix = 1;
 
-   wm.wm5.max_threads = key->max_threads - 1;
+   wm.wm5.max_threads = brw->wm_max_threads - 1;
    wm.wm5.thread_dispatch_enable = 1;	/* AKA: color_write */
    wm.wm5.legacy_line_rast = 0;
    wm.wm5.legacy_global_depth_bias = 0;
@@ -223,8 +210,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    bo = brw_upload_cache(&brw->cache, BRW_WM_UNIT,
 			 key, sizeof(*key),
 			 reloc_bufs, 3,
-			 &wm, sizeof(wm),
-			 NULL, NULL);
+			 &wm, sizeof(wm));
 
    /* Emit WM program relocation */
    dri_bo_emit_reloc(bo,
@@ -268,7 +254,7 @@ static void upload_wm_unit( struct brw_context *brw )
     */
    assert(key.total_scratch <= 12 * 1024);
    if (key.total_scratch) {
-      GLuint total = key.total_scratch * key.max_threads;
+      GLuint total = key.total_scratch * brw->wm_max_threads;
 
       if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) {
 	 dri_bo_unreference(brw->wm.scratch_bo);
diff --git a/i965/brw_wm_surface_state.c b/i965/brw_wm_surface_state.c
index 8335e5a..ce0bf0b 100644
--- a/i965/brw_wm_surface_state.c
+++ b/i965/brw_wm_surface_state.c
@@ -207,33 +207,14 @@ brw_create_texture_surface( struct brw_context *brw,
 
    surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
    surf.ss0.surface_type = translate_tex_target(key->target);
-   if (key->bo) {
-      surf.ss0.surface_format = translate_tex_format(key->format,
-						     key->internal_format,
-						     key->depthmode);
-   }
-   else {
-      switch (key->depth) {
-      case 32:
-         surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-         break;
-      default:
-      case 24:
-         surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
-         break;
-      case 16:
-         surf.ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
-         break;
-      }
-   }
+   surf.ss0.surface_format = translate_tex_format(key->format,
+						  key->internal_format,
+						  key->depthmode);
 
    /* This is ok for all textures with channel width 8bit or less:
     */
 /*    surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
-   if (key->bo)
-      surf.ss1.base_addr = key->bo->offset; /* reloc */
-   else
-      surf.ss1.base_addr = key->offset;
+   surf.ss1.base_addr = key->bo->offset; /* reloc */
 
    surf.ss2.mip_count = key->last_level - key->first_level;
    surf.ss2.width = key->width - 1;
@@ -255,18 +236,14 @@ brw_create_texture_surface( struct brw_context *brw,
 
    bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
-			 &key->bo, key->bo ? 1 : 0,
-			 &surf, sizeof(surf),
-			 NULL, NULL);
-
-   if (key->bo) {
-      /* Emit relocation to surface contents */
-      dri_bo_emit_reloc(bo,
-			I915_GEM_DOMAIN_SAMPLER, 0,
-			0,
-			offsetof(struct brw_surface_state, ss1),
-			key->bo);
-   }
+			 &key->bo, 1,
+			 &surf, sizeof(surf));
+
+   /* Emit relocation to surface contents */
+   drm_intel_bo_emit_reloc(bo, offsetof(struct brw_surface_state, ss1),
+			   key->bo, 0,
+			   I915_GEM_DOMAIN_SAMPLER, 0);
+
    return bo;
 }
 
@@ -282,19 +259,12 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
 
    memset(&key, 0, sizeof(key));
 
-   if (intelObj->imageOverride) {
-      key.pitch = intelObj->pitchOverride / intelObj->mt->cpp;
-      key.depth = intelObj->depthOverride;
-      key.bo = NULL;
-      key.offset = intelObj->textureOffset;
-   } else {
-      key.format = firstImage->TexFormat;
-      key.internal_format = firstImage->InternalFormat;
-      key.pitch = intelObj->mt->pitch;
-      key.depth = firstImage->Depth;
-      key.bo = intelObj->mt->region->buffer;
-      key.offset = 0;
-   }
+   key.format = firstImage->TexFormat;
+   key.internal_format = firstImage->InternalFormat;
+   key.pitch = intelObj->mt->pitch;
+   key.depth = firstImage->Depth;
+   key.bo = intelObj->mt->region->buffer;
+   key.offset = 0;
 
    key.target = tObj->Target;
    key.depthmode = tObj->DepthMode;
@@ -309,7 +279,7 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
    brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
                                             BRW_SS_SURFACE,
                                             &key, sizeof(key),
-                                            &key.bo, key.bo ? 1 : 0,
+                                            &key.bo, 1,
                                             NULL);
    if (brw->wm.surf_bo[surf] == NULL) {
       brw->wm.surf_bo[surf] = brw_create_texture_surface(brw, &key);
@@ -337,10 +307,7 @@ brw_create_constant_surface( struct brw_context *brw,
    surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
 
    assert(key->bo);
-   if (key->bo)
-      surf.ss1.base_addr = key->bo->offset; /* reloc */
-   else
-      surf.ss1.base_addr = key->offset;
+   surf.ss1.base_addr = key->bo->offset; /* reloc */
 
    surf.ss2.width = w & 0x7f;            /* bits 6:0 of size or width */
    surf.ss2.height = (w >> 7) & 0x1fff;  /* bits 19:7 of size or width */
@@ -350,21 +317,16 @@ brw_create_constant_surface( struct brw_context *brw,
  
    bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
-			 &key->bo, key->bo ? 1 : 0,
-			 &surf, sizeof(surf),
-			 NULL, NULL);
-
-   if (key->bo) {
-      /* Emit relocation to surface contents.  Section 5.1.1 of the gen4
-       * bspec ("Data Cache") says that the data cache does not exist as
-       * a separate cache and is just the sampler cache.
-       */
-      dri_bo_emit_reloc(bo,
-			I915_GEM_DOMAIN_SAMPLER, 0,
-			0,
-			offsetof(struct brw_surface_state, ss1),
-			key->bo);
-   }
+			 &key->bo, 1,
+			 &surf, sizeof(surf));
+
+   /* Emit relocation to surface contents.  Section 5.1.1 of the gen4
+    * bspec ("Data Cache") says that the data cache does not exist as
+    * a separate cache and is just the sampler cache.
+    */
+   drm_intel_bo_emit_reloc(bo, offsetof(struct brw_surface_state, ss1),
+			   key->bo, 0,
+			   I915_GEM_DOMAIN_SAMPLER, 0);
 
    return bo;
 }
@@ -422,7 +384,7 @@ brw_update_wm_constant_surface( GLcontext *ctx,
    /* If there's no constant buffer, then no surface BO is needed to point at
     * it.
     */
-   if (fp->const_buffer == 0) {
+   if (fp->const_buffer == NULL) {
       drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
       brw->wm.surf_bo[surf] = NULL;
       return;
@@ -450,7 +412,7 @@ brw_update_wm_constant_surface( GLcontext *ctx,
    brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
                                             BRW_SS_SURFACE,
                                             &key, sizeof(key),
-                                            &key.bo, key.bo ? 1 : 0,
+                                            &key.bo, 1,
                                             NULL);
    if (brw->wm.surf_bo[surf] == NULL) {
       brw->wm.surf_bo[surf] = brw_create_constant_surface(brw, &key);
@@ -511,7 +473,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 				struct gl_renderbuffer *rb,
 				unsigned int unit)
 {
-   GLcontext *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
+   GLcontext *ctx = &intel->ctx;
    dri_bo *region_bo = NULL;
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    struct intel_region *region = irb ? irb->region : NULL;
@@ -522,7 +485,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       GLubyte color_mask[4];
       GLboolean color_blend;
       uint32_t tiling;
-      uint32_t draw_offset;
+      uint32_t draw_x;
+      uint32_t draw_y;
    } key;
 
    memset(&key, 0, sizeof(key));
@@ -564,7 +528,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       }
       key.pitch = region->pitch;
       key.cpp = region->cpp;
-      key.draw_offset = region->draw_offset; /* cur 3d or cube face offset */
+      key.draw_x = region->draw_x;
+      key.draw_y = region->draw_y;
    } else {
       key.surface_type = BRW_SURFACE_NULL;
       key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
@@ -572,20 +537,24 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       key.width = 1;
       key.height = 1;
       key.cpp = 4;
-      key.draw_offset = 0;
+      key.draw_x = 0;
+      key.draw_y = 0;
    }
-   /* _NEW_COLOR */
-   memcpy(key.color_mask, ctx->Color.ColorMask,
-	  sizeof(key.color_mask));
 
-   /* As mentioned above, disable writes to the alpha component when the
-    * renderbuffer is XRGB.
-    */
-   if (ctx->DrawBuffer->Visual.alphaBits == 0)
-     key.color_mask[3] = GL_FALSE;
+   if (intel->gen < 6) {
+      /* _NEW_COLOR */
+      memcpy(key.color_mask, ctx->Color.ColorMask[unit],
+	     sizeof(key.color_mask));
 
-   key.color_blend = (!ctx->Color._LogicOpEnabled &&
-		      ctx->Color.BlendEnabled);
+      /* As mentioned above, disable writes to the alpha component when the
+       * renderbuffer is XRGB.
+       */
+      if (ctx->DrawBuffer->Visual.alphaBits == 0)
+	 key.color_mask[3] = GL_FALSE;
+
+      key.color_blend = (!ctx->Color._LogicOpEnabled &&
+			 (ctx->Color.BlendEnabled & (1 << unit)));
+   }
 
    dri_bo_unreference(brw->wm.surf_bo[unit]);
    brw->wm.surf_bo[unit] = brw_search_cache(&brw->surface_cache,
@@ -602,25 +571,32 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       surf.ss0.surface_format = key.surface_format;
       surf.ss0.surface_type = key.surface_type;
       if (key.tiling == I915_TILING_NONE) {
-	 surf.ss1.base_addr = key.draw_offset;
+	 surf.ss1.base_addr = (key.draw_x + key.draw_y * key.pitch) * key.cpp;
       } else {
-	 uint32_t tile_offset = key.draw_offset % 4096;
-
-	 surf.ss1.base_addr = key.draw_offset - tile_offset;
-
-	 assert(BRW_IS_G4X(brw) || tile_offset == 0);
-	 if (BRW_IS_G4X(brw)) {
-	    if (key.tiling == I915_TILING_X) {
-	       /* Note that the low bits of these fields are missing, so
-		* there's the possibility of getting in trouble.
-		*/
-	       surf.ss5.x_offset = (tile_offset % 512) / key.cpp / 4;
-	       surf.ss5.y_offset = tile_offset / 512 / 2;
-	    } else {
-	       surf.ss5.x_offset = (tile_offset % 128) / key.cpp / 4;
-	       surf.ss5.y_offset = tile_offset / 128 / 2;
-	    }
+	 uint32_t tile_base, tile_x, tile_y;
+	 uint32_t pitch = key.pitch * key.cpp;
+
+	 if (key.tiling == I915_TILING_X) {
+	    tile_x = key.draw_x % (512 / key.cpp);
+	    tile_y = key.draw_y % 8;
+	    tile_base = ((key.draw_y / 8) * (8 * pitch));
+	    tile_base += (key.draw_x - tile_x) / (512 / key.cpp) * 4096;
+	 } else {
+	    /* Y */
+	    tile_x = key.draw_x % (128 / key.cpp);
+	    tile_y = key.draw_y % 32;
+	    tile_base = ((key.draw_y / 32) * (32 * pitch));
+	    tile_base += (key.draw_x - tile_x) / (128 / key.cpp) * 4096;
 	 }
+	 assert(intel->is_g4x || (tile_x == 0 && tile_y == 0));
+	 assert(tile_x % 4 == 0);
+	 assert(tile_y % 2 == 0);
+	 /* Note that the low bits of these fields are missing, so
+	  * there's the possibility of getting in trouble.
+	  */
+	 surf.ss1.base_addr = tile_base;
+	 surf.ss5.x_offset = tile_x / 4;
+	 surf.ss5.y_offset = tile_y / 2;
       }
       if (region_bo != NULL)
 	 surf.ss1.base_addr += region_bo->offset; /* reloc */
@@ -630,20 +606,21 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       brw_set_surface_tiling(&surf, key.tiling);
       surf.ss3.pitch = (key.pitch * key.cpp) - 1;
 
-      /* _NEW_COLOR */
-      surf.ss0.color_blend = key.color_blend;
-      surf.ss0.writedisable_red =   !key.color_mask[0];
-      surf.ss0.writedisable_green = !key.color_mask[1];
-      surf.ss0.writedisable_blue =  !key.color_mask[2];
-      surf.ss0.writedisable_alpha = !key.color_mask[3];
+      if (intel->gen < 6) {
+	 /* _NEW_COLOR */
+	 surf.ss0.color_blend = key.color_blend;
+	 surf.ss0.writedisable_red =   !key.color_mask[0];
+	 surf.ss0.writedisable_green = !key.color_mask[1];
+	 surf.ss0.writedisable_blue =  !key.color_mask[2];
+	 surf.ss0.writedisable_alpha = !key.color_mask[3];
+      }
 
       /* Key size will never match key size for textures, so we're safe. */
       brw->wm.surf_bo[unit] = brw_upload_cache(&brw->surface_cache,
                                                BRW_SS_SURFACE,
                                                &key, sizeof(key),
 					       &region_bo, 1,
-					       &surf, sizeof(surf),
-					       NULL, NULL);
+					       &surf, sizeof(surf));
       if (region_bo != NULL) {
 	 /* We might sample from it, and we might render to it, so flag
 	  * them both.  We might be able to figure out from other state
@@ -690,8 +667,7 @@ brw_wm_get_binding_table(struct brw_context *brw)
       bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
 				  NULL, 0,
 				  brw->wm.surf_bo, brw->wm.nr_surfaces,
-				  data, data_size,
-				  NULL, NULL);
+				  data, data_size);
 
       /* Emit binding table relocations to surface state */
       for (i = 0; i < BRW_WM_MAX_SURF; i++) {
diff --git a/i965/gen6_cc.c b/i965/gen6_cc.c
new file mode 100644
index 0000000..f7acad6
--- /dev/null
+++ b/i965/gen6_cc.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_util.h"
+#include "intel_batchbuffer.h"
+#include "main/macros.h"
+
+struct gen6_blend_state_key {
+   GLboolean color_blend, alpha_enabled;
+   GLboolean dither;
+
+   GLenum logic_op;
+
+   GLenum blend_eq_rgb, blend_eq_a;
+   GLenum blend_src_rgb, blend_src_a;
+   GLenum blend_dst_rgb, blend_dst_a;
+
+   GLenum alpha_func;
+};
+
+static void
+blend_state_populate_key(struct brw_context *brw,
+			 struct gen6_blend_state_key *key)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+
+   memset(key, 0, sizeof(*key));
+
+   /* _NEW_COLOR */
+   if (ctx->Color._LogicOpEnabled)
+      key->logic_op = ctx->Color.LogicOp;
+   else
+      key->logic_op = GL_COPY;
+
+   /* _NEW_COLOR */
+   key->color_blend = ctx->Color.BlendEnabled;
+   if (key->color_blend) {
+      key->blend_eq_rgb = ctx->Color.BlendEquationRGB;
+      key->blend_eq_a = ctx->Color.BlendEquationA;
+      key->blend_src_rgb = ctx->Color.BlendSrcRGB;
+      key->blend_dst_rgb = ctx->Color.BlendDstRGB;
+      key->blend_src_a = ctx->Color.BlendSrcA;
+      key->blend_dst_a = ctx->Color.BlendDstA;
+   }
+
+   /* _NEW_COLOR */
+   key->alpha_enabled = ctx->Color.AlphaEnabled;
+   if (key->alpha_enabled) {
+      key->alpha_func = ctx->Color.AlphaFunc;
+   }
+
+   /* _NEW_COLOR */
+   key->dither = ctx->Color.DitherFlag;
+}
+
+/**
+ * Creates the state cache entry for the given CC unit key.
+ */
+static drm_intel_bo *
+blend_state_create_from_key(struct brw_context *brw,
+			    struct gen6_blend_state_key *key)
+{
+   struct gen6_blend_state blend;
+   drm_intel_bo *bo;
+
+   memset(&blend, 0, sizeof(blend));
+
+   if (key->logic_op != GL_COPY) {
+      blend.blend1.logic_op_enable = 1;
+      blend.blend1.logic_op_func = intel_translate_logic_op(key->logic_op);
+   } else if (key->color_blend) {
+      GLenum eqRGB = key->blend_eq_rgb;
+      GLenum eqA = key->blend_eq_a;
+      GLenum srcRGB = key->blend_src_rgb;
+      GLenum dstRGB = key->blend_dst_rgb;
+      GLenum srcA = key->blend_src_a;
+      GLenum dstA = key->blend_dst_a;
+
+      if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
+	 srcRGB = dstRGB = GL_ONE;
+      }
+
+      if (eqA == GL_MIN || eqA == GL_MAX) {
+	 srcA = dstA = GL_ONE;
+      }
+
+      blend.blend0.dest_blend_factor = brw_translate_blend_factor(dstRGB);
+      blend.blend0.source_blend_factor = brw_translate_blend_factor(srcRGB);
+      blend.blend0.blend_func = brw_translate_blend_equation(eqRGB);
+
+      blend.blend0.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
+      blend.blend0.ia_source_blend_factor = brw_translate_blend_factor(srcA);
+      blend.blend0.ia_blend_func = brw_translate_blend_equation(eqA);
+
+      blend.blend0.blend_enable = 1;
+      blend.blend0.ia_blend_enable = (srcA != srcRGB ||
+				      dstA != dstRGB ||
+				      eqA != eqRGB);
+   }
+
+   if (key->alpha_enabled) {
+      blend.blend1.alpha_test_enable = 1;
+      blend.blend1.alpha_test_func = intel_translate_compare_func(key->alpha_func);
+
+   }
+
+   if (key->dither) {
+      blend.blend1.dither_enable = 1;
+      blend.blend1.y_dither_offset = 0;
+      blend.blend1.x_dither_offset = 0;
+   }
+
+   bo = brw_upload_cache(&brw->cache, BRW_BLEND_STATE,
+			 key, sizeof(*key),
+			 NULL, 0,
+			 &blend, sizeof(blend));
+
+   return bo;
+}
+
+static void
+prepare_blend_state(struct brw_context *brw)
+{
+   struct gen6_blend_state_key key;
+
+   blend_state_populate_key(brw, &key);
+
+   drm_intel_bo_unreference(brw->cc.blend_state_bo);
+   brw->cc.blend_state_bo = brw_search_cache(&brw->cache, BRW_BLEND_STATE,
+					     &key, sizeof(key),
+					     NULL, 0,
+					     NULL);
+
+   if (brw->cc.blend_state_bo == NULL)
+      brw->cc.blend_state_bo = blend_state_create_from_key(brw, &key);
+}
+
+const struct brw_tracked_state gen6_blend_state = {
+   .dirty = {
+      .mesa = _NEW_COLOR,
+      .brw = 0,
+      .cache = 0,
+   },
+   .prepare = prepare_blend_state,
+};
+
+struct gen6_color_calc_state_key {
+   GLubyte blend_constant_color[4];
+   GLclampf alpha_ref;
+   GLubyte stencil_ref[2];
+};
+
+static void
+color_calc_state_populate_key(struct brw_context *brw,
+			      struct gen6_color_calc_state_key *key)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+
+   memset(key, 0, sizeof(*key));
+
+   /* _NEW_STENCIL */
+   if (ctx->Stencil._Enabled) {
+      const unsigned back = ctx->Stencil._BackFace;
+
+      key->stencil_ref[0] = ctx->Stencil.Ref[0];
+      if (ctx->Stencil._TestTwoSide)
+	 key->stencil_ref[1] = ctx->Stencil.Ref[back];
+   }
+
+   /* _NEW_COLOR */
+   if (ctx->Color.AlphaEnabled)
+      key->alpha_ref = ctx->Color.AlphaRef;
+
+   key->blend_constant_color[0] = ctx->Color.BlendColor[0];
+   key->blend_constant_color[1] = ctx->Color.BlendColor[1];
+   key->blend_constant_color[2] = ctx->Color.BlendColor[2];
+   key->blend_constant_color[3] = ctx->Color.BlendColor[3];
+}
+
+/**
+ * Creates the state cache entry for the given CC state key.
+ */
+static drm_intel_bo *
+color_calc_state_create_from_key(struct brw_context *brw,
+				 struct gen6_color_calc_state_key *key)
+{
+   struct gen6_color_calc_state cc;
+   drm_intel_bo *bo;
+
+   memset(&cc, 0, sizeof(cc));
+
+   cc.cc0.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
+   UNCLAMPED_FLOAT_TO_UBYTE(cc.cc1.alpha_ref_fi.ui, key->alpha_ref);
+
+   cc.cc0.stencil_ref = key->stencil_ref[0];
+   cc.cc0.bf_stencil_ref = key->stencil_ref[1];
+
+   cc.constant_r = key->blend_constant_color[0];
+   cc.constant_g = key->blend_constant_color[1];
+   cc.constant_b = key->blend_constant_color[2];
+   cc.constant_a = key->blend_constant_color[3];
+
+   bo = brw_upload_cache(&brw->cache, BRW_COLOR_CALC_STATE,
+			 key, sizeof(*key),
+			 NULL, 0,
+			 &cc, sizeof(cc));
+
+   return bo;
+}
+
+static void
+prepare_color_calc_state(struct brw_context *brw)
+{
+   struct gen6_color_calc_state_key key;
+
+   color_calc_state_populate_key(brw, &key);
+
+   drm_intel_bo_unreference(brw->cc.state_bo);
+   brw->cc.state_bo = brw_search_cache(&brw->cache, BRW_COLOR_CALC_STATE,
+				       &key, sizeof(key),
+				       NULL, 0,
+				       NULL);
+
+   if (brw->cc.state_bo == NULL)
+      brw->cc.state_bo = color_calc_state_create_from_key(brw, &key);
+}
+
+const struct brw_tracked_state gen6_color_calc_state = {
+   .dirty = {
+      .mesa = _NEW_COLOR,
+      .brw = 0,
+      .cache = 0,
+   },
+   .prepare = prepare_color_calc_state,
+};
+
+static void upload_cc_state_pointers(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(4);
+   OUT_BATCH(CMD_3D_CC_STATE_POINTERS << 16 | (4 - 2));
+   OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
+   OUT_RELOC(brw->cc.blend_state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
+   OUT_RELOC(brw->cc.depth_stencil_state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
+   ADVANCE_BATCH();
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+}
+
+
+static void prepare_cc_state_pointers(struct brw_context *brw)
+{
+   brw_add_validated_bo(brw, brw->cc.state_bo);
+   brw_add_validated_bo(brw, brw->cc.blend_state_bo);
+   brw_add_validated_bo(brw, brw->cc.depth_stencil_state_bo);
+}
+
+const struct brw_tracked_state gen6_cc_state_pointers = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH,
+      .cache = (CACHE_NEW_BLEND_STATE |
+		CACHE_NEW_COLOR_CALC_STATE |
+		CACHE_NEW_DEPTH_STENCIL_STATE)
+   },
+   .prepare = prepare_cc_state_pointers,
+   .emit = upload_cc_state_pointers,
+};
diff --git a/i965/gen6_clip_state.c b/i965/gen6_clip_state.c
new file mode 100644
index 0000000..06f8145
--- /dev/null
+++ b/i965/gen6_clip_state.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "intel_batchbuffer.h"
+
+static void
+upload_clip_state(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   GLcontext *ctx = &intel->ctx;
+   uint32_t depth_clamp = 0;
+   uint32_t provoking;
+
+   if (!ctx->Transform.DepthClamp)
+      depth_clamp = GEN6_CLIP_Z_TEST;
+
+   if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
+      provoking = 0;
+   } else {
+      provoking =
+	 (2 << GEN6_CLIP_TRI_PROVOKE_SHIFT) |
+	 (2 << GEN6_CLIP_TRIFAN_PROVOKE_SHIFT) |
+	 (1 << GEN6_CLIP_LINE_PROVOKE_SHIFT);
+   }
+
+   BEGIN_BATCH(4);
+   OUT_BATCH(CMD_3D_CLIP_STATE << 16 | (4 - 2));
+   OUT_BATCH(GEN6_CLIP_STATISTICS_ENABLE);
+   OUT_BATCH(GEN6_CLIP_ENABLE |
+	     GEN6_CLIP_API_OGL |
+	     GEN6_CLIP_MODE_REJECT_ALL | /* XXX: debug: get VS working */
+	     GEN6_CLIP_XY_TEST |
+	     depth_clamp |
+	     provoking);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+}
+
+const struct brw_tracked_state gen6_clip_state = {
+   .dirty = {
+      .mesa  = _NEW_TRANSFORM,
+      .brw   = BRW_NEW_CONTEXT,
+      .cache = 0
+   },
+   .emit = upload_clip_state,
+};
diff --git a/i965/gen6_depthstencil.c b/i965/gen6_depthstencil.c
new file mode 100644
index 0000000..4924f0f
--- /dev/null
+++ b/i965/gen6_depthstencil.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_context.h"
+#include "brw_state.h"
+
+struct brw_depth_stencil_state_key {
+   GLenum depth_func;
+   GLboolean depth_test, depth_write;
+   GLboolean stencil, stencil_two_side;
+   GLenum stencil_func[2], stencil_fail_op[2];
+   GLenum stencil_pass_depth_fail_op[2], stencil_pass_depth_pass_op[2];
+   GLubyte stencil_write_mask[2], stencil_test_mask[2];
+};
+
+static void
+depth_stencil_state_populate_key(struct brw_context *brw,
+				 struct brw_depth_stencil_state_key *key)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   const unsigned back = ctx->Stencil._BackFace;
+
+   memset(key, 0, sizeof(*key));
+
+   /* _NEW_STENCIL */
+   key->stencil = ctx->Stencil._Enabled;
+   key->stencil_two_side = ctx->Stencil._TestTwoSide;
+
+   if (key->stencil) {
+      key->stencil_func[0] = ctx->Stencil.Function[0];
+      key->stencil_fail_op[0] = ctx->Stencil.FailFunc[0];
+      key->stencil_pass_depth_fail_op[0] = ctx->Stencil.ZFailFunc[0];
+      key->stencil_pass_depth_pass_op[0] = ctx->Stencil.ZPassFunc[0];
+      key->stencil_write_mask[0] = ctx->Stencil.WriteMask[0];
+      key->stencil_test_mask[0] = ctx->Stencil.ValueMask[0];
+   }
+   if (key->stencil_two_side) {
+      key->stencil_func[1] = ctx->Stencil.Function[back];
+      key->stencil_fail_op[1] = ctx->Stencil.FailFunc[back];
+      key->stencil_pass_depth_fail_op[1] = ctx->Stencil.ZFailFunc[back];
+      key->stencil_pass_depth_pass_op[1] = ctx->Stencil.ZPassFunc[back];
+      key->stencil_write_mask[1] = ctx->Stencil.WriteMask[back];
+      key->stencil_test_mask[1] = ctx->Stencil.ValueMask[back];
+   }
+
+   key->depth_test = ctx->Depth.Test;
+   if (key->depth_test) {
+      key->depth_func = ctx->Depth.Func;
+      key->depth_write = ctx->Depth.Mask;
+   }
+}
+
+/**
+ * Creates the state cache entry for the given DEPTH_STENCIL_STATE state key.
+ */
+static dri_bo *
+depth_stencil_state_create_from_key(struct brw_context *brw,
+				    struct brw_depth_stencil_state_key *key)
+{
+   struct gen6_depth_stencil_state ds;
+   dri_bo *bo;
+
+   memset(&ds, 0, sizeof(ds));
+
+   /* _NEW_STENCIL */
+   if (key->stencil) {
+      ds.ds0.stencil_enable = 1;
+      ds.ds0.stencil_func =
+	 intel_translate_compare_func(key->stencil_func[0]);
+      ds.ds0.stencil_fail_op =
+	 intel_translate_stencil_op(key->stencil_fail_op[0]);
+      ds.ds0.stencil_pass_depth_fail_op =
+	 intel_translate_stencil_op(key->stencil_pass_depth_fail_op[0]);
+      ds.ds0.stencil_pass_depth_pass_op =
+	 intel_translate_stencil_op(key->stencil_pass_depth_pass_op[0]);
+      ds.ds1.stencil_write_mask = key->stencil_write_mask[0];
+      ds.ds1.stencil_test_mask = key->stencil_test_mask[0];
+
+      if (key->stencil_two_side) {
+	 ds.ds0.bf_stencil_enable = 1;
+	 ds.ds0.bf_stencil_func =
+	    intel_translate_compare_func(key->stencil_func[1]);
+	 ds.ds0.bf_stencil_fail_op =
+	    intel_translate_stencil_op(key->stencil_fail_op[1]);
+	 ds.ds0.bf_stencil_pass_depth_fail_op =
+	    intel_translate_stencil_op(key->stencil_pass_depth_fail_op[1]);
+	 ds.ds0.bf_stencil_pass_depth_pass_op =
+	    intel_translate_stencil_op(key->stencil_pass_depth_pass_op[1]);
+	 ds.ds1.bf_stencil_write_mask = key->stencil_write_mask[1];
+	 ds.ds1.bf_stencil_test_mask = key->stencil_test_mask[1];
+      }
+
+      /* Not really sure about this:
+       */
+      if (key->stencil_write_mask[0] ||
+	  (key->stencil_two_side && key->stencil_write_mask[1]))
+	 ds.ds0.stencil_write_enable = 1;
+   }
+
+   /* _NEW_DEPTH */
+   if (key->depth_test) {
+      ds.ds2.depth_test_enable = 1;
+      ds.ds2.depth_test_func = intel_translate_compare_func(key->depth_func);
+      ds.ds2.depth_write_enable = key->depth_write;
+   }
+
+   bo = brw_upload_cache(&brw->cache, BRW_DEPTH_STENCIL_STATE,
+			 key, sizeof(*key),
+			 NULL, 0,
+			 &ds, sizeof(ds));
+
+   return bo;
+}
+
+static void
+prepare_depth_stencil_state(struct brw_context *brw)
+{
+   struct brw_depth_stencil_state_key key;
+
+   depth_stencil_state_populate_key(brw, &key);
+
+   dri_bo_unreference(brw->cc.depth_stencil_state_bo);
+   brw->cc.depth_stencil_state_bo = brw_search_cache(&brw->cache,
+						     BRW_DEPTH_STENCIL_STATE,
+						     &key, sizeof(key),
+						     NULL, 0,
+						     NULL);
+
+   if (brw->cc.depth_stencil_state_bo == NULL)
+      brw->cc.depth_stencil_state_bo =
+	 depth_stencil_state_create_from_key(brw, &key);
+}
+
+const struct brw_tracked_state gen6_depth_stencil_state = {
+   .dirty = {
+      .mesa = _NEW_DEPTH | _NEW_STENCIL,
+      .brw = 0,
+      .cache = 0,
+   },
+   .prepare = prepare_depth_stencil_state,
+};
diff --git a/i965/gen6_gs_state.c b/i965/gen6_gs_state.c
new file mode 100644
index 0000000..161e7b8
--- /dev/null
+++ b/i965/gen6_gs_state.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "intel_batchbuffer.h"
+
+static void
+upload_gs_state(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   /* Disable all the constant buffers. */
+   BEGIN_BATCH(5);
+   OUT_BATCH(CMD_3D_CONSTANT_GS_STATE << 16 | (5 - 2));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   if (brw->gs.prog_bo) {
+      BEGIN_BATCH(7);
+      OUT_BATCH(CMD_3D_GS_STATE << 16 | (7 - 2));
+      OUT_RELOC(brw->gs.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+      OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) |
+		(0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+      OUT_BATCH(0); /* scratch space base offset */
+      OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
+		(brw->gs.prog_data->urb_read_length << GEN6_GS_URB_READ_LENGTH_SHIFT) |
+		(0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT));
+      OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) |
+		GEN6_GS_STATISTICS_ENABLE |
+		GEN6_GS_RENDERING_ENABLE);
+      OUT_BATCH(GEN6_GS_ENABLE);
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(7);
+      OUT_BATCH(CMD_3D_GS_STATE << 16 | (7 - 2));
+      OUT_BATCH(0); /* prog_bo */
+      OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) |
+		(0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+      OUT_BATCH(0); /* scratch space base offset */
+      OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
+		(0 << GEN6_GS_URB_READ_LENGTH_SHIFT) |
+		(0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT));
+      OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) |
+		GEN6_GS_STATISTICS_ENABLE |
+		GEN6_GS_RENDERING_ENABLE);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   }
+}
+
+const struct brw_tracked_state gen6_gs_state = {
+   .dirty = {
+      .mesa  = _NEW_TRANSFORM,
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE |
+		BRW_NEW_CONTEXT),
+      .cache = CACHE_NEW_GS_PROG
+   },
+   .emit = upload_gs_state,
+};
diff --git a/i965/gen6_sampler_state.c b/i965/gen6_sampler_state.c
new file mode 100644
index 0000000..ab8e751
--- /dev/null
+++ b/i965/gen6_sampler_state.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "intel_batchbuffer.h"
+
+static void
+upload_sampler_state_pointers(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(4);
+   OUT_BATCH(CMD_3D_SAMPLER_STATE_POINTERS << 16 |
+	     VS_SAMPLER_STATE_CHANGE |
+	     GS_SAMPLER_STATE_CHANGE |
+	     PS_SAMPLER_STATE_CHANGE |
+	     (4 - 2));
+   OUT_BATCH(0); /* VS */
+   OUT_BATCH(0); /* GS */
+   if (brw->wm.sampler_bo)
+      OUT_RELOC(brw->wm.sampler_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   else
+      OUT_BATCH(0);
+
+   ADVANCE_BATCH();
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+}
+
+
+static void
+prepare_sampler_state_pointers(struct brw_context *brw)
+{
+   brw_add_validated_bo(brw, brw->wm.sampler_bo);
+}
+
+const struct brw_tracked_state gen6_sampler_state = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH,
+      .cache = CACHE_NEW_SAMPLER
+   },
+   .prepare = prepare_sampler_state_pointers,
+   .emit = upload_sampler_state_pointers,
+};
diff --git a/i965/gen6_scissor_state.c b/i965/gen6_scissor_state.c
new file mode 100644
index 0000000..2e21e5f
--- /dev/null
+++ b/i965/gen6_scissor_state.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "intel_batchbuffer.h"
+
+static void
+prepare_scissor_state(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
+   struct gen6_scissor_state scissor;
+
+   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
+
+   /* The scissor only needs to handle the intersection of drawable and
+    * scissor rect.  Clipping to the boundaries of static shared buffers
+    * for front/back/depth is covered by looping over cliprects in brw_draw.c.
+    *
+    * Note that the hardware's coordinates are inclusive, while Mesa's min is
+    * inclusive but max is exclusive.
+    */
+   if (render_to_fbo) {
+      /* texmemory: Y=0=bottom */
+      scissor.xmin = ctx->DrawBuffer->_Xmin;
+      scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
+      scissor.ymin = ctx->DrawBuffer->_Ymin;
+      scissor.ymax = ctx->DrawBuffer->_Ymax - 1;
+   }
+   else {
+      /* memory: Y=0=top */
+      scissor.xmin = ctx->DrawBuffer->_Xmin;
+      scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
+      scissor.ymin = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymax;
+      scissor.ymax = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymin - 1;
+   }
+
+   drm_intel_bo_unreference(brw->sf.state_bo);
+   brw->sf.state_bo = brw_cache_data(&brw->cache, BRW_SF_UNIT,
+				     &scissor, sizeof(scissor),
+				     NULL, 0);
+}
+
+const struct brw_tracked_state gen6_scissor_state = {
+   .dirty = {
+      .mesa = _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT,
+      .brw = 0,
+      .cache = 0,
+   },
+   .prepare = prepare_scissor_state,
+};
+
+static void upload_scissor_state_pointers(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(2);
+   OUT_BATCH(CMD_3D_SCISSOR_STATE_POINTERS << 16 | (2 - 2));
+   OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   ADVANCE_BATCH();
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+}
+
+
+static void prepare_scissor_state_pointers(struct brw_context *brw)
+{
+   brw_add_validated_bo(brw, brw->sf.state_bo);
+}
+
+const struct brw_tracked_state gen6_scissor_state_pointers = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH,
+      .cache = CACHE_NEW_SF_UNIT
+   },
+   .prepare = prepare_scissor_state_pointers,
+   .emit = upload_scissor_state_pointers,
+};
diff --git a/i965/gen6_sf_state.c b/i965/gen6_sf_state.c
new file mode 100644
index 0000000..8d96b44
--- /dev/null
+++ b/i965/gen6_sf_state.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_util.h"
+#include "main/macros.h"
+#include "intel_batchbuffer.h"
+
+static uint32_t
+get_attr_override(struct brw_context *brw, int attr)
+{
+   uint32_t attr_override;
+   int attr_index = 0, i;
+
+   /* Find the source index (0 = first attribute after the 4D position)
+    * for this output attribute.  attr is currently a VERT_RESULT_* but should
+    * be FRAG_ATTRIB_*.
+    */
+   for (i = 0; i < attr; i++) {
+      if (brw->vs.prog_data->outputs_written & BITFIELD64_BIT(i))
+	 attr_index++;
+   }
+   attr_override = attr_index;
+
+   return attr_index;
+}
+
+static void
+upload_sf_state(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   GLcontext *ctx = &intel->ctx;
+   /* CACHE_NEW_VS_PROG */
+   uint32_t num_inputs = brw_count_bits(brw->vs.prog_data->outputs_written);
+   /* This should probably be FS inputs read */
+   uint32_t num_outputs = brw_count_bits(brw->vs.prog_data->outputs_written);
+   uint32_t dw1, dw2, dw3, dw4;
+   int i;
+   /* _NEW_BUFFER */
+   GLboolean render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
+   int attr = 0;
+
+   dw1 =
+      num_outputs << GEN6_SF_NUM_OUTPUTS_SHIFT |
+      (num_inputs + 1) / 2 << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT |
+      3 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT;
+   dw2 = GEN6_SF_VIEWPORT_TRANSFORM_ENABLE |
+      GEN6_SF_STATISTICS_ENABLE;
+   dw3 = 0;
+   dw4 = 0;
+
+   /* _NEW_POLYGON */
+   if ((ctx->Polygon.FrontFace == GL_CCW) ^ render_to_fbo)
+      dw2 |= GEN6_SF_WINDING_CCW;
+
+   /* _NEW_SCISSOR */
+   if (ctx->Scissor.Enabled)
+      dw3 |= GEN6_SF_SCISSOR_ENABLE;
+
+   /* _NEW_POLYGON */
+   if (ctx->Polygon.CullFlag) {
+      switch (ctx->Polygon.CullFaceMode) {
+      case GL_FRONT:
+	 dw3 |= GEN6_SF_CULL_BOTH;
+	 break;
+      case GL_BACK:
+	 dw3 |= GEN6_SF_CULL_BACK;
+	 break;
+      case GL_FRONT_AND_BACK:
+	 dw3 |= GEN6_SF_CULL_BOTH;
+	 break;
+      default:
+	 assert(0);
+	 break;
+      }
+   } else {
+      dw3 |= GEN6_SF_CULL_NONE;
+   }
+
+   /* _NEW_LINE */
+   dw3 |= U_FIXED(CLAMP(ctx->Line.Width, 0.0, 7.99), 7) <<
+      GEN6_SF_LINE_WIDTH_SHIFT;
+   if (ctx->Line.SmoothFlag) {
+      dw3 |= GEN6_SF_LINE_AA_ENABLE;
+      dw3 |= GEN6_SF_LINE_AA_MODE_TRUE;
+      dw3 |= GEN6_SF_LINE_END_CAP_WIDTH_1_0;
+   }
+
+   /* _NEW_POINT */
+   if (ctx->Point._Attenuated)
+      dw4 |= GEN6_SF_USE_STATE_POINT_WIDTH;
+
+   dw4 |= U_FIXED(CLAMP(ctx->Point.Size, 0.125, 225.875), 3) <<
+      GEN6_SF_POINT_WIDTH_SHIFT;
+   if (render_to_fbo)
+      dw1 |= GEN6_SF_POINT_SPRITE_LOWERLEFT;
+
+   /* _NEW_LIGHT */
+   if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
+      dw4 |=
+	 (2 << GEN6_SF_TRI_PROVOKE_SHIFT) |
+	 (2 << GEN6_SF_TRIFAN_PROVOKE_SHIFT) |
+	 (1 << GEN6_SF_LINE_PROVOKE_SHIFT);
+   } else {
+      dw4 |=
+	 (1 << GEN6_SF_TRIFAN_PROVOKE_SHIFT);
+   }
+
+   BEGIN_BATCH(20);
+   OUT_BATCH(CMD_3D_SF_STATE << 16 | (20 - 2));
+   OUT_BATCH(dw1);
+   OUT_BATCH(dw2);
+   OUT_BATCH(dw3);
+   OUT_BATCH(dw4);
+   OUT_BATCH_F(ctx->Polygon.OffsetUnits * 2); /* constant.  copied from gen4 */
+   OUT_BATCH_F(ctx->Polygon.OffsetFactor); /* scale */
+   OUT_BATCH_F(0.0); /* XXX: global depth offset clamp */
+   for (i = 0; i < 8; i++) {
+      uint32_t attr_overrides = 0;
+
+      /* These should be generating FS inputs read instead of VS
+       * outputs written
+       */
+      for (; attr < 64; attr++) {
+	 if (brw->vs.prog_data->outputs_written & BITFIELD64_BIT(attr)) {
+	    attr_overrides |= get_attr_override(brw, attr);
+	    attr++;
+	    break;
+	 }
+      }
+
+      for (; attr < 64; attr++) {
+	 if (brw->vs.prog_data->outputs_written & BITFIELD64_BIT(attr)) {
+	    attr_overrides |= get_attr_override(brw, attr) << 16;
+	    attr++;
+	    break;
+	 }
+      }
+      OUT_BATCH(attr_overrides);
+   }
+   OUT_BATCH(0); /* point sprite texcoord bitmask */
+   OUT_BATCH(0); /* constant interp bitmask */
+   OUT_BATCH(0); /* wrapshortest enables 0-7 */
+   OUT_BATCH(0); /* wrapshortest enables 8-15 */
+   ADVANCE_BATCH();
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+}
+
+const struct brw_tracked_state gen6_sf_state = {
+   .dirty = {
+      .mesa  = (_NEW_LIGHT |
+		_NEW_POLYGON |
+		_NEW_LINE |
+		_NEW_SCISSOR |
+		_NEW_BUFFERS),
+      .brw   = BRW_NEW_CONTEXT,
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .emit = upload_sf_state,
+};
diff --git a/i965/gen6_urb.c b/i965/gen6_urb.c
new file mode 100644
index 0000000..5445e40
--- /dev/null
+++ b/i965/gen6_urb.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "main/macros.h"
+#include "intel_batchbuffer.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+static void
+prepare_urb( struct brw_context *brw )
+{
+   brw->urb.nr_vs_entries = 24;
+   if (brw->gs.prog_bo)
+      brw->urb.nr_gs_entries = 4;
+   else
+      brw->urb.nr_gs_entries = 0;
+   /* CACHE_NEW_VS_PROG */
+   brw->urb.vs_size = MIN2(brw->vs.prog_data->urb_entry_size, 1);
+
+   /* Check that the number of URB rows (8 floats each) allocated is less
+    * than the URB space.
+    */
+   assert((brw->urb.nr_vs_entries +
+	   brw->urb.nr_gs_entries) * brw->urb.vs_size * 8 < 64 * 1024);
+}
+
+static void
+upload_urb(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   assert(brw->urb.nr_vs_entries % 4 == 0);
+   assert(brw->urb.nr_gs_entries % 4 == 0);
+   /* GS requirement */
+   assert(!brw->gs.prog_bo || brw->urb.vs_size < 5);
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   BEGIN_BATCH(3);
+   OUT_BATCH(CMD_URB << 16 | (3 - 2));
+   OUT_BATCH(((brw->urb.vs_size - 1) << GEN6_URB_VS_SIZE_SHIFT) |
+	     ((brw->urb.nr_vs_entries) << GEN6_URB_VS_ENTRIES_SHIFT));
+   OUT_BATCH(((brw->urb.vs_size - 1) << GEN6_URB_GS_SIZE_SHIFT) |
+	     ((brw->urb.nr_gs_entries) << GEN6_URB_GS_ENTRIES_SHIFT));
+   ADVANCE_BATCH();
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+}
+
+const struct brw_tracked_state gen6_urb = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = CACHE_NEW_VS_PROG,
+   },
+   .prepare = prepare_urb,
+   .emit = upload_urb,
+};
diff --git a/i965/gen6_viewport_state.c b/i965/gen6_viewport_state.c
new file mode 100644
index 0000000..0c2aa42
--- /dev/null
+++ b/i965/gen6_viewport_state.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "intel_batchbuffer.h"
+#include "main/macros.h"
+
+/* The clip VP defines the guardband region where expensive clipping is skipped
+ * and fragments are allowed to be generated and clipped out cheaply by the SF.
+ *
+ * By setting it to NDC bounds of [-1,1], we don't do GB clipping.  It's
+ * supposed to cause seams to become visible in apps due to shared edges taking
+ * different clip/no clip paths depending on whether the rest of the prim ends
+ * up in the guardband or not.
+ */
+static void
+prepare_clip_vp(struct brw_context *brw)
+{
+   struct brw_clipper_viewport vp;
+
+   vp.xmin = -1.0;
+   vp.xmax = 1.0;
+   vp.ymin = -1.0;
+   vp.ymax = 1.0;
+
+   drm_intel_bo_unreference(brw->clip.vp_bo);
+   brw->clip.vp_bo = brw_cache_data(&brw->cache, BRW_CLIP_VP,
+				    &vp, sizeof(vp),
+				    NULL, 0);
+}
+
+const struct brw_tracked_state gen6_clip_vp = {
+   .dirty = {
+      .mesa = _NEW_VIEWPORT, /* XXX: not really, but we need nonzero */
+      .brw = 0,
+      .cache = 0,
+   },
+   .prepare = prepare_clip_vp,
+};
+
+static void
+prepare_sf_vp(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   struct brw_sf_viewport sfv;
+   GLfloat y_scale, y_bias;
+   const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
+   const GLfloat *v = ctx->Viewport._WindowMap.m;
+
+   memset(&sfv, 0, sizeof(sfv));
+
+   /* _NEW_BUFFERS */
+   if (render_to_fbo) {
+      y_scale = 1.0;
+      y_bias = 0;
+   } else {
+      y_scale = -1.0;
+      y_bias = ctx->DrawBuffer->Height;
+   }
+
+   /* _NEW_VIEWPORT */
+   sfv.viewport.m00 = v[MAT_SX];
+   sfv.viewport.m11 = v[MAT_SY] * y_scale;
+   sfv.viewport.m22 = v[MAT_SZ] * depth_scale;
+   sfv.viewport.m30 = v[MAT_TX];
+   sfv.viewport.m31 = v[MAT_TY] * y_scale + y_bias;
+   sfv.viewport.m32 = v[MAT_TZ] * depth_scale;
+
+   drm_intel_bo_unreference(brw->sf.vp_bo);
+   brw->sf.vp_bo = brw_cache_data(&brw->cache, BRW_SF_VP,
+				  &sfv, sizeof(sfv),
+				  NULL, 0);
+}
+
+const struct brw_tracked_state gen6_sf_vp = {
+   .dirty = {
+      .mesa = _NEW_VIEWPORT | _NEW_BUFFERS,
+      .brw = 0,
+      .cache = 0,
+   },
+   .prepare = prepare_sf_vp,
+};
+
+static void
+prepare_cc_vp(struct brw_context *brw)
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_cc_viewport ccv;
+
+   /* _NEW_TRANSOFORM */
+   if (ctx->Transform.DepthClamp) {
+      /* _NEW_VIEWPORT */
+      ccv.min_depth = MIN2(ctx->Viewport.Near, ctx->Viewport.Far);
+      ccv.max_depth = MAX2(ctx->Viewport.Near, ctx->Viewport.Far);
+   } else {
+      ccv.min_depth = 0.0;
+      ccv.max_depth = 1.0;
+   }
+
+   drm_intel_bo_unreference(brw->cc.vp_bo);
+   brw->cc.vp_bo = brw_cache_data(&brw->cache, BRW_CC_VP, &ccv, sizeof(ccv),
+				  NULL, 0);
+}
+
+const struct brw_tracked_state gen6_cc_vp = {
+   .dirty = {
+      .mesa = _NEW_VIEWPORT | _NEW_TRANSFORM,
+      .brw = 0,
+      .cache = 0,
+   },
+   .prepare = prepare_cc_vp,
+};
+
+static void prepare_viewport_state_pointers(struct brw_context *brw)
+{
+   brw_add_validated_bo(brw, brw->sf.state_bo);
+}
+
+static void upload_viewport_state_pointers(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   BEGIN_BATCH(4);
+   OUT_BATCH(CMD_VIEWPORT_STATE_POINTERS << 16 | (4 - 2) |
+	     GEN6_CC_VIEWPORT_MODIFY |
+	     GEN6_SF_VIEWPORT_MODIFY |
+	     GEN6_CLIP_VIEWPORT_MODIFY);
+   OUT_RELOC(brw->clip.vp_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_RELOC(brw->sf.vp_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_RELOC(brw->cc.vp_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   ADVANCE_BATCH();
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+}
+
+const struct brw_tracked_state gen6_viewport_state = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH,
+      .cache = (CACHE_NEW_CLIP_VP |
+		CACHE_NEW_SF_VP |
+		CACHE_NEW_CC_VP)
+   },
+   .prepare = prepare_viewport_state_pointers,
+   .emit = upload_viewport_state_pointers,
+};
diff --git a/i965/gen6_vs_state.c b/i965/gen6_vs_state.c
new file mode 100644
index 0000000..fe597df
--- /dev/null
+++ b/i965/gen6_vs_state.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_util.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+#include "intel_batchbuffer.h"
+
+static void
+upload_vs_state(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   GLcontext *ctx = &intel->ctx;
+   const struct brw_vertex_program *vp =
+      brw_vertex_program_const(brw->vertex_program);
+   unsigned int nr_params = vp->program.Base.Parameters->NumParameters;
+   drm_intel_bo *constant_bo;
+   int i;
+
+   if (vp->use_const_buffer || nr_params == 0) {
+      /* Disable the push constant buffers. */
+      BEGIN_BATCH(5);
+      OUT_BATCH(CMD_3D_CONSTANT_VS_STATE << 16 | (5 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else {
+      if (brw->vertex_program->IsNVProgram)
+	 _mesa_load_tracked_matrices(ctx);
+
+      /* Updates the ParamaterValues[i] pointers for all parameters of the
+       * basic type of PROGRAM_STATE_VAR.
+       */
+      _mesa_load_state_parameters(ctx, vp->program.Base.Parameters);
+
+      constant_bo = drm_intel_bo_alloc(intel->bufmgr, "VS constant_bo",
+				       nr_params * 4 * sizeof(float),
+				       4096);
+      drm_intel_gem_bo_map_gtt(constant_bo);
+      for (i = 0; i < nr_params; i++) {
+	 memcpy((char *)constant_bo->virtual + i * 4 * sizeof(float),
+		vp->program.Base.Parameters->ParameterValues[i],
+		4 * sizeof(float));
+      }
+      drm_intel_gem_bo_unmap_gtt(constant_bo);
+
+      BEGIN_BATCH(5);
+      OUT_BATCH(CMD_3D_CONSTANT_VS_STATE << 16 |
+		GEN6_CONSTANT_BUFFER_0_ENABLE |
+		(5 - 2));
+      OUT_RELOC(constant_bo,
+		I915_GEM_DOMAIN_RENDER, 0, /* XXX: bad domain */
+		ALIGN(nr_params, 2) / 2 - 1);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+
+      drm_intel_bo_unreference(constant_bo);
+   }
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   BEGIN_BATCH(6);
+   OUT_BATCH(CMD_3D_VS_STATE << 16 | (6 - 2));
+   OUT_RELOC(brw->vs.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_BATCH((0 << GEN6_VS_SAMPLER_COUNT_SHIFT) |
+	     (brw->vs.nr_surfaces << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
+   OUT_BATCH(0); /* scratch space base offset */
+   OUT_BATCH((1 << GEN6_VS_DISPATCH_START_GRF_SHIFT) |
+	     (brw->vs.prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
+	     (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));
+   OUT_BATCH((0 << GEN6_VS_MAX_THREADS_SHIFT) |
+	     GEN6_VS_STATISTICS_ENABLE);
+   ADVANCE_BATCH();
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+}
+
+const struct brw_tracked_state gen6_vs_state = {
+   .dirty = {
+      .mesa  = _NEW_TRANSFORM | _NEW_PROGRAM_CONSTANTS,
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+                BRW_NEW_NR_VS_SURFACES |
+		BRW_NEW_URB_FENCE |
+		BRW_NEW_CONTEXT),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .emit = upload_vs_state,
+};
diff --git a/i965/gen6_wm_state.c b/i965/gen6_wm_state.c
new file mode 100644
index 0000000..1eb17ca
--- /dev/null
+++ b/i965/gen6_wm_state.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_util.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+#include "intel_batchbuffer.h"
+
+static void
+upload_wm_state(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   GLcontext *ctx = &intel->ctx;
+   const struct brw_fragment_program *fp =
+      brw_fragment_program_const(brw->fragment_program);
+   unsigned int nr_params = fp->program.Base.Parameters->NumParameters;
+   drm_intel_bo *constant_bo;
+   int i;
+   uint32_t dw2, dw4, dw5, dw6;
+
+   if (fp->use_const_buffer || nr_params == 0) {
+      /* Disable the push constant buffers. */
+      BEGIN_BATCH(5);
+      OUT_BATCH(CMD_3D_CONSTANT_PS_STATE << 16 | (5 - 2));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+   } else {
+      /* Updates the ParamaterValues[i] pointers for all parameters of the
+       * basic type of PROGRAM_STATE_VAR.
+       */
+      _mesa_load_state_parameters(ctx, fp->program.Base.Parameters);
+
+      constant_bo = drm_intel_bo_alloc(intel->bufmgr, "WM constant_bo",
+				       nr_params * 4 * sizeof(float),
+				       4096);
+      drm_intel_gem_bo_map_gtt(constant_bo);
+      for (i = 0; i < nr_params; i++) {
+	 memcpy((char *)constant_bo->virtual + i * 4 * sizeof(float),
+		fp->program.Base.Parameters->ParameterValues[i],
+		4 * sizeof(float));
+      }
+      drm_intel_gem_bo_unmap_gtt(constant_bo);
+
+      BEGIN_BATCH(5);
+      OUT_BATCH(CMD_3D_CONSTANT_PS_STATE << 16 |
+		GEN6_CONSTANT_BUFFER_0_ENABLE |
+		(5 - 2));
+      OUT_RELOC(constant_bo,
+		I915_GEM_DOMAIN_RENDER, 0, /* XXX: bad domain */
+		ALIGN(nr_params, 2) / 2 - 1);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      ADVANCE_BATCH();
+
+      drm_intel_bo_unreference(constant_bo);
+   }
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   dw2 = dw4 = dw5 = dw6 = 0;
+   dw4 |= GEN6_WM_STATISTICS_ENABLE;
+   dw5 |= GEN6_WM_LINE_AA_WIDTH_1_0;
+   dw5 |= GEN6_WM_LINE_END_CAP_AA_WIDTH_0_5;
+
+   /* BRW_NEW_NR_SURFACES */
+   dw2 |= brw->wm.nr_surfaces << GEN6_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT;
+
+   /* CACHE_NEW_SAMPLER */
+   dw2 |= (ALIGN(brw->wm.sampler_count, 4) / 4) << GEN6_WM_SAMPLER_COUNT_SHIFT;
+   dw4 |= (1 << GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
+
+   dw5 |= (40 - 1) << GEN6_WM_MAX_THREADS_SHIFT;
+   dw5 |= GEN6_WM_DISPATCH_ENABLE;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   if (fp->isGLSL)
+      dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
+   else
+      dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
+
+   /* _NEW_LINE */
+   if (ctx->Line.StippleFlag)
+      dw5 |= GEN6_WM_LINE_STIPPLE_ENABLE;
+
+   /* _NEW_POLYGONSTIPPLE */
+   if (ctx->Polygon.StippleFlag)
+      dw5 |= GEN6_WM_POLYGON_STIPPLE_ENABLE;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   if (fp->program.Base.InputsRead & (1 << FRAG_ATTRIB_WPOS))
+      dw5 |= GEN6_WM_USES_SOURCE_DEPTH | GEN6_WM_USES_SOURCE_W;
+   if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
+      dw5 |= GEN6_WM_COMPUTED_DEPTH;
+
+   /* _NEW_COLOR */
+   if (fp->program.UsesKill || ctx->Color.AlphaEnabled)
+      dw5 |= GEN6_WM_KILL_ENABLE;
+
+   /* This should probably be FS inputs read */
+   dw6 |= brw_count_bits(brw->vs.prog_data->outputs_written) <<
+      GEN6_WM_NUM_SF_OUTPUTS_SHIFT;
+
+   BEGIN_BATCH(9);
+   OUT_BATCH(CMD_3D_WM_STATE << 16 | (9 - 2));
+   OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_BATCH(dw2);
+   OUT_BATCH(0); /* scratch space base offset */
+   OUT_BATCH(dw4);
+   OUT_BATCH(dw5);
+   OUT_BATCH(dw6);
+   OUT_BATCH(0); /* kernel 1 pointer */
+   OUT_BATCH(0); /* kernel 2 pointer */
+   ADVANCE_BATCH();
+
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+}
+
+const struct brw_tracked_state gen6_wm_state = {
+   .dirty = {
+      .mesa  = _NEW_LINE | _NEW_POLYGONSTIPPLE | _NEW_COLOR,
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_FRAGMENT_PROGRAM |
+                BRW_NEW_NR_WM_SURFACES |
+		BRW_NEW_URB_FENCE |
+		BRW_NEW_BATCH),
+      .cache = CACHE_NEW_SAMPLER
+   },
+   .emit = upload_wm_state,
+};
author	Luc Verhaegen <libv@skynet.be>	2010-03-16 20:19:03 +0100
committer	Luc Verhaegen <libv@skynet.be>	2010-03-16 20:19:03 +0100
commit	54a8e9cc3988d908b5b846a752679127cacefd3b (patch)
tree	ee289172f13f246903151b12db7808b7cb0651e8 /i965
parent	6b263d1e2c599ddcc0ce4c84425dbc1ac8de020c (diff)