Import i915 and i965 dri drivers from mesa 7.6.0.7.6.0

author: Luc Verhaegen <libv@skynet.be> 2010-03-14 05:54:10 +0100
committer: Luc Verhaegen <libv@skynet.be> 2010-03-14 05:54:10 +0100
commit: 4cfe98db595fd3876e252bd23ed48bf82836f795 (patch)
tree: 4a05e56b9c3ebba1f42460ffe2e6cf6e7e08eafe
parent: da7796a7d40ca2cd4a03e99ddf38e3a7256a401f (diff)
106 files changed, 5429 insertions, 2809 deletions
diff --git a/configure.ac b/configure.ac
index b73fe88..af47813 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,7 +1,7 @@
 # Process this file with autoconf to produce a configure script
 
 AC_PREREQ(2.57)
-AC_INIT([mesa-dri-i9xx], 7.5.0, [], mesa-dri-i9xx)
+AC_INIT([mesa-dri-i9xx], 7.6.0, [], mesa-dri-i9xx)
 
 AM_INIT_AUTOMAKE([dist-bzip2])
 
@@ -16,9 +16,8 @@ AC_PROG_CC
 AC_HEADER_STDC
 
 PKG_CHECK_MODULES([DRM], [libdrm >= 2.4.3])
-# changes in struct gl_fragment_program.
-PKG_CHECK_MODULES([DRI], [libmesadri >= 7.5.0 libmesadri < 7.6.0
-			  libmesadricommon >= 7.5.0 libmesadricommon < 7.6.0])
+PKG_CHECK_MODULES([DRI], [libmesadri >= 7.6.0 libmesadri < 7.7.0
+			  libmesadricommon >= 7.6.0 libmesadricommon < 7.7.0])
 
 AC_OUTPUT([
 	Makefile
diff --git a/i915/Makefile.am b/i915/Makefile.am
index 2dc608c..451e9fd 100644
--- a/i915/Makefile.am
+++ b/i915/Makefile.am
@@ -20,6 +20,7 @@ i915_dri_la_SOURCES = \
 	../shared/intel_batchbuffer.c \
 	../shared/intel_clear.c \
 	../shared/intel_extensions.c \
+	../shared/intel_generatemipmap.c \
 	../shared/intel_mipmap_tree.c \
 	../shared/intel_tex_layout.c \
 	../shared/intel_tex_image.c \
@@ -32,7 +33,7 @@ i915_dri_la_SOURCES = \
 	../shared/intel_pixel_bitmap.c \
 	../shared/intel_pixel_copy.c \
 	../shared/intel_pixel_draw.c \
-	intel_pixel_read.c \
+	../shared/intel_pixel_read.c \
 	../shared/intel_buffers.c \
 	../shared/intel_blit.c \
 	../shared/intel_swapbuffers.c \
@@ -51,5 +52,6 @@ i915_dri_la_SOURCES = \
 	../shared/intel_screen.c \
 	../shared/intel_span.c \
 	../shared/intel_state.c \
+	../shared/intel_syncobj.c \
 	intel_tris.c \
 	../shared/intel_fbo.c
diff --git a/i915/i830_context.h b/i915/i830_context.h
index 1bdb320..f73cbbf 100644
--- a/i915/i830_context.h
+++ b/i915/i830_context.h
@@ -40,6 +40,7 @@
 #define I830_UPLOAD_BUFFERS          0x2
 #define I830_UPLOAD_STIPPLE          0x4
 #define I830_UPLOAD_INVARIENT        0x8
+#define I830_UPLOAD_RASTER_RULES     0x10
 #define I830_UPLOAD_TEX(i)           (0x10<<(i))
 #define I830_UPLOAD_TEXBLEND(i)      (0x100<<(i))
 #define I830_UPLOAD_TEX_ALL          (0x0f0)
@@ -99,6 +100,11 @@
 
 #define I830_TEXBLEND_SIZE	12      /* (4 args + op) * 2 + COLOR_FACTOR */
 
+enum {
+   I830_RASTER_RULES,
+   I830_RASTER_RULES_SIZE
+};
+
 struct i830_texture_object
 {
    struct intel_texture_object intel;
@@ -112,6 +118,7 @@ struct i830_hw_state
    GLuint Ctx[I830_CTX_SETUP_SIZE];
    GLuint Buffer[I830_DEST_SETUP_SIZE];
    GLuint Stipple[I830_STP_SETUP_SIZE];
+   GLuint RasterRules[I830_RASTER_RULES_SIZE];
    GLuint Tex[I830_TEX_UNITS][I830_TEX_SETUP_SIZE];
    GLuint TexBlend[I830_TEX_UNITS][I830_TEXBLEND_SIZE];
    GLuint TexBlendWordsUsed[I830_TEX_UNITS];
@@ -197,6 +204,7 @@ extern void i830InitStateFuncs(struct dd_function_table *functions);
 extern void i830EmitState(struct i830_context *i830);
 
 extern void i830InitState(struct i830_context *i830);
+extern void i830_update_provoking_vertex(GLcontext *ctx);
 
 /* i830_metaops.c
  */
diff --git a/i915/i830_reg.h b/i915/i830_reg.h
index d210c2d..ae13170 100644
--- a/i915/i830_reg.h
+++ b/i915/i830_reg.h
@@ -48,19 +48,6 @@
 #define AA_LINE_ENABLE			((1<<1) | 1)
 #define AA_LINE_DISABLE			(1<<1)
 
-#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
-/* Dword 1 */
-#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
-#define BUF_3D_ID_DEPTH 	(0x7<<24)
-#define BUF_3D_USE_FENCE	(1<<23)
-#define BUF_3D_TILED_SURFACE	(1<<22)
-#define BUF_3D_TILE_WALK_X	0
-#define BUF_3D_TILE_WALK_Y	(1<<21)
-#define BUF_3D_PITCH(x)         (((x)/4)<<2)
-/* Dword 2 */
-#define BUF_3D_ADDR(x)		((x) & ~0x3)
-
-
 #define _3DSTATE_COLOR_FACTOR_CMD	(CMD_3D | (0x1d<<24) | (0x1<<16))
 
 #define _3DSTATE_COLOR_FACTOR_N_CMD(stage)	(CMD_3D | (0x1d<<24) | \
@@ -433,8 +420,11 @@
 #define ENABLE_LINE_STRIP_PROVOKE_VRTX	(1<<8)
 #define ENABLE_TRI_FAN_PROVOKE_VRTX	(1<<5)
 #define ENABLE_TRI_STRIP_PROVOKE_VRTX	(1<<2)
+#define LINE_STRIP_PROVOKE_VRTX_MASK	(3<<6)
 #define LINE_STRIP_PROVOKE_VRTX(x)	((x)<<6)
+#define TRI_FAN_PROVOKE_VRTX_MASK	(3<<3)
 #define TRI_FAN_PROVOKE_VRTX(x) 	((x)<<3)
+#define TRI_STRIP_PROVOKE_VRTX_MASK	(3<<0)
 #define TRI_STRIP_PROVOKE_VRTX(x)	(x)
 
 /* _3DSTATE_SCISSOR_ENABLE, p200 */
diff --git a/i915/i830_state.c b/i915/i830_state.c
index 8ef6c91..645ebe3 100644
--- a/i915/i830_state.c
+++ b/i915/i830_state.c
@@ -1047,6 +1047,16 @@ i830_init_packets(struct i830_context *i830)
                                          TEXBIND_SET1(TEXCOORDSRC_VTXSET_1) |
                                          TEXBIND_SET0(TEXCOORDSRC_VTXSET_0));
 
+   i830->state.RasterRules[I830_RASTER_RULES] = (_3DSTATE_RASTER_RULES_CMD |
+						 ENABLE_POINT_RASTER_RULE |
+						 OGL_POINT_RASTER_RULE |
+						 ENABLE_LINE_STRIP_PROVOKE_VRTX |
+						 ENABLE_TRI_FAN_PROVOKE_VRTX |
+						 ENABLE_TRI_STRIP_PROVOKE_VRTX |
+						 LINE_STRIP_PROVOKE_VRTX(1) |
+						 TRI_FAN_PROVOKE_VRTX(2) |
+						 TRI_STRIP_PROVOKE_VRTX(2));
+
 
    i830->state.Stipple[I830_STPREG_ST0] = _3DSTATE_STIPPLE;
 
@@ -1058,6 +1068,27 @@ i830_init_packets(struct i830_context *i830)
    i830->state.Buffer[I830_DESTREG_SR2] = 0;
 }
 
+void
+i830_update_provoking_vertex(GLcontext * ctx)
+{
+   struct i830_context *i830 = i830_context(ctx);
+
+   I830_STATECHANGE(i830, I830_UPLOAD_RASTER_RULES);
+   i830->state.RasterRules[I830_RASTER_RULES] &= ~(LINE_STRIP_PROVOKE_VRTX_MASK |
+						   TRI_FAN_PROVOKE_VRTX_MASK |
+						   TRI_STRIP_PROVOKE_VRTX_MASK);
+
+   /* _NEW_LIGHT */
+   if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION) {
+      i830->state.RasterRules[I830_RASTER_RULES] |= (LINE_STRIP_PROVOKE_VRTX(1) |
+						     TRI_FAN_PROVOKE_VRTX(2) |
+						     TRI_STRIP_PROVOKE_VRTX(2));
+   } else {
+      i830->state.RasterRules[I830_RASTER_RULES] |= (LINE_STRIP_PROVOKE_VRTX(0) |
+						     TRI_FAN_PROVOKE_VRTX(1) |
+						     TRI_STRIP_PROVOKE_VRTX(0));
+    }
+}
 
 void
 i830InitStateFuncs(struct dd_function_table *functions)
@@ -1101,6 +1132,7 @@ i830InitState(struct i830_context *i830)
    i830->current = &i830->state;
    i830->state.emitted = 0;
    i830->state.active = (I830_UPLOAD_INVARIENT |
+                         I830_UPLOAD_RASTER_RULES |
                          I830_UPLOAD_TEXBLEND(0) |
                          I830_UPLOAD_STIPPLE |
                          I830_UPLOAD_CTX | I830_UPLOAD_BUFFERS);
diff --git a/i915/i830_texstate.c b/i915/i830_texstate.c
index 753c25b..6f998fa 100644
--- a/i915/i830_texstate.c
+++ b/i915/i830_texstate.c
@@ -174,14 +174,16 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    state[I830_TEXREG_TM0LI] = (_3DSTATE_LOAD_STATE_IMMEDIATE_2 |
                                (LOAD_TEXTURE_MAP0 << unit) | 4);
 
-/*    state[I830_TEXREG_TM0S0] = (TM0S0_USE_FENCE | */
-/* 			       t->intel.TextureOffset); */
-
-
    state[I830_TEXREG_TM0S1] =
       (((firstImage->Height - 1) << TM0S1_HEIGHT_SHIFT) |
        ((firstImage->Width - 1) << TM0S1_WIDTH_SHIFT) | format);
 
+   if (intelObj->mt->region->tiling != I915_TILING_NONE) {
+      state[I830_TEXREG_TM0S1] |= TM0S1_TILED_SURFACE;
+      if (intelObj->mt->region->tiling == I915_TILING_Y)
+	 state[I830_TEXREG_TM0S1] |= TM0S1_TILE_WALK;
+   }
+
    state[I830_TEXREG_TM0S2] =
       ((((pitch / 4) - 1) << TM0S2_PITCH_SHIFT) | TM0S2_CUBE_FACE_ENA_MASK);
 
diff --git a/i915/i830_vtbl.c b/i915/i830_vtbl.c
index 3bf02de..983f672 100644
--- a/i915/i830_vtbl.c
+++ b/i915/i830_vtbl.c
@@ -299,7 +299,7 @@ i830_emit_invarient_state(struct intel_context *intel)
 {
    BATCH_LOCALS;
 
-   BEGIN_BATCH(30, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(29, IGNORE_CLIPRECTS);
 
    OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
    OUT_BATCH(0);
@@ -351,15 +351,6 @@ i830_emit_invarient_state(struct intel_context *intel)
    OUT_BATCH(_3DSTATE_MAP_COORD_TRANSFORM);
    OUT_BATCH(DISABLE_TEX_TRANSFORM | TEXTURE_SET(3));
 
-   OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
-             ENABLE_POINT_RASTER_RULE |
-             OGL_POINT_RASTER_RULE |
-             ENABLE_LINE_STRIP_PROVOKE_VRTX |
-             ENABLE_TRI_FAN_PROVOKE_VRTX |
-             ENABLE_TRI_STRIP_PROVOKE_VRTX |
-             LINE_STRIP_PROVOKE_VRTX(1) |
-             TRI_FAN_PROVOKE_VRTX(2) | TRI_STRIP_PROVOKE_VRTX(2));
-
    OUT_BATCH(_3DSTATE_VERTEX_TRANSFORM);
    OUT_BATCH(DISABLE_VIEWPORT_TRANSFORM | DISABLE_PERSPECTIVE_DIVIDE);
 
@@ -394,6 +385,9 @@ get_state_size(struct i830_hw_state *state)
    if (dirty & I830_UPLOAD_INVARIENT)
       sz += 40 * sizeof(int);
 
+   if (dirty & I830_UPLOAD_RASTER_RULES)
+      sz += sizeof(state->RasterRules);
+
    if (dirty & I830_UPLOAD_CTX)
       sz += sizeof(state->Ctx);
 
@@ -486,6 +480,11 @@ i830_emit_state(struct intel_context *intel)
       i830_emit_invarient_state(intel);
    }
 
+   if (dirty & I830_UPLOAD_RASTER_RULES) {
+      DBG("I830_UPLOAD_RASTER_RULES:\n");
+      emit(intel, state->RasterRules, sizeof(state->RasterRules));
+   }
+
    if (dirty & I830_UPLOAD_CTX) {
       DBG("I830_UPLOAD_CTX:\n");
       emit(intel, state->Ctx, sizeof(state->Ctx));
@@ -552,7 +551,7 @@ i830_emit_state(struct intel_context *intel)
          if (state->tex_buffer[i]) {
             OUT_RELOC(state->tex_buffer[i],
 		      I915_GEM_DOMAIN_SAMPLER, 0,
-                      state->tex_offset[i] | TM0S0_USE_FENCE);
+                      state->tex_offset[i]);
          }
 	 else if (state == &i830->meta) {
 	    assert(i == 0);
@@ -634,21 +633,11 @@ i830_state_draw_region(struct intel_context *intel,
    /*
     * Set stride/cpp values
     */
-   if (color_region) {
-      state->Buffer[I830_DESTREG_CBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I830_DESTREG_CBUFADDR1] =
-         (BUF_3D_ID_COLOR_BACK |
-          BUF_3D_PITCH(color_region->pitch * color_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I830_DESTREG_CBUFADDR0],
+				color_region, BUF_3D_ID_COLOR_BACK);
 
-   if (depth_region) {
-      state->Buffer[I830_DESTREG_DBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I830_DESTREG_DBUFADDR1] =
-         (BUF_3D_ID_DEPTH |
-          BUF_3D_PITCH(depth_region->pitch * depth_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I830_DESTREG_DBUFADDR0],
+				depth_region, BUF_3D_ID_DEPTH);
 
    /*
     * Compute/set I830_DESTREG_DV1 value
@@ -718,26 +707,6 @@ i830_set_draw_region(struct intel_context *intel,
    i830_state_draw_region(intel, &i830->state, color_regions[0], depth_region);
 }
 
-#if 0
-static void
-i830_update_color_z_regions(intelContextPtr intel,
-                            const intelRegion * colorRegion,
-                            const intelRegion * depthRegion)
-{
-   i830ContextPtr i830 = I830_CONTEXT(intel);
-
-   i830->state.Buffer[I830_DESTREG_CBUFADDR1] =
-      (BUF_3D_ID_COLOR_BACK | BUF_3D_PITCH(colorRegion->pitch) |
-       BUF_3D_USE_FENCE);
-   i830->state.Buffer[I830_DESTREG_CBUFADDR2] = colorRegion->offset;
-
-   i830->state.Buffer[I830_DESTREG_DBUFADDR1] =
-      (BUF_3D_ID_DEPTH | BUF_3D_PITCH(depthRegion->pitch) | BUF_3D_USE_FENCE);
-   i830->state.Buffer[I830_DESTREG_DBUFADDR2] = depthRegion->offset;
-}
-#endif
-
-
 /* This isn't really handled at the moment.
  */
 static void
@@ -768,9 +737,10 @@ i830_assert_not_dirty( struct intel_context *intel )
 }
 
 static void
-i830_note_unlock( struct intel_context *intel )
+i830_invalidate_state(struct intel_context *intel, GLuint new_state)
 {
-    /* nothing */
+   if (new_state & _NEW_LIGHT)
+      i830_update_provoking_vertex(&intel->ctx);
 }
 
 void
@@ -787,6 +757,6 @@ i830InitVtbl(struct i830_context *i830)
    i830->intel.vtbl.render_start = i830_render_start;
    i830->intel.vtbl.render_prevalidate = i830_render_prevalidate;
    i830->intel.vtbl.assert_not_dirty = i830_assert_not_dirty;
-   i830->intel.vtbl.note_unlock = i830_note_unlock; 
    i830->intel.vtbl.finish_batch = intel_finish_vb;
+   i830->intel.vtbl.invalidate_state = i830_invalidate_state;
 }
diff --git a/i915/i915_context.c b/i915/i915_context.c
index 1f9f363..3ab7d68 100644
--- a/i915/i915_context.c
+++ b/i915/i915_context.c
@@ -27,6 +27,7 @@
 
 #include "i915_context.h"
 #include "main/imports.h"
+#include "main/macros.h"
 #include "intel_tex.h"
 #include "intel_tris.h"
 #include "tnl/t_context.h"
@@ -73,8 +74,12 @@ i915InvalidateState(GLcontext * ctx, GLuint new_state)
          p->params_uptodate = 0;
    }
 
-   if (new_state & (_NEW_FOG | _NEW_HINT | _NEW_PROGRAM))
+   if (new_state & (_NEW_FOG | _NEW_HINT | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS))
       i915_update_fog(ctx);
+   if (new_state & (_NEW_STENCIL | _NEW_BUFFERS | _NEW_POLYGON))
+      i915_update_stencil(ctx);
+   if (new_state & (_NEW_LIGHT))
+       i915_update_provoking_vertex(ctx);
 }
 
 
@@ -162,6 +167,9 @@ i915CreateContext(const __GLcontextModes * mesaVis,
    ctx->Const.FragmentProgram.MaxNativeTexIndirections =
       I915_MAX_TEX_INDIRECT;
    ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0; /* I don't think we have one */
+   ctx->Const.FragmentProgram.MaxEnvParams =
+      MIN2(ctx->Const.FragmentProgram.MaxNativeParameters,
+	   ctx->Const.FragmentProgram.MaxEnvParams);
 
    ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
 
diff --git a/i915/i915_context.h b/i915/i915_context.h
index 87bbf5f..8de4a9d 100644
--- a/i915/i915_context.h
+++ b/i915/i915_context.h
@@ -48,6 +48,7 @@
 #define I915_UPLOAD_FOG              0x20
 #define I915_UPLOAD_INVARIENT        0x40
 #define I915_UPLOAD_DEFAULTS         0x80
+#define I915_UPLOAD_RASTER_RULES     0x100
 #define I915_UPLOAD_TEX(i)           (0x00010000<<(i))
 #define I915_UPLOAD_TEX_ALL          (0x00ff0000)
 #define I915_UPLOAD_TEX_0_SHIFT      16
@@ -82,7 +83,9 @@
 #define I915_CTXREG_IAB   	 	6
 #define I915_CTXREG_BLENDCOLOR0		7
 #define I915_CTXREG_BLENDCOLOR1		8
-#define I915_CTX_SETUP_SIZE		9
+#define I915_CTXREG_BF_STENCIL_OPS	9
+#define I915_CTXREG_BF_STENCIL_MASKS	10
+#define I915_CTX_SETUP_SIZE		11
 
 #define I915_FOGREG_COLOR		0
 #define I915_FOGREG_MODE0		1
@@ -110,6 +113,10 @@
 #define I915_DEFREG_Z1    5
 #define I915_DEF_SETUP_SIZE    6
 
+enum {
+   I915_RASTER_RULES,
+   I915_RASTER_RULES_SETUP_SIZE,
+};
 
 #define I915_MAX_CONSTANT      32
 #define I915_CONSTANT_SIZE     (2+(4*I915_MAX_CONSTANT))
@@ -206,6 +213,7 @@ struct i915_hw_state
    GLuint Stipple[I915_STP_SETUP_SIZE];
    GLuint Fog[I915_FOG_SETUP_SIZE];
    GLuint Defaults[I915_DEF_SETUP_SIZE];
+   GLuint RasterRules[I915_RASTER_RULES_SETUP_SIZE];
    GLuint Tex[I915_TEX_UNITS][I915_TEX_SETUP_SIZE];
    GLuint Constant[I915_CONSTANT_SIZE];
    GLuint ConstantSize;
@@ -321,6 +329,8 @@ extern void i915_print_ureg(const char *msg, GLuint ureg);
 extern void i915InitStateFunctions(struct dd_function_table *functions);
 extern void i915InitState(struct i915_context *i915);
 extern void i915_update_fog(GLcontext * ctx);
+extern void i915_update_stencil(GLcontext * ctx);
+extern void i915_update_provoking_vertex(GLcontext *ctx);
 
 
 /*======================================================================
diff --git a/i915/i915_reg.h b/i915/i915_reg.h
index 8891e11..b5fa7fd 100644
--- a/i915/i915_reg.h
+++ b/i915/i915_reg.h
@@ -86,27 +86,15 @@
 #define BFM_ENABLE_STENCIL_WRITE_MASK     (1<<16)
 #define BFM_STENCIL_TEST_MASK_SHIFT       8
 #define BFM_STENCIL_TEST_MASK_MASK        (0xff<<8)
+#define BFM_STENCIL_TEST_MASK(x)	  (((x)&0xff) << 8)
 #define BFM_STENCIL_WRITE_MASK_SHIFT      0
 #define BFM_STENCIL_WRITE_MASK_MASK       (0xff<<0)
+#define BFM_STENCIL_WRITE_MASK(x)	  ((x)&0xff)
 
 
 
 /* 3DSTATE_BIN_CONTROL p141 */
 
-/* p143 */
-#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
-/* Dword 1 */
-#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
-#define BUF_3D_ID_DEPTH 	(0x7<<24)
-#define BUF_3D_USE_FENCE	(1<<23)
-#define BUF_3D_TILED_SURFACE	(1<<22)
-#define BUF_3D_TILE_WALK_X	0
-#define BUF_3D_TILE_WALK_Y	(1<<21)
-#define BUF_3D_PITCH(x)         (((x)/4)<<2)
-/* Dword 2 */
-#define BUF_3D_ADDR(x)		((x) & ~0x3)
-
-
 /* 3DSTATE_CHROMA_KEY */
 
 /* 3DSTATE_CLEAR_PARAMETERS, p150 */
@@ -155,6 +143,7 @@
 /* p161 */
 #define _3DSTATE_DST_BUF_VARS_CMD	(CMD_3D | (0x1d<<24) | (0x85<<16))
 /* Dword 1 */
+#define CLASSIC_EARLY_DEPTH		(1<<31)
 #define TEX_DEFAULT_COLOR_OGL           (0<<30)
 #define TEX_DEFAULT_COLOR_D3D           (1<<30)
 #define ZR_EARLY_DEPTH                  (1<<29)
@@ -308,7 +297,9 @@
 #define TEXKILL_4D                      (1<<9)
 #define ENABLE_LINE_STRIP_PROVOKE_VRTX	(1<<8)
 #define ENABLE_TRI_FAN_PROVOKE_VRTX	(1<<5)
+#define LINE_STRIP_PROVOKE_VRTX_MASK	(3 << 6)
 #define LINE_STRIP_PROVOKE_VRTX(x)	((x)<<6)
+#define TRI_FAN_PROVOKE_VRTX_MASK	(3 << 3)
 #define TRI_FAN_PROVOKE_VRTX(x) 	((x)<<3)
 
 /* _3DSTATE_SCISSOR_ENABLE, p256 */
diff --git a/i915/i915_state.c b/i915/i915_state.c
index 814fb59..b60efea 100644
--- a/i915/i915_state.c
+++ b/i915/i915_state.c
@@ -48,73 +48,119 @@
 
 #define FILE_DEBUG_FLAG DEBUG_STATE
 
-static void
-i915StencilFuncSeparate(GLcontext * ctx, GLenum face, GLenum func, GLint ref,
-                        GLuint mask)
+void
+i915_update_stencil(GLcontext * ctx)
 {
    struct i915_context *i915 = I915_CONTEXT(ctx);
-   int test = intel_translate_compare_func(func);
+   GLuint front_ref, front_writemask, front_mask;
+   GLenum front_func, front_fail, front_pass_z_fail, front_pass_z_pass;
+   GLuint back_ref, back_writemask, back_mask;
+   GLenum back_func, back_fail, back_pass_z_fail, back_pass_z_pass;
 
-   mask = mask & 0xff;
-
-   DBG("%s : func: %s, ref : 0x%x, mask: 0x%x\n", __FUNCTION__,
-       _mesa_lookup_enum_by_nr(func), ref, mask);
+   I915_STATECHANGE(i915, I915_UPLOAD_CTX);
 
+   /* The 915 considers CW to be "front" for two-sided stencil, so choose
+    * appropriately.
+    */
+   /* _NEW_POLYGON | _NEW_STENCIL */
+   if (ctx->Polygon.FrontFace == GL_CW) {
+      front_ref = ctx->Stencil.Ref[0];
+      front_mask = ctx->Stencil.ValueMask[0];
+      front_writemask = ctx->Stencil.WriteMask[0];
+      front_func = ctx->Stencil.Function[0];
+      front_fail = ctx->Stencil.FailFunc[0];
+      front_pass_z_fail = ctx->Stencil.ZFailFunc[0];
+      front_pass_z_pass = ctx->Stencil.ZPassFunc[0];
+      back_ref = ctx->Stencil.Ref[ctx->Stencil._BackFace];
+      back_mask = ctx->Stencil.ValueMask[ctx->Stencil._BackFace];
+      back_writemask = ctx->Stencil.WriteMask[ctx->Stencil._BackFace];
+      back_func = ctx->Stencil.Function[ctx->Stencil._BackFace];
+      back_fail = ctx->Stencil.FailFunc[ctx->Stencil._BackFace];
+      back_pass_z_fail = ctx->Stencil.ZFailFunc[ctx->Stencil._BackFace];
+      back_pass_z_pass = ctx->Stencil.ZPassFunc[ctx->Stencil._BackFace];
+   } else {
+      front_ref = ctx->Stencil.Ref[ctx->Stencil._BackFace];
+      front_mask = ctx->Stencil.ValueMask[ctx->Stencil._BackFace];
+      front_writemask = ctx->Stencil.WriteMask[ctx->Stencil._BackFace];
+      front_func = ctx->Stencil.Function[ctx->Stencil._BackFace];
+      front_fail = ctx->Stencil.FailFunc[ctx->Stencil._BackFace];
+      front_pass_z_fail = ctx->Stencil.ZFailFunc[ctx->Stencil._BackFace];
+      front_pass_z_pass = ctx->Stencil.ZPassFunc[ctx->Stencil._BackFace];
+      back_ref = ctx->Stencil.Ref[0];
+      back_mask = ctx->Stencil.ValueMask[0];
+      back_writemask = ctx->Stencil.WriteMask[0];
+      back_func = ctx->Stencil.Function[0];
+      back_fail = ctx->Stencil.FailFunc[0];
+      back_pass_z_fail = ctx->Stencil.ZFailFunc[0];
+      back_pass_z_pass = ctx->Stencil.ZPassFunc[0];
+   }
 
-   I915_STATECHANGE(i915, I915_UPLOAD_CTX);
-   i915->state.Ctx[I915_CTXREG_STATE4] &= ~MODE4_ENABLE_STENCIL_TEST_MASK;
+   /* Set front state. */
+   i915->state.Ctx[I915_CTXREG_STATE4] &= ~(MODE4_ENABLE_STENCIL_TEST_MASK |
+					    MODE4_ENABLE_STENCIL_WRITE_MASK);
    i915->state.Ctx[I915_CTXREG_STATE4] |= (ENABLE_STENCIL_TEST_MASK |
-                                           STENCIL_TEST_MASK(mask));
+					   ENABLE_STENCIL_WRITE_MASK |
+					   STENCIL_TEST_MASK(front_mask) |
+					   STENCIL_WRITE_MASK(front_writemask));
 
    i915->state.Ctx[I915_CTXREG_LIS5] &= ~(S5_STENCIL_REF_MASK |
-                                          S5_STENCIL_TEST_FUNC_MASK);
+					  S5_STENCIL_TEST_FUNC_MASK |
+					  S5_STENCIL_FAIL_MASK |
+					  S5_STENCIL_PASS_Z_FAIL_MASK |
+					  S5_STENCIL_PASS_Z_PASS_MASK);
+
+   i915->state.Ctx[I915_CTXREG_LIS5] |=
+      (front_ref << S5_STENCIL_REF_SHIFT) |
+      (intel_translate_compare_func(front_func) << S5_STENCIL_TEST_FUNC_SHIFT) |
+      (intel_translate_stencil_op(front_fail) << S5_STENCIL_FAIL_SHIFT) |
+      (intel_translate_stencil_op(front_pass_z_fail) <<
+       S5_STENCIL_PASS_Z_FAIL_SHIFT) |
+      (intel_translate_stencil_op(front_pass_z_pass) <<
+       S5_STENCIL_PASS_Z_PASS_SHIFT);
+
+   /* Set back state if different from front. */
+   if (ctx->Stencil._TestTwoSide) {
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_OPS] &=
+	 ~(BFO_STENCIL_REF_MASK |
+	   BFO_STENCIL_TEST_MASK |
+	   BFO_STENCIL_FAIL_MASK |
+	   BFO_STENCIL_PASS_Z_FAIL_MASK |
+	   BFO_STENCIL_PASS_Z_PASS_MASK);
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_OPS] |= BFO_STENCIL_TWO_SIDE |
+	 (back_ref << BFO_STENCIL_REF_SHIFT) |
+	 (intel_translate_compare_func(back_func) << BFO_STENCIL_TEST_SHIFT) |
+	 (intel_translate_stencil_op(back_fail) << BFO_STENCIL_FAIL_SHIFT) |
+	 (intel_translate_stencil_op(back_pass_z_fail) <<
+	  BFO_STENCIL_PASS_Z_FAIL_SHIFT) |
+	 (intel_translate_stencil_op(back_pass_z_pass) <<
+	  BFO_STENCIL_PASS_Z_PASS_SHIFT);
+
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_MASKS] &=
+	 ~(BFM_STENCIL_TEST_MASK_MASK |
+	   BFM_STENCIL_WRITE_MASK_MASK);
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_MASKS] |=
+	 BFM_STENCIL_TEST_MASK(back_mask) |
+	 BFM_STENCIL_WRITE_MASK(back_writemask);
+   } else {
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_OPS] &= ~BFO_STENCIL_TWO_SIDE;
+   }
+}
 
-   i915->state.Ctx[I915_CTXREG_LIS5] |= ((ref << S5_STENCIL_REF_SHIFT) |
-                                         (test <<
-                                          S5_STENCIL_TEST_FUNC_SHIFT));
+static void
+i915StencilFuncSeparate(GLcontext * ctx, GLenum face, GLenum func, GLint ref,
+                        GLuint mask)
+{
 }
 
 static void
 i915StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
 {
-   struct i915_context *i915 = I915_CONTEXT(ctx);
-
-   DBG("%s : mask 0x%x\n", __FUNCTION__, mask);
-   
-   mask = mask & 0xff;
-
-   I915_STATECHANGE(i915, I915_UPLOAD_CTX);
-   i915->state.Ctx[I915_CTXREG_STATE4] &= ~MODE4_ENABLE_STENCIL_WRITE_MASK;
-   i915->state.Ctx[I915_CTXREG_STATE4] |= (ENABLE_STENCIL_WRITE_MASK |
-                                           STENCIL_WRITE_MASK(mask));
 }
 
-
 static void
 i915StencilOpSeparate(GLcontext * ctx, GLenum face, GLenum fail, GLenum zfail,
                       GLenum zpass)
 {
-   struct i915_context *i915 = I915_CONTEXT(ctx);
-   int fop = intel_translate_stencil_op(fail);
-   int dfop = intel_translate_stencil_op(zfail);
-   int dpop = intel_translate_stencil_op(zpass);
-
-
-   DBG("%s: fail : %s, zfail: %s, zpass : %s\n", __FUNCTION__,
-       _mesa_lookup_enum_by_nr(fail),
-       _mesa_lookup_enum_by_nr(zfail), _mesa_lookup_enum_by_nr(zpass));
-
-   I915_STATECHANGE(i915, I915_UPLOAD_CTX);
-
-   i915->state.Ctx[I915_CTXREG_LIS5] &= ~(S5_STENCIL_FAIL_MASK |
-                                          S5_STENCIL_PASS_Z_FAIL_MASK |
-                                          S5_STENCIL_PASS_Z_PASS_MASK);
-
-   i915->state.Ctx[I915_CTXREG_LIS5] |= ((fop << S5_STENCIL_FAIL_SHIFT) |
-                                         (dfop <<
-                                          S5_STENCIL_PASS_Z_FAIL_SHIFT) |
-                                         (dpop <<
-                                          S5_STENCIL_PASS_Z_PASS_SHIFT));
 }
 
 static void
@@ -945,6 +991,17 @@ i915_init_packets(struct i915_context *i915)
          _3DSTATE_CONST_BLEND_COLOR_CMD;
       i915->state.Ctx[I915_CTXREG_BLENDCOLOR1] = 0;
 
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_MASKS] =
+	 _3DSTATE_BACKFACE_STENCIL_MASKS |
+	 BFM_ENABLE_STENCIL_TEST_MASK |
+	 BFM_ENABLE_STENCIL_WRITE_MASK |
+	 (0xff << BFM_STENCIL_WRITE_MASK_SHIFT) |
+	 (0xff << BFM_STENCIL_TEST_MASK_SHIFT);
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_OPS] =
+	 _3DSTATE_BACKFACE_STENCIL_OPS |
+	 BFO_ENABLE_STENCIL_REF |
+	 BFO_ENABLE_STENCIL_FUNCS |
+	 BFO_ENABLE_STENCIL_TWO_SIDE;
    }
 
    {
@@ -976,6 +1033,13 @@ i915_init_packets(struct i915_context *i915)
       i915->state.Buffer[I915_DESTREG_SR2] = 0;
    }
 
+   i915->state.RasterRules[I915_RASTER_RULES] = _3DSTATE_RASTER_RULES_CMD |
+      ENABLE_POINT_RASTER_RULE |
+      OGL_POINT_RASTER_RULE |
+      ENABLE_LINE_STRIP_PROVOKE_VRTX |
+      ENABLE_TRI_FAN_PROVOKE_VRTX |
+      LINE_STRIP_PROVOKE_VRTX(1) |
+      TRI_FAN_PROVOKE_VRTX(2) | ENABLE_TEXKILL_3D_4D | TEXKILL_4D;
 
 #if 0
    {
@@ -996,7 +1060,33 @@ i915_init_packets(struct i915_context *i915)
    i915->state.active = (I915_UPLOAD_PROGRAM |
                          I915_UPLOAD_STIPPLE |
                          I915_UPLOAD_CTX |
-                         I915_UPLOAD_BUFFERS | I915_UPLOAD_INVARIENT);
+                         I915_UPLOAD_BUFFERS |
+			 I915_UPLOAD_INVARIENT |
+			 I915_UPLOAD_RASTER_RULES);
+}
+
+void
+i915_update_provoking_vertex(GLcontext * ctx)
+{
+   struct i915_context *i915 = I915_CONTEXT(ctx);
+
+   I915_STATECHANGE(i915, I915_UPLOAD_CTX);
+   i915->state.Ctx[I915_CTXREG_LIS6] &= ~(S6_TRISTRIP_PV_MASK);
+
+   I915_STATECHANGE(i915, I915_UPLOAD_RASTER_RULES);
+   i915->state.RasterRules[I915_RASTER_RULES] &= ~(LINE_STRIP_PROVOKE_VRTX_MASK |
+						   TRI_FAN_PROVOKE_VRTX_MASK);
+
+   /* _NEW_LIGHT */
+   if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION) {
+      i915->state.RasterRules[I915_RASTER_RULES] |= (LINE_STRIP_PROVOKE_VRTX(1) |
+						     TRI_FAN_PROVOKE_VRTX(2));
+      i915->state.Ctx[I915_CTXREG_LIS6] |= (2 << S6_TRISTRIP_PV_SHIFT);
+   } else {
+      i915->state.RasterRules[I915_RASTER_RULES] |= (LINE_STRIP_PROVOKE_VRTX(0) |
+						     TRI_FAN_PROVOKE_VRTX(1));
+      i915->state.Ctx[I915_CTXREG_LIS6] |= (0 << S6_TRISTRIP_PV_SHIFT);
+    }
 }
 
 void
diff --git a/i915/i915_tex_layout.c b/i915/i915_tex_layout.c
index 7cc1c09..d9588e5 100644
--- a/i915/i915_tex_layout.c
+++ b/i915/i915_tex_layout.c
@@ -55,6 +55,17 @@ static GLint step_offsets[6][2] = {
    [FACE_NEG_Z] = {-1, 1},
 };
 
+
+static GLint bottom_offsets[6] = {
+   [FACE_POS_X] = 16 + 0 * 8,
+   [FACE_POS_Y] = 16 + 1 * 8,
+   [FACE_POS_Z] = 16 + 2 * 8,
+   [FACE_NEG_X] = 16 + 3 * 8,
+   [FACE_NEG_Y] = 16 + 4 * 8,
+   [FACE_NEG_Z] = 16 + 5 * 8,
+};
+
+
 /**
  * Cube texture map layout for i830M-GM915.
  *
@@ -101,7 +112,8 @@ static GLint step_offsets[6][2] = {
  */
 static void
 i915_miptree_layout_cube(struct intel_context *intel,
-			 struct intel_mipmap_tree * mt)
+			 struct intel_mipmap_tree * mt,
+			 uint32_t tiling)
 {
    const GLuint dim = mt->width0;
    GLuint face;
@@ -111,7 +123,7 @@ i915_miptree_layout_cube(struct intel_context *intel,
    assert(lvlWidth == lvlHeight); /* cubemap images are square */
 
    /* double pitch for cube layouts */
-   mt->pitch = intel_miptree_pitch_align (intel, mt, dim * 2);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, dim * 2);
    mt->total_height = dim * 4;
 
    for (level = mt->first_level; level <= mt->last_level; level++) {
@@ -145,7 +157,8 @@ i915_miptree_layout_cube(struct intel_context *intel,
 
 static void
 i915_miptree_layout_3d(struct intel_context *intel,
-		       struct intel_mipmap_tree * mt)
+		       struct intel_mipmap_tree * mt,
+		       uint32_t tiling)
 {
    GLuint width = mt->width0;
    GLuint height = mt->height0;
@@ -154,7 +167,7 @@ i915_miptree_layout_3d(struct intel_context *intel,
    GLint level;
 
    /* Calculate the size of a single slice. */
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
 
    /* XXX: hardware expects/requires 9 levels at minimum. */
    for (level = mt->first_level; level <= MAX2(8, mt->last_level); level++) {
@@ -189,14 +202,15 @@ i915_miptree_layout_3d(struct intel_context *intel,
 
 static void
 i915_miptree_layout_2d(struct intel_context *intel,
-		       struct intel_mipmap_tree * mt)
+		       struct intel_mipmap_tree * mt,
+		       uint32_t tiling)
 {
    GLuint width = mt->width0;
    GLuint height = mt->height0;
    GLuint img_height;
    GLint level;
 
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
    mt->total_height = 0;
 
    for (level = mt->first_level; level <= mt->last_level; level++) {
@@ -217,19 +231,20 @@ i915_miptree_layout_2d(struct intel_context *intel,
 }
 
 GLboolean
-i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
+i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt,
+		    uint32_t tiling)
 {
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      i915_miptree_layout_cube(intel, mt);
+      i915_miptree_layout_cube(intel, mt, tiling);
       break;
    case GL_TEXTURE_3D:
-      i915_miptree_layout_3d(intel, mt);
+      i915_miptree_layout_3d(intel, mt, tiling);
       break;
    case GL_TEXTURE_1D:
    case GL_TEXTURE_2D:
    case GL_TEXTURE_RECTANGLE_ARB:
-      i915_miptree_layout_2d(intel, mt);
+      i915_miptree_layout_2d(intel, mt, tiling);
       break;
    default:
       _mesa_problem(NULL, "Unexpected tex target in i915_miptree_layout()");
@@ -297,7 +312,7 @@ i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
  * +---+   +---+   +---+   +---+   +---+   +---+
  *
  * The bottom row continues with the remaining 2x2 then the 1x1 mip contents
- * in order, with each of them aligned to a 4x4 block boundary.  Thus, for
+ * in order, with each of them aligned to a 8x8 block boundary.  Thus, for
  * 32x32 cube maps and smaller, the bottom row layout is going to dictate the
  * pitch of the tree.  For a tree with 4x4 images, the pitch is at least
  * 14 * 8 = 112 texels, for 2x2 it is at least 12 * 8 texels, and for 1x1
@@ -306,7 +321,8 @@ i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
 
 static void
 i945_miptree_layout_cube(struct intel_context *intel,
-			 struct intel_mipmap_tree * mt)
+			 struct intel_mipmap_tree * mt,
+			 uint32_t tiling)
 {
    const GLuint dim = mt->width0;
    GLuint face;
@@ -320,9 +336,9 @@ i945_miptree_layout_cube(struct intel_context *intel,
     * or the final row of 4x4, 2x2 and 1x1 faces below this.
     */
    if (dim > 32)
-      mt->pitch = intel_miptree_pitch_align (intel, mt, dim * 2);
+      mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, dim * 2);
    else
-      mt->pitch = intel_miptree_pitch_align (intel, mt, 14 * 8);
+      mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, 14 * 8);
 
    if (dim >= 4)
       mt->total_height = dim * 4 + 4;
@@ -375,10 +391,11 @@ i945_miptree_layout_cube(struct intel_context *intel,
 	       x = (face - 4) * 8;
 	       break;
 	    }
+	    break;
 
 	 case 2:
 	    y = mt->total_height - 4;
-	    x = 16 + face * 8;
+	    x = bottom_offsets[face];
 	    break;
 
 	 case 1:
@@ -396,7 +413,8 @@ i945_miptree_layout_cube(struct intel_context *intel,
 
 static void
 i945_miptree_layout_3d(struct intel_context *intel,
-		       struct intel_mipmap_tree * mt)
+		       struct intel_mipmap_tree * mt,
+		       uint32_t tiling)
 {
    GLuint width = mt->width0;
    GLuint height = mt->height0;
@@ -405,7 +423,7 @@ i945_miptree_layout_3d(struct intel_context *intel,
    GLuint pack_y_pitch;
    GLuint level;
 
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
    mt->total_height = 0;
 
    pack_y_pitch = MAX2(mt->height0, 2);
@@ -450,22 +468,23 @@ i945_miptree_layout_3d(struct intel_context *intel,
 }
 
 GLboolean
-i945_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
+i945_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt,
+		    uint32_t tiling)
 {
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
       if (mt->compressed)
-	 i945_miptree_layout_cube(intel, mt);
+	 i945_miptree_layout_cube(intel, mt, tiling);
       else
-	 i915_miptree_layout_cube(intel, mt);
+	 i915_miptree_layout_cube(intel, mt, tiling);
       break;
    case GL_TEXTURE_3D:
-      i945_miptree_layout_3d(intel, mt);
+      i945_miptree_layout_3d(intel, mt, tiling);
       break;
    case GL_TEXTURE_1D:
    case GL_TEXTURE_2D:
    case GL_TEXTURE_RECTANGLE_ARB:
-      i945_miptree_layout_2d(intel, mt);
+      i945_miptree_layout_2d(intel, mt, tiling);
       break;
    default:
       _mesa_problem(NULL, "Unexpected tex target in i945_miptree_layout()");
diff --git a/i915/i915_texstate.c b/i915/i915_texstate.c
index a37dd7f..32d4b30 100644
--- a/i915/i915_texstate.c
+++ b/i915/i915_texstate.c
@@ -185,8 +185,13 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 
    state[I915_TEXREG_MS3] =
       (((firstImage->Height - 1) << MS3_HEIGHT_SHIFT) |
-       ((firstImage->Width - 1) << MS3_WIDTH_SHIFT) | format |
-       MS3_USE_FENCE_REGS);
+       ((firstImage->Width - 1) << MS3_WIDTH_SHIFT) | format);
+
+   if (intelObj->mt->region->tiling != I915_TILING_NONE) {
+      state[I915_TEXREG_MS3] |= MS3_TILED_SURFACE;
+      if (intelObj->mt->region->tiling == I915_TILING_Y)
+	 state[I915_TEXREG_MS3] |= MS3_TILE_WALK;
+   }
 
    state[I915_TEXREG_MS4] =
      ((((pitch / 4) - 1) << MS4_PITCH_SHIFT) | MS4_CUBE_FACE_ENA_MASK |
diff --git a/i915/i915_vtbl.c b/i915/i915_vtbl.c
index 1150046..9a723d3 100644
--- a/i915/i915_vtbl.c
+++ b/i915/i915_vtbl.c
@@ -42,6 +42,7 @@
 #include "intel_regions.h"
 #include "intel_tris.h"
 #include "intel_fbo.h"
+#include "intel_chipset.h"
 
 #include "i915_reg.h"
 #include "i915_context.h"
@@ -175,7 +176,7 @@ i915_emit_invarient_state(struct intel_context *intel)
 {
    BATCH_LOCALS;
 
-   BEGIN_BATCH(20, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(17, IGNORE_CLIPRECTS);
 
    OUT_BATCH(_3DSTATE_AA_CMD |
              AA_LINE_ECAAR_WIDTH_ENABLE |
@@ -199,14 +200,6 @@ i915_emit_invarient_state(struct intel_context *intel)
              CSB_TCB(3, 3) |
              CSB_TCB(4, 4) | CSB_TCB(5, 5) | CSB_TCB(6, 6) | CSB_TCB(7, 7));
 
-   OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
-             ENABLE_POINT_RASTER_RULE |
-             OGL_POINT_RASTER_RULE |
-             ENABLE_LINE_STRIP_PROVOKE_VRTX |
-             ENABLE_TRI_FAN_PROVOKE_VRTX |
-             LINE_STRIP_PROVOKE_VRTX(1) |
-             TRI_FAN_PROVOKE_VRTX(2) | ENABLE_TEXKILL_3D_4D | TEXKILL_4D);
-
    /* Need to initialize this to zero.
     */
    OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | (0));
@@ -224,11 +217,6 @@ i915_emit_invarient_state(struct intel_context *intel)
    OUT_BATCH(_3DSTATE_LOAD_INDIRECT | 0);       /* disable indirect state */
    OUT_BATCH(0);
 
-
-   /* Don't support twosided stencil yet */
-   OUT_BATCH(_3DSTATE_BACKFACE_STENCIL_OPS | BFO_ENABLE_STENCIL_TWO_SIDE | 0);
-   OUT_BATCH(0);
-
    ADVANCE_BATCH();
 }
 
@@ -262,6 +250,9 @@ get_state_size(struct i915_hw_state *state)
    if (dirty & I915_UPLOAD_INVARIENT)
       sz += 30 * 4;
 
+   if (dirty & I915_UPLOAD_RASTER_RULES)
+      sz += sizeof(state->RasterRules);
+
    if (dirty & I915_UPLOAD_CTX)
       sz += sizeof(state->Ctx);
 
@@ -370,6 +361,12 @@ i915_emit_state(struct intel_context *intel)
       i915_emit_invarient_state(intel);
    }
 
+   if (dirty & I915_UPLOAD_RASTER_RULES) {
+      if (INTEL_DEBUG & DEBUG_STATE)
+         fprintf(stderr, "I915_UPLOAD_RASTER_RULES:\n");
+      emit(intel, state->RasterRules, sizeof(state->RasterRules));
+   }
+
    if (dirty & I915_UPLOAD_CTX) {
       if (INTEL_DEBUG & DEBUG_STATE)
          fprintf(stderr, "I915_UPLOAD_CTX:\n");
@@ -529,6 +526,23 @@ i915_destroy_context(struct intel_context *intel)
    _tnl_free_vertices(&intel->ctx);
 }
 
+void
+i915_set_buf_info_for_region(uint32_t *state, struct intel_region *region,
+			     uint32_t buffer_id)
+{
+   state[0] = _3DSTATE_BUF_INFO_CMD;
+   state[1] = buffer_id;
+
+   if (region != NULL) {
+      state[1] |= BUF_3D_PITCH(region->pitch * region->cpp);
+
+      if (region->tiling != I915_TILING_NONE) {
+	 state[1] |= BUF_3D_TILED_SURFACE;
+	 if (region->tiling == I915_TILING_Y)
+	    state[1] |= BUF_3D_TILE_WALK_Y;
+      }
+   }
+}
 
 /**
  * Set the drawing regions for the color and depth/stencil buffers.
@@ -562,21 +576,11 @@ i915_state_draw_region(struct intel_context *intel,
    /*
     * Set stride/cpp values
     */
-   if (color_region) {
-      state->Buffer[I915_DESTREG_CBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I915_DESTREG_CBUFADDR1] =
-         (BUF_3D_ID_COLOR_BACK |
-          BUF_3D_PITCH(color_region->pitch * color_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I915_DESTREG_CBUFADDR0],
+				color_region, BUF_3D_ID_COLOR_BACK);
 
-   if (depth_region) {
-      state->Buffer[I915_DESTREG_DBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I915_DESTREG_DBUFADDR1] =
-         (BUF_3D_ID_DEPTH |
-          BUF_3D_PITCH(depth_region->pitch * depth_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I915_DESTREG_DBUFADDR0],
+				depth_region, BUF_3D_ID_DEPTH);
 
    /*
     * Compute/set I915_DESTREG_DV1 value
@@ -604,6 +608,14 @@ i915_state_draw_region(struct intel_context *intel,
       }
    }
 
+   /* This isn't quite safe, thus being hidden behind an option.  When changing
+    * the value of this bit, the pipeline needs to be MI_FLUSHed.  And it
+    * can only be set when a depth buffer is already defined.
+    */
+   if (IS_945(intel->intelScreen->deviceID) && intel->use_early_z &&
+       depth_region->tiling != I915_TILING_NONE)
+      value |= CLASSIC_EARLY_DEPTH;
+
    if (depth_region && depth_region->cpp == 4) {
       value |= DEPTH_FRMT_24_FIXED_8_OTHER;
    }
@@ -676,13 +688,6 @@ i915_assert_not_dirty( struct intel_context *intel )
    assert(!dirty);
 }
 
-static void
-i915_note_unlock( struct intel_context *intel )
-{
-    /* nothing */
-}
-
-
 void
 i915InitVtbl(struct i915_context *i915)
 {
@@ -697,6 +702,5 @@ i915InitVtbl(struct i915_context *i915)
    i915->intel.vtbl.update_texture_state = i915UpdateTextureState;
    i915->intel.vtbl.flush_cmd = i915_flush_cmd;
    i915->intel.vtbl.assert_not_dirty = i915_assert_not_dirty;
-   i915->intel.vtbl.note_unlock = i915_note_unlock; 
    i915->intel.vtbl.finish_batch = intel_finish_vb;
 }
diff --git a/i915/intel_tris.c b/i915/intel_tris.c
index 1d39278..a905455 100644
--- a/i915/intel_tris.c
+++ b/i915/intel_tris.c
@@ -1255,11 +1255,9 @@ intel_meta_draw_poly(struct intel_context *intel,
 {
    union fi *vb;
    GLint i;
-   GLboolean was_locked = intel->locked;
    unsigned int saved_vertex_size = intel->vertex_size;
 
-   if (!was_locked)
-       LOCK_HARDWARE(intel);
+   LOCK_HARDWARE(intel);
 
    intel->vertex_size = 6;
 
@@ -1283,8 +1281,7 @@ intel_meta_draw_poly(struct intel_context *intel,
 
    intel->vertex_size = saved_vertex_size;
 
-   if (!was_locked)
-       UNLOCK_HARDWARE(intel);
+   UNLOCK_HARDWARE(intel);
 }
 
 static void
diff --git a/i965/Makefile.am b/i965/Makefile.am
index 5a118af..8ada8b5 100644
--- a/i965/Makefile.am
+++ b/i965/Makefile.am
@@ -17,6 +17,7 @@ i965_dri_la_SOURCES = \
 	../shared/intel_decode.c \
 	../shared/intel_extensions.c \
 	../shared/intel_fbo.c \
+	../shared/intel_generatemipmap.c \
 	../shared/intel_mipmap_tree.c \
 	../shared/intel_regions.c \
 	../shared/intel_screen.c \
@@ -25,8 +26,10 @@ i965_dri_la_SOURCES = \
 	../shared/intel_pixel_bitmap.c \
 	../shared/intel_pixel_copy.c \
 	../shared/intel_pixel_draw.c \
+	../shared/intel_pixel_read.c \
 	../shared/intel_state.c \
 	../shared/intel_swapbuffers.c \
+	../shared/intel_syncobj.c \
 	../shared/intel_tex.c \
 	../shared/intel_tex_copy.c \
 	../shared/intel_tex_format.c \
@@ -44,6 +47,7 @@ i965_dri_la_SOURCES = \
 	brw_clip_util.c \
 	brw_context.c \
 	brw_curbe.c \
+	brw_disasm.c \
 	brw_draw.c \
 	brw_draw_upload.c \
 	brw_eu.c \
@@ -72,6 +76,7 @@ i965_dri_la_SOURCES = \
 	brw_vs_constval.c \
 	brw_vs_emit.c \
 	brw_vs_state.c \
+	brw_vs_surface_state.c \
 	brw_vtbl.c \
 	brw_wm.c \
 	brw_wm_debug.c \
diff --git a/i965/brw_clip.c b/i965/brw_clip.c
index 5cffceb..20a927c 100644
--- a/i965/brw_clip.c
+++ b/i965/brw_clip.c
@@ -65,21 +65,31 @@ static void compile_clip_prog( struct brw_context *brw,
    c.func.single_program_flow = 1;
 
    c.key = *key;
-
+   c.need_ff_sync = BRW_IS_IGDNG(brw);
 
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
    c.header_position_offset = ATTR_SIZE;
 
-   for (i = 0, delta = REG_SIZE; i < VERT_RESULT_MAX; i++)
+   if (BRW_IS_IGDNG(brw))
+       delta = 3 * REG_SIZE;
+   else
+       delta = REG_SIZE;
+
+   for (i = 0; i < VERT_RESULT_MAX; i++)
       if (c.key.attrs & (1<<i)) {
 	 c.offset[i] = delta;
 	 delta += ATTR_SIZE;
       }
 
    c.nr_attrs = brw_count_bits(c.key.attrs);
-   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+   
+   if (BRW_IS_IGDNG(brw))
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+   else
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
    c.nr_bytes = c.nr_regs * REG_SIZE;
 
    c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
@@ -148,11 +158,16 @@ static void upload_clip_prog(struct brw_context *brw)
    key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
    /* _NEW_TRANSFORM */
    key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
-   key.clip_mode = BRW_CLIPMODE_NORMAL;
+
+   if (BRW_IS_IGDNG(brw))
+       key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
+   else
+       key.clip_mode = BRW_CLIPMODE_NORMAL;
 
    /* _NEW_POLYGON */
    if (key.primitive == GL_TRIANGLES) {
-      if (ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK) 
+      if (ctx->Polygon.CullFlag &&
+	  ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
 	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
       else {
 	 GLuint fill_front = CLIP_CULL;
diff --git a/i965/brw_clip.h b/i965/brw_clip.h
index e067478..957df44 100644
--- a/i965/brw_clip.h
+++ b/i965/brw_clip.h
@@ -100,6 +100,8 @@ struct brw_clip_compile {
       
       struct brw_reg fixed_planes;
       struct brw_reg plane_equation;
+       
+      struct brw_reg ff_sync;
    } reg;
 
    /* 3 different ways of expressing vertex size:
@@ -117,6 +119,7 @@ struct brw_clip_compile {
 
    GLuint header_position_offset;
    GLuint offset[VERT_ATTRIB_MAX];
+   GLboolean need_ff_sync;
 };
 
 #define ATTR_SIZE  (4*4)
@@ -171,5 +174,6 @@ struct brw_reg get_tmp( struct brw_clip_compile *c );
 
 void brw_clip_project_position(struct brw_clip_compile *c,
              struct brw_reg pos );
-
+void brw_clip_ff_sync(struct brw_clip_compile *c);
+void brw_clip_init_ff_sync(struct brw_clip_compile *c);
 #endif
diff --git a/i965/brw_clip_line.c b/i965/brw_clip_line.c
index d830e49..048ca62 100644
--- a/i965/brw_clip_line.c
+++ b/i965/brw_clip_line.c
@@ -85,6 +85,10 @@ static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
       i++;
    }
 
+   if (c->need_ff_sync) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
 
    c->first_tmp = i;
    c->last_tmp = i;
@@ -130,7 +134,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    struct brw_instruction *plane_loop;
    struct brw_instruction *plane_active;
    struct brw_instruction *is_negative;
-   struct brw_instruction *is_neg2;
+   struct brw_instruction *is_neg2 = NULL;
    struct brw_instruction *not_culled;
    struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
 
@@ -148,7 +152,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    brw_clip_init_clipmask(c);
 
    /* -ve rhw workaround */
-   if (!BRW_IS_G4X(p->brw)) {
+   if (BRW_IS_965(p->brw)) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
               brw_imm_ud(1<<20));
@@ -185,7 +189,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
               * Both can be negative on GM965/G965 due to RHW workaround
               * if so, this object should be rejected.
               */
-             if (!BRW_IS_G4X(p->brw)) {
+             if (BRW_IS_965(p->brw)) {
                  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
                  is_neg2 = brw_IF(p, BRW_EXECUTE_1);
                  {
@@ -210,7 +214,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
 
              /* If both are positive, do nothing */
              /* Only on GM965/G965 */
-             if (!BRW_IS_G4X(p->brw)) {
+             if (BRW_IS_965(p->brw)) {
                  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
                  is_neg2 = brw_IF(p, BRW_EXECUTE_1);
              }
@@ -225,7 +229,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
                  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
              }
 
-             if (!BRW_IS_G4X(p->brw)) {
+             if (BRW_IS_965(p->brw)) {
                  brw_ENDIF(p, is_neg2);
              }
          }
@@ -263,6 +267,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
 void brw_emit_line_clip( struct brw_clip_compile *c )
 {
    brw_clip_line_alloc_regs(c);
+   brw_clip_init_ff_sync(c);
 
    if (c->key.do_flat_shading)
       brw_clip_copy_colors(c, 0, 1);
diff --git a/i965/brw_clip_point.c b/i965/brw_clip_point.c
index d17b199..8458f61 100644
--- a/i965/brw_clip_point.c
+++ b/i965/brw_clip_point.c
@@ -50,5 +50,7 @@ void brw_emit_point_clip( struct brw_clip_compile *c )
    /* Send an empty message to kill the thread:
     */
    brw_clip_tri_alloc_regs(c, 0);
+   brw_clip_init_ff_sync(c);
+
    brw_clip_kill_thread(c);
 }
diff --git a/i965/brw_clip_state.c b/i965/brw_clip_state.c
index 9b0d7ea..5762c95 100644
--- a/i965/brw_clip_state.c
+++ b/i965/brw_clip_state.c
@@ -95,7 +95,14 @@ clip_unit_create_from_key(struct brw_context *brw,
        * even number.
        */
       assert(key->nr_urb_entries % 2 == 0);
-      clip.thread4.max_threads = 2 - 1;
+      
+      /* Although up to 16 concurrent Clip threads are allowed on IGDNG, 
+       * only 2 threads can output VUEs at a time.
+       */
+      if (BRW_IS_IGDNG(brw))
+         clip.thread4.max_threads = 16 - 1;        
+      else
+         clip.thread4.max_threads = 2 - 1;
    } else {
       assert(key->nr_urb_entries >= 5);
       clip.thread4.max_threads = 1 - 1;
diff --git a/i965/brw_clip_tri.c b/i965/brw_clip_tri.c
index 7fd37bd..0efd772 100644
--- a/i965/brw_clip_tri.c
+++ b/i965/brw_clip_tri.c
@@ -77,6 +77,10 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
    if (c->nr_attrs & 1) {
       for (j = 0; j < 3; j++) {
 	 GLuint delta = c->nr_attrs*16 + 32;
+
+         if (BRW_IS_IGDNG(c->func.brw))
+             delta = c->nr_attrs * 16 + 32 * 3;
+
 	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
       }
    }
@@ -115,6 +119,11 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
       i++;
    }
 
+   if (c->need_ff_sync) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
+
    c->first_tmp = i;
    c->last_tmp = i;
 
@@ -559,10 +568,11 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
    brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
    brw_clip_tri_init_vertices(c);
    brw_clip_init_clipmask(c);
+   brw_clip_init_ff_sync(c);
 
    /* if -ve rhw workaround bit is set, 
       do cliptest */
-   if (!BRW_IS_G4X(p->brw)) {
+   if (BRW_IS_965(p->brw)) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
               brw_imm_ud(1<<20));
@@ -579,11 +589,12 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
    if (c->key.do_flat_shading) 
       brw_clip_tri_flat_shade(c); 
       
-   if (c->key.clip_mode == BRW_CLIPMODE_NORMAL)
+   if ((c->key.clip_mode == BRW_CLIPMODE_NORMAL) ||
+       (c->key.clip_mode == BRW_CLIPMODE_KERNEL_CLIP))
       do_clip_tri(c);
    else 
       maybe_do_clip_tri(c);
-      
+
    brw_clip_tri_emit_polygon(c);
 
    /* Send an empty message to kill the thread:
diff --git a/i965/brw_clip_unfilled.c b/i965/brw_clip_unfilled.c
index d7ca517..ad1bfa4 100644
--- a/i965/brw_clip_unfilled.c
+++ b/i965/brw_clip_unfilled.c
@@ -453,6 +453,7 @@ void brw_emit_unfilled_clip( struct brw_clip_compile *c )
 
    brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
    brw_clip_tri_init_vertices(c);
+   brw_clip_init_ff_sync(c);
 
    assert(c->offset[VERT_RESULT_EDGE]);
 
diff --git a/i965/brw_clip_util.c b/i965/brw_clip_util.c
index 9d3b0be..5a73abd 100644
--- a/i965/brw_clip_util.c
+++ b/i965/brw_clip_util.c
@@ -140,6 +140,10 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
 
    /* Just copy the vertex header:
     */
+   /*
+    * After CLIP stage, only first 256 bits of the VUE are read
+    * back on IGDNG, so needn't change it
+    */
    brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
       
    /* Iterate over each attribute (could be done in pairs?)
@@ -147,6 +151,9 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    for (i = 0; i < c->nr_attrs; i++) {
       GLuint delta = i*16 + 32;
 
+      if (BRW_IS_IGDNG(p->brw))
+          delta = i * 16 + 32 * 3;
+
       if (delta == c->offset[VERT_RESULT_EDGE]) {
 	 if (force_edgeflag) 
 	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
@@ -177,6 +184,10 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
 
    if (i & 1) {
       GLuint delta = i*16 + 32;
+
+      if (BRW_IS_IGDNG(p->brw))
+          delta = i * 16 + 32 * 3;
+
       brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
    }
 
@@ -202,6 +213,8 @@ void brw_clip_emit_vue(struct brw_clip_compile *c,
    struct brw_compile *p = &c->func;
    GLuint start = c->last_mrf;
 
+   brw_clip_ff_sync(c);
+
    assert(!(allocate && eot));
    
    /* Cycle through mrf regs - probably futile as we have to wait for
@@ -252,6 +265,7 @@ void brw_clip_kill_thread(struct brw_clip_compile *c)
 {
    struct brw_compile *p = &c->func;
 
+   brw_clip_ff_sync(c);
    /* Send an empty message to kill the thread and release any
     * allocated urb entry:
     */
@@ -343,3 +357,40 @@ void brw_clip_init_clipmask( struct brw_clip_compile *c )
    }
 }
 
+void brw_clip_ff_sync(struct brw_clip_compile *c)
+{
+    if (c->need_ff_sync) {
+        struct brw_compile *p = &c->func;
+        struct brw_instruction *need_ff_sync;
+
+        brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+        brw_AND(p, brw_null_reg(), c->reg.ff_sync, brw_imm_ud(0x1));
+        need_ff_sync = brw_IF(p, BRW_EXECUTE_1);
+        {
+            brw_OR(p, c->reg.ff_sync, c->reg.ff_sync, brw_imm_ud(0x1));
+            brw_ff_sync(p, 
+                    c->reg.R0,
+                    0,
+                    c->reg.R0,
+                    1,	
+                    1,		/* used */
+                    1,  	/* msg length */
+                    1,		/* response length */
+                    0,		/* eot */
+                    1,		/* write compelete */
+                    0,		/* urb offset */
+                    BRW_URB_SWIZZLE_NONE);
+        }
+        brw_ENDIF(p, need_ff_sync);
+        brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    }
+}
+
+void brw_clip_init_ff_sync(struct brw_clip_compile *c)
+{
+    if (c->need_ff_sync) {
+	struct brw_compile *p = &c->func;
+        
+        brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));
+    }
+}
diff --git a/i965/brw_context.c b/i965/brw_context.c
index 4dbe551..c300c33 100644
--- a/i965/brw_context.c
+++ b/i965/brw_context.c
@@ -125,8 +125,34 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 
    /* We want the GLSL compiler to emit code that uses condition codes */
    ctx->Shader.EmitCondCodes = GL_TRUE;
-
-/*    ctx->Const.MaxNativeVertexProgramTemps = 32; */
+   ctx->Shader.EmitNVTempInitialization = GL_TRUE;
+
+   ctx->Const.VertexProgram.MaxNativeInstructions = (16 * 1024);
+   ctx->Const.VertexProgram.MaxAluInstructions = 0;
+   ctx->Const.VertexProgram.MaxTexInstructions = 0;
+   ctx->Const.VertexProgram.MaxTexIndirections = 0;
+   ctx->Const.VertexProgram.MaxNativeAluInstructions = 0;
+   ctx->Const.VertexProgram.MaxNativeTexInstructions = 0;
+   ctx->Const.VertexProgram.MaxNativeTexIndirections = 0;
+   ctx->Const.VertexProgram.MaxNativeAttribs = 16;
+   ctx->Const.VertexProgram.MaxNativeTemps = 256;
+   ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+   ctx->Const.VertexProgram.MaxNativeParameters = 1024;
+   ctx->Const.VertexProgram.MaxEnvParams =
+      MIN2(ctx->Const.VertexProgram.MaxNativeParameters,
+	   ctx->Const.VertexProgram.MaxEnvParams);
+
+   ctx->Const.FragmentProgram.MaxNativeInstructions = (16 * 1024);
+   ctx->Const.FragmentProgram.MaxNativeAluInstructions = (16 * 1024);
+   ctx->Const.FragmentProgram.MaxNativeTexInstructions = (16 * 1024);
+   ctx->Const.FragmentProgram.MaxNativeTexIndirections = (16 * 1024);
+   ctx->Const.FragmentProgram.MaxNativeAttribs = 12;
+   ctx->Const.FragmentProgram.MaxNativeTemps = 256;
+   ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
+   ctx->Const.FragmentProgram.MaxNativeParameters = 1024;
+   ctx->Const.FragmentProgram.MaxEnvParams =
+      MIN2(ctx->Const.FragmentProgram.MaxNativeParameters,
+	   ctx->Const.FragmentProgram.MaxEnvParams);
 
    brw_init_state( brw );
 
diff --git a/i965/brw_context.h b/i965/brw_context.h
index 577497b..a5209ac 100644
--- a/i965/brw_context.h
+++ b/i965/brw_context.h
@@ -129,8 +129,8 @@ struct brw_context;
 #define BRW_NEW_PRIMITIVE               0x40
 #define BRW_NEW_CONTEXT                 0x80
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
-#define BRW_NEW_INPUT_VARYING           0x200
 #define BRW_NEW_PSP                     0x800
+#define BRW_NEW_WM_SURFACES		0x1000
 #define BRW_NEW_FENCE                   0x2000
 #define BRW_NEW_INDICES			0x4000
 #define BRW_NEW_VERTICES		0x8000
@@ -143,6 +143,7 @@ struct brw_context;
 #define BRW_NEW_DEPTH_BUFFER		0x20000
 #define BRW_NEW_NR_WM_SURFACES		0x40000
 #define BRW_NEW_NR_VS_SURFACES		0x80000
+#define BRW_NEW_INDEX_BUFFER		0x100000
 
 struct brw_state_flags {
    /** State update flags signalled by mesa internals */
@@ -173,6 +174,9 @@ struct brw_fragment_program {
 
    dri_bo *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
+
+   /** for debugging, which texture units are referenced */
+   GLbitfield tex_units_used;
 };
 
 
@@ -386,6 +390,8 @@ struct brw_cached_batch_item {
 struct brw_vertex_element {
    const struct gl_client_array *glarray;
 
+   /** The corresponding Mesa vertex attribute */
+   gl_vert_attrib attrib;
    /** Size of a complete element */
    GLuint element_size;
    /** Number of uploaded elements for this input. */
@@ -401,7 +407,6 @@ struct brw_vertex_element {
 
 
 struct brw_vertex_info {
-   GLuint varying;  /* varying:1[VERT_ATTRIB_MAX] */
    GLuint sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[VERT_ATTRIB_MAX] */
 };
 
@@ -439,9 +444,13 @@ struct brw_query_object {
    unsigned int count;
 };
 
+
+/**
+ * brw_context is derived from intel_context.
+ */
 struct brw_context 
 {
-   struct intel_context intel;
+   struct intel_context intel;  /**< base class, must be first field */
    GLuint primitive;
 
    GLboolean emit_state_always;
@@ -450,8 +459,6 @@ struct brw_context
 
    struct {
       struct brw_state_flags dirty;
-      struct brw_tracked_state **atoms;
-      GLuint nr_atoms;
 
       GLuint nr_color_regions;
       struct intel_region *color_regions[MAX_DRAW_BUFFERS];
@@ -471,12 +478,16 @@ struct brw_context
       int validated_bo_count;
    } state;
 
-   struct brw_cache cache;
+   struct brw_cache cache;  /** non-surface items */
+   struct brw_cache surface_cache;  /* surface items */
    struct brw_cached_batch_item *cached_batch_items;
 
    struct {
       struct brw_vertex_element inputs[VERT_ATTRIB_MAX];
 
+      struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
+      GLuint nr_enabled;
+
 #define BRW_NR_UPLOAD_BUFS 17
 #define BRW_UPLOAD_INIT_SIZE (128*1024)
 
@@ -500,8 +511,15 @@ struct brw_context
        */
       const struct _mesa_index_buffer *ib;
 
+      /* Updates to these fields are signaled by BRW_NEW_INDEX_BUFFER. */
       dri_bo *bo;
       unsigned int offset;
+      unsigned int size;
+      /* Offset to index buffer index to use in CMD_3D_PRIM so that we can
+       * avoid re-uploading the IB packet over and over if we're actually
+       * referencing the same index buffer.
+       */
+      unsigned int start_vertex_offset;
    } ib;
 
    /* Active vertex program: 
@@ -555,11 +573,6 @@ struct brw_context
       GLuint vs_size;
       GLuint total_size;
 
-      /* Dynamic tracker which changes to reflect the state referenced
-       * by active fp and vp program parameters:
-       */
-      struct brw_tracked_state tracked_state;
-
       dri_bo *curbe_bo;
       /** Offset within curbe_bo of space for current curbe entry */
       GLuint curbe_offset;
@@ -713,6 +726,8 @@ void brw_upload_urb_fence(struct brw_context *brw);
  */
 void brw_upload_cs_urb_state(struct brw_context *brw);
 
+/* brw_disasm.c */
+int brw_disasm (FILE *file, struct brw_instruction *inst);
 
 /*======================================================================
  * Inline conversion functions.  These are better-typed than the
diff --git a/i965/brw_curbe.c b/i965/brw_curbe.c
index 9197fed..4be6c77 100644
--- a/i965/brw_curbe.c
+++ b/i965/brw_curbe.c
@@ -36,6 +36,7 @@
 #include "main/macros.h"
 #include "main/enums.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
 #include "shader/prog_statevars.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
@@ -188,13 +189,6 @@ static void prepare_constant_buffer(struct brw_context *brw)
    GLfloat *buf;
    GLuint i;
 
-   /* Update our own dependency flags.  This works because this
-    * function will also be called whenever fp or vp changes.
-    */
-   brw->curbe.tracked_state.dirty.mesa = (_NEW_TRANSFORM|_NEW_PROJECTION);
-   brw->curbe.tracked_state.dirty.mesa |= vp->program.Base.Parameters->StateFlags;
-   brw->curbe.tracked_state.dirty.mesa |= fp->program.Base.Parameters->StateFlags;
-
    if (sz == 0) {
       if (brw->curbe.last_buf) {
 	 free(brw->curbe.last_buf);
@@ -254,6 +248,12 @@ static void prepare_constant_buffer(struct brw_context *brw)
       GLuint offset = brw->curbe.vs_start * 16;
       GLuint nr = brw->vs.prog_data->nr_params / 4;
 
+      if (brw->vertex_program->IsNVProgram)
+	 _mesa_load_tracked_matrices(ctx);
+
+      /* Updates the ParamaterValues[i] pointers for all parameters of the
+       * basic type of PROGRAM_STATE_VAR.
+       */
       _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
 
       /* XXX just use a memcpy here */
@@ -335,78 +335,11 @@ static void prepare_constant_buffer(struct brw_context *brw)
     */
 }
 
-
-/**
- * Copy Mesa program parameters into given constant buffer.
- */
-static void
-update_constant_buffer(struct brw_context *brw,
-                       const struct gl_program_parameter_list *params,
-                       dri_bo *const_buffer)
-{
-   struct intel_context *intel = &brw->intel;
-   const int size = params->NumParameters * 4 * sizeof(GLfloat);
-
-   /* copy Mesa program constants into the buffer */
-   if (const_buffer && size > 0) {
-
-      assert(const_buffer);
-      assert(const_buffer->size >= size);
-
-      if (intel->intelScreen->kernel_exec_fencing) {
-         drm_intel_gem_bo_map_gtt(const_buffer);
-         memcpy(const_buffer->virtual, params->ParameterValues, size);
-         drm_intel_gem_bo_unmap_gtt(const_buffer);
-      }
-      else {
-         dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
-      }
-
-      if (0) {
-         int i;
-         for (i = 0; i < params->NumParameters; i++) {
-            float *p = params->ParameterValues[i];
-            printf("%d: %f %f %f %f\n", i, p[0], p[1], p[2], p[3]);
-         }
-      }
-   }
-}
-
-
-/** Copy current vertex program's parameters into the constant buffer */
-static void
-update_vertex_constant_buffer(struct brw_context *brw)
-{
-   struct brw_vertex_program *vp =
-      (struct brw_vertex_program *) brw->vertex_program;
-   if (0) {
-      printf("update VS constants in buffer %p\n", vp->const_buffer);
-      printf("program %u\n", vp->program.Base.Id);
-   }
-   if (vp->use_const_buffer)
-      update_constant_buffer(brw, vp->program.Base.Parameters, vp->const_buffer);
-}
-
-
-/** Copy current fragment program's parameters into the constant buffer */
-static void
-update_fragment_constant_buffer(struct brw_context *brw)
-{
-   struct brw_fragment_program *fp =
-      (struct brw_fragment_program *) brw->fragment_program;
-   if (fp->use_const_buffer)
-      update_constant_buffer(brw, fp->program.Base.Parameters, fp->const_buffer);
-}
-
-
 static void emit_constant_buffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    GLuint sz = brw->curbe.total_size;
 
-   update_vertex_constant_buffer(brw);
-   update_fragment_constant_buffer(brw);
-
    BEGIN_BATCH(2, IGNORE_CLIPRECTS);
    if (sz == 0) {
       OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
@@ -428,7 +361,7 @@ static void emit_constant_buffer(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_constant_buffer = {
    .dirty = {
-      .mesa = (_NEW_TRANSFORM|_NEW_PROJECTION),      /* plus fp and vp flags */
+      .mesa = _NEW_PROGRAM_CONSTANTS,
       .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 	       BRW_NEW_VERTEX_PROGRAM |
 	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
diff --git a/i965/brw_defines.h b/i965/brw_defines.h
index 98fc909..78d457a 100644
--- a/i965/brw_defines.h
+++ b/i965/brw_defines.h
@@ -139,6 +139,7 @@
 #define BRW_CLIPMODE_CLIP_NON_REJECTED   2
 #define BRW_CLIPMODE_REJECT_ALL          3
 #define BRW_CLIPMODE_ACCEPT_ALL          4
+#define BRW_CLIPMODE_KERNEL_CLIP         5
 
 #define BRW_CLIP_NDCSPACE     0
 #define BRW_CLIP_SCREENSPACE  1
@@ -470,8 +471,9 @@
 #define BRW_CONDITIONAL_GE    4
 #define BRW_CONDITIONAL_L     5
 #define BRW_CONDITIONAL_LE    6
-#define BRW_CONDITIONAL_C     7
+#define BRW_CONDITIONAL_R     7
 #define BRW_CONDITIONAL_O     8
+#define BRW_CONDITIONAL_U     9
 
 #define BRW_DEBUG_NONE        0
 #define BRW_DEBUG_BREAKPOINT  1
@@ -511,6 +513,7 @@
 #define BRW_OPCODE_RSL        11
 #define BRW_OPCODE_ASR        12
 #define BRW_OPCODE_CMP        16
+#define BRW_OPCODE_CMPN       17
 #define BRW_OPCODE_JMPI       32
 #define BRW_OPCODE_IF         34
 #define BRW_OPCODE_IFF        35
@@ -670,6 +673,25 @@
 #define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
 #define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
 
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG            0
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_IGDNG          0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG           0
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG       1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_BIAS_IGDNG     1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG      1
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_IGDNG        2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_IGDNG      2
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD_IGDNG       2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG    3
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE_IGDNG  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG   3
+
+/* for IGDNG only */
+#define BRW_SAMPLER_SIMD_MODE_SIMD4X2                   0
+#define BRW_SAMPLER_SIMD_MODE_SIMD8                     1
+#define BRW_SAMPLER_SIMD_MODE_SIMD16                    2
+#define BRW_SAMPLER_SIMD_MODE_SIMD32_64                 3
+
 #define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
 #define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
 #define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
@@ -819,8 +841,11 @@
 #include "intel_chipset.h"
 
 #define BRW_IS_G4X(brw)         (IS_G4X((brw)->intel.intelScreen->deviceID))
-#define CMD_PIPELINE_SELECT(brw)        (BRW_IS_G4X(brw) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
-#define CMD_VF_STATISTICS(brw)          (BRW_IS_G4X(brw) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
-#define URB_SIZES(brw)                  (BRW_IS_G4X(brw) ? 384 : 256)  /* 512 bit units */
+#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->intel.intelScreen->deviceID))
+#define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
+#define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
+#define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
+#define URB_SIZES(brw)                  (BRW_IS_IGDNG(brw) ? 1024 : \
+                                         (BRW_IS_G4X(brw) ? 384 : 256))  /* 512 bit units */
 
 #endif
diff --git a/i965/brw_disasm.c b/i965/brw_disasm.c
new file mode 100644
index 0000000..9fef230
--- /dev/null
+++ b/i965/brw_disasm.c
@@ -0,0 +1,903 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <stdarg.h>
+
+#include "main/mtypes.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+
+struct {
+    char    *name;
+    int	    nsrc;
+    int	    ndst;
+} opcode[128] = {
+    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+
+    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 1, .ndst = 01 },
+    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+};
+
+char *conditional_modifier[16] = {
+    [BRW_CONDITIONAL_NONE] = "",
+    [BRW_CONDITIONAL_Z] = ".e",
+    [BRW_CONDITIONAL_NZ] = ".ne",
+    [BRW_CONDITIONAL_G] = ".g",
+    [BRW_CONDITIONAL_GE] = ".ge",
+    [BRW_CONDITIONAL_L] = ".l",
+    [BRW_CONDITIONAL_LE] = ".le",
+    [BRW_CONDITIONAL_R] = ".r",
+    [BRW_CONDITIONAL_O] = ".o",
+    [BRW_CONDITIONAL_U] = ".u",
+};
+
+char *negate[2] = {
+    [0] = "",
+    [1] = "-",
+};
+
+char *_abs[2] = {
+    [0] = "",
+    [1] = "(abs)",
+};
+
+char *vert_stride[16] = {
+    [0] = "0",
+    [1] = "1",
+    [2] = "2",
+    [3] = "4",
+    [4] = "8",
+    [5] = "16",
+    [6] = "32",
+    [15] = "VxH",
+};
+
+char *width[8] = {
+    [0] = "1",
+    [1] = "2",
+    [2] = "4",
+    [3] = "8",
+    [4] = "16",
+};
+
+char *horiz_stride[4] = {
+    [0] = "0",
+    [1] = "1",
+    [2] = "2",
+    [3] = "4"
+};
+
+char *chan_sel[4] = {
+    [0] = "x",
+    [1] = "y",
+    [2] = "z",
+    [3] = "w",
+};
+
+char *dest_condmod[16] = {
+};
+
+char *debug_ctrl[2] = {
+    [0] = "",
+    [1] = ".breakpoint"
+};
+
+char *saturate[2] = {
+    [0] = "",
+    [1] = ".sat"
+};
+
+char *exec_size[8] = {
+    [0] = "1",
+    [1] = "2",
+    [2] = "4",
+    [3] = "8",
+    [4] = "16",
+    [5] = "32"
+};
+
+char *pred_inv[2] = {
+    [0] = "+",
+    [1] = "-"
+};
+
+char *pred_ctrl_align16[16] = {
+    [1] = "",
+    [2] = ".x",
+    [3] = ".y",
+    [4] = ".z",
+    [5] = ".w",
+    [6] = ".any4h",
+    [7] = ".all4h",
+};
+
+char *pred_ctrl_align1[16] = {
+    [1] = "",
+    [2] = ".anyv",
+    [3] = ".allv",
+    [4] = ".any2h",
+    [5] = ".all2h",
+    [6] = ".any4h",
+    [7] = ".all4h",
+    [8] = ".any8h",
+    [9] = ".all8h",
+    [10] = ".any16h",
+    [11] = ".all16h",
+};
+
+char *thread_ctrl[4] = {
+    [0] = "",
+    [2] = "switch"
+};
+
+char *compr_ctrl[4] = {
+    [0] = "",
+    [1] = "sechalf",
+    [2] = "compr",
+};
+
+char *dep_ctrl[4] = {
+    [0] = "",
+    [1] = "NoDDClr",
+    [2] = "NoDDChk",
+    [3] = "NoDDClr,NoDDChk",
+};
+
+char *mask_ctrl[4] = {
+    [0] = "",
+    [1] = "nomask",
+};
+
+char *access_mode[2] = {
+    [0] = "align1",
+    [1] = "align16",
+};
+
+char *reg_encoding[8] = {
+    [0] = "UD",
+    [1] = "D",
+    [2] = "UW",
+    [3] = "W",
+    [4] = "UB",
+    [5] = "B",
+    [7] = "F"
+};
+
+char *imm_encoding[8] = {
+    [0] = "UD",
+    [1] = "D",
+    [2] = "UW",
+    [3] = "W",
+    [5] = "VF",
+    [5] = "V",
+    [7] = "F"
+};
+
+char *reg_file[4] = {
+    [0] = "A",
+    [1] = "g",
+    [2] = "m",
+    [3] = "imm",
+};
+
+char *writemask[16] = {
+    [0x0] = ".",
+    [0x1] = ".x",
+    [0x2] = ".y",
+    [0x3] = ".xy",
+    [0x4] = ".z",
+    [0x5] = ".xz",
+    [0x6] = ".yz",
+    [0x7] = ".xyz",
+    [0x8] = ".w",
+    [0x9] = ".xw",
+    [0xa] = ".yw",
+    [0xb] = ".xyw",
+    [0xc] = ".zw",
+    [0xd] = ".xzw",
+    [0xe] = ".yzw",
+    [0xf] = "",
+};
+
+char *end_of_thread[2] = {
+    [0] = "",
+    [1] = "EOT"
+};
+
+char *target_function[16] = {
+    [BRW_MESSAGE_TARGET_NULL] = "null",
+    [BRW_MESSAGE_TARGET_MATH] = "math",
+    [BRW_MESSAGE_TARGET_SAMPLER] = "sampler",
+    [BRW_MESSAGE_TARGET_GATEWAY] = "gateway",
+    [BRW_MESSAGE_TARGET_DATAPORT_READ] = "read",
+    [BRW_MESSAGE_TARGET_DATAPORT_WRITE] = "write",
+    [BRW_MESSAGE_TARGET_URB] = "urb",
+    [BRW_MESSAGE_TARGET_THREAD_SPAWNER] = "thread_spawner"
+};
+
+char *math_function[16] = {
+    [BRW_MATH_FUNCTION_INV] = "inv",
+    [BRW_MATH_FUNCTION_LOG] = "log",
+    [BRW_MATH_FUNCTION_EXP] = "exp",
+    [BRW_MATH_FUNCTION_SQRT] = "sqrt",
+    [BRW_MATH_FUNCTION_RSQ] = "rsq",
+    [BRW_MATH_FUNCTION_SIN] = "sin",
+    [BRW_MATH_FUNCTION_COS] = "cos",
+    [BRW_MATH_FUNCTION_SINCOS] = "sincos",
+    [BRW_MATH_FUNCTION_TAN] = "tan",
+    [BRW_MATH_FUNCTION_POW] = "pow",
+    [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+    [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intmod",
+    [BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intdiv",
+};
+
+char *math_saturate[2] = {
+    [0] = "",
+    [1] = "sat"
+};
+
+char *math_signed[2] = {
+    [0] = "",
+    [1] = "signed"
+};
+
+char *math_scalar[2] = {
+    [0] = "",
+    [1] = "scalar"
+};
+
+char *math_precision[2] = {
+    [0] = "",
+    [1] = "partial_precision"
+};
+
+char *urb_swizzle[4] = {
+    [BRW_URB_SWIZZLE_NONE] = "",
+    [BRW_URB_SWIZZLE_INTERLEAVE] = "interleave",
+    [BRW_URB_SWIZZLE_TRANSPOSE] = "transpose",
+};
+
+char *urb_allocate[2] = {
+    [0] = "",
+    [1] = "allocate"
+};
+
+char *urb_used[2] = {
+    [0] = "",
+    [1] = "used"
+};
+
+char *urb_complete[2] = {
+    [0] = "",
+    [1] = "complete"
+};
+
+char *sampler_target_format[4] = {
+    [0] = "F",
+    [2] = "UD",
+    [3] = "D"
+};
+
+
+static int column;
+
+static int string (FILE *file, char *string)
+{
+    fputs (string, file);
+    column += strlen (string);
+    return 0;
+}
+
+static int format (FILE *f, char *format, ...)
+{
+    char    buf[1024];
+    va_list	args;
+    va_start (args, format);
+
+    vsnprintf (buf, sizeof (buf) - 1, format, args);
+    string (f, buf);
+    return 0;
+}
+
+static int newline (FILE *f)
+{
+    putc ('\n', f);
+    column = 0;
+    return 0;
+}
+
+static int pad (FILE *f, int c)
+{
+    do
+	string (f, " ");
+    while (column < c);
+    return 0;
+}
+
+static int control (FILE *file, char *name, char *ctrl[], GLuint id, int *space)
+{
+    if (!ctrl[id]) {
+	fprintf (file, "*** invalid %s value %d ",
+		 name, id);
+	return 1;
+    }
+    if (ctrl[id][0])
+    {
+	if (space && *space)
+	    string (file, " ");
+	string (file, ctrl[id]);
+	if (space)
+	    *space = 1;
+    }
+    return 0;
+}
+
+static int print_opcode (FILE *file, int id)
+{
+    if (!opcode[id].name) {
+	format (file, "*** invalid opcode value %d ", id);
+	return 1;
+    }
+    string (file, opcode[id].name);
+    return 0;
+}
+
+static int reg (FILE *file, GLuint _reg_file, GLuint _reg_nr)
+{
+    int	err = 0;
+    if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) {
+	switch (_reg_nr & 0xf0) {
+	case BRW_ARF_NULL:
+	    string (file, "null");
+	    return -1;
+	case BRW_ARF_ADDRESS:
+	    format (file, "a%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_ACCUMULATOR:
+	    format (file, "acc%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_MASK:
+	    format (file, "mask%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_MASK_STACK:
+	    format (file, "msd%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_STATE:
+	    format (file, "sr%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_CONTROL:
+	    format (file, "cr%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_NOTIFICATION_COUNT:
+	    format (file, "n%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_IP:
+	    string (file, "ip");
+	    return -1;
+	    break;
+	default:
+	    format (file, "ARF%d", _reg_nr);
+	    break;
+	}
+    } else {
+	err  |= control (file, "src reg file", reg_file, _reg_file, NULL);
+	format (file, "%d", _reg_nr);
+    }
+    return err;
+}
+
+static int dest (FILE *file, struct brw_instruction *inst)
+{
+    int	err = 0;
+
+    if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    err |= reg (file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr);
+	    if (err == -1)
+		return 0;
+	    if (inst->bits1.da1.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.da1.dest_subreg_nr);
+	    format (file, "<%d>", inst->bits1.da1.dest_horiz_stride);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
+	}
+	else
+	{
+	    string (file, "g[a0");
+	    if (inst->bits1.ia1.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.ia1.dest_subreg_nr);
+	    if (inst->bits1.ia1.dest_indirect_offset)
+		format (file, " %d", inst->bits1.ia1.dest_indirect_offset);
+	    string (file, "]");
+	    format (file, "<%d>", inst->bits1.ia1.dest_horiz_stride);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.ia1.dest_reg_type, NULL);
+	}
+    }
+    else
+    {
+	if (inst->bits1.da16.dest_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    err |= reg (file, inst->bits1.da16.dest_reg_file, inst->bits1.da16.dest_reg_nr);
+	    if (err == -1)
+		return 0;
+	    if (inst->bits1.da16.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.da16.dest_subreg_nr);
+	    string (file, "<1>");
+	    err |= control (file, "writemask", writemask, inst->bits1.da16.dest_writemask, NULL);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da16.dest_reg_type, NULL);
+	}
+	else
+	{
+	    err = 1;
+	    string (file, "Indirect align16 address mode not supported");
+	}
+    }
+
+    return 0;
+}
+
+static int src_align1_region (FILE *file,
+			      GLuint _vert_stride, GLuint _width, GLuint _horiz_stride)
+{
+    int err = 0;
+    string (file, "<");
+    err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+    string (file, ",");
+    err |= control (file, "width", width, _width, NULL);
+    string (file, ",");
+    err |= control (file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+    string (file, ">");
+    return err;
+}
+
+static int src_da1 (FILE *file, GLuint type, GLuint _reg_file,
+		    GLuint _vert_stride, GLuint _width, GLuint _horiz_stride,
+		    GLuint reg_num, GLuint sub_reg_num, GLuint __abs, GLuint _negate)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    err |= reg (file, _reg_file, reg_num);
+    if (err == -1)
+	return 0;
+    if (sub_reg_num)
+	format (file, ".%d", sub_reg_num);
+    src_align1_region (file, _vert_stride, _width, _horiz_stride);
+    err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    return err;
+}
+
+static int src_ia1 (FILE *file,
+		    GLuint type,
+		    GLuint _reg_file,
+		    GLint _addr_imm,
+		    GLuint _addr_subreg_nr,
+		    GLuint _negate,
+		    GLuint __abs,
+		    GLuint _addr_mode,
+		    GLuint _horiz_stride,
+		    GLuint _width,
+		    GLuint _vert_stride)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    string (file, "g[a0");
+    if (_addr_subreg_nr)
+	format (file, ".%d", _addr_subreg_nr);
+    if (_addr_imm)
+	format (file, " %d", _addr_imm);
+    string (file, "]");
+    src_align1_region (file, _vert_stride, _width, _horiz_stride);
+    err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    return err;
+}
+
+static int src_da16 (FILE *file,
+		     GLuint _reg_type,
+		     GLuint _reg_file,
+		     GLuint _vert_stride,
+		     GLuint _reg_nr,
+		     GLuint _subreg_nr,
+		     GLuint __abs,
+		     GLuint _negate,
+		     GLuint swz_x,
+		     GLuint swz_y,
+		     GLuint swz_z,
+		     GLuint swz_w)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    err |= reg (file, _reg_file, _reg_nr);
+    if (err == -1)
+	return 0;
+    if (_subreg_nr)
+	format (file, ".%d", _subreg_nr);
+    string (file, "<");
+    err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+    string (file, ",1,1>");
+    err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL);
+    /*
+     * Three kinds of swizzle display:
+     *  identity - nothing printed
+     *  1->all	 - print the single channel
+     *  1->1     - print the mapping
+     */
+    if (swz_x == BRW_CHANNEL_X &&
+	swz_y == BRW_CHANNEL_Y &&
+	swz_z == BRW_CHANNEL_Z &&
+	swz_w == BRW_CHANNEL_W)
+    {
+	;
+    }
+    else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+    {
+	string (file, ".");
+	err |= control (file, "channel select", chan_sel, swz_x, NULL);
+    }
+    else
+    {
+	string (file, ".");
+	err |= control (file, "channel select", chan_sel, swz_x, NULL);
+	err |= control (file, "channel select", chan_sel, swz_y, NULL);
+	err |= control (file, "channel select", chan_sel, swz_z, NULL);
+	err |= control (file, "channel select", chan_sel, swz_w, NULL);
+    }
+    return err;
+}
+
+
+static int imm (FILE *file, GLuint type, struct brw_instruction *inst) {
+    switch (type) {
+    case BRW_REGISTER_TYPE_UD:
+	format (file, "0x%08xUD", inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_D:
+	format (file, "%dD", inst->bits3.d);
+	break;
+    case BRW_REGISTER_TYPE_UW:
+	format (file, "0x%04xUW", (uint16_t) inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_W:
+	format (file, "%dW", (int16_t) inst->bits3.d);
+	break;
+    case BRW_REGISTER_TYPE_UB:
+	format (file, "0x%02xUB", (int8_t) inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_VF:
+	format (file, "Vector Float");
+	break;
+    case BRW_REGISTER_TYPE_V:
+	format (file, "0x%08xV", inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_F:
+	format (file, "%-gF", inst->bits3.f);
+    }
+    return 0;
+}
+
+static int src0 (FILE *file, struct brw_instruction *inst)
+{
+    if (inst->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE)
+	return imm (file, inst->bits1.da1.src0_reg_type,
+		    inst);
+    else if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits2.da1.src0_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da1 (file,
+			    inst->bits1.da1.src0_reg_type,
+			    inst->bits1.da1.src0_reg_file,
+			    inst->bits2.da1.src0_vert_stride,
+			    inst->bits2.da1.src0_width,
+			    inst->bits2.da1.src0_horiz_stride,
+			    inst->bits2.da1.src0_reg_nr,
+			    inst->bits2.da1.src0_subreg_nr,
+			    inst->bits2.da1.src0_abs,
+			    inst->bits2.da1.src0_negate);
+	}
+	else
+	{
+	    return src_ia1 (file,
+			    inst->bits1.ia1.src0_reg_type,
+			    inst->bits1.ia1.src0_reg_file,
+			    inst->bits2.ia1.src0_indirect_offset,
+			    inst->bits2.ia1.src0_subreg_nr,
+			    inst->bits2.ia1.src0_negate,
+			    inst->bits2.ia1.src0_abs,
+			    inst->bits2.ia1.src0_address_mode,
+			    inst->bits2.ia1.src0_horiz_stride,
+			    inst->bits2.ia1.src0_width,
+			    inst->bits2.ia1.src0_vert_stride);
+	}
+    }
+    else
+    {
+	if (inst->bits2.da16.src0_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da16 (file,
+			     inst->bits1.da16.src0_reg_type,
+			     inst->bits1.da16.src0_reg_file,
+			     inst->bits2.da16.src0_vert_stride,
+			     inst->bits2.da16.src0_reg_nr,
+			     inst->bits2.da16.src0_subreg_nr,
+			     inst->bits2.da16.src0_abs,
+			     inst->bits2.da16.src0_negate,
+			     inst->bits2.da16.src0_swz_x,
+			     inst->bits2.da16.src0_swz_y,
+			     inst->bits2.da16.src0_swz_z,
+			     inst->bits2.da16.src0_swz_w);
+	}
+	else
+	{
+	    string (file, "Indirect align16 address mode not supported");
+	    return 1;
+	}
+    }
+}
+
+static int src1 (FILE *file, struct brw_instruction *inst)
+{
+    if (inst->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
+	return imm (file, inst->bits1.da1.src1_reg_type,
+		    inst);
+    else if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits3.da1.src1_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da1 (file,
+			    inst->bits1.da1.src1_reg_type,
+			    inst->bits1.da1.src1_reg_file,
+			    inst->bits3.da1.src1_vert_stride,
+			    inst->bits3.da1.src1_width,
+			    inst->bits3.da1.src1_horiz_stride,
+			    inst->bits3.da1.src1_reg_nr,
+			    inst->bits3.da1.src1_subreg_nr,
+			    inst->bits3.da1.src1_abs,
+			    inst->bits3.da1.src1_negate);
+	}
+	else
+	{
+	    return src_ia1 (file,
+			    inst->bits1.ia1.src1_reg_type,
+			    inst->bits1.ia1.src1_reg_file,
+			    inst->bits3.ia1.src1_indirect_offset,
+			    inst->bits3.ia1.src1_subreg_nr,
+			    inst->bits3.ia1.src1_negate,
+			    inst->bits3.ia1.src1_abs,
+			    inst->bits3.ia1.src1_address_mode,
+			    inst->bits3.ia1.src1_horiz_stride,
+			    inst->bits3.ia1.src1_width,
+			    inst->bits3.ia1.src1_vert_stride);
+	}
+    }
+    else
+    {
+	if (inst->bits3.da16.src1_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da16 (file,
+			     inst->bits1.da16.src1_reg_type,
+			     inst->bits1.da16.src1_reg_file,
+			     inst->bits3.da16.src1_vert_stride,
+			     inst->bits3.da16.src1_reg_nr,
+			     inst->bits3.da16.src1_subreg_nr,
+			     inst->bits3.da16.src1_abs,
+			     inst->bits3.da16.src1_negate,
+			     inst->bits3.da16.src1_swz_x,
+			     inst->bits3.da16.src1_swz_y,
+			     inst->bits3.da16.src1_swz_z,
+			     inst->bits3.da16.src1_swz_w);
+	}
+	else
+	{
+	    string (file, "Indirect align16 address mode not supported");
+	    return 1;
+	}
+    }
+}
+
+int brw_disasm (FILE *file, struct brw_instruction *inst)
+{
+    int	err = 0;
+    int space = 0;
+
+    if (inst->header.predicate_control) {
+	string (file, "(");
+	err |= control (file, "predicate inverse", pred_inv, inst->header.predicate_inverse, NULL);
+	string (file, "f0");
+	if (inst->bits2.da1.flag_reg_nr)
+	    format (file, ".%d", inst->bits2.da1.flag_reg_nr);
+	if (inst->header.access_mode == BRW_ALIGN_1)
+	    err |= control (file, "predicate control align1", pred_ctrl_align1,
+			    inst->header.predicate_control, NULL);
+	else
+	    err |= control (file, "predicate control align16", pred_ctrl_align16,
+			    inst->header.predicate_control, NULL);
+	string (file, ") ");
+    }
+
+    err |= print_opcode (file, inst->header.opcode);
+    err |= control (file, "saturate", saturate, inst->header.saturate, NULL);
+    err |= control (file, "debug control", debug_ctrl, inst->header.debug_control, NULL);
+
+    if (inst->header.opcode != BRW_OPCODE_SEND)
+	err |= control (file, "conditional modifier", conditional_modifier,
+			inst->header.destreg__conditionalmod, NULL);
+
+    if (inst->header.opcode != BRW_OPCODE_NOP) {
+	string (file, "(");
+	err |= control (file, "execution size", exec_size, inst->header.execution_size, NULL);
+	string (file, ")");
+    }
+
+    if (inst->header.opcode == BRW_OPCODE_SEND)
+	format (file, " %d", inst->header.destreg__conditionalmod);
+
+    if (opcode[inst->header.opcode].ndst > 0) {
+	pad (file, 16);
+	err |= dest (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 0) {
+	pad (file, 32);
+	err |= src0 (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 1) {
+	pad (file, 48);
+	err |= src1 (file, inst);
+    }
+
+    if (inst->header.opcode == BRW_OPCODE_SEND) {
+	newline (file);
+	pad (file, 16);
+	space = 0;
+	err |= control (file, "target function", target_function,
+			inst->bits3.generic.msg_target, &space);
+	switch (inst->bits3.generic.msg_target) {
+	case BRW_MESSAGE_TARGET_MATH:
+	    err |= control (file, "math function", math_function,
+			    inst->bits3.math.function, &space);
+	    err |= control (file, "math saturate", math_saturate,
+			    inst->bits3.math.saturate, &space);
+	    err |= control (file, "math signed", math_signed,
+			    inst->bits3.math.int_type, &space);
+	    err |= control (file, "math scalar", math_scalar,
+			    inst->bits3.math.data_type, &space);
+	    err |= control (file, "math precision", math_precision,
+			    inst->bits3.math.precision, &space);
+	    break;
+	case BRW_MESSAGE_TARGET_SAMPLER:
+	    format (file, " (%d, %d, ",
+		    inst->bits3.sampler.binding_table_index,
+		    inst->bits3.sampler.sampler);
+	    err |= control (file, "sampler target format", sampler_target_format,
+			    inst->bits3.sampler.return_format, NULL);
+	    string (file, ")");
+	    break;
+	case BRW_MESSAGE_TARGET_DATAPORT_WRITE:
+	    format (file, " (%d, %d, %d, %d)",
+		    inst->bits3.dp_write.binding_table_index,
+		    (inst->bits3.dp_write.pixel_scoreboard_clear << 3) |
+		    inst->bits3.dp_write.msg_control,
+		    inst->bits3.dp_write.msg_type,
+		    inst->bits3.dp_write.send_commit_msg);
+	    break;
+	case BRW_MESSAGE_TARGET_URB:
+	    format (file, " %d", inst->bits3.urb.offset);
+	    space = 1;
+	    err |= control (file, "urb swizzle", urb_swizzle,
+			    inst->bits3.urb.swizzle_control, &space);
+	    err |= control (file, "urb allocate", urb_allocate,
+			    inst->bits3.urb.allocate, &space);
+	    err |= control (file, "urb used", urb_used,
+			    inst->bits3.urb.used, &space);
+	    err |= control (file, "urb complete", urb_complete,
+			    inst->bits3.urb.complete, &space);
+	    break;
+	case BRW_MESSAGE_TARGET_THREAD_SPAWNER:
+	    break;
+	default:
+	    format (file, "unsupported target %d", inst->bits3.generic.msg_target);
+	    break;
+	}
+	if (space)
+	    string (file, " ");
+	format (file, "mlen %d",
+		inst->bits3.generic.msg_length);
+	format (file, " rlen %d",
+		inst->bits3.generic.response_length);
+    }
+    pad (file, 64);
+    if (inst->header.opcode != BRW_OPCODE_NOP) {
+	string (file, "{");
+	space = 1;
+	err |= control(file, "access mode", access_mode, inst->header.access_mode, &space);
+	err |= control (file, "mask control", mask_ctrl, inst->header.mask_control, &space);
+	err |= control (file, "dependency control", dep_ctrl, inst->header.dependency_control, &space);
+	err |= control (file, "compression control", compr_ctrl, inst->header.compression_control, &space);
+	err |= control (file, "thread control", thread_ctrl, inst->header.thread_control, &space);
+	if (inst->header.opcode == BRW_OPCODE_SEND)
+	    err |= control (file, "end of thread", end_of_thread,
+			    inst->bits3.generic.end_of_thread, &space);
+	if (space)
+	    string (file, " ");
+	string (file, "}");
+    }
+    string (file, ";");
+    newline (file);
+    return err;
+}
diff --git a/i965/brw_draw.c b/i965/brw_draw.c
index 5342622..c53bd47 100644
--- a/i965/brw_draw.c
+++ b/i965/brw_draw.c
@@ -141,6 +141,8 @@ static void brw_emit_prim(struct brw_context *brw,
 
    prim_packet.verts_per_instance = trim(prim->mode, prim->count);
    prim_packet.start_vert_location = prim->start;
+   if (prim->indexed)
+      prim_packet.start_vert_location += brw->ib.start_vertex_offset;
    prim_packet.instance_count = 1;
    prim_packet.start_instance_location = 0;
    prim_packet.base_vert_location = 0;
@@ -185,21 +187,16 @@ static void brw_merge_inputs( struct brw_context *brw,
 
    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
       brw->vb.inputs[i].glarray = arrays[i];
+      brw->vb.inputs[i].attrib = (gl_vert_attrib) i;
 
       if (arrays[i]->StrideB != 0)
-	 brw->vb.info.varying |= 1 << i;
-
 	 brw->vb.info.sizes[i/16] |= (brw->vb.inputs[i].glarray->Size - 1) <<
 	    ((i%16) * 2);
    }
 
-   /* Raise statechanges if input sizes and varying have changed: 
-    */
+   /* Raise statechanges if input sizes have changed. */
    if (memcmp(brw->vb.info.sizes, old.sizes, sizeof(old.sizes)) != 0)
       brw->state.dirty.brw |= BRW_NEW_INPUT_DIMENSIONS;
-
-   if (brw->vb.info.varying != old.varying)
-      brw->state.dirty.brw |= BRW_NEW_INPUT_VARYING;
 }
 
 /* XXX: could split the primitive list to fallback only on the
@@ -416,6 +413,8 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
  out:
    UNLOCK_HARDWARE(intel);
 
+   brw_state_cache_check_size(brw);
+
    if (warn)
       fprintf(stderr, "i965: Single primitive emit potentially exceeded "
 	      "available aperture space\n");
@@ -426,54 +425,31 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    return retval;
 }
 
-static GLboolean brw_need_rebase( GLcontext *ctx,
-				  const struct gl_client_array *arrays[],
-				  const struct _mesa_index_buffer *ib,
-				  GLuint min_index )
-{
-   if (min_index == 0) 
-      return GL_FALSE;
-
-   if (ib) {
-      if (!vbo_all_varyings_in_vbos(arrays))
-	 return GL_TRUE;
-      else
-	 return GL_FALSE;
-   }
-   else {
-      /* Hmm.  This isn't quite what I wanted.  BRW can actually
-       * handle the mixed case well enough that we shouldn't need to
-       * rebase.  However, it's probably not very common, nor hugely
-       * expensive to do it this way:
-       */
-      if (!vbo_all_varyings_in_vbos(arrays))
-	 return GL_TRUE;
-      else
-	 return GL_FALSE;
-   }
-}
-				  
-
 void brw_draw_prims( GLcontext *ctx,
 		     const struct gl_client_array *arrays[],
 		     const struct _mesa_prim *prim,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
+		     GLboolean index_bounds_valid,
 		     GLuint min_index,
 		     GLuint max_index )
 {
    GLboolean retval;
 
-   /* Decide if we want to rebase.  If so we end up recursing once
-    * only into this function.
-    */
-   if (brw_need_rebase( ctx, arrays, ib, min_index )) {
-      vbo_rebase_prims( ctx, arrays, 
-			prim, nr_prims, 
-			ib, min_index, max_index, 
-			brw_draw_prims );
-      
-      return;
+   if (!vbo_all_varyings_in_vbos(arrays)) {
+      if (!index_bounds_valid)
+	 vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
+
+      /* Decide if we want to rebase.  If so we end up recursing once
+       * only into this function.
+       */
+      if (min_index != 0) {
+	 vbo_rebase_prims(ctx, arrays,
+			  prim, nr_prims,
+			  ib, min_index, max_index,
+			  brw_draw_prims );
+	 return;
+      }
    }
 
    /* Make a first attempt at drawing:
diff --git a/i965/brw_draw.h b/i965/brw_draw.h
index 9aebbdb..2a14db2 100644
--- a/i965/brw_draw.h
+++ b/i965/brw_draw.h
@@ -39,6 +39,7 @@ void brw_draw_prims( GLcontext *ctx,
 		     const struct _mesa_prim *prims,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
+		     GLboolean index_bounds_valid,
 		     GLuint min_index,
 		     GLuint max_index );
 
diff --git a/i965/brw_draw_upload.c b/i965/brw_draw_upload.c
index b91b20b..4aa17fa 100644
--- a/i965/brw_draw_upload.c
+++ b/i965/brw_draw_upload.c
@@ -343,16 +343,13 @@ static void brw_prepare_vertices(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = intel_context(ctx);
-   GLuint tmp = brw->vs.prog_data->inputs_read; 
+   GLbitfield vs_inputs = brw->vs.prog_data->inputs_read; 
    GLuint i;
    const unsigned char *ptr = NULL;
    GLuint interleave = 0;
    unsigned int min_index = brw->vb.min_index;
    unsigned int max_index = brw->vb.max_index;
 
-   struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
-   GLuint nr_enabled = 0;
-
    struct brw_vertex_element *upload[VERT_ATTRIB_MAX];
    GLuint nr_uploads = 0;
 
@@ -362,12 +359,13 @@ static void brw_prepare_vertices(struct brw_context *brw)
       _mesa_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
 
    /* Accumulate the list of enabled arrays. */
-   while (tmp) {
-      GLuint i = _mesa_ffsll(tmp)-1;
+   brw->vb.nr_enabled = 0;
+   while (vs_inputs) {
+      GLuint i = _mesa_ffsll(vs_inputs) - 1;
       struct brw_vertex_element *input = &brw->vb.inputs[i];
 
-      tmp &= ~(1<<i);
-      enabled[nr_enabled++] = input;
+      vs_inputs &= ~(1 << i);
+      brw->vb.enabled[brw->vb.nr_enabled++] = input;
    }
 
    /* XXX: In the rare cases where this happens we fallback all
@@ -376,16 +374,15 @@ static void brw_prepare_vertices(struct brw_context *brw)
     * cases with > 17 vertex attributes enabled, so it probably
     * isn't an issue at this point.
     */
-   if (nr_enabled >= BRW_VEP_MAX) {
+   if (brw->vb.nr_enabled >= BRW_VEP_MAX) {
       intel->Fallback = 1;
       return;
    }
 
-   for (i = 0; i < nr_enabled; i++) {
-      struct brw_vertex_element *input = enabled[i];
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
 
       input->element_size = get_size(input->glarray->Type) * input->glarray->Size;
-      input->count = input->glarray->StrideB ? max_index + 1 - min_index : 1;
 
       if (input->glarray->BufferObj->Name != 0) {
 	 struct intel_buffer_object *intel_buffer =
@@ -398,7 +395,23 @@ static void brw_prepare_vertices(struct brw_context *brw)
 	 dri_bo_reference(input->bo);
 	 input->offset = (unsigned long)input->glarray->Ptr;
 	 input->stride = input->glarray->StrideB;
+	 input->count = input->glarray->_MaxElement;
+
+	 /* This is a common place to reach if the user mistakenly supplies
+	  * a pointer in place of a VBO offset.  If we just let it go through,
+	  * we may end up dereferencing a pointer beyond the bounds of the
+	  * GTT.  We would hope that the VBO's max_index would save us, but
+	  * Mesa appears to hand us min/max values not clipped to the
+	  * array object's _MaxElement, and _MaxElement frequently appears
+	  * to be wrong anyway.
+	  *
+	  * The VBO spec allows application termination in this case, and it's
+	  * probably a service to the poor programmer to do so rather than
+	  * trying to just not render.
+	  */
+	 assert(input->offset < input->bo->size);
       } else {
+	 input->count = input->glarray->StrideB ? max_index + 1 - min_index : 1;
 	 if (input->bo != NULL) {
 	    /* Already-uploaded vertex data is present from a previous
 	     * prepare_vertices, but we had to re-validate state due to
@@ -410,7 +423,7 @@ static void brw_prepare_vertices(struct brw_context *brw)
 	 /* Queue the buffer object up to be uploaded in the next pass,
 	  * when we've decided if we're doing interleaved or not.
 	  */
-	 if (i == 0) {
+	 if (input->attrib == VERT_ATTRIB_POS) {
 	    /* Position array not properly enabled:
 	     */
             if (input->glarray->StrideB == 0) {
@@ -466,8 +479,8 @@ static void brw_prepare_vertices(struct brw_context *brw)
 
    brw_prepare_query_begin(brw);
 
-   for (i = 0; i < nr_enabled; i++) {
-      struct brw_vertex_element *input = enabled[i];
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
 
       brw_add_validated_bo(brw, input->bo);
    }
@@ -477,34 +490,44 @@ static void brw_emit_vertices(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = intel_context(ctx);
-   GLuint tmp = brw->vs.prog_data->inputs_read;
-   struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
    GLuint i;
-   GLuint nr_enabled = 0;
 
-  /* Accumulate the list of enabled arrays. */
-   while (tmp) {
-      i = _mesa_ffsll(tmp)-1;
-      struct brw_vertex_element *input = &brw->vb.inputs[i];
+   brw_emit_query_begin(brw);
 
-      tmp &= ~(1<<i);
-      enabled[nr_enabled++] = input;
+   /* If the VS doesn't read any inputs (calculating vertex position from
+    * a state variable for some reason, for example), emit a single pad
+    * VERTEX_ELEMENT struct and bail.
+    *
+    * The stale VB state stays in place, but they don't do anything unless
+    * a VE loads from them.
+    */
+   if (brw->vb.nr_enabled == 0) {
+      BEGIN_BATCH(3, IGNORE_CLIPRECTS);
+      OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | 1);
+      OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) |
+		BRW_VE0_VALID |
+		(BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
+		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
+      OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT));
+      ADVANCE_BATCH();
+      return;
    }
 
-   brw_emit_query_begin(brw);
-
    /* Now emit VB and VEP state packets.
     *
     * This still defines a hardware VB for each input, even if they
     * are interleaved or from the same VBO.  TBD if this makes a
     * performance difference.
     */
-   BEGIN_BATCH(1 + nr_enabled * 4, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 4, IGNORE_CLIPRECTS);
    OUT_BATCH((CMD_VERTEX_BUFFER << 16) |
-	     ((1 + nr_enabled * 4) - 2));
+	     ((1 + brw->vb.nr_enabled * 4) - 2));
 
-   for (i = 0; i < nr_enabled; i++) {
-      struct brw_vertex_element *input = enabled[i];
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
 
       OUT_BATCH((i << BRW_VB0_INDEX_SHIFT) |
 		BRW_VB0_ACCESS_VERTEXDATA |
@@ -512,15 +535,27 @@ static void brw_emit_vertices(struct brw_context *brw)
       OUT_RELOC(input->bo,
 		I915_GEM_DOMAIN_VERTEX, 0,
 		input->offset);
-      OUT_BATCH(brw->vb.max_index);
+      if (BRW_IS_IGDNG(brw)) {
+          if (input->stride) {
+              OUT_RELOC(input->bo,
+                        I915_GEM_DOMAIN_VERTEX, 0,
+                        input->offset + input->stride * input->count);
+          } else {
+              assert(input->count == 1);
+              OUT_RELOC(input->bo,
+                        I915_GEM_DOMAIN_VERTEX, 0,
+                        input->offset + input->element_size);
+          }
+      } else
+          OUT_BATCH(input->stride ? input->count : 0);
       OUT_BATCH(0); /* Instance data step rate */
    }
    ADVANCE_BATCH();
 
-   BEGIN_BATCH(1 + nr_enabled * 2, IGNORE_CLIPRECTS);
-   OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + nr_enabled * 2) - 2));
-   for (i = 0; i < nr_enabled; i++) {
-      struct brw_vertex_element *input = enabled[i];
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 2, IGNORE_CLIPRECTS);
+   OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + brw->vb.nr_enabled * 2) - 2));
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
       uint32_t format = get_surface_type(input->glarray->Type,
 					 input->glarray->Size,
 					 input->glarray->Format,
@@ -542,11 +577,18 @@ static void brw_emit_vertices(struct brw_context *brw)
 		BRW_VE0_VALID |
 		(format << BRW_VE0_FORMAT_SHIFT) |
 		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
-      OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
-		(comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
-		(comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
-		(comp3 << BRW_VE1_COMPONENT_3_SHIFT) |
-		((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
+
+      if (BRW_IS_IGDNG(brw))
+          OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
+                    (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
+                    (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
+                    (comp3 << BRW_VE1_COMPONENT_3_SHIFT));
+      else
+          OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
+                    (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
+                    (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
+                    (comp3 << BRW_VE1_COMPONENT_3_SHIFT) |
+                    ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
    }
    ADVANCE_BATCH();
 }
@@ -570,17 +612,20 @@ static void brw_prepare_indices(struct brw_context *brw)
    dri_bo *bo = NULL;
    struct gl_buffer_object *bufferobj;
    GLuint offset;
+   GLuint ib_type_size;
 
    if (index_buffer == NULL)
       return;
 
-   ib_size = get_size(index_buffer->type) * index_buffer->count;
+   ib_type_size = get_size(index_buffer->type);
+   ib_size = ib_type_size * index_buffer->count;
    bufferobj = index_buffer->obj;;
 
    /* Turn into a proper VBO:
     */
    if (!bufferobj->Name) {
-     
+      brw->ib.start_vertex_offset = 0;
+
       /* Get new bufferobj, offset:
        */
       get_space(brw, ib_size, &bo, &offset);
@@ -596,6 +641,7 @@ static void brw_prepare_indices(struct brw_context *brw)
       }
    } else {
       offset = (GLuint) (unsigned long) index_buffer->ptr;
+      brw->ib.start_vertex_offset = 0;
 
       /* If the index buffer isn't aligned to its element size, we have to
        * rebase it into a temporary.
@@ -616,39 +662,62 @@ static void brw_prepare_indices(struct brw_context *brw)
 	  bo = intel_bufferobj_buffer(intel, intel_buffer_object(bufferobj),
 				      INTEL_READ);
 	  dri_bo_reference(bo);
+
+	  /* Use CMD_3D_PRIM's start_vertex_offset to avoid re-uploading
+	   * the index buffer state when we're just moving the start index
+	   * of our drawing.
+	   */
+	  brw->ib.start_vertex_offset = offset / ib_type_size;
+	  offset = 0;
+	  ib_size = bo->size;
        }
    }
 
-   dri_bo_unreference(brw->ib.bo);
-   brw->ib.bo = bo;
-   brw->ib.offset = offset;
+   if (brw->ib.bo != bo ||
+       brw->ib.offset != offset ||
+       brw->ib.size != ib_size)
+   {
+      drm_intel_bo_unreference(brw->ib.bo);
+      brw->ib.bo = bo;
+      brw->ib.offset = offset;
+      brw->ib.size = ib_size;
+
+      brw->state.dirty.brw |= BRW_NEW_INDEX_BUFFER;
+   } else {
+      drm_intel_bo_unreference(bo);
+   }
 
    brw_add_validated_bo(brw, brw->ib.bo);
 }
 
-static void brw_emit_indices(struct brw_context *brw)
+const struct brw_tracked_state brw_indices = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_INDICES,
+      .cache = 0,
+   },
+   .prepare = brw_prepare_indices,
+};
+
+static void brw_emit_index_buffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
-   GLuint ib_size;
 
    if (index_buffer == NULL)
       return;
 
-   ib_size = get_size(index_buffer->type) * index_buffer->count;
-
    /* Emit the indexbuffer packet:
     */
    {
       struct brw_indexbuffer ib;
 
       memset(&ib, 0, sizeof(ib));
-   
+
       ib.header.bits.opcode = CMD_INDEX_BUFFER;
       ib.header.bits.length = sizeof(ib)/4 - 2;
       ib.header.bits.index_format = get_index_type(index_buffer->type);
       ib.header.bits.cut_index_enable = 0;
-   
 
       BEGIN_BATCH(4, IGNORE_CLIPRECTS);
       OUT_BATCH( ib.header.dword );
@@ -657,18 +726,17 @@ static void brw_emit_indices(struct brw_context *brw)
 		brw->ib.offset);
       OUT_RELOC(brw->ib.bo,
 		I915_GEM_DOMAIN_VERTEX, 0,
-		brw->ib.offset + ib_size);
+		brw->ib.offset + brw->ib.size);
       OUT_BATCH( 0 );
       ADVANCE_BATCH();
    }
 }
 
-const struct brw_tracked_state brw_indices = {
+const struct brw_tracked_state brw_index_buffer = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_BATCH | BRW_NEW_INDICES,
+      .brw = BRW_NEW_BATCH | BRW_NEW_INDEX_BUFFER,
       .cache = 0,
    },
-   .prepare = brw_prepare_indices,
-   .emit = brw_emit_indices,
+   .emit = brw_emit_index_buffer,
 };
diff --git a/i965/brw_eu.c b/i965/brw_eu.c
index c53efba..1df5613 100644
--- a/i965/brw_eu.c
+++ b/i965/brw_eu.c
@@ -62,7 +62,7 @@ void brw_set_predicate_control( struct brw_compile *p, GLuint pc )
 
 void brw_set_conditionalmod( struct brw_compile *p, GLuint conditional )
 {
-   p->current->header.destreg__conditonalmod = conditional;
+   p->current->header.destreg__conditionalmod = conditional;
 }
 
 void brw_set_access_mode( struct brw_compile *p, GLuint access_mode )
diff --git a/i965/brw_eu.h b/i965/brw_eu.h
index 003332f..30603bd 100644
--- a/i965/brw_eu.h
+++ b/i965/brw_eu.h
@@ -97,7 +97,7 @@ struct brw_glsl_call;
 
 
 #define BRW_EU_MAX_INSN_STACK 5
-#define BRW_EU_MAX_INSN 4000
+#define BRW_EU_MAX_INSN 10000
 
 struct brw_compile {
    struct brw_instruction store[BRW_EU_MAX_INSN];
@@ -171,9 +171,9 @@ static INLINE struct brw_reg brw_reg( GLuint file,
 {
    struct brw_reg reg;
    if (type == BRW_GENERAL_REGISTER_FILE)
-      assert(nr < 128);
+      assert(nr < BRW_MAX_GRF);
    else if (type == BRW_MESSAGE_REGISTER_FILE)
-      assert(nr < 9);
+      assert(nr < BRW_MAX_MRF);
    else if (type == BRW_ARCHITECTURE_REGISTER_FILE)
       assert(nr <= BRW_ARF_IP);
 
@@ -538,6 +538,7 @@ static INLINE struct brw_reg brw_mask_reg( GLuint subnr )
 
 static INLINE struct brw_reg brw_message_reg( GLuint nr )
 {
+   assert(nr < BRW_MAX_MRF);
    return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE,
 		       nr,
 		       0);
@@ -815,6 +816,19 @@ void brw_urb_WRITE(struct brw_compile *p,
 		   GLuint offset,
 		   GLuint swizzle);
 
+void brw_ff_sync(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle);
+
 void brw_fb_WRITE(struct brw_compile *p,
 		   struct brw_reg dest,
 		   GLuint msg_reg_nr,
@@ -834,7 +848,9 @@ void brw_SAMPLE(struct brw_compile *p,
 		GLuint msg_type,
 		GLuint response_length,
 		GLuint msg_length,
-		GLboolean eot);
+		GLboolean eot,
+		GLuint header_present,
+		GLuint simd_mode);
 
 void brw_math_16( struct brw_compile *p,
 		  struct brw_reg dest,
diff --git a/i965/brw_eu_emit.c b/i965/brw_eu_emit.c
index 2a147fb..241cdc3 100644
--- a/i965/brw_eu_emit.c
+++ b/i965/brw_eu_emit.c
@@ -241,7 +241,8 @@ void brw_set_src1( struct brw_instruction *insn,
 
 
 
-static void brw_set_math_message( struct brw_instruction *insn,
+static void brw_set_math_message( struct brw_context *brw,
+				  struct brw_instruction *insn,
 				  GLuint msg_length,
 				  GLuint response_length,
 				  GLuint function,
@@ -252,18 +253,35 @@ static void brw_set_math_message( struct brw_instruction *insn,
 {
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.math.function = function;
-   insn->bits3.math.int_type = integer_type;
-   insn->bits3.math.precision = low_precision;
-   insn->bits3.math.saturate = saturate;
-   insn->bits3.math.data_type = dataType;
-   insn->bits3.math.response_length = response_length;
-   insn->bits3.math.msg_length = msg_length;
-   insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
-   insn->bits3.math.end_of_thread = 0;
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.math_igdng.function = function;
+       insn->bits3.math_igdng.int_type = integer_type;
+       insn->bits3.math_igdng.precision = low_precision;
+       insn->bits3.math_igdng.saturate = saturate;
+       insn->bits3.math_igdng.data_type = dataType;
+       insn->bits3.math_igdng.snapshot = 0;
+       insn->bits3.math_igdng.header_present = 0;
+       insn->bits3.math_igdng.response_length = response_length;
+       insn->bits3.math_igdng.msg_length = msg_length;
+       insn->bits3.math_igdng.end_of_thread = 0;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_MATH;
+       insn->bits2.send_igdng.end_of_thread = 0;
+   } else {
+       insn->bits3.math.function = function;
+       insn->bits3.math.int_type = integer_type;
+       insn->bits3.math.precision = low_precision;
+       insn->bits3.math.saturate = saturate;
+       insn->bits3.math.data_type = dataType;
+       insn->bits3.math.response_length = response_length;
+       insn->bits3.math.msg_length = msg_length;
+       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
+       insn->bits3.math.end_of_thread = 0;
+   }
 }
 
-static void brw_set_urb_message( struct brw_instruction *insn,
+
+static void brw_set_ff_sync_message( struct brw_context *brw,
+				 struct brw_instruction *insn,
 				 GLboolean allocate,
 				 GLboolean used,
 				 GLuint msg_length,
@@ -273,21 +291,64 @@ static void brw_set_urb_message( struct brw_instruction *insn,
 				 GLuint offset,
 				 GLuint swizzle_control )
 {
-   brw_set_src1(insn, brw_imm_d(0));
+	brw_set_src1(insn, brw_imm_d(0));
+
+	insn->bits3.urb_igdng.opcode = 1;
+	insn->bits3.urb_igdng.offset = offset;
+	insn->bits3.urb_igdng.swizzle_control = swizzle_control;
+	insn->bits3.urb_igdng.allocate = allocate;
+	insn->bits3.urb_igdng.used = used;
+	insn->bits3.urb_igdng.complete = complete;
+	insn->bits3.urb_igdng.header_present = 1;
+	insn->bits3.urb_igdng.response_length = response_length;
+	insn->bits3.urb_igdng.msg_length = msg_length;
+	insn->bits3.urb_igdng.end_of_thread = end_of_thread;
+	insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+	insn->bits2.send_igdng.end_of_thread = end_of_thread;
+}
 
-   insn->bits3.urb.opcode = 0;	/* ? */
-   insn->bits3.urb.offset = offset;
-   insn->bits3.urb.swizzle_control = swizzle_control;
-   insn->bits3.urb.allocate = allocate;
-   insn->bits3.urb.used = used;	/* ? */
-   insn->bits3.urb.complete = complete;
-   insn->bits3.urb.response_length = response_length;
-   insn->bits3.urb.msg_length = msg_length;
-   insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
-   insn->bits3.urb.end_of_thread = end_of_thread;
+static void brw_set_urb_message( struct brw_context *brw,
+				 struct brw_instruction *insn,
+				 GLboolean allocate,
+				 GLboolean used,
+				 GLuint msg_length,
+				 GLuint response_length,
+				 GLboolean end_of_thread,
+				 GLboolean complete,
+				 GLuint offset,
+				 GLuint swizzle_control )
+{
+    brw_set_src1(insn, brw_imm_d(0));
+
+    if (BRW_IS_IGDNG(brw)) {
+        insn->bits3.urb_igdng.opcode = 0;	/* ? */
+        insn->bits3.urb_igdng.offset = offset;
+        insn->bits3.urb_igdng.swizzle_control = swizzle_control;
+        insn->bits3.urb_igdng.allocate = allocate;
+        insn->bits3.urb_igdng.used = used;	/* ? */
+        insn->bits3.urb_igdng.complete = complete;
+        insn->bits3.urb_igdng.header_present = 1;
+        insn->bits3.urb_igdng.response_length = response_length;
+        insn->bits3.urb_igdng.msg_length = msg_length;
+        insn->bits3.urb_igdng.end_of_thread = end_of_thread;
+        insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+        insn->bits2.send_igdng.end_of_thread = end_of_thread;
+    } else {
+        insn->bits3.urb.opcode = 0;	/* ? */
+        insn->bits3.urb.offset = offset;
+        insn->bits3.urb.swizzle_control = swizzle_control;
+        insn->bits3.urb.allocate = allocate;
+        insn->bits3.urb.used = used;	/* ? */
+        insn->bits3.urb.complete = complete;
+        insn->bits3.urb.response_length = response_length;
+        insn->bits3.urb.msg_length = msg_length;
+        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
+        insn->bits3.urb.end_of_thread = end_of_thread;
+    }
 }
 
-static void brw_set_dp_write_message( struct brw_instruction *insn,
+static void brw_set_dp_write_message( struct brw_context *brw,
+				      struct brw_instruction *insn,
 				      GLuint binding_table_index,
 				      GLuint msg_control,
 				      GLuint msg_type,
@@ -298,18 +359,33 @@ static void brw_set_dp_write_message( struct brw_instruction *insn,
 {
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.dp_write.binding_table_index = binding_table_index;
-   insn->bits3.dp_write.msg_control = msg_control;
-   insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
-   insn->bits3.dp_write.msg_type = msg_type;
-   insn->bits3.dp_write.send_commit_msg = 0;
-   insn->bits3.dp_write.response_length = response_length;
-   insn->bits3.dp_write.msg_length = msg_length;
-   insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
-   insn->bits3.urb.end_of_thread = end_of_thread;
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.dp_write_igdng.binding_table_index = binding_table_index;
+       insn->bits3.dp_write_igdng.msg_control = msg_control;
+       insn->bits3.dp_write_igdng.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write_igdng.msg_type = msg_type;
+       insn->bits3.dp_write_igdng.send_commit_msg = 0;
+       insn->bits3.dp_write_igdng.header_present = 1;
+       insn->bits3.dp_write_igdng.response_length = response_length;
+       insn->bits3.dp_write_igdng.msg_length = msg_length;
+       insn->bits3.dp_write_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   } else {
+       insn->bits3.dp_write.binding_table_index = binding_table_index;
+       insn->bits3.dp_write.msg_control = msg_control;
+       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write.msg_type = msg_type;
+       insn->bits3.dp_write.send_commit_msg = 0;
+       insn->bits3.dp_write.response_length = response_length;
+       insn->bits3.dp_write.msg_length = msg_length;
+       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits3.dp_write.end_of_thread = end_of_thread;
+   }
 }
 
-static void brw_set_dp_read_message( struct brw_instruction *insn,
+static void brw_set_dp_read_message( struct brw_context *brw,
+				      struct brw_instruction *insn,
 				      GLuint binding_table_index,
 				      GLuint msg_control,
 				      GLuint msg_type,
@@ -320,15 +396,29 @@ static void brw_set_dp_read_message( struct brw_instruction *insn,
 {
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
-   insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
-   insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
-   insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
-   insn->bits3.dp_read.response_length = response_length;  /*16:19*/
-   insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
-   insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
-   insn->bits3.dp_read.pad1 = 0;  /*28:30*/
-   insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.dp_read_igdng.binding_table_index = binding_table_index;
+       insn->bits3.dp_read_igdng.msg_control = msg_control;
+       insn->bits3.dp_read_igdng.msg_type = msg_type;
+       insn->bits3.dp_read_igdng.target_cache = target_cache;
+       insn->bits3.dp_read_igdng.header_present = 1;
+       insn->bits3.dp_read_igdng.response_length = response_length;
+       insn->bits3.dp_read_igdng.msg_length = msg_length;
+       insn->bits3.dp_read_igdng.pad1 = 0;
+       insn->bits3.dp_read_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   } else {
+       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
+       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
+       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
+       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
+       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
+       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
+       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
+       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
+       insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
+   }
 }
 
 static void brw_set_sampler_message(struct brw_context *brw,
@@ -338,11 +428,25 @@ static void brw_set_sampler_message(struct brw_context *brw,
                                     GLuint msg_type,
                                     GLuint response_length,
                                     GLuint msg_length,
-                                    GLboolean eot)
+                                    GLboolean eot,
+                                    GLuint header_present,
+                                    GLuint simd_mode)
 {
+   assert(eot == 0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_G4X(brw)) {
+   if (BRW_IS_IGDNG(brw)) {
+      insn->bits3.sampler_igdng.binding_table_index = binding_table_index;
+      insn->bits3.sampler_igdng.sampler = sampler;
+      insn->bits3.sampler_igdng.msg_type = msg_type;
+      insn->bits3.sampler_igdng.simd_mode = simd_mode;
+      insn->bits3.sampler_igdng.header_present = header_present;
+      insn->bits3.sampler_igdng.response_length = response_length;
+      insn->bits3.sampler_igdng.msg_length = msg_length;
+      insn->bits3.sampler_igdng.end_of_thread = eot;
+      insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_SAMPLER;
+      insn->bits2.send_igdng.end_of_thread = eot;
+   } else if (BRW_IS_G4X(brw)) {
       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
       insn->bits3.sampler_g4x.sampler = sampler;
       insn->bits3.sampler_g4x.msg_type = msg_type;
@@ -377,8 +481,8 @@ static struct brw_instruction *next_insn( struct brw_compile *p,
    /* Reset this one-shot flag: 
     */
 
-   if (p->current->header.destreg__conditonalmod) {
-      p->current->header.destreg__conditonalmod = 0;   
+   if (p->current->header.destreg__conditionalmod) {
+      p->current->header.destreg__conditionalmod = 0;
       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
    }
 
@@ -484,6 +588,10 @@ struct brw_instruction *brw_JMPI(struct brw_compile *p,
 {
    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
 
+   insn->header.execution_size = 1;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+
    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 
    return insn;
@@ -540,6 +648,10 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
 				 struct brw_instruction *if_insn)
 {
    struct brw_instruction *insn;
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2;
 
    if (p->single_program_flow) {
       insn = next_insn(p, BRW_OPCODE_ADD);
@@ -566,8 +678,8 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
    } else {
       assert(if_insn->header.opcode == BRW_OPCODE_IF);
 
-      if_insn->bits3.if_else.jump_count = insn - if_insn;
-      if_insn->bits3.if_else.pop_count = 1;
+      if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
+      if_insn->bits3.if_else.pop_count = 0;
       if_insn->bits3.if_else.pad0 = 0;
    }
 
@@ -577,6 +689,11 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
 void brw_ENDIF(struct brw_compile *p, 
 	       struct brw_instruction *patch_insn)
 {
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2; 
+ 
    if (p->single_program_flow) {
       /* In single program flow mode, there's no need to execute an ENDIF,
        * since we don't need to do any stack operations, and if we're executing
@@ -608,11 +725,11 @@ void brw_ENDIF(struct brw_compile *p,
 	 /* Automagically turn it into an IFF:
 	  */
 	 patch_insn->header.opcode = BRW_OPCODE_IFF;
-	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+	 patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
 	 patch_insn->bits3.if_else.pop_count = 0;
 	 patch_insn->bits3.if_else.pad0 = 0;
       } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) {
-	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+	 patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
 	 patch_insn->bits3.if_else.pop_count = 1;
 	 patch_insn->bits3.if_else.pad0 = 0;
       } else {
@@ -686,6 +803,10 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
                                   struct brw_instruction *do_insn)
 {
    struct brw_instruction *insn;
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2;
 
    if (p->single_program_flow)
       insn = next_insn(p, BRW_OPCODE_ADD);
@@ -706,7 +827,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
       insn->header.execution_size = do_insn->header.execution_size;
 
       assert(do_insn->header.opcode == BRW_OPCODE_DO);
-      insn->bits3.if_else.jump_count = do_insn - insn + 1;
+      insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
       insn->bits3.if_else.pop_count = 0;
       insn->bits3.if_else.pad0 = 0;
    }
@@ -725,11 +846,15 @@ void brw_land_fwd_jump(struct brw_compile *p,
 		       struct brw_instruction *jmp_insn)
 {
    struct brw_instruction *landing = &p->store[p->nr_insn];
+   GLuint jmpi = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
 
    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
    assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
 
-   jmp_insn->bits3.ud = (landing - jmp_insn) - 1; 
+   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
 }
 
 
@@ -746,7 +871,7 @@ void brw_CMP(struct brw_compile *p,
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
 
-   insn->header.destreg__conditonalmod = conditional;
+   insn->header.destreg__conditionalmod = conditional;
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
@@ -790,11 +915,12 @@ void brw_math( struct brw_compile *p,
     * instructions.
     */
    insn->header.predicate_control = 0; 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src);
-   brw_set_math_message(insn, 
+   brw_set_math_message(p->brw,
+			insn, 
 			msg_length, response_length, 
 			function,
 			BRW_MATH_INTEGER_UNSIGNED,
@@ -826,11 +952,12 @@ void brw_math_16( struct brw_compile *p,
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
    insn = next_insn(p, BRW_OPCODE_SEND);
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src);
-   brw_set_math_message(insn, 
+   brw_set_math_message(p->brw,
+			insn, 
 			msg_length, response_length, 
 			function,
 			BRW_MATH_INTEGER_UNSIGNED,
@@ -842,11 +969,12 @@ void brw_math_16( struct brw_compile *p,
     */
    insn = next_insn(p, BRW_OPCODE_SEND);
    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
-   insn->header.destreg__conditonalmod = msg_reg_nr+1;
+   insn->header.destreg__conditionalmod = msg_reg_nr+1;
 
    brw_set_dest(insn, offset(dest,1));
    brw_set_src0(insn, src);
-   brw_set_math_message(insn, 
+   brw_set_math_message(p->brw, 
+			insn, 
 			msg_length, response_length, 
 			function,
 			BRW_MATH_INTEGER_UNSIGNED,
@@ -888,12 +1016,13 @@ void brw_dp_WRITE_16( struct brw_compile *p,
    
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
   
       brw_set_dest(insn, dest);
       brw_set_src0(insn, src);
 
-      brw_set_dp_write_message(insn,
+      brw_set_dp_write_message(p->brw,
+			       insn,
 			       255, /* binding table index (255=stateless) */
 			       BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
 			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
@@ -933,12 +1062,13 @@ void brw_dp_READ_16( struct brw_compile *p,
    
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
   
       brw_set_dest(insn, dest);	/* UW? */
       brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
 
-      brw_set_dp_read_message(insn,
+      brw_set_dp_read_message(p->brw,
+			      insn,
 			      255, /* binding table index (255=stateless) */
 			      3,  /* msg_control (3 means 4 Owords) */
 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
@@ -986,7 +1116,7 @@ void brw_dp_READ_4( struct brw_compile *p,
    
       insn->header.predicate_control = BRW_PREDICATE_NONE;
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
       insn->header.mask_control = BRW_MASK_DISABLE;
   
       /* cast dest to a uword[8] vector */
@@ -995,7 +1125,8 @@ void brw_dp_READ_4( struct brw_compile *p,
       brw_set_dest(insn, dest);
       brw_set_src0(insn, brw_null_reg());
 
-      brw_set_dp_read_message(insn,
+      brw_set_dp_read_message(p->brw,
+			      insn,
 			      bind_table_index,
 			      0,  /* msg_control (0 means 1 Oword) */
 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
@@ -1059,14 +1190,15 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
    
       insn->header.predicate_control = BRW_PREDICATE_NONE;
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
       insn->header.mask_control = BRW_MASK_DISABLE;
       /*insn->header.access_mode = BRW_ALIGN_16;*/
   
       brw_set_dest(insn, dest);
       brw_set_src0(insn, brw_null_reg());
 
-      brw_set_dp_read_message(insn,
+      brw_set_dp_read_message(p->brw,
+			      insn,
 			      bind_table_index,
 			      oword,  /* 0 = lower Oword, 1 = upper Oword */
 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
@@ -1092,11 +1224,12 @@ void brw_fb_WRITE(struct brw_compile *p,
    
    insn->header.predicate_control = 0; /* XXX */
    insn->header.compression_control = BRW_COMPRESSION_NONE; 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
   
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src0);
-   brw_set_dp_write_message(insn,
+   brw_set_dp_write_message(p->brw,
+			    insn,
 			    binding_table_index,
 			    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, /* msg_control */
 			    BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, /* msg_type */
@@ -1122,7 +1255,9 @@ void brw_SAMPLE(struct brw_compile *p,
 		GLuint msg_type,
 		GLuint response_length,
 		GLuint msg_length,
-		GLboolean eot)
+		GLboolean eot,
+		GLuint header_present,
+		GLuint simd_mode)
 {
    GLboolean need_stall = 0;
    
@@ -1187,7 +1322,7 @@ void brw_SAMPLE(struct brw_compile *p,
    
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE;
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
 
       brw_set_dest(insn, dest);
       brw_set_src0(insn, src0);
@@ -1197,7 +1332,9 @@ void brw_SAMPLE(struct brw_compile *p,
 			      msg_type,
 			      response_length, 
 			      msg_length,
-			      eot);
+			      eot,
+			      header_present,
+			      simd_mode);
    }
 
    if (need_stall) {
@@ -1232,15 +1369,16 @@ void brw_urb_WRITE(struct brw_compile *p,
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
 
-   assert(msg_length < 16);
+   assert(msg_length < BRW_MAX_MRF);
 
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
 
-   brw_set_urb_message(insn,
+   brw_set_urb_message(p->brw,
+		       insn,
 		       allocate,
 		       used,
 		       msg_length,
@@ -1251,3 +1389,37 @@ void brw_urb_WRITE(struct brw_compile *p,
 		       swizzle);
 }
 
+void brw_ff_sync(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < 16);
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+
+   brw_set_ff_sync_message(p->brw,
+		       insn,
+		       allocate,
+		       used,
+		       msg_length,
+		       response_length, 
+		       eot, 
+		       writes_complete, 
+		       offset,
+		       swizzle);
+}
diff --git a/i965/brw_fallback.c b/i965/brw_fallback.c
index 2993574..d27c6c2 100644
--- a/i965/brw_fallback.c
+++ b/i965/brw_fallback.c
@@ -37,6 +37,9 @@
 #include "tnl/tnl.h"
 #include "brw_context.h"
 #include "brw_fallback.h"
+#include "intel_chipset.h"
+#include "intel_fbo.h"
+#include "intel_regions.h"
 
 #include "glapi/glapi.h"
 
@@ -44,6 +47,7 @@
 
 static GLboolean do_check_fallback(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
 
@@ -81,6 +85,33 @@ static GLboolean do_check_fallback(struct brw_context *brw)
       return GL_TRUE;
    }
 
+   /* _NEW_BUFFERS */
+   if (IS_965(intel->intelScreen->deviceID) &&
+       !IS_G4X(intel->intelScreen->deviceID)) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+
+	 /* The original gen4 hardware couldn't set up WM surfaces pointing
+	  * at an offset within a tile, which can happen when rendering to
+	  * anything but the base level of a texture or the +X face/0 depth.
+	  * This was fixed with the 4 Series hardware.
+	  *
+	  * For these original chips, you would have to make the depth and
+	  * color destination surfaces include information on the texture
+	  * type, LOD, face, and various limits to use them as a destination.
+	  * I would have done this, but there's also a nasty requirement that
+	  * the depth and the color surfaces all be of the same LOD, which
+	  * may be a worse requirement than this alignment.  (Also, we may
+	  * want to just demote the texture to untiled, instead).
+	  */
+	 if (irb->region && irb->region->tiling != I915_TILING_NONE &&
+	     (irb->region->draw_offset & 4095)) {
+	    DBG("FALLBACK: non-tile-aligned destination for tiled FBO\n");
+	    return GL_TRUE;
+	 }
+      }
+   }
 
    return GL_FALSE;
 }
diff --git a/i965/brw_gs.c b/i965/brw_gs.c
index a8b74a0..48c2b9a 100644
--- a/i965/brw_gs.c
+++ b/i965/brw_gs.c
@@ -54,12 +54,17 @@ static void compile_gs_prog( struct brw_context *brw,
    memset(&c, 0, sizeof(c));
    
    c.key = *key;
-
+   c.need_ff_sync = BRW_IS_IGDNG(brw);
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
    c.nr_attrs = brw_count_bits(c.key.attrs);
-   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
+   if (BRW_IS_IGDNG(brw))
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+   else
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
    c.nr_bytes = c.nr_regs * REG_SIZE;
 
    
diff --git a/i965/brw_gs.h b/i965/brw_gs.h
index 18a4537..bbb991e 100644
--- a/i965/brw_gs.h
+++ b/i965/brw_gs.h
@@ -62,6 +62,7 @@ struct brw_gs_compile {
    GLuint nr_attrs;
    GLuint nr_regs;
    GLuint nr_bytes;
+   GLboolean need_ff_sync;
 };
 
 #define ATTR_SIZE  (4*4)
diff --git a/i965/brw_gs_emit.c b/i965/brw_gs_emit.c
index 22e0d25..a9b2aa2 100644
--- a/i965/brw_gs_emit.c
+++ b/i965/brw_gs_emit.c
@@ -101,6 +101,23 @@ static void brw_gs_emit_vue(struct brw_gs_compile *c,
 		 BRW_URB_SWIZZLE_NONE);
 }
 
+static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
+{
+	struct brw_compile *p = &c->func;
+	brw_MOV(p, get_element_ud(c->reg.R0, 1), brw_imm_ud(num_prim));
+	brw_ff_sync(p, 
+				c->reg.R0,
+				0,
+				c->reg.R0,
+				1,	
+				1,		/* used */
+				1,  	/* msg length */
+				1,		/* response length */
+				0,		/* eot */
+				1,		/* write compelete */
+				0,		/* urb offset */
+				BRW_URB_SWIZZLE_NONE);
+}
 
 
 void brw_gs_quads( struct brw_gs_compile *c )
@@ -110,6 +127,8 @@ void brw_gs_quads( struct brw_gs_compile *c )
    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
     * is the PV for quads, but vertex 0 for polygons:
     */
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);    
    brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
    brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); 
@@ -120,6 +139,8 @@ void brw_gs_quad_strip( struct brw_gs_compile *c )
 {
    brw_gs_alloc_regs(c, 4);
    
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); 
@@ -129,6 +150,9 @@ void brw_gs_quad_strip( struct brw_gs_compile *c )
 void brw_gs_tris( struct brw_gs_compile *c )
 {
    brw_gs_alloc_regs(c, 3);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
    brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_TRILIST << 2) | R02_PRIM_END));
@@ -137,6 +161,9 @@ void brw_gs_tris( struct brw_gs_compile *c )
 void brw_gs_lines( struct brw_gs_compile *c )
 {
    brw_gs_alloc_regs(c, 2);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
 }
@@ -144,6 +171,9 @@ void brw_gs_lines( struct brw_gs_compile *c )
 void brw_gs_points( struct brw_gs_compile *c )
 {
    brw_gs_alloc_regs(c, 1);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
 }
 
diff --git a/i965/brw_gs_state.c b/i965/brw_gs_state.c
index 27023cf..a761c03 100644
--- a/i965/brw_gs_state.c
+++ b/i965/brw_gs_state.c
@@ -95,6 +95,9 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 
    gs.thread4.max_threads = 0; /* Hardware requirement */
 
+   if (BRW_IS_IGDNG(brw))
+      gs.thread4.rendering_enable = 1;
+
    if (INTEL_DEBUG & DEBUG_STATS)
       gs.thread4.stats_enable = 1;
 
diff --git a/i965/brw_misc_state.c b/i965/brw_misc_state.c
index 9bc5c35..ea71857 100644
--- a/i965/brw_misc_state.c
+++ b/i965/brw_misc_state.c
@@ -118,7 +118,10 @@ static void upload_binding_table_pointers(struct brw_context *brw)
 
    BEGIN_BATCH(6, IGNORE_CLIPRECTS);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
-   OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
+   if (brw->vs.bind_bo != NULL)
+      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
+   else
+      OUT_BATCH(0);
    OUT_BATCH(0); /* gs */
    OUT_BATCH(0); /* clip */
    OUT_BATCH(0); /* sf */
@@ -169,6 +172,7 @@ static void prepare_psp_urb_cbs(struct brw_context *brw)
    brw_add_validated_bo(brw, brw->vs.state_bo);
    brw_add_validated_bo(brw, brw->gs.state_bo);
    brw_add_validated_bo(brw, brw->clip.state_bo);
+   brw_add_validated_bo(brw, brw->sf.state_bo);
    brw_add_validated_bo(brw, brw->wm.state_bo);
    brw_add_validated_bo(brw, brw->cc.state_bo);
 }
@@ -208,7 +212,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    struct intel_region *region = brw->state.depth_region;
-   unsigned int len = BRW_IS_G4X(brw) ? 6 : 5;
+   unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5;
 
    if (region == NULL) {
       BEGIN_BATCH(len, IGNORE_CLIPRECTS);
@@ -219,7 +223,7 @@ static void emit_depthbuffer(struct brw_context *brw)
       OUT_BATCH(0);
       OUT_BATCH(0);
 
-      if (BRW_IS_G4X(brw))
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
          OUT_BATCH(0);
 
       ADVANCE_BATCH();
@@ -241,6 +245,8 @@ static void emit_depthbuffer(struct brw_context *brw)
 	 return;
       }
 
+      assert(region->tiling != I915_TILING_X);
+
       BEGIN_BATCH(len, IGNORE_CLIPRECTS);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH(((region->pitch * region->cpp) - 1) |
@@ -256,7 +262,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 		((region->height - 1) << 19));
       OUT_BATCH(0);
 
-      if (BRW_IS_G4X(brw))
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
          OUT_BATCH(0);
 
       ADVANCE_BATCH();
@@ -369,7 +375,7 @@ static void upload_aa_line_parameters(struct brw_context *brw)
 {
    struct brw_aa_line_parameters balp;
    
-   if (!BRW_IS_G4X(brw))
+   if (BRW_IS_965(brw))
       return;
 
    /* use legacy aa line coverage computation */
@@ -506,14 +512,27 @@ static void upload_state_base_address( struct brw_context *brw )
    /* Output the structure (brw_state_base_address) directly to the
     * batchbuffer, so we can emit relocations inline.
     */
-   BEGIN_BATCH(6, IGNORE_CLIPRECTS);
-   OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
-   OUT_BATCH(1); /* General state base address */
-   OUT_BATCH(1); /* Surface state base address */
-   OUT_BATCH(1); /* Indirect object base address */
-   OUT_BATCH(1); /* General state upper bound */
-   OUT_BATCH(1); /* Indirect object upper bound */
-   ADVANCE_BATCH();
+   if (BRW_IS_IGDNG(brw)) {
+       BEGIN_BATCH(8, IGNORE_CLIPRECTS);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* Instruction base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       OUT_BATCH(1); /* Instruction access upper bound */
+       ADVANCE_BATCH();
+   } else {
+       BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       ADVANCE_BATCH();
+   }
 }
 
 const struct brw_tracked_state brw_state_base_address = {
diff --git a/i965/brw_queryobj.c b/i965/brw_queryobj.c
index cb9169e..a195bc3 100644
--- a/i965/brw_queryobj.c
+++ b/i965/brw_queryobj.c
@@ -146,17 +146,12 @@ static void brw_wait_query(GLcontext *ctx, struct gl_query_object *q)
 
 static void brw_check_query(GLcontext *ctx, struct gl_query_object *q)
 {
-   /* XXX: Need to expose dri_bo_is_idle from bufmgr. */
-#if 0
    struct brw_query_object *query = (struct brw_query_object *)q;
 
-   if (dri_bo_is_idle(query->bo)) {
+   if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
       brw_queryobj_get_results(query);
       query->Base.Ready = GL_TRUE;
    }
-#else
-   brw_wait_query(ctx, q);
-#endif
 }
 
 /** Called to set up the query BO and account for its aperture space */
diff --git a/i965/brw_sf.c b/i965/brw_sf.c
index c3c8597..e1c2c77 100644
--- a/i965/brw_sf.c
+++ b/i965/brw_sf.c
@@ -166,6 +166,9 @@ static void upload_sf_prog(struct brw_context *brw)
    key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
    key.do_twoside_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
 
+   /* _NEW_HINT */
+   key.linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
+
    /* _NEW_POLYGON */
    if (key.do_twoside_color) {
       /* If we're rendering to a FBO, we have to invert the polygon
@@ -188,7 +191,7 @@ static void upload_sf_prog(struct brw_context *brw)
 
 const struct brw_tracked_state brw_sf_prog = {
    .dirty = {
-      .mesa  = (_NEW_LIGHT|_NEW_POLYGON|_NEW_POINT),
+      .mesa  = (_NEW_HINT | _NEW_LIGHT | _NEW_POLYGON | _NEW_POINT),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/i965/brw_sf.h b/i965/brw_sf.h
index 1c0fb70..6426b6d 100644
--- a/i965/brw_sf.h
+++ b/i965/brw_sf.h
@@ -51,7 +51,8 @@ struct brw_sf_prog_key {
    GLuint do_flat_shading:1;
    GLuint frontface_ccw:1;
    GLuint do_point_sprite:1;
-   GLuint pad:10;
+   GLuint linear_color:1;  /**< linear interp vs. perspective interp */
+   GLuint pad:25;
    GLenum SpriteOrigin;
 };
 
diff --git a/i965/brw_sf_emit.c b/i965/brw_sf_emit.c
index 862835f..ca8f97f 100644
--- a/i965/brw_sf_emit.c
+++ b/i965/brw_sf_emit.c
@@ -151,6 +151,8 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
    GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
+   GLuint jmpi = 1;
+
    if (!nr)
       return;
 
@@ -159,18 +161,21 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
    if (c->key.primitive == SF_UNFILLED_TRIS)
       return;
 
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
    brw_push_insn_state(p);
    
-   brw_MUL(p, c->pv, c->pv, brw_imm_ud(nr*2+1));
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1)));
    brw_JMPI(p, ip, ip, c->pv);
 
    copy_colors(c, c->vert[1], c->vert[0]);
    copy_colors(c, c->vert[2], c->vert[0]);
-   brw_JMPI(p, ip, ip, brw_imm_ud(nr*4+1));
+   brw_JMPI(p, ip, ip, brw_imm_d(jmpi*(nr*4+1)));
 
    copy_colors(c, c->vert[0], c->vert[1]);
    copy_colors(c, c->vert[2], c->vert[1]);
-   brw_JMPI(p, ip, ip, brw_imm_ud(nr*2));
+   brw_JMPI(p, ip, ip, brw_imm_d(jmpi*nr*2));
 
    copy_colors(c, c->vert[0], c->vert[2]);
    copy_colors(c, c->vert[1], c->vert[2]);
@@ -184,7 +189,8 @@ static void do_flatshade_line( struct brw_sf_compile *c )
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
    GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
-   
+   GLuint jmpi = 1;
+
    if (!nr)
       return;
 
@@ -193,13 +199,16 @@ static void do_flatshade_line( struct brw_sf_compile *c )
    if (c->key.primitive == SF_UNFILLED_TRIS)
       return;
 
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
    brw_push_insn_state(p);
    
-   brw_MUL(p, c->pv, c->pv, brw_imm_ud(nr+1));
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1)));
    brw_JMPI(p, ip, ip, c->pv);
    copy_colors(c, c->vert[1], c->vert[0]);
 
-   brw_JMPI(p, ip, ip, brw_imm_ud(nr));
+   brw_JMPI(p, ip, ip, brw_imm_ud(jmpi*nr));
    copy_colors(c, c->vert[0], c->vert[1]);
 
    brw_pop_insn_state(p);
@@ -218,7 +227,7 @@ static void alloc_regs( struct brw_sf_compile *c )
 
    /* Values computed by fixed function unit:
     */
-   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD);
+   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D);
    c->det = brw_vec1_grf(1, 2);
    c->dx0 = brw_vec1_grf(1, 3);
    c->dx2 = brw_vec1_grf(1, 4);
@@ -295,9 +304,6 @@ static void invert_det( struct brw_sf_compile *c)
 
 }
 
-#define NON_PERPECTIVE_ATTRS  (FRAG_BIT_WPOS | \
-                               FRAG_BIT_COL0 | \
-			       FRAG_BIT_COL1)
 
 static GLboolean calculate_masks( struct brw_sf_compile *c,
 				  GLuint reg,
@@ -306,9 +312,16 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
 				  GLushort *pc_linear)
 {
    GLboolean is_last_attr = (reg == c->nr_setup_regs - 1);
-   GLuint persp_mask = c->key.attrs & ~NON_PERPECTIVE_ATTRS;
+   GLuint persp_mask;
    GLuint linear_mask;
 
+   if (c->key.do_flat_shading || c->key.linear_color)
+      persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS |
+                                    FRAG_BIT_COL0 |
+                                    FRAG_BIT_COL1);
+   else
+      persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS);
+
    if (c->key.do_flat_shading)
       linear_mask = c->key.attrs & ~(FRAG_BIT_COL0|FRAG_BIT_COL1);
    else
@@ -674,7 +687,7 @@ void brw_emit_anyprim_setup( struct brw_sf_compile *c )
 					       (1<<_3DPRIM_POLYGON) |
 					       (1<<_3DPRIM_RECTLIST) |
 					       (1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
-   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
    {
       saveflag = p->flag_value;
       brw_push_insn_state(p); 
@@ -695,7 +708,7 @@ void brw_emit_anyprim_setup( struct brw_sf_compile *c )
 					       (1<<_3DPRIM_LINESTRIP_CONT) |
 					       (1<<_3DPRIM_LINESTRIP_BF) |
 					       (1<<_3DPRIM_LINESTRIP_CONT_BF)));
-   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
    {
       saveflag = p->flag_value;
       brw_push_insn_state(p); 
@@ -708,7 +721,7 @@ void brw_emit_anyprim_setup( struct brw_sf_compile *c )
 
    brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
    brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
-   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
    {
       saveflag = p->flag_value;
       brw_push_insn_state(p); 
diff --git a/i965/brw_sf_state.c b/i965/brw_sf_state.c
index c999187..bc0f076 100644
--- a/i965/brw_sf_state.c
+++ b/i965/brw_sf_state.c
@@ -113,7 +113,7 @@ struct brw_sf_unit_key {
 
    unsigned int nr_urb_entries, urb_size, sfsize;
 
-   GLenum front_face, cull_face;
+   GLenum front_face, cull_face, provoking_vertex;
    unsigned scissor:1;
    unsigned line_smooth:1;
    unsigned point_sprite:1;
@@ -153,6 +153,9 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
    key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
    key->point_attenuated = ctx->Point._Attenuated;
 
+   /* _NEW_LIGHT */
+   key->provoking_vertex = ctx->Light.ProvokingVertex;
+
    key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
 }
 
@@ -162,7 +165,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 {
    struct brw_sf_unit_state sf;
    dri_bo *bo;
-
+   int chipset_max_threads;
    memset(&sf, 0, sizeof(sf));
 
    sf.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
@@ -171,13 +174,26 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
 
    sf.thread3.dispatch_grf_start_reg = 3;
-   sf.thread3.urb_entry_read_offset = 1;
+
+   if (BRW_IS_IGDNG(brw))
+       sf.thread3.urb_entry_read_offset = 3;
+   else
+       sf.thread3.urb_entry_read_offset = 1;
+
    sf.thread3.urb_entry_read_length = key->urb_entry_read_length;
 
    sf.thread4.nr_urb_entries = key->nr_urb_entries;
    sf.thread4.urb_entry_allocation_size = key->sfsize - 1;
-   /* Each SF thread produces 1 PUE, and there can be up to 24 threads */
-   sf.thread4.max_threads = MIN2(24, key->nr_urb_entries) - 1;
+
+   /* Each SF thread produces 1 PUE, and there can be up to 24(Pre-IGDNG) or 
+    * 48(IGDNG) threads 
+    */
+   if (BRW_IS_IGDNG(brw))
+      chipset_max_threads = 48;
+   else
+      chipset_max_threads = 24;
+
+   sf.thread4.max_threads = MIN2(chipset_max_threads, key->nr_urb_entries) - 1;
 
    if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
       sf.thread4.max_threads = 0;
@@ -233,7 +249,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    else if (sf.sf6.line_width <= 0x2)
        sf.sf6.line_width = 0;
 
-   /* _NEW_POINT */
+   /* _NEW_BUFFERS */
    key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
    if (!key->render_to_fbo) {
       /* Rendering to an OpenGL window */
@@ -263,6 +279,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    }
    /* XXX clamp max depends on AA vs. non-AA */
 
+   /* _NEW_POINT */
    sf.sf7.sprite_point = key->point_sprite;
    sf.sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3);
    sf.sf7.use_point_size_state = !key->point_attenuated;
@@ -270,9 +287,15 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
     */
-   sf.sf7.trifan_pv = 2;
-   sf.sf7.linestrip_pv = 1;
-   sf.sf7.tristrip_pv = 2;
+   if (key->provoking_vertex == GL_LAST_VERTEX_CONVENTION) {
+      sf.sf7.trifan_pv = 2;
+      sf.sf7.linestrip_pv = 1;
+      sf.sf7.tristrip_pv = 2;
+   } else {
+      sf.sf7.trifan_pv = 1;
+      sf.sf7.linestrip_pv = 0;
+      sf.sf7.tristrip_pv = 0;
+   }
    sf.sf7.line_last_pixel_enable = 0;
 
    /* Set bias for OpenGL rasterization rules:
@@ -286,6 +309,9 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 			 &sf, sizeof(sf),
 			 NULL, NULL);
 
+   /* STATE_PREFETCH command description describes this state as being
+    * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
+    */
    /* Emit SF program relocation */
    dri_bo_emit_reloc(bo,
 		     I915_GEM_DOMAIN_INSTRUCTION, 0,
@@ -326,9 +352,11 @@ static void upload_sf_unit( struct brw_context *brw )
 const struct brw_tracked_state brw_sf_unit = {
    .dirty = {
       .mesa  = (_NEW_POLYGON | 
+		_NEW_LIGHT |
 		_NEW_LINE | 
 		_NEW_POINT | 
-		_NEW_SCISSOR),
+		_NEW_SCISSOR |
+		_NEW_BUFFERS),
       .brw   = BRW_NEW_URB_FENCE,
       .cache = (CACHE_NEW_SF_VP |
 		CACHE_NEW_SF_PROG)
diff --git a/i965/brw_state.h b/i965/brw_state.h
index 81b0a45..5335eac 100644
--- a/i965/brw_state.h
+++ b/i965/brw_state.h
@@ -72,11 +72,13 @@ const struct brw_tracked_state brw_sf_vp;
 const struct brw_tracked_state brw_state_base_address;
 const struct brw_tracked_state brw_urb_fence;
 const struct brw_tracked_state brw_vertex_state;
+const struct brw_tracked_state brw_vs_surfaces;
 const struct brw_tracked_state brw_vs_prog;
 const struct brw_tracked_state brw_vs_unit;
 const struct brw_tracked_state brw_wm_input_sizes;
 const struct brw_tracked_state brw_wm_prog;
 const struct brw_tracked_state brw_wm_samplers;
+const struct brw_tracked_state brw_wm_constant_surface;
 const struct brw_tracked_state brw_wm_surfaces;
 const struct brw_tracked_state brw_wm_unit;
 
@@ -84,12 +86,24 @@ const struct brw_tracked_state brw_psp_urb_cbs;
 
 const struct brw_tracked_state brw_pipe_control;
 
-const struct brw_tracked_state brw_clear_surface_cache;
-const struct brw_tracked_state brw_clear_batch_cache;
-
 const struct brw_tracked_state brw_drawing_rect;
 const struct brw_tracked_state brw_indices;
 const struct brw_tracked_state brw_vertices;
+const struct brw_tracked_state brw_index_buffer;
+
+/**
+ * Use same key for WM and VS surfaces.
+ */
+struct brw_surface_key {
+   GLenum target, depthmode;
+   dri_bo *bo;
+   GLint format, internal_format;
+   GLint first_level, last_level;
+   GLint width, height, depth;
+   GLint pitch, cpp;
+   uint32_t tiling;
+   GLuint offset;
+};
 
 /***********************************************************************
  * brw_state.c
@@ -135,8 +149,8 @@ dri_bo *brw_search_cache( struct brw_cache *cache,
 			  void *aux_return);
 void brw_state_cache_check_size( struct brw_context *brw );
 
-void brw_init_cache( struct brw_context *brw );
-void brw_destroy_cache( struct brw_context *brw );
+void brw_init_caches( struct brw_context *brw );
+void brw_destroy_caches( struct brw_context *brw );
 
 /***********************************************************************
  * brw_state_batch.c
@@ -148,6 +162,11 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 				   const void *data,
 				   GLuint sz );
 void brw_destroy_batch_cache( struct brw_context *brw );
-void brw_clear_batch_cache_flush( struct brw_context *brw );
+void brw_clear_batch_cache( struct brw_context *brw );
+
+/* brw_wm_surface_state.c */
+dri_bo *
+brw_create_constant_surface( struct brw_context *brw,
+                             struct brw_surface_key *key );
 
 #endif
diff --git a/i965/brw_state_batch.c b/i965/brw_state_batch.c
index 811940e..7821898 100644
--- a/i965/brw_state_batch.c
+++ b/i965/brw_state_batch.c
@@ -79,7 +79,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
    return GL_TRUE;
 }
 
-static void clear_batch_cache( struct brw_context *brw )
+void brw_clear_batch_cache( struct brw_context *brw )
 {
    struct brw_cached_batch_item *item = brw->cached_batch_items;
 
@@ -93,18 +93,7 @@ static void clear_batch_cache( struct brw_context *brw )
    brw->cached_batch_items = NULL;
 }
 
-void brw_clear_batch_cache_flush( struct brw_context *brw )
-{
-   clear_batch_cache(brw);
-
-   brw->state.dirty.mesa |= ~0;
-   brw->state.dirty.brw |= ~0;
-   brw->state.dirty.cache |= ~0;
-}
-
-
-
 void brw_destroy_batch_cache( struct brw_context *brw )
 {
-   clear_batch_cache(brw);
+   brw_clear_batch_cache(brw);
 }
diff --git a/i965/brw_state_cache.c b/i965/brw_state_cache.c
index d5b5166..e40d7a0 100644
--- a/i965/brw_state_cache.c
+++ b/i965/brw_state_cache.c
@@ -56,9 +56,9 @@
  * incorrect program is run for the other instance.
  */
 
+#include "main/imports.h"
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
-#include "main/imports.h"
 
 /* XXX: Fixme - have to include these to get the sizes of the prog_key
  * structs:
@@ -69,8 +69,10 @@
 #include "brw_sf.h"
 #include "brw_gs.h"
 
-static GLuint hash_key( const void *key, GLuint key_size,
-			dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+
+static GLuint
+hash_key(const void *key, GLuint key_size,
+         dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
 {
    GLuint *ikey = (GLuint *)key;
    GLuint hash = 0, i;
@@ -95,6 +97,7 @@ static GLuint hash_key( const void *key, GLuint key_size,
    return hash;
 }
 
+
 /**
  * Marks a new buffer as being chosen for the given cache id.
  */
@@ -111,6 +114,7 @@ update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
    cache->brw->state.dirty.cache |= 1 << cache_id;
 }
 
+
 static struct brw_cache_item *
 search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 	     GLuint hash, const void *key, GLuint key_size,
@@ -143,7 +147,8 @@ search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 }
 
 
-static void rehash( struct brw_cache *cache )
+static void
+rehash(struct brw_cache *cache)
 {
    struct brw_cache_item **items;
    struct brw_cache_item *c, *next;
@@ -164,15 +169,17 @@ static void rehash( struct brw_cache *cache )
    cache->size = size;
 }
 
+
 /**
  * Returns the buffer object matching cache_id and key, or NULL.
  */
-dri_bo *brw_search_cache( struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *key,
-			  GLuint key_size,
-			  dri_bo **reloc_bufs, GLuint nr_reloc_bufs,
-			  void *aux_return )
+dri_bo *
+brw_search_cache(struct brw_cache *cache,
+                 enum brw_cache_id cache_id,
+                 const void *key,
+                 GLuint key_size,
+                 dri_bo **reloc_bufs, GLuint nr_reloc_bufs,
+                 void *aux_return)
 {
    struct brw_cache_item *item;
    GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
@@ -192,6 +199,7 @@ dri_bo *brw_search_cache( struct brw_cache *cache,
    return item->bo;
 }
 
+
 dri_bo *
 brw_upload_cache( struct brw_cache *cache,
 		  enum brw_cache_id cache_id,
@@ -265,7 +273,9 @@ brw_upload_cache( struct brw_cache *cache,
    return bo;
 }
 
-/* This doesn't really work with aux data.  Use search/upload instead
+
+/**
+ * This doesn't really work with aux data.  Use search/upload instead
  */
 dri_bo *
 brw_cache_data_sz(struct brw_cache *cache,
@@ -296,6 +306,7 @@ brw_cache_data_sz(struct brw_cache *cache,
    return bo;
 }
 
+
 /**
  * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
  *
@@ -319,21 +330,22 @@ enum pool_type {
    DW_GENERAL_STATE
 };
 
+
 static void
-brw_init_cache_id( struct brw_context *brw,
-		const char *name,
-		enum brw_cache_id id,
-		GLuint key_size,
-		GLuint aux_size)
+brw_init_cache_id(struct brw_cache *cache,
+                  const char *name,
+                  enum brw_cache_id id,
+                  GLuint key_size,
+                  GLuint aux_size)
 {
-   struct brw_cache *cache = &brw->cache;
-
    cache->name[id] = strdup(name);
    cache->key_size[id] = key_size;
    cache->aux_size[id] = aux_size;
 }
 
-void brw_init_cache( struct brw_context *brw )
+
+static void
+brw_init_non_surface_cache(struct brw_context *brw)
 {
    struct brw_cache *cache = &brw->cache;
 
@@ -342,114 +354,136 @@ void brw_init_cache( struct brw_context *brw )
    cache->size = 7;
    cache->n_items = 0;
    cache->items = (struct brw_cache_item **)
-      _mesa_calloc(cache->size * 
-		   sizeof(struct brw_cache_item));
+      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CC_VP",
 		     BRW_CC_VP,
 		     sizeof(struct brw_cc_viewport),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CC_UNIT",
 		     BRW_CC_UNIT,
 		     sizeof(struct brw_cc_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "WM_PROG",
 		     BRW_WM_PROG,
 		     sizeof(struct brw_wm_prog_key),
 		     sizeof(struct brw_wm_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SAMPLER_DEFAULT_COLOR",
 		     BRW_SAMPLER_DEFAULT_COLOR,
 		     sizeof(struct brw_sampler_default_color),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SAMPLER",
 		     BRW_SAMPLER,
 		     0,		/* variable key/data size */
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "WM_UNIT",
 		     BRW_WM_UNIT,
 		     sizeof(struct brw_wm_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_PROG",
 		     BRW_SF_PROG,
 		     sizeof(struct brw_sf_prog_key),
 		     sizeof(struct brw_sf_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_VP",
 		     BRW_SF_VP,
 		     sizeof(struct brw_sf_viewport),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_UNIT",
 		     BRW_SF_UNIT,
 		     sizeof(struct brw_sf_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "VS_UNIT",
 		     BRW_VS_UNIT,
 		     sizeof(struct brw_vs_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "VS_PROG",
 		     BRW_VS_PROG,
 		     sizeof(struct brw_vs_prog_key),
 		     sizeof(struct brw_vs_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CLIP_UNIT",
 		     BRW_CLIP_UNIT,
 		     sizeof(struct brw_clip_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CLIP_PROG",
 		     BRW_CLIP_PROG,
 		     sizeof(struct brw_clip_prog_key),
 		     sizeof(struct brw_clip_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "GS_UNIT",
 		     BRW_GS_UNIT,
 		     sizeof(struct brw_gs_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "GS_PROG",
 		     BRW_GS_PROG,
 		     sizeof(struct brw_gs_prog_key),
 		     sizeof(struct brw_gs_prog_data));
+}
+
+
+static void
+brw_init_surface_cache(struct brw_context *brw)
+{
+   struct brw_cache *cache = &brw->surface_cache;
+
+   cache->brw = brw;
 
-   brw_init_cache_id(brw,
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
+
+   brw_init_cache_id(cache,
 		     "SS_SURFACE",
 		     BRW_SS_SURFACE,
 		     sizeof(struct brw_surface_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SS_SURF_BIND",
 		     BRW_SS_SURF_BIND,
 		     0,
 		     0);
 }
 
+
+void
+brw_init_caches(struct brw_context *brw)
+{
+   brw_init_non_surface_cache(brw);
+   brw_init_surface_cache(brw);
+}
+
+
 static void
-brw_clear_cache( struct brw_context *brw )
+brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 {
    struct brw_cache_item *c, *next;
    GLuint i;
@@ -457,8 +491,8 @@ brw_clear_cache( struct brw_context *brw )
    if (INTEL_DEBUG & DEBUG_STATE)
       _mesa_printf("%s\n", __FUNCTION__);
 
-   for (i = 0; i < brw->cache.size; i++) {
-      for (c = brw->cache.items[i]; c; c = next) {
+   for (i = 0; i < cache->size; i++) {
+      for (c = cache->items[i]; c; c = next) {
 	 int j;
 
 	 next = c->next;
@@ -468,10 +502,10 @@ brw_clear_cache( struct brw_context *brw )
 	 free((void *)c->key);
 	 free(c);
       }
-      brw->cache.items[i] = NULL;
+      cache->items[i] = NULL;
    }
 
-   brw->cache.n_items = 0;
+   cache->n_items = 0;
 
    if (brw->curbe.last_buf) {
       _mesa_free(brw->curbe.last_buf);
@@ -483,25 +517,46 @@ brw_clear_cache( struct brw_context *brw )
    brw->state.dirty.cache |= ~0;
 }
 
-void brw_state_cache_check_size( struct brw_context *brw )
+
+void
+brw_state_cache_check_size(struct brw_context *brw)
 {
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
+
    /* un-tuned guess.  We've got around 20 state objects for a total of around
     * 32k, so 1000 of them is around 1.5MB.
     */
    if (brw->cache.n_items > 1000)
-      brw_clear_cache(brw);
+      brw_clear_cache(brw, &brw->cache);
+
+   if (brw->surface_cache.n_items > 1000)
+      brw_clear_cache(brw, &brw->surface_cache);
 }
 
-void brw_destroy_cache( struct brw_context *brw )
+
+static void
+brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 {
    GLuint i;
 
-   brw_clear_cache(brw);
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("%s\n", __FUNCTION__);
+
+   brw_clear_cache(brw, cache);
    for (i = 0; i < BRW_MAX_CACHE; i++) {
-      dri_bo_unreference(brw->cache.last_bo[i]);
-      free(brw->cache.name[i]);
+      dri_bo_unreference(cache->last_bo[i]);
+      free(cache->name[i]);
    }
-   free(brw->cache.items);
-   brw->cache.items = NULL;
-   brw->cache.size = 0;
+   free(cache->items);
+   cache->items = NULL;
+   cache->size = 0;
+}
+
+
+void
+brw_destroy_caches(struct brw_context *brw)
+{
+   brw_destroy_cache(brw, &brw->cache);
+   brw_destroy_cache(brw, &brw->surface_cache);
 }
diff --git a/i965/brw_state_dump.c b/i965/brw_state_dump.c
index a713262..e94fa7d 100644
--- a/i965/brw_state_dump.c
+++ b/i965/brw_state_dump.c
@@ -126,6 +126,8 @@ static void dump_wm_surface_state(struct brw_context *brw)
 		surf->ss3.pitch + 1, surf->ss3.tiled_surface ? "" : "not ");
       state_out(name, surf, surfoff, 4, "mip base %d\n",
 		surf->ss4.min_lod);
+      state_out(name, surf, surfoff, 5, "x,y offset: %d,%d\n",
+		surf->ss5.x_offset, surf->ss5.y_offset);
 
       dri_bo_unmap(surf_bo);
    }
diff --git a/i965/brw_state_upload.c b/i965/brw_state_upload.c
index 5de1450..b817b74 100644
--- a/i965/brw_state_upload.c
+++ b/i965/brw_state_upload.c
@@ -59,11 +59,12 @@ const struct brw_tracked_state *atoms[] =
    &brw_curbe_offsets,
    &brw_recalculate_urb_fence,
 
-
    &brw_cc_vp,
    &brw_cc_unit,
 
-   &brw_wm_surfaces,		/* must do before samplers */
+   &brw_vs_surfaces,		/* must do before unit */
+   &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
+   &brw_wm_surfaces,		/* must do before samplers and unit */
    &brw_wm_samplers,
 
    &brw_wm_unit,
@@ -88,54 +89,27 @@ const struct brw_tracked_state *atoms[] =
 
    &brw_line_stipple,
    &brw_aa_line_parameters,
-   /* Ordering of the commands below is documented as fixed.  
-    */
-#if 0
-   &brw_pipelined_state_pointers,
-   &brw_urb_fence,
-   &brw_constant_buffer_state,
-#else
+
    &brw_psp_urb_cbs,
-#endif
 
    &brw_drawing_rect,
    &brw_indices,
+   &brw_index_buffer,
    &brw_vertices,
 
-   NULL,			/* brw_constant_buffer */
+   &brw_constant_buffer
 };
 
 
 void brw_init_state( struct brw_context *brw )
 {
-   GLuint i;
-
-   brw_init_cache(brw);
-
-   brw->state.atoms = _mesa_malloc(sizeof(atoms));
-   brw->state.nr_atoms = sizeof(atoms)/sizeof(*atoms);
-   _mesa_memcpy(brw->state.atoms, atoms, sizeof(atoms));
-
-   /* Patch in a pointer to the dynamic state atom:
-    */
-   for (i = 0; i < brw->state.nr_atoms; i++)
-      if (brw->state.atoms[i] == NULL)
-	 brw->state.atoms[i] = &brw->curbe.tracked_state;
-
-   _mesa_memcpy(&brw->curbe.tracked_state, 
-		&brw_constant_buffer,
-		sizeof(brw_constant_buffer));
+   brw_init_caches(brw);
 }
 
 
 void brw_destroy_state( struct brw_context *brw )
 {
-   if (brw->state.atoms) {
-      _mesa_free(brw->state.atoms);
-      brw->state.atoms = NULL;
-   }
-
-   brw_destroy_cache(brw);
+   brw_destroy_caches(brw);
    brw_destroy_batch_cache(brw);
 }
 
@@ -218,6 +192,7 @@ static struct dirty_bit_map mesa_bits[] = {
    DEFINE_BIT(_NEW_MULTISAMPLE),
    DEFINE_BIT(_NEW_TRACK_MATRIX),
    DEFINE_BIT(_NEW_PROGRAM),
+   DEFINE_BIT(_NEW_PROGRAM_CONSTANTS),
    {0, 0, 0}
 };
 
@@ -231,10 +206,10 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_PRIMITIVE),
    DEFINE_BIT(BRW_NEW_CONTEXT),
    DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
-   DEFINE_BIT(BRW_NEW_INPUT_VARYING),
    DEFINE_BIT(BRW_NEW_PSP),
    DEFINE_BIT(BRW_NEW_FENCE),
    DEFINE_BIT(BRW_NEW_INDICES),
+   DEFINE_BIT(BRW_NEW_INDEX_BUFFER),
    DEFINE_BIT(BRW_NEW_VERTICES),
    DEFINE_BIT(BRW_NEW_BATCH),
    DEFINE_BIT(BRW_NEW_DEPTH_BUFFER),
@@ -312,6 +287,7 @@ void brw_validate_state( struct brw_context *brw )
    if (brw->emit_state_always) {
       state->mesa |= ~0;
       state->brw |= ~0;
+      state->cache |= ~0;
    }
 
    if (brw->fragment_program != ctx->FragmentProgram._Current) {
@@ -330,13 +306,13 @@ void brw_validate_state( struct brw_context *brw )
       return;
 
    if (brw->state.dirty.brw & BRW_NEW_CONTEXT)
-      brw_clear_batch_cache_flush(brw);
+      brw_clear_batch_cache(brw);
 
    brw->intel.Fallback = 0;
 
    /* do prepare stage for all atoms */
    for (i = 0; i < Elements(atoms); i++) {
-      const struct brw_tracked_state *atom = brw->state.atoms[i];
+      const struct brw_tracked_state *atom = atoms[i];
 
       if (brw->intel.Fallback)
          break;
@@ -347,6 +323,19 @@ void brw_validate_state( struct brw_context *brw )
         }
       }
    }
+
+   /* Make sure that the textures which are referenced by the current
+    * brw fragment program are actually present/valid.
+    * If this fails, we can experience GPU lock-ups.
+    */
+   {
+      const struct brw_fragment_program *fp;
+      fp = brw_fragment_program_const(brw->fragment_program);
+      if (fp) {
+         assert((fp->tex_units_used & ctx->Texture._EnabledUnits)
+                == fp->tex_units_used);
+      }
+   }
 }
 
 
@@ -367,8 +356,8 @@ void brw_upload_state(struct brw_context *brw)
       _mesa_memset(&examined, 0, sizeof(examined));
       prev = *state;
 
-      for (i = 0; i < brw->state.nr_atoms; i++) {	 
-	 const struct brw_tracked_state *atom = brw->state.atoms[i];
+      for (i = 0; i < Elements(atoms); i++) {	 
+	 const struct brw_tracked_state *atom = atoms[i];
 	 struct brw_state_flags generated;
 
 	 assert(atom->dirty.mesa ||
@@ -397,7 +386,7 @@ void brw_upload_state(struct brw_context *brw)
    }
    else {
       for (i = 0; i < Elements(atoms); i++) {	 
-	 const struct brw_tracked_state *atom = brw->state.atoms[i];
+	 const struct brw_tracked_state *atom = atoms[i];
 
 	 if (brw->intel.Fallback)
 	    break;
diff --git a/i965/brw_structs.h b/i965/brw_structs.h
index 89e2981..66d4127 100644
--- a/i965/brw_structs.h
+++ b/i965/brw_structs.h
@@ -33,6 +33,14 @@
 #ifndef BRW_STRUCTS_H
 #define BRW_STRUCTS_H
 
+
+/** Number of general purpose registers (VS, WM, etc) */
+#define BRW_MAX_GRF 128
+
+/** Number of message register file registers */
+#define BRW_MAX_MRF 16
+
+
 /* Command packets:
  */
 struct header 
@@ -434,8 +442,8 @@ struct brw_urb_fence
    {
       GLuint sf_fence:10;  
       GLuint vf_fence:10;  
-      GLuint cs_fence:10;  
-      GLuint pad:2;
+      GLuint cs_fence:11;  
+      GLuint pad:1;
    } bits1;
 };
 
@@ -815,7 +823,9 @@ struct brw_gs_unit_state
 
    struct
    {
-      GLuint pad0:10;
+      GLuint pad0:8;
+      GLuint rendering_enable:1; /* for IGDNG */
+      GLuint pad4:1;
       GLuint stats_enable:1; 
       GLuint nr_urb_entries:7; 
       GLuint pad1:1;
@@ -923,6 +933,28 @@ struct brw_wm_unit_state
    
    GLfloat global_depth_offset_constant;  
    GLfloat global_depth_offset_scale;   
+   
+   /* for IGDNG only */
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_1:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_1:26;
+   } wm8;       
+
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_2:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_2:26;
+   } wm9;       
+
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_3:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_3:26;
+   } wm10;       
 };
 
 struct brw_sampler_default_color {
@@ -1075,7 +1107,7 @@ struct brw_surface_state
       GLuint y_offset:4;
       GLuint pad0:1;
       GLuint x_offset:7;
-   } ss5;   /* NEW in Integrated Graphics Device */
+   } ss5;   /* New in G4X */
 
 };
 
@@ -1168,7 +1200,7 @@ struct brw_instruction
       GLuint predicate_control:4;
       GLuint predicate_inverse:1;
       GLuint execution_size:3;
-      GLuint destreg__conditonalmod:4; /* destreg - send, conditionalmod - others */
+      GLuint destreg__conditionalmod:4; /* destreg - send, conditionalmod - others */
       GLuint pad0:2;
       GLuint debug_control:1;
       GLuint saturate:1;
@@ -1196,7 +1228,9 @@ struct brw_instruction
 	 GLuint dest_reg_type:3;
 	 GLuint src0_reg_file:2;
 	 GLuint src0_reg_type:3;
-	 GLuint pad:6;
+	 GLuint src1_reg_file:2;        /* 0x00000c00 */
+	 GLuint src1_reg_type:3;        /* 0x00007000 */
+	 GLuint pad:1;
 	 GLint dest_indirect_offset:10;	/* offset against the deref'd address reg */
 	 GLuint dest_subreg_nr:3; /* subnr for the address reg a0.x */
 	 GLuint dest_horiz_stride:2;
@@ -1211,7 +1245,7 @@ struct brw_instruction
 	 GLuint src0_reg_type:3;
 	 GLuint src1_reg_file:2;
 	 GLuint src1_reg_type:3;
-	 GLuint pad0:1;
+	 GLuint pad:1;
 	 GLuint dest_writemask:4;
 	 GLuint dest_subreg_nr:1;
 	 GLuint dest_reg_nr:8;
@@ -1298,6 +1332,14 @@ struct brw_instruction
 	 GLuint pad1:6;
       } ia16;
 
+       struct 
+       {
+           GLuint pad:26;
+           GLuint end_of_thread:1;
+           GLuint pad1:1;
+           GLuint sfid:4;
+       } send_igdng;  /* for IGDNG only */
+
    } bits2;
 
    union
@@ -1308,7 +1350,7 @@ struct brw_instruction
 	 GLuint src1_reg_nr:8;
 	 GLuint src1_abs:1;
 	 GLuint src1_negate:1;
-	 GLuint pad:1;
+	 GLuint src1_address_mode:1;
 	 GLuint src1_horiz_stride:2;
 	 GLuint src1_width:3;
 	 GLuint src1_vert_stride:4;
@@ -1323,7 +1365,7 @@ struct brw_instruction
 	 GLuint src1_reg_nr:8;
 	 GLuint src1_abs:1;
 	 GLuint src1_negate:1;
-	 GLuint pad0:1;
+	 GLuint src1_address_mode:1;
 	 GLuint src1_swz_z:2;
 	 GLuint src1_swz_w:2;
 	 GLuint pad1:1;
@@ -1337,7 +1379,7 @@ struct brw_instruction
 	 GLuint src1_subreg_nr:3;
 	 GLuint src1_abs:1;
 	 GLuint src1_negate:1;
-	 GLuint pad0:1;
+	 GLuint src1_address_mode:1;
 	 GLuint src1_horiz_stride:2;
 	 GLuint src1_width:3;
 	 GLuint src1_vert_stride:4;
@@ -1385,6 +1427,21 @@ struct brw_instruction
       } math;
 
       struct {
+	 GLuint function:4;
+	 GLuint int_type:1;
+	 GLuint precision:1;
+	 GLuint saturate:1;
+	 GLuint data_type:1;
+	 GLuint snapshot:1;
+	 GLuint pad0:10;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } math_igdng;
+
+      struct {
 	 GLuint binding_table_index:8;
 	 GLuint sampler:4;
 	 GLuint return_format:2; 
@@ -1407,9 +1464,38 @@ struct brw_instruction
          GLuint end_of_thread:1;
       } sampler_g4x;
 
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint sampler:4;
+	 GLuint msg_type:4;
+	 GLuint simd_mode:2;
+	 GLuint pad0:1;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } sampler_igdng;
+
       struct brw_urb_immediate urb;
 
       struct {
+	 GLuint opcode:4;
+	 GLuint offset:6;
+	 GLuint swizzle_control:2; 
+	 GLuint pad:1;
+	 GLuint allocate:1;
+	 GLuint used:1;
+	 GLuint complete:1;
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } urb_igdng;
+
+      struct {
 	 GLuint binding_table_index:8;
 	 GLuint msg_control:4;  
 	 GLuint msg_type:2;  
@@ -1423,6 +1509,19 @@ struct brw_instruction
 
       struct {
 	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;  
+	 GLuint msg_type:3;  
+	 GLuint target_cache:2;    
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_read_igdng;
+
+      struct {
+	 GLuint binding_table_index:8;
 	 GLuint msg_control:3;
 	 GLuint pixel_scoreboard_clear:1;
 	 GLuint msg_type:3;    
@@ -1435,6 +1534,20 @@ struct brw_instruction
       } dp_write;
 
       struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;
+	 GLuint pixel_scoreboard_clear:1;
+	 GLuint msg_type:3;    
+	 GLuint send_commit_msg:1;
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_write_igdng;
+
+      struct {
 	 GLuint pad:16;
 	 GLuint response_length:4;
 	 GLuint msg_length:4;
@@ -1443,8 +1556,18 @@ struct brw_instruction
 	 GLuint end_of_thread:1;
       } generic;
 
+      struct {
+	 GLuint pad:19;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } generic_igdng;
+
       GLint d;
       GLuint ud;
+      float f;
    } bits3;
 };
 
diff --git a/i965/brw_tex_layout.c b/i965/brw_tex_layout.c
index 51a617f..5986cbf 100644
--- a/i965/brw_tex_layout.c
+++ b/i965/brw_tex_layout.c
@@ -28,7 +28,6 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
-        
 
 /* Code to layout images in a mipmap tree for i965.
  */
@@ -37,17 +36,93 @@
 #include "intel_tex_layout.h"
 #include "intel_context.h"
 #include "main/macros.h"
+#include "intel_chipset.h"
 
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
 
-GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_tree *mt )
+GLboolean brw_miptree_layout(struct intel_context *intel,
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling)
 {
-   /* XXX: these vary depending on image format: 
-    */
-/*    GLint align_w = 4; */
+   /* XXX: these vary depending on image format: */
+   /* GLint align_w = 4; */
 
    switch (mt->target) {
-   case GL_TEXTURE_CUBE_MAP: 
+   case GL_TEXTURE_CUBE_MAP:
+      if (IS_IGDNG(intel->intelScreen->deviceID)) {
+          GLuint align_h = 2, align_w = 4;
+          GLuint level;
+          GLuint x = 0;
+          GLuint y = 0;
+          GLuint width = mt->width0;
+          GLuint height = mt->height0;
+          GLuint qpitch = 0;
+          GLuint y_pitch = 0;
+
+          mt->pitch = mt->width0;
+          intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+          y_pitch = ALIGN(height, align_h);
+
+          if (mt->compressed) {
+              mt->pitch = ALIGN(mt->width0, align_w);
+          }
+
+          if (mt->first_level != mt->last_level) {
+              GLuint mip1_width;
+
+              if (mt->compressed) {
+                  mip1_width = ALIGN(minify(mt->width0), align_w)
+                      + ALIGN(minify(minify(mt->width0)), align_w);
+              } else {
+                  mip1_width = ALIGN(minify(mt->width0), align_w)
+                      + minify(minify(mt->width0));
+              }
+
+              if (mip1_width > mt->pitch) {
+                  mt->pitch = mip1_width;
+              }
+          }
+
+          mt->pitch = intel_miptree_pitch_align(intel, mt, tiling, mt->pitch);
+
+          if (mt->compressed) {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * 6;
+          } else {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * 6;
+          }
+
+          for (level = mt->first_level; level <= mt->last_level; level++) {
+              GLuint img_height;
+              GLuint nr_images = 6;
+              GLuint q = 0;
+
+              intel_miptree_set_level_info(mt, level, nr_images, x, y, width, 
+                                           height, 1);
+
+              for (q = 0; q < nr_images; q++)
+                  intel_miptree_set_image_offset_ex(mt, level, q, x, y, q * qpitch);
+
+              if (mt->compressed)
+                  img_height = MAX2(1, height/4);
+              else
+                  img_height = ALIGN(height, align_h);
+
+              if (level == mt->first_level + 1) {
+                  x += ALIGN(width, align_w);
+              }
+              else {
+                  y += img_height;
+              }
+
+              width  = minify(width);
+              height = minify(height);
+          }
+
+          break;
+      }
+
    case GL_TEXTURE_3D: {
       GLuint width  = mt->width0;
       GLuint height = mt->height0;
@@ -59,25 +134,25 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
       GLuint align_w = 4;
 
       mt->total_height = 0;
-      
+      intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+
       if (mt->compressed) {
-          align_w = intel_compressed_alignment(mt->internal_format);
           mt->pitch = ALIGN(width, align_w);
           pack_y_pitch = (height + 3) / 4;
       } else {
-          mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
-          pack_y_pitch = ALIGN(mt->height0, align_h);
+	 mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
+	 pack_y_pitch = ALIGN(mt->height0, align_h);
       }
 
-      pack_x_pitch = mt->pitch;
+      pack_x_pitch = width;
       pack_x_nr = 1;
 
-      for ( level = mt->first_level ; level <= mt->last_level ; level++ ) {
+      for (level = mt->first_level ; level <= mt->last_level ; level++) {
 	 GLuint nr_images = mt->target == GL_TEXTURE_3D ? depth : 6;
 	 GLint x = 0;
 	 GLint y = 0;
 	 GLint q, j;
-	    
+
 	 intel_miptree_set_level_info(mt, level, nr_images,
 				      0, mt->total_height,
 				      width, height, depth);
@@ -89,7 +164,7 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
 	    }
 
 	    x = 0;
-	    y += pack_y_pitch;	    
+	    y += pack_y_pitch;
 	 }
 
 
@@ -98,40 +173,50 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
 	 height = minify(height);
 	 depth  = minify(depth);
 
-    if (mt->compressed) {
-        pack_y_pitch = (height + 3) / 4;
-        
-        if (pack_x_pitch > ALIGN(width, align_w)) {
-            pack_x_pitch = ALIGN(width, align_w);
-            pack_x_nr <<= 1;
-        }
-    } else {
-        if (pack_x_pitch > 4) {
-            pack_x_pitch >>= 1;
-            pack_x_nr <<= 1;
-            assert(pack_x_pitch * pack_x_nr <= mt->pitch);
-        }
-
-        if (pack_y_pitch > 2) {
-            pack_y_pitch >>= 1;
-            pack_y_pitch = ALIGN(pack_y_pitch, align_h);
-        }
-    }
+	 if (mt->compressed) {
+	    pack_y_pitch = (height + 3) / 4;
+
+	    if (pack_x_pitch > ALIGN(width, align_w)) {
+	       pack_x_pitch = ALIGN(width, align_w);
+	       pack_x_nr <<= 1;
+	    }
+	 } else {
+	    if (pack_x_pitch > 4) {
+	       pack_x_pitch >>= 1;
+	       pack_x_nr <<= 1;
+	       assert(pack_x_pitch * pack_x_nr <= mt->pitch);
+	    }
+
+	    if (pack_y_pitch > 2) {
+	       pack_y_pitch >>= 1;
+	       pack_y_pitch = ALIGN(pack_y_pitch, align_h);
+	    }
+	 }
 
       }
+      /* The 965's sampler lays cachelines out according to how accesses
+       * in the texture surfaces run, so they may be "vertical" through
+       * memory.  As a result, the docs say in Surface Padding Requirements:
+       * Sampling Engine Surfaces that two extra rows of padding are required.
+       * We don't know of similar requirements for pre-965, but given that
+       * those docs are silent on padding requirements in general, let's play
+       * it safe.
+       */
+      if (mt->target == GL_TEXTURE_CUBE_MAP)
+	 mt->total_height += 2;
       break;
    }
 
    default:
-      i945_miptree_layout_2d(intel, mt);
+      i945_miptree_layout_2d(intel, mt, tiling);
       break;
    }
-   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__, 
-		mt->pitch, 
+   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+		mt->pitch,
 		mt->total_height,
 		mt->cpp,
 		mt->pitch * mt->total_height * mt->cpp );
-		
+
    return GL_TRUE;
 }
 
diff --git a/i965/brw_urb.c b/i965/brw_urb.c
index 7673dd3..8c6f435 100644
--- a/i965/brw_urb.c
+++ b/i965/brw_urb.c
@@ -143,7 +143,29 @@ static void recalculate_urb_fence( struct brw_context *brw )
       brw->urb.nr_clip_entries = limits[CLP].preferred_nr_entries;
       brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;	
       brw->urb.nr_cs_entries = limits[CS].preferred_nr_entries;	
-      
+
+      brw->urb.constrained = 0;
+
+      if (BRW_IS_IGDNG(brw)) {
+         brw->urb.nr_vs_entries = 128;
+         brw->urb.nr_sf_entries = 48;
+         if (check_urb_layout(brw)) {
+            goto done;
+         } else {
+            brw->urb.constrained = 1;
+            brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+            brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
+         }
+      } else if (BRW_IS_G4X(brw)) {
+	 brw->urb.nr_vs_entries = 64;
+	 if (check_urb_layout(brw)) {
+	    goto done;
+	 } else {
+	    brw->urb.constrained = 1;
+	    brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+	 }
+      }
+
       if (!check_urb_layout(brw)) {
 	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;	
 	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;	
@@ -169,9 +191,8 @@ static void recalculate_urb_fence( struct brw_context *brw )
 	 if (INTEL_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
 	    _mesa_printf("URB CONSTRAINED\n");
       }
-      else 
-	 brw->urb.constrained = 0;
 
+done:
       if (INTEL_DEBUG & DEBUG_URB)
 	 _mesa_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
 		      brw->urb.vs_start,
diff --git a/i965/brw_vs.c b/i965/brw_vs.c
index e3111c6..f0c79ef 100644
--- a/i965/brw_vs.c
+++ b/i965/brw_vs.c
@@ -90,8 +90,6 @@ static void brw_upload_vs_prog(struct brw_context *brw)
    struct brw_vertex_program *vp = 
       (struct brw_vertex_program *)brw->vertex_program;
 
-   assert (vp && !vp->program.IsNVProgram);
-   
    memset(&key, 0, sizeof(key));
 
    /* Just upload the program verbatim for now.  Always send it all
diff --git a/i965/brw_vs.h b/i965/brw_vs.h
index 1e4f660..4a59136 100644
--- a/i965/brw_vs.h
+++ b/i965/brw_vs.h
@@ -58,6 +58,7 @@ struct brw_vs_compile {
 
    GLuint first_output;
    GLuint nr_outputs;
+   GLuint first_overflow_output; /**< VERT_ATTRIB_x */
 
    GLuint first_tmp;
    GLuint last_tmp;
diff --git a/i965/brw_vs_emit.c b/i965/brw_vs_emit.c
index b69616d..108e19c 100644
--- a/i965/brw_vs_emit.c
+++ b/i965/brw_vs_emit.c
@@ -68,14 +68,20 @@ static void release_tmps( struct brw_vs_compile *c )
 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
    GLuint i, reg = 0, mrf;
+   int attributes_in_vue;
 
-#if 0
-   if (c->vp->program.Base.Parameters->NumParameters >= 6)
-      c->vp->use_const_buffer = 1;
+   /* Determine whether to use a real constant buffer or use a block
+    * of GRF registers for constants.  The later is faster but only
+    * works if everything fits in the GRF.
+    * XXX this heuristic/check may need some fine tuning...
+    */
+   if (c->vp->program.Base.Parameters->NumParameters +
+       c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
+      c->vp->use_const_buffer = GL_TRUE;
    else
-#endif
       c->vp->use_const_buffer = GL_FALSE;
-   /*printf("use_const_buffer = %d\n", c->use_const_buffer);*/
+
+   /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 
    /* r0 -- reserved as usual
     */
@@ -123,16 +129,27 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	 reg++;
       }
    }
+   /* If there are no inputs, we'll still be reading one attribute's worth
+    * because it's required -- see urb_read_length setting.
+    */
+   if (c->nr_inputs == 0)
+      reg++;
 
-   /* Allocate outputs: TODO: could organize the non-position outputs
-    * to go straight into message regs.
+   /* Allocate outputs.  The non-position outputs go straight into message regs.
     */
    c->nr_outputs = 0;
    c->first_output = reg;
-   mrf = 4;
+   c->first_overflow_output = 0;
+
+   if (BRW_IS_IGDNG(c->func.brw))
+       mrf = 8;
+   else
+       mrf = 4;
+
    for (i = 0; i < VERT_RESULT_MAX; i++) {
       if (c->prog_data.outputs_written & (1 << i)) {
 	 c->nr_outputs++;
+         assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 	 if (i == VERT_RESULT_HPOS) {
 	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 	    reg++;
@@ -143,8 +160,17 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	    mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
 	 }
 	 else {
-	    c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
-	    mrf++;
+            if (mrf < 16) {
+               c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
+               mrf++;
+            }
+            else {
+               /* too many vertex results to fit in MRF, use GRF for overflow */
+               if (!c->first_overflow_output)
+                  c->first_overflow_output = i;
+               c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+               reg++;
+            }
 	 }
       }
    }     
@@ -200,8 +226,23 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     * vertex urb, so is half the amount:
     */
    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
+   /* Setting this field to 0 leads to undefined behavior according to the
+    * the VS_STATE docs.  Our VUEs will always have at least one attribute
+    * sitting in them, even if it's padding.
+    */
+   if (c->prog_data.urb_read_length == 0)
+      c->prog_data.urb_read_length = 1;
+
+   /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
+    * them to fit the biggest thing they need to.
+    */
+   attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
+
+   if (BRW_IS_IGDNG(c->func.brw))
+       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
+   else
+       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 
-   c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
    c->prog_data.total_grf = reg;
 
    if (INTEL_DEBUG & DEBUG_VS) {
@@ -709,10 +750,11 @@ get_constant(struct brw_vs_compile *c,
    struct brw_compile *p = &c->func;
    struct brw_reg const_reg;
    struct brw_reg const2_reg;
+   const GLboolean relAddr = src->RelAddr;
 
    assert(argIndex < 3);
 
-   if (c->current_const[argIndex].index != src->Index || src->RelAddr) {
+   if (c->current_const[argIndex].index != src->Index || relAddr) {
       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 
       c->current_const[argIndex].index = src->Index;
@@ -725,13 +767,13 @@ get_constant(struct brw_vs_compile *c,
       brw_dp_READ_4_vs(p,
                        c->current_const[argIndex].reg,/* writeback dest */
                        0,                             /* oword */
-                       src->RelAddr,                  /* relative indexing? */
+                       relAddr,                       /* relative indexing? */
                        addrReg,                       /* address register */
                        16 * src->Index,               /* byte offset */
                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
                        );
 
-      if (src->RelAddr) {
+      if (relAddr) {
          /* second read */
          const2_reg = get_tmp(c);
 
@@ -742,7 +784,7 @@ get_constant(struct brw_vs_compile *c,
          brw_dp_READ_4_vs(p,
                           const2_reg,              /* writeback dest */
                           1,                       /* oword */
-                          src->RelAddr,            /* relative indexing? */
+                          relAddr,                 /* relative indexing? */
                           addrReg,                 /* address register */
                           16 * src->Index,         /* byte offset */
                           SURF_INDEX_VERT_CONST_BUFFER
@@ -752,7 +794,7 @@ get_constant(struct brw_vs_compile *c,
 
    const_reg = c->current_const[argIndex].reg;
 
-   if (src->RelAddr) {
+   if (relAddr) {
       /* merge the two Owords into the constant register */
       /* const_reg[7..4] = const2_reg[7..4] */
       brw_MOV(p,
@@ -869,6 +911,7 @@ get_src_reg( struct brw_vs_compile *c,
    case PROGRAM_STATE_VAR:
    case PROGRAM_CONSTANT:
    case PROGRAM_UNIFORM:
+   case PROGRAM_ENV_PARAM:
       if (c->vp->use_const_buffer) {
          return get_constant(c, inst, argIndex);
       }
@@ -888,7 +931,6 @@ get_src_reg( struct brw_vs_compile *c,
       return brw_null_reg();
 
    case PROGRAM_LOCAL_PARAM: 
-   case PROGRAM_ENV_PARAM: 
    case PROGRAM_WRITE_ONLY:
    default:
       assert(0);
@@ -1061,6 +1103,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    struct brw_reg m0 = brw_message_reg(0);
    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
+   int eot;
+   GLuint len_vertext_header = 2;
 
    if (c->key.copy_edgeflag) {
       brw_MOV(p, 
@@ -1070,14 +1114,16 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 
    /* Build ndc coords */
    ndc = get_tmp(c);
+   /* ndc = 1.0 / pos.w */
    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
+   /* ndc.xyz = pos * ndc */
    brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
 
    /* Update the header for point size, user clipping flags, and -ve rhw
     * workaround.
     */
    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
-       c->key.nr_userclip || !BRW_IS_G4X(p->brw))
+       c->key.nr_userclip || BRW_IS_965(p->brw))
    {
       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
       GLuint i;
@@ -1108,7 +1154,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (!BRW_IS_G4X(p->brw)) {
+      if (BRW_IS_965(p->brw)) {
 	 brw_CMP(p,
 		 vec8(brw_null_reg()),
 		 BRW_CONDITIONAL_L,
@@ -1135,7 +1181,23 @@ static void emit_vertex_write( struct brw_vs_compile *c)
     */
    brw_set_access_mode(p, BRW_ALIGN_1);
    brw_MOV(p, offset(m0, 2), ndc);
-   brw_MOV(p, offset(m0, 3), pos);
+
+   if (BRW_IS_IGDNG(p->brw)) {
+       /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
+       brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
+       /* m4, m5 contain the distances from vertex to the user clip planeXXX. 
+        * Seems it is useless for us.
+        * m6 is used for aligning, so that the remainder of vertex element is 
+        * reg-aligned.
+        */
+       brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
+       len_vertext_header = 6;
+   } else {
+       brw_MOV(p, offset(m0, 3), pos);
+       len_vertext_header = 2;
+   }
+
+   eot = (c->first_overflow_output == 0);
 
    brw_urb_WRITE(p, 
 		 brw_null_reg(), /* dest */
@@ -1143,12 +1205,43 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 c->r0,		/* src */
 		 0,		/* allocate */
 		 1,		/* used */
-		 c->nr_outputs + 3, /* msg len */
+		 MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
 		 0,		/* response len */
-		 1, 		/* eot */
+		 eot, 		/* eot */
 		 1, 		/* writes complete */
 		 0, 		/* urb destination offset */
 		 BRW_URB_SWIZZLE_INTERLEAVE);
+
+   if (c->first_overflow_output > 0) {
+      /* Not all of the vertex outputs/results fit into the MRF.
+       * Move the overflowed attributes from the GRF to the MRF and
+       * issue another brw_urb_WRITE().
+       */
+      /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
+       * at mrf[4] atm...
+       */
+      GLuint i, mrf = 0;
+      for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
+         if (c->prog_data.outputs_written & (1 << i)) {
+            /* move from GRF to MRF */
+            brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
+            mrf++;
+         }
+      }
+
+      brw_urb_WRITE(p,
+                    brw_null_reg(), /* dest */
+                    4,              /* starting mrf reg nr */
+                    c->r0,          /* src */
+                    0,              /* allocate */
+                    1,              /* used */
+                    mrf+1,          /* msg len */
+                    0,              /* response len */
+                    1,              /* eot */
+                    1,              /* writes complete */
+                    BRW_MAX_MRF-1,  /* urb destination offset */
+                    BRW_URB_SWIZZLE_INTERLEAVE);
+   }
 }
 
 
@@ -1169,28 +1262,49 @@ post_vs_emit( struct brw_vs_compile *c,
 
    /* patch up the END code to jump past subroutines, etc */
    offset = last_inst - end_inst;
-   brw_set_src1(end_inst, brw_imm_d(offset * 16));
+   if (offset > 1) {
+      brw_set_src1(end_inst, brw_imm_d(offset * 16));
+   } else {
+      end_inst->header.opcode = BRW_OPCODE_NOP;
+   }
 }
 
+static uint32_t
+get_predicate(uint32_t swizzle)
+{
+   switch (swizzle) {
+   case SWIZZLE_XXXX:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_X;
+   case SWIZZLE_YYYY:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+   case SWIZZLE_ZZZZ:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+   case SWIZZLE_WWWW:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_W;
+   default:
+      _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n", swizzle);
+      return BRW_PREDICATE_NORMAL;
+   }
+}
 
 /* Emit the vertex program instructions here.
  */
 void brw_vs_emit(struct brw_vs_compile *c )
 {
-#define MAX_IFSN 32
+#define MAX_IF_DEPTH 32
+#define MAX_LOOP_DEPTH 32
    struct brw_compile *p = &c->func;
-   GLuint nr_insns = c->vp->program.Base.NumInstructions;
-   GLuint insn, if_insn = 0;
+   const GLuint nr_insns = c->vp->program.Base.NumInstructions;
+   GLuint insn, if_depth = 0, loop_depth = 0;
    GLuint end_offset = 0;
    struct brw_instruction *end_inst, *last_inst;
-   struct brw_instruction *if_inst[MAX_IFSN];
-   struct brw_indirect stack_index = brw_indirect(0, 0);   
-
+   struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
+   const struct brw_indirect stack_index = brw_indirect(0, 0);   
    GLuint index;
    GLuint file;
 
    if (INTEL_DEBUG & DEBUG_VS) {
-      _mesa_printf("vs-emit:\n");
+      _mesa_printf("vs-mesa:\n");
       _mesa_print_program(&c->vp->program.Base); 
       _mesa_printf("\n");
    }
@@ -1219,7 +1333,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
    for (insn = 0; insn < nr_insns; insn++) {
 
-      struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
+      const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
       struct brw_reg args[3], dst;
       GLuint i;
       
@@ -1232,7 +1346,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
        */
       if (inst->Opcode != OPCODE_SWZ)
 	  for (i = 0; i < 3; i++) {
-	      struct prog_src_register *src = &inst->SrcReg[i];
+	      const struct prog_src_register *src = &inst->SrcReg[i];
 	      index = src->Index;
 	      file = src->File;	
 	      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
@@ -1376,16 +1490,54 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 emit_xpd(p, dst, args[0], args[1]);
 	 break;
       case OPCODE_IF:
-	 assert(if_insn < MAX_IFSN);
-         if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+	 assert(if_depth < MAX_IF_DEPTH);
+	 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
+	 if_inst[if_depth]->header.predicate_control =
+	    get_predicate(inst->DstReg.CondSwizzle);
+	 if_depth++;
 	 break;
       case OPCODE_ELSE:
-	 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
+	 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
 	 break;
       case OPCODE_ENDIF:
-         assert(if_insn > 0);
-	 brw_ENDIF(p, if_inst[--if_insn]);
+         assert(if_depth > 0);
+	 brw_ENDIF(p, if_inst[--if_depth]);
 	 break;			
+#if 0
+      case OPCODE_BGNLOOP:
+         loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
+         break;
+      case OPCODE_BRK:
+         brw_BREAK(p);
+         brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+         break;
+      case OPCODE_CONT:
+         brw_CONT(p);
+         brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+         break;
+      case OPCODE_ENDLOOP: 
+         {
+            struct brw_instruction *inst0, *inst1;
+            loop_depth--;
+            inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+            /* patch all the BREAK/CONT instructions from last BEGINLOOP */
+            while (inst0 > loop_inst[loop_depth]) {
+               inst0--;
+               if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+                  inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
+                  inst0->bits3.if_else.pop_count = 0;
+               }
+               else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+                  inst0->bits3.if_else.jump_count = inst1 - inst0;
+                  inst0->bits3.if_else.pop_count = 0;
+               }
+            }
+         }
+         break;
+#else
+         (void) loop_inst;
+         (void) loop_depth;
+#endif
       case OPCODE_BRA:
          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
@@ -1430,6 +1582,19 @@ void brw_vs_emit(struct brw_vs_compile *c )
 				    "unknown");
       }
 
+      /* Set the predication update on the last instruction of the native
+       * instruction sequence.
+       *
+       * This would be problematic if it was set on a math instruction,
+       * but that shouldn't be the case with the current GLSL compiler.
+       */
+      if (inst->CondUpdate) {
+	 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
+
+	 assert(hw_insn->header.destreg__conditionalmod == 0);
+	 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
+      }
+
       if ((inst->DstReg.File == PROGRAM_OUTPUT)
           && (inst->DstReg.Index != VERT_RESULT_HPOS)
           && c->output_regs[inst->DstReg.Index].used_in_src) {
@@ -1467,4 +1632,13 @@ void brw_vs_emit(struct brw_vs_compile *c )
    emit_vertex_write(c);
 
    post_vs_emit(c, end_inst, last_inst);
+
+   if (INTEL_DEBUG & DEBUG_VS) {
+      int i;
+
+      _mesa_printf("vs-native:\n");
+      for (i = 0; i < p->nr_insn; i++)
+	 brw_disasm(stderr, &p->store[i]);
+      _mesa_printf("\n");
+   }
 }
diff --git a/i965/brw_vs_state.c b/i965/brw_vs_state.c
index 3d29538..d790ab6 100644
--- a/i965/brw_vs_state.c
+++ b/i965/brw_vs_state.c
@@ -97,7 +97,11 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     * brw_urb_WRITE() results.
     */
    vs.thread1.single_program_flow = 0;
-   vs.thread1.binding_table_entry_count = key->nr_surfaces;
+
+   if (BRW_IS_IGDNG(brw))
+      vs.thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else
+      vs.thread1.binding_table_entry_count = key->nr_surfaces;
 
    vs.thread3.urb_entry_read_length = key->urb_entry_read_length;
    vs.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
@@ -105,10 +109,16 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    vs.thread3.urb_entry_read_offset = 0;
    vs.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
 
-   vs.thread4.nr_urb_entries = key->nr_urb_entries;
+   if (BRW_IS_IGDNG(brw))
+       vs.thread4.nr_urb_entries = key->nr_urb_entries >> 2;
+   else
+       vs.thread4.nr_urb_entries = key->nr_urb_entries;
+
    vs.thread4.urb_entry_allocation_size = key->urb_size - 1;
 
-   if (BRW_IS_G4X(brw))
+   if (BRW_IS_IGDNG(brw))
+      chipset_max_threads = 72;
+   else if (BRW_IS_G4X(brw))
       chipset_max_threads = 32;
    else
       chipset_max_threads = 16;
@@ -120,6 +130,8 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 
    /* No samplers for ARB_vp programs:
     */
+   /* It has to be set to 0 for IGDNG
+    */
    vs.vs5.sampler_count = 0;
 
    if (INTEL_DEBUG & DEBUG_STATS)
diff --git a/i965/brw_vs_surface_state.c b/i965/brw_vs_surface_state.c
new file mode 100644
index 0000000..89f4752
--- /dev/null
+++ b/i965/brw_vs_surface_state.c
@@ -0,0 +1,226 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "main/mtypes.h"
+#include "main/texformat.h"
+#include "main/texstore.h"
+#include "shader/prog_parameter.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+/* Creates a new VS constant buffer reflecting the current VS program's
+ * constants, if needed by the VS program.
+ *
+ * Otherwise, constants go through the CURBEs using the brw_constant_buffer
+ * state atom.
+ */
+static drm_intel_bo *
+brw_vs_update_constant_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   drm_intel_bo *const_buffer;
+
+   /* BRW_NEW_VERTEX_PROGRAM */
+   if (!vp->use_const_buffer)
+      return NULL;
+
+   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer",
+				     size, 64);
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+
+   return const_buffer;
+}
+
+/**
+ * Update the surface state for a VS constant buffer.
+ *
+ * Sets brw->vs.surf_bo[surf] and brw->vp->const_buffer.
+ */
+static void
+brw_update_vs_constant_surface( GLcontext *ctx,
+                                GLuint surf)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_surface_key key;
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+
+   assert(surf == 0);
+
+   /* If we're in this state update atom, we need to update VS constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   dri_bo_unreference(vp->const_buffer);
+   vp->const_buffer = brw_vs_update_constant_buffer(brw);
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (vp->const_buffer == 0) {
+      drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
+      brw->vs.surf_bo[surf] = NULL;
+      return;
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.format = MESA_FORMAT_RGBA_FLOAT32;
+   key.internal_format = GL_RGBA;
+   key.bo = vp->const_buffer;
+   key.depthmode = GL_NONE;
+   key.pitch = params->NumParameters;
+   key.width = params->NumParameters;
+   key.height = 1;
+   key.depth = 1;
+   key.cpp = 16;
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
+   brw->vs.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->vs.surf_bo[surf] == NULL) {
+      brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
+   }
+}
+
+
+/**
+ * Constructs the binding table for the VS surface state.
+ */
+static dri_bo *
+brw_vs_get_binding_table(struct brw_context *brw)
+{
+   dri_bo *bind_bo;
+
+   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
+			      NULL, 0,
+			      brw->vs.surf_bo, BRW_VS_MAX_SURF,
+			      NULL);
+
+   if (bind_bo == NULL) {
+      GLuint data_size = BRW_VS_MAX_SURF * sizeof(GLuint);
+      uint32_t *data = malloc(data_size);
+      int i;
+
+      for (i = 0; i < BRW_VS_MAX_SURF; i++)
+         if (brw->vs.surf_bo[i])
+            data[i] = brw->vs.surf_bo[i]->offset;
+         else
+            data[i] = 0;
+
+      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
+				  NULL, 0,
+				  brw->vs.surf_bo, BRW_VS_MAX_SURF,
+				  data, data_size,
+				  NULL, NULL);
+
+      /* Emit binding table relocations to surface state */
+      for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+	 if (brw->vs.surf_bo[i] != NULL) {
+	    /* The presumed offsets were set in the data values for
+	     * brw_upload_cache.
+	     */
+	    drm_intel_bo_emit_reloc(bind_bo, i * 4,
+				    brw->vs.surf_bo[i], 0,
+				    I915_GEM_DOMAIN_INSTRUCTION, 0);
+	 }
+      }
+
+      free(data);
+   }
+
+   return bind_bo;
+}
+
+/**
+ * Vertex shader surfaces (constant buffer).
+ *
+ * This consumes the state updates for the constant buffer needing
+ * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
+ * CACHE_NEW_SURF_BIND for the binding table upload.
+ */
+static void prepare_vs_surfaces(struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   int i;
+   int nr_surfaces = 0;
+
+   brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER);
+
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      if (brw->vs.surf_bo[i] != NULL) {
+	 nr_surfaces = i + 1;
+      }
+   }
+
+   if (brw->vs.nr_surfaces != nr_surfaces) {
+      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
+      brw->vs.nr_surfaces = nr_surfaces;
+   }
+
+   /* Note that we don't end up updating the bind_bo if we don't have a
+    * surface to be pointing at.  This should be relatively harmless, as it
+    * just slightly increases our working set size.
+    */
+   if (brw->vs.nr_surfaces != 0) {
+      dri_bo_unreference(brw->vs.bind_bo);
+      brw->vs.bind_bo = brw_vs_get_binding_table(brw);
+   }
+}
+
+const struct brw_tracked_state brw_vs_surfaces = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_VERTEX_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_vs_surfaces,
+};
+
+
+
diff --git a/i965/brw_vtbl.c b/i965/brw_vtbl.c
index ba03afd..ac11790 100644
--- a/i965/brw_vtbl.c
+++ b/i965/brw_vtbl.c
@@ -177,14 +177,6 @@ static void brw_note_fence( struct intel_context *intel, GLuint fence )
    brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
 }
 
-
-static void brw_note_unlock( struct intel_context *intel )
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-   brw_state_cache_check_size(brw);
-}
-
-
 /* called from intelWaitForIdle() and intelFlush()
  *
  * For now, just flush everything.  Could be smarter later.
@@ -194,7 +186,7 @@ static GLuint brw_flush_cmd( void )
    struct brw_mi_flush flush;
    flush.opcode = CMD_MI_FLUSH;
    flush.pad = 0;
-   flush.flags = BRW_FLUSH_READ_CACHE | BRW_FLUSH_STATE_CACHE;
+   flush.flags = BRW_FLUSH_STATE_CACHE;
    return *(GLuint *)&flush;
 }
 
@@ -215,7 +207,6 @@ void brwInitVtbl( struct brw_context *brw )
 
    brw->intel.vtbl.invalidate_state = brw_invalidate_state;
    brw->intel.vtbl.note_fence = brw_note_fence;
-   brw->intel.vtbl.note_unlock = brw_note_unlock;
    brw->intel.vtbl.new_batch = brw_new_batch;
    brw->intel.vtbl.finish_batch = brw_finish_batch;
    brw->intel.vtbl.destroy = brw_destroy_context;
diff --git a/i965/brw_wm.c b/i965/brw_wm.c
index 8a3b7df..2292de9 100644
--- a/i965/brw_wm.c
+++ b/i965/brw_wm.c
@@ -41,13 +41,13 @@ GLuint brw_wm_nr_args( GLuint opcode )
 {
    switch (opcode) {
    case WM_FRONTFACING:
-      return 0;
    case WM_PIXELXY:
+      return 0;
    case WM_CINTERP:
    case WM_WPOSXY:
+   case WM_DELTAXY:
       return 1;
    case WM_LINTERP:
-   case WM_DELTAXY:
    case WM_PIXELW:
       return 2;
    case WM_FB_WRITE:
@@ -171,9 +171,11 @@ static void do_wm_prog( struct brw_context *brw,
     * differently from "simple" shaders.
     */
    if (fp->isGLSL) {
+      c->dispatch_width = 8;
       brw_wm_glsl_emit(brw, c);
    }
    else {
+      c->dispatch_width = 16;
       brw_wm_non_glsl_emit(brw, c);
    }
 
@@ -202,6 +204,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
    /* BRW_NEW_FRAGMENT_PROGRAM */
    const struct brw_fragment_program *fp = 
       (struct brw_fragment_program *)brw->fragment_program;
+   GLboolean uses_depth = (fp->program.Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
    GLuint lookup = 0;
    GLuint line_aa;
    GLuint i;
@@ -263,6 +266,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
 	 
    brw_wm_lookup_iz(line_aa,
 		    lookup,
+		    uses_depth,
 		    key);
 
 
@@ -272,6 +276,9 @@ static void brw_wm_populate_key( struct brw_context *brw,
    /* _NEW_LIGHT */
    key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT);
 
+   /* _NEW_HINT */
+   key->linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
+
    /* _NEW_TEXTURE */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
       const struct gl_texture_unit *unit = &ctx->Texture.Unit[i];
@@ -351,6 +358,7 @@ const struct brw_tracked_state brw_wm_prog = {
    .dirty = {
       .mesa  = (_NEW_COLOR |
 		_NEW_DEPTH |
+                _NEW_HINT |
 		_NEW_STENCIL |
 		_NEW_POLYGON |
 		_NEW_LINE |
diff --git a/i965/brw_wm.h b/i965/brw_wm.h
index 295fed8..ae98b54 100644
--- a/i965/brw_wm.h
+++ b/i965/brw_wm.h
@@ -63,6 +63,7 @@ struct brw_wm_prog_key {
    GLuint computes_depth:1;	/* could be derived from program string */
    GLuint source_depth_to_render_target:1;
    GLuint flat_shade:1;
+   GLuint linear_color:1;  /**< linear interpolation vs perspective interp */
    GLuint runtime_check_aads_emit:1;
    
    GLbitfield proj_attrib_mask; /**< one bit per fragment program attribute */
@@ -241,19 +242,25 @@ struct brw_wm_compile {
    GLuint max_wm_grf;
    GLuint last_scratch;
 
+   GLuint cur_inst;  /**< index of current instruction */
+
+   GLboolean out_of_regs;  /**< ran out of GRF registers? */
+
    /** Mapping from Mesa registers to hardware registers */
    struct {
       GLboolean inited;
       struct brw_reg reg;
    } wm_regs[PROGRAM_PAYLOAD+1][256][4];
 
+   GLboolean used_grf[BRW_WM_MAX_GRF];
+   GLuint first_free_grf;
    struct brw_reg stack;
    struct brw_reg emit_mask_reg;
-   GLuint reg_index;  /**< Index of next free GRF register */
    GLuint tmp_regs[BRW_WM_MAX_GRF];
    GLuint tmp_index;
    GLuint tmp_max;
    GLuint subroutines[BRW_WM_MAX_SUBROUTINE];
+   GLuint dispatch_width;
 
    /** we may need up to 3 constants per instruction (if use_const_buffer) */
    struct {
@@ -286,6 +293,7 @@ void brw_wm_print_program( struct brw_wm_compile *c,
 
 void brw_wm_lookup_iz( GLuint line_aa,
 		       GLuint lookup,
+		       GLboolean ps_uses_depth,
 		       struct brw_wm_prog_key *key );
 
 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
diff --git a/i965/brw_wm_emit.c b/i965/brw_wm_emit.c
index a870e75..268f796 100644
--- a/i965/brw_wm_emit.c
+++ b/i965/brw_wm_emit.c
@@ -65,8 +65,7 @@ static INLINE struct brw_reg sechalf( struct brw_reg reg )
 
 static void emit_pixel_xy(struct brw_compile *p,
 			  const struct brw_reg *dst,
-			  GLuint mask,
-			  const struct brw_reg *arg0)
+			  GLuint mask)
 {
    struct brw_reg r1 = brw_vec1_grf(1, 0);
    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
@@ -98,8 +97,7 @@ static void emit_pixel_xy(struct brw_compile *p,
 static void emit_delta_xy(struct brw_compile *p,
 			  const struct brw_reg *dst,
 			  GLuint mask,
-			  const struct brw_reg *arg0,
-			  const struct brw_reg *arg1)
+			  const struct brw_reg *arg0)
 {
    struct brw_reg r1 = brw_vec1_grf(1, 0);
 
@@ -353,6 +351,19 @@ static void emit_mad( struct brw_compile *p,
    }
 }
 
+static void emit_trunc( struct brw_compile *p,
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0)
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_RNDZ(p, dst[i], arg0[i]);
+      }
+   }
+}
 
 static void emit_lrp( struct brw_compile *p, 
 		      const struct brw_reg *dst,
@@ -532,16 +543,18 @@ static void emit_dp3( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 
    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-   brw_MAC(p, dst[0], arg0[2], arg1[2]);
+   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
    brw_set_saturate(p, 0);
 }
 
@@ -552,17 +565,19 @@ static void emit_dp4( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 
    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-   brw_MAC(p, dst[0], arg0[3], arg1[3]);
+   brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
    brw_set_saturate(p, 0);
 }
 
@@ -573,17 +588,19 @@ static void emit_dph( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
+   const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
-   brw_MAC(p, dst[0], arg0[2], arg1[2]);
+   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 
    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-   brw_ADD(p, dst[0], dst[0], arg1[3]);
+   brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
    brw_set_saturate(p, 0);
 }
 
@@ -619,18 +636,19 @@ static void emit_math1( struct brw_compile *p,
 			GLuint mask,
 			const struct brw_reg *arg0 )
 {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   //assert((mask & WRITEMASK_XYZW) == WRITEMASK_X ||
-   //	  function == BRW_MATH_FUNCTION_SINCOS);
-   
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
    brw_MOV(p, brw_message_reg(2), arg0[0]);
 
    /* Send two messages to perform all 16 operations:
     */
    brw_math_16(p, 
-	       dst[0],
+	       dst[dst_chan],
 	       function,
 	       (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
 	       2,
@@ -646,10 +664,12 @@ static void emit_math2( struct brw_compile *p,
 			const struct brw_reg *arg0,
 			const struct brw_reg *arg1)
 {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
    brw_push_insn_state(p);
 
@@ -668,7 +688,7 @@ static void emit_math2( struct brw_compile *p,
     */
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_math(p, 
-	    dst[0],
+	    dst[dst_chan],
 	    function,
 	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
 	    2,
@@ -678,7 +698,7 @@ static void emit_math2( struct brw_compile *p,
 
    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
    brw_math(p, 
-	    offset(dst[0],1),
+	    offset(dst[dst_chan],1),
 	    function,
 	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
 	    4,
@@ -701,6 +721,7 @@ static void emit_tex( struct brw_wm_compile *c,
    GLuint msgLength, responseLength;
    GLuint i, nr;
    GLuint emit;
+   GLuint msg_type;
 
    /* How many input regs are there?
     */
@@ -714,10 +735,14 @@ static void emit_tex( struct brw_wm_compile *c,
       emit = WRITEMASK_XY;
       nr = 2;
       break;
-   default:
+   case TEXTURE_3D_INDEX:
+   case TEXTURE_CUBE_INDEX:
       emit = WRITEMASK_XYZ;
       nr = 3;
       break;
+   default:
+      /* unexpected target */
+      abort();
    }
 
    if (inst->tex_shadow) {
@@ -738,6 +763,18 @@ static void emit_tex( struct brw_wm_compile *c,
 
    responseLength = 8;		/* always */
 
+   if (BRW_IS_IGDNG(p->brw)) {
+       if (inst->tex_shadow)
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG;
+       else
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG;
+   } else {
+       if (inst->tex_shadow)
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
+       else
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
+   }
+
    brw_SAMPLE(p, 
 	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
 	      1,
@@ -745,12 +782,12 @@ static void emit_tex( struct brw_wm_compile *c,
               SURF_INDEX_TEXTURE(inst->tex_unit),
 	      inst->tex_unit,	  /* sampler */
 	      inst->writemask,
-	      (inst->tex_shadow ? 
-	       BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE : 
-	       BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE),
+	      msg_type, 
 	      responseLength,
 	      msgLength,
-	      0);	
+	      0,	
+	      1,
+	      BRW_SAMPLER_SIMD_MODE_SIMD16);	
 }
 
 
@@ -762,7 +799,7 @@ static void emit_txb( struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
    GLuint msgLength;
-
+   GLuint msg_type;
    /* Shadow ignored for txb.
     */
    switch (inst->tex_idx) {
@@ -777,16 +814,25 @@ static void emit_txb( struct brw_wm_compile *c,
       brw_MOV(p, brw_message_reg(4), arg[1]);
       brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
       break;
-   default:
+   case TEXTURE_3D_INDEX:
+   case TEXTURE_CUBE_INDEX:
       brw_MOV(p, brw_message_reg(2), arg[0]);
       brw_MOV(p, brw_message_reg(4), arg[1]);
       brw_MOV(p, brw_message_reg(6), arg[2]);
       break;
+   default:
+      /* unexpected target */
+      abort();
    }
 
    brw_MOV(p, brw_message_reg(8), arg[3]);
    msgLength = 9;
 
+   if (BRW_IS_IGDNG(p->brw))
+       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG;
+   else
+       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+
    brw_SAMPLE(p, 
 	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
 	      1,
@@ -794,10 +840,12 @@ static void emit_txb( struct brw_wm_compile *c,
               SURF_INDEX_TEXTURE(inst->tex_unit),
 	      inst->tex_unit,	  /* sampler */
 	      inst->writemask,
-	      BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
+	      msg_type,
 	      8,		/* responseLength */
 	      msgLength,
-	      0);	
+	      0,	
+	      1,
+	      BRW_SAMPLER_SIMD_MODE_SIMD16);	
 }
 
 
@@ -1009,7 +1057,7 @@ static void emit_fb_write( struct brw_wm_compile *c,
 	      get_element_ud(brw_vec8_grf(1,0), 6), 
 	      brw_imm_ud(1<<26)); 
 
-      jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+      jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
       {
 	 emit_aa(c, arg1, 2);
 	 fire_fb_write(c, 0, nr, target, eot);
@@ -1161,11 +1209,11 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 /* Generated instructions for calculating triangle interpolants:
 	  */
       case WM_PIXELXY:
-	 emit_pixel_xy(p, dst, dst_flags, args[0]);
+	 emit_pixel_xy(p, dst, dst_flags);
 	 break;
 
       case WM_DELTAXY:
-	 emit_delta_xy(p, dst, dst_flags, args[0], args[1]);
+	 emit_delta_xy(p, dst, dst_flags, args[0]);
 	 break;
 
       case WM_WPOSXY:
@@ -1222,6 +1270,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_dph(p, dst, dst_flags, args[0], args[1]);
 	 break;
 
+      case OPCODE_TRUNC:
+	 emit_trunc(p, dst, dst_flags, args[0]);
+	 break;
+
       case OPCODE_LRP:
 	 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
 	 break;
@@ -1348,4 +1400,13 @@ void brw_wm_emit( struct brw_wm_compile *c )
 		      inst->dst[i]->hw_reg, 
 		      inst->dst[i]->spill_slot);
    }
+
+   if (INTEL_DEBUG & DEBUG_WM) {
+      int i;
+
+      _mesa_printf("wm-native:\n");
+      for (i = 0; i < p->nr_insn; i++)
+	 brw_disasm(stderr, &p->store[i]);
+      _mesa_printf("\n");
+   }
 }
diff --git a/i965/brw_wm_fp.c b/i965/brw_wm_fp.c
index 49aad28..123fe84 100644
--- a/i965/brw_wm_fp.c
+++ b/i965/brw_wm_fp.c
@@ -42,6 +42,12 @@
 #include "shader/prog_statevars.h"
 
 
+/** An invalid texture target */
+#define TEX_TARGET_NONE NUM_TEXTURE_TARGETS
+
+/** An invalid texture unit */
+#define TEX_UNIT_NONE BRW_MAX_TEX_UNIT
+
 #define FIRST_INTERNAL_TEMP MAX_NV_FRAGMENT_PROGRAM_TEMPS
 
 #define X    0
@@ -199,6 +205,15 @@ static struct prog_instruction * emit_tex_op(struct brw_wm_compile *c,
 {
    struct prog_instruction *inst = get_fp_inst(c);
       
+   assert(tex_src_unit < BRW_MAX_TEX_UNIT ||
+          tex_src_unit == TEX_UNIT_NONE);
+   assert(tex_src_target < NUM_TEXTURE_TARGETS ||
+          tex_src_target == TEX_TARGET_NONE);
+
+   /* update mask of which texture units are referenced by this program */
+   if (tex_src_unit != TEX_UNIT_NONE)
+      c->fp->tex_units_used |= (1 << tex_src_unit);
+
    memset(inst, 0, sizeof(*inst));
 
    inst->Opcode = op;
@@ -223,12 +238,45 @@ static struct prog_instruction * emit_op(struct brw_wm_compile *c,
 				       struct prog_src_register src2 )
 {
    return emit_tex_op(c, op, dest, saturate,
-                      0, 0, 0,  /* tex unit, target, shadow */
+                      TEX_UNIT_NONE, TEX_TARGET_NONE, 0,  /* unit, tgt, shadow */
                       src0, src1, src2);
 }
-   
 
 
+/* Many Mesa opcodes produce the same value across all the result channels.
+ * We'd rather not have to support that splatting in the opcode implementations,
+ * and brw_wm_pass*.c wants to optimize them out by shuffling references around
+ * anyway.  We can easily get both by emitting the opcode to one channel, and
+ * then MOVing it to the others, which brw_wm_pass*.c already understands.
+ */
+static struct prog_instruction *emit_scalar_insn(struct brw_wm_compile *c,
+						 const struct prog_instruction *inst0)
+{
+   struct prog_instruction *inst;
+   unsigned int dst_chan;
+   unsigned int other_channel_mask;
+
+   if (inst0->DstReg.WriteMask == 0)
+      return NULL;
+
+   dst_chan = _mesa_ffs(inst0->DstReg.WriteMask) - 1;
+   inst = get_fp_inst(c);
+   *inst = *inst0;
+   inst->DstReg.WriteMask = 1 << dst_chan;
+
+   other_channel_mask = inst0->DstReg.WriteMask & ~(1 << dst_chan);
+   if (other_channel_mask != 0) {
+      inst = emit_op(c,
+		     OPCODE_MOV,
+		     dst_mask(inst0->DstReg, other_channel_mask),
+		     0,
+		     src_swizzle1(src_reg_from_dst(inst0->DstReg), dst_chan),
+		     src_undef(),
+		     src_undef());
+   }
+   return inst;
+}
+
 
 /***********************************************************************
  * Special instructions for interpolation and other tasks
@@ -354,24 +402,28 @@ static void emit_interp( struct brw_wm_compile *c,
 		 src_undef());
       }
       else {
-	 emit_op(c,
-		 WM_LINTERP,
-		 dst,
-		 0,
-		 interp,
-		 deltas,
-		 src_undef());
+         if (c->key.linear_color) {
+            emit_op(c,
+                    WM_LINTERP,
+                    dst,
+                    0,
+                    interp,
+                    deltas,
+                    src_undef());
+         }
+         else {
+            /* perspective-corrected color interpolation */
+            emit_op(c,
+                    WM_PINTERP,
+                    dst,
+                    0,
+                    interp,
+                    deltas,
+                    get_pixel_w(c));
+         }
       }
       break;
    case FRAG_ATTRIB_FOGC:
-      /* The FOGC input is really special.  When a program uses glFogFragCoord,
-       * the results returned are supposed to be (f,0,0,1).  But for Mesa GLSL,
-       * the glFrontFacing and glPointCoord values are also stashed in FOGC.
-       * So, write the interpolated fog value to X, then either 0, 1, or the
-       * stashed values to Y, Z, W.  Note that this means that
-       * glFogFragCoord.yzw can be wrong in those cases!
-       */
-
       /* Interpolate the fog coordinate */
       emit_op(c,
 	      WM_PINTERP,
@@ -381,26 +433,40 @@ static void emit_interp( struct brw_wm_compile *c,
 	      deltas,
 	      get_pixel_w(c));
 
-      /* Move the front facing value into FOGC.y if it's needed. */
-      if (c->fp->program.UsesFrontFacing) {
-	 emit_op(c,
-		 WM_FRONTFACING,
-		 dst_mask(dst, WRITEMASK_Y),
-		 0,
-		 src_undef(),
-		 src_undef(),
-		 src_undef());
-      } else {
-	 emit_op(c,
-		 OPCODE_MOV,
-		 dst_mask(dst, WRITEMASK_Y),
-		 0,
-		 src_swizzle1(interp, SWIZZLE_ZERO),
-		 src_undef(),
-		 src_undef());
-      }
+      emit_op(c,
+	      OPCODE_MOV,
+	      dst_mask(dst, WRITEMASK_YZW),
+	      0,
+	      src_swizzle(interp,
+			  SWIZZLE_ZERO,
+			  SWIZZLE_ZERO,
+			  SWIZZLE_ZERO,
+			  SWIZZLE_ONE),
+	      src_undef(),
+	      src_undef());
+      break;
+
+   case FRAG_ATTRIB_FACE:
+      /* XXX review/test this case */
+      emit_op(c,
+              WM_FRONTFACING,
+              dst_mask(dst, WRITEMASK_X),
+              0,
+              src_undef(),
+              src_undef(),
+              src_undef());
+      break;
+
+   case FRAG_ATTRIB_PNTC:
+      /* XXX review/test this case */
+      emit_op(c,
+	      WM_PINTERP,
+	      dst_mask(dst, WRITEMASK_XY),
+	      0,
+	      interp,
+	      deltas,
+	      get_pixel_w(c));
 
-      /* Should do the PointCoord thing here. */
       emit_op(c,
 	      OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_ZW),
@@ -413,6 +479,7 @@ static void emit_interp( struct brw_wm_compile *c,
 	      src_undef(),
 	      src_undef());
       break;
+
    default:
       emit_op(c,
 	      WM_PINTERP,
@@ -631,6 +698,8 @@ static void precalc_tex( struct brw_wm_compile *c,
    struct prog_dst_register tmpcoord;
    const GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
 
+   assert(unit < BRW_MAX_TEX_UNIT);
+
    if (inst->TexSrcTarget == TEXTURE_CUBE_INDEX) {
        struct prog_instruction *out;
        struct prog_dst_register tmp0 = get_temp(c);
@@ -671,7 +740,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 
        /* tmp0 = 1 / tmp1 */
        emit_op(c, OPCODE_RCP,
-               tmp0,
+               dst_mask(tmp0, WRITEMASK_X),
                0,
                tmp1src,
                src_undef(),
@@ -682,7 +751,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                tmpcoord,
                0,
                src0,
-               tmp0src,
+               src_swizzle1(tmp0src, SWIZZLE_X),
                src_undef());
 
        release_temp(c, tmp0);
@@ -705,7 +774,11 @@ static void precalc_tex( struct brw_wm_compile *c,
 	      tmpcoord,
 	      0,
 	      inst->SrcReg[0],
-	      scale,
+	      src_swizzle(scale,
+			  SWIZZLE_X,
+			  SWIZZLE_Y,
+			  SWIZZLE_ONE,
+			  SWIZZLE_ONE),
 	      src_undef());
 
       coord = src_reg_from_dst(tmpcoord);
@@ -1029,6 +1102,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
    c->delta_xy = src_undef();
    c->pixel_w = src_undef();
    c->nr_fp_insns = 0;
+   c->fp->tex_units_used = 0x0;
 
    /* Emit preamble instructions.  This is where special instructions such as
     * WM_CINTERP, WM_LINTERP, WM_PINTERP and WM_WPOSXY are emitted to
@@ -1096,6 +1170,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
       case OPCODE_TXB:
 	 out = emit_insn(c, inst);
 	 out->TexSrcUnit = fp->program.Base.SamplerUnits[inst->TexSrcUnit];
+         assert(out->TexSrcUnit < BRW_MAX_TEX_UNIT);
 	 break;
 
       case OPCODE_XPD: 
@@ -1122,9 +1197,11 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 break;
       case OPCODE_PRINT:
 	 break;
-	 
       default:
-	 emit_insn(c, inst);
+	 if (brw_wm_is_scalar_result(inst->Opcode))
+	    emit_scalar_insn(c, inst);
+	 else
+	    emit_insn(c, inst);
 	 break;
       }
    }
diff --git a/i965/brw_wm_glsl.c b/i965/brw_wm_glsl.c
index a907e1b..7c210ab 100644
--- a/i965/brw_wm_glsl.c
+++ b/i965/brw_wm_glsl.c
@@ -1,5 +1,7 @@
 #include "main/macros.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+#include "shader/prog_optimize.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
@@ -8,6 +10,9 @@ enum _subroutine {
     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
 };
 
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
+                                  const struct prog_instruction *inst,
+                                  GLuint component);
 
 /**
  * Determine if the given fragment program uses GLSL features such
@@ -20,8 +25,8 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
     for (i = 0; i < fp->Base.NumInstructions; i++) {
 	const struct prog_instruction *inst = &fp->Base.Instructions[i];
 	switch (inst->Opcode) {
+	    case OPCODE_ARL:
 	    case OPCODE_IF:
-	    case OPCODE_TRUNC:
 	    case OPCODE_ENDIF:
 	    case OPCODE_CAL:
 	    case OPCODE_BRK:
@@ -42,6 +47,83 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 }
 
 
+
+static void
+reclaim_temps(struct brw_wm_compile *c);
+
+
+/** Mark GRF register as used. */
+static void
+prealloc_grf(struct brw_wm_compile *c, int r)
+{
+   c->used_grf[r] = GL_TRUE;
+}
+
+
+/** Mark given GRF register as not in use. */
+static void
+release_grf(struct brw_wm_compile *c, int r)
+{
+   /*assert(c->used_grf[r]);*/
+   c->used_grf[r] = GL_FALSE;
+   c->first_free_grf = MIN2(c->first_free_grf, r);
+}
+
+
+/** Return index of a free GRF, mark it as used. */
+static int
+alloc_grf(struct brw_wm_compile *c)
+{
+   GLuint r;
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   /* no free temps, try to reclaim some */
+   reclaim_temps(c);
+   c->first_free_grf = 0;
+
+   /* try alloc again */
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   for (r = 0; r < BRW_WM_MAX_GRF; r++) {
+      assert(c->used_grf[r]);
+   }
+
+   /* really, no free GRF regs found */
+   if (!c->out_of_regs) {
+      /* print warning once per compilation */
+      _mesa_warning(NULL, "i965: ran out of registers for fragment program");
+      c->out_of_regs = GL_TRUE;
+   }
+
+   return -1;
+}
+
+
+/** Return number of GRF registers used */
+static int
+num_grf_used(const struct brw_wm_compile *c)
+{
+   int r;
+   for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
+      if (c->used_grf[r])
+         return r + 1;
+   return 0;
+}
+
+
+
 /**
  * Record the mapping of a Mesa register to a hardware register.
  */
@@ -52,27 +134,26 @@ static void set_reg(struct brw_wm_compile *c, int file, int index,
     c->wm_regs[file][index][component].inited = GL_TRUE;
 }
 
-/**
- * Examine instruction's write mask to find index of first component
- * enabled for writing.
- */
-static int get_scalar_dst_index(const struct prog_instruction *inst)
-{
-    int i;
-    for (i = 0; i < 4; i++)
-	if (inst->DstReg.WriteMask & (1<<i))
-	    break;
-    return i;
-}
-
 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 {
     struct brw_reg reg;
-    if(c->tmp_index == c->tmp_max)
-	c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
-    
+
+    /* if we need to allocate another temp, grow the tmp_regs[] array */
+    if (c->tmp_index == c->tmp_max) {
+       int r = alloc_grf(c);
+       if (r < 0) {
+          /*printf("Out of temps in %s\n", __FUNCTION__);*/
+          r = 50; /* XXX random register! */
+       }
+       c->tmp_regs[ c->tmp_max++ ] = r;
+    }
+
+    /* form the GRF register */
     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
+    /*printf("alloc_temp %d\n", reg.nr);*/
+    assert(reg.nr < BRW_WM_MAX_GRF);
     return reg;
+
 }
 
 /**
@@ -130,35 +211,29 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 	    return brw_null_reg();
     }
 
+    assert(index < 256);
+    assert(component < 4);
+
     /* see if we've already allocated a HW register for this Mesa register */
     if (c->wm_regs[file][index][component].inited) {
-	/* yes, re-use */
-	reg = c->wm_regs[file][index][component].reg;
+       /* yes, re-use */
+       reg = c->wm_regs[file][index][component].reg;
     }
     else {
 	/* no, allocate new register */
-	reg = brw_vec8_grf(c->reg_index, 0);
-    }
+       int grf = alloc_grf(c);
+       /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
+       if (grf < 0) {
+          /* totally out of temps */
+          grf = 51; /* XXX random register! */
+       }
 
-    /* if this is a new register allocation, record it in the table */
-    if (!c->wm_regs[file][index][component].inited) {
-	set_reg(c, file, index, component, reg);
-	c->reg_index++;
-    }
+       reg = brw_vec8_grf(grf, 0);
+       /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 
-    if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
-	/* ran out of temporary registers! */
-#if 1
-        /* This is a big hack for now.
-         * Return bad register index, just don't hang the GPU.
-         */
-        _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
-        c->reg_index = BRW_WM_MAX_GRF - 13;
-#else
-	return brw_null_reg();
-#endif
+       set_reg(c, file, index, component, reg);
     }
- 
+
     if (neg & (1 << component)) {
 	reg = negate(reg);
     }
@@ -168,6 +243,46 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 }
 
 
+
+/**
+ * This is called if we run out of GRF registers.  Examine the live intervals
+ * of temp regs in the program and free those which won't be used again.
+ */
+static void
+reclaim_temps(struct brw_wm_compile *c)
+{
+   GLint intBegin[MAX_PROGRAM_TEMPS];
+   GLint intEnd[MAX_PROGRAM_TEMPS];
+   int index;
+
+   /*printf("Reclaim temps:\n");*/
+
+   _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
+                             intBegin, intEnd);
+
+   for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
+      if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
+         /* program temp[i] can be freed */
+         int component;
+         /*printf("  temp[%d] is dead\n", index);*/
+         for (component = 0; component < 4; component++) {
+            if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
+               int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
+               release_grf(c, r);
+               /*
+               printf("  Reclaim temp %d, reg %d at inst %d\n",
+                      index, r, c->cur_inst);
+               */
+               c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
+            }
+         }
+      }
+   }
+}
+
+
+
+
 /**
  * Preallocate registers.  This sets up the Mesa to hardware register
  * mapping for certain registers, such as constants (uniforms/state vars)
@@ -179,6 +294,10 @@ static void prealloc_reg(struct brw_wm_compile *c)
     struct brw_reg reg;
     int urb_read_length = 0;
     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
+    GLuint reg_index = 0;
+
+    memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
+    c->first_free_grf = 0;
 
     for (i = 0; i < 4; i++) {
         if (i < c->key.nr_depth_regs) 
@@ -187,14 +306,20 @@ static void prealloc_reg(struct brw_wm_compile *c)
             reg = brw_vec8_grf(0, 0);
 	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
     }
-    c->reg_index += 2 * c->key.nr_depth_regs;
+    reg_index += 2 * c->key.nr_depth_regs;
 
     /* constants */
     {
-        const int nr_params = c->fp->program.Base.Parameters->NumParameters;
+        const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
+        const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 
         /* use a real constant buffer, or just use a section of the GRF? */
-        c->fp->use_const_buffer = GL_FALSE; /* (nr_params > 8);*/
+        /* XXX this heuristic may need adjustment... */
+        if ((nr_params + nr_temps) * 4 + reg_index > 80)
+           c->fp->use_const_buffer = GL_TRUE;
+        else
+           c->fp->use_const_buffer = GL_FALSE;
+        /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 
         if (c->fp->use_const_buffer) {
            /* We'll use a real constant buffer and fetch constants from
@@ -216,7 +341,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
            for (i = 0; i < nr_params; i++) {
               /* loop over XYZW channels */
               for (j = 0; j < 4; j++, index++) {
-                 reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
+                 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
                  /* Save pointer to parameter/constant value.
                   * Constants will be copied in prepare_constant_buffer()
                   */
@@ -226,7 +351,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
            }
            /* number of constant regs used (each reg is float[8]) */
            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
-           c->reg_index += c->nr_creg;
+           reg_index += c->nr_creg;
         }
     }
 
@@ -242,23 +367,52 @@ static void prealloc_reg(struct brw_wm_compile *c)
 	  fp_input = -1;
 
        if (fp_input >= 0 && inputs & (1 << fp_input)) {
-	  urb_read_length = c->reg_index;
-	  reg = brw_vec8_grf(c->reg_index, 0);
+	  urb_read_length = reg_index;
+	  reg = brw_vec8_grf(reg_index, 0);
 	  for (j = 0; j < 4; j++)
 	     set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
        }
        if (c->key.vp_outputs_written & (1 << i)) {
-	  c->reg_index += 2;
+	  reg_index += 2;
        }
     }
 
     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
     c->prog_data.urb_read_length = urb_read_length;
     c->prog_data.curb_read_length = c->nr_creg;
-    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-    c->reg_index++;
-    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-    c->reg_index += 2;
+    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index++;
+    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index += 2;
+
+    /* mark GRF regs [0..reg_index-1] as in-use */
+    for (i = 0; i < reg_index; i++)
+       prealloc_grf(c, i);
+
+    /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
+    prealloc_grf(c, 126);
+    prealloc_grf(c, 127);
+
+    for (i = 0; i < c->nr_fp_insns; i++) {
+	const struct prog_instruction *inst = &c->prog_instructions[i];
+	struct brw_reg dst[4];
+
+	switch (inst->Opcode) {
+	case OPCODE_TEX:
+	case OPCODE_TXB:
+	    /* Allocate the channels of texture results contiguously,
+	     * since they are written out that way by the sampler unit.
+	     */
+	    for (j = 0; j < 4; j++) {
+		dst[j] = get_dst_reg(c, inst, j);
+		if (j != 0)
+		    assert(dst[j].nr == dst[j - 1].nr + 1);
+	    }
+	    break;
+	default:
+	    break;
+	}
+    }
 
     /* An instruction may reference up to three constants.
      * They'll be found in these registers.
@@ -267,12 +421,12 @@ static void prealloc_reg(struct brw_wm_compile *c)
     if (c->fp->use_const_buffer) {
        for (i = 0; i < 3; i++) {
           c->current_const[i].index = -1;
-          c->current_const[i].reg = alloc_tmp(c);
+          c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
        }
     }
 #if 0
     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
-    printf("AFTER PRE_ALLOC, reg_index = %d\n", c->reg_index);
+    printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 #endif
 }
 
@@ -376,6 +530,14 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c,
     const GLuint nr = 1;
     const GLuint component = GET_SWZ(src->Swizzle, channel);
 
+    /* Extended swizzle terms */
+    if (component == SWIZZLE_ZERO) {
+       return brw_imm_f(0.0F);
+    }
+    else if (component == SWIZZLE_ONE) {
+       return brw_imm_f(1.0F);
+    }
+
     if (c->fp->use_const_buffer &&
         (src->File == PROGRAM_STATE_VAR ||
          src->File == PROGRAM_CONSTANT ||
@@ -489,23 +651,6 @@ static void invoke_subroutine( struct brw_wm_compile *c,
     }
 }
 
-static void emit_abs( struct brw_wm_compile *c,
-                      const struct prog_instruction *inst)
-{
-    int i;
-    struct brw_compile *p = &c->func;
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for (i = 0; i < 4; i++) {
-	if (inst->DstReg.WriteMask & (1<<i)) {
-	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i);
-	    src = get_src_reg(c, inst, 0, i);
-	    brw_MOV(p, dst, brw_abs(src));
-	}
-    }
-    brw_set_saturate(p, 0);
-}
-
 static void emit_trunc( struct brw_wm_compile *c,
                         const struct prog_instruction *inst)
 {
@@ -673,27 +818,26 @@ static void emit_fb_write(struct brw_wm_compile *c,
     }
 
     if (c->key.dest_depth_reg) {
-        GLuint comp = c->key.dest_depth_reg / 2;
-        GLuint off = c->key.dest_depth_reg % 2;
+        const GLuint comp = c->key.dest_depth_reg / 2;
+        const GLuint off = c->key.dest_depth_reg % 2;
 
-        assert(comp == 1);
-        assert(off == 0);
-#if 0
-        /* XXX do we need this code?   comp always 1, off always 0, it seems */
         if (off != 0) {
+            /* XXX this code needs review/testing */
+            struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
+            struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
+
             brw_push_insn_state(p);
             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
-            brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
+            brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
             /* 2nd half? */
-            brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
+            brw_MOV(p, brw_message_reg(nr+1), arg1_1);
             brw_pop_insn_state(p);
         }
         else
-#endif
         {
-           struct brw_reg src =  get_src_reg(c, inst, 1, 1);
-           brw_MOV(p, brw_message_reg(nr), src);
+            struct brw_reg src =  get_src_reg(c, inst, 1, 1);
+            brw_MOV(p, brw_message_reg(nr), src);
         }
         nr += 2;
    }
@@ -882,12 +1026,20 @@ static void emit_dp3(struct brw_wm_compile *c,
     struct brw_reg src0[3], src1[3], dst;
     int i;
     struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
     for (i = 0; i < 3; i++) {
 	src0[i] = get_src_reg(c, inst, 0, i);
 	src1[i] = get_src_reg_imm(c, inst, 1, i);
     }
 
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+    dst = get_dst_reg(c, inst, dst_chan);
     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
@@ -901,11 +1053,19 @@ static void emit_dp4(struct brw_wm_compile *c,
     struct brw_reg src0[4], src1[4], dst;
     int i;
     struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
     for (i = 0; i < 4; i++) {
 	src0[i] = get_src_reg(c, inst, 0, i);
 	src1[i] = get_src_reg_imm(c, inst, 1, i);
     }
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+    dst = get_dst_reg(c, inst, dst_chan);
     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
     brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
@@ -920,11 +1080,19 @@ static void emit_dph(struct brw_wm_compile *c,
     struct brw_reg src0[4], src1[4], dst;
     int i;
     struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
     for (i = 0; i < 4; i++) {
 	src0[i] = get_src_reg(c, inst, 0, i);
 	src1[i] = get_src_reg_imm(c, inst, 1, i);
     }
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+    dst = get_dst_reg(c, inst, dst_chan);
     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
     brw_MAC(p, dst, src0[2], src1[2]);
@@ -942,37 +1110,28 @@ static void emit_math1(struct brw_wm_compile *c,
                        const struct prog_instruction *inst, GLuint func)
 {
     struct brw_compile *p = &c->func;
-    struct brw_reg src0, dst, tmp;
-    const int mark = mark_tmps( c );
-    int i;
+    struct brw_reg src0, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
 
-    tmp = alloc_tmp(c);
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
     /* Get first component of source register */
+    dst = get_dst_reg(c, inst, dst_chan);
     src0 = get_src_reg(c, inst, 0, 0);
 
-    /* tmp = func(src0) */
     brw_MOV(p, brw_message_reg(2), src0);
     brw_math(p,
-             tmp,
+             dst,
              func,
              (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
              2,
              brw_null_reg(),
              BRW_MATH_DATA_VECTOR,
              BRW_MATH_PRECISION_FULL);
-
-    /*tmp.dw1.bits.swizzle = SWIZZLE_XXXX;*/
-
-    /* replicate tmp value across enabled dest channels */
-    for (i = 0; i < 4; i++) {
-       if (inst->DstReg.WriteMask & (1 << i)) {
-          dst = get_dst_reg(c, inst, i);
-          brw_MOV(p, dst, tmp);
-       }
-    }
-
-    release_tmps(c, mark);
 }
 
 static void emit_rcp(struct brw_wm_compile *c,
@@ -1043,24 +1202,6 @@ static void emit_arl(struct brw_wm_compile *c,
     brw_set_saturate(p, 0);
 }
 
-static void emit_sub(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg(c, inst, 0, i);
-	    src1 = get_src_reg_imm(c, inst, 1, i);
-	    brw_ADD(p, dst, src0, negate(src1));
-	}
-    }
-    brw_set_saturate(p, 0);
-}
 
 static void emit_mul(struct brw_wm_compile *c,
                      const struct prog_instruction *inst)
@@ -1172,7 +1313,15 @@ static void emit_pow(struct brw_wm_compile *c,
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst, src0, src1;
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    dst = get_dst_reg(c, inst, dst_chan);
     src0 = get_src_reg_imm(c, inst, 0, 0);
     src1 = get_src_reg_imm(c, inst, 1, 0);
 
@@ -2474,8 +2623,12 @@ static void emit_txb(struct brw_wm_compile *c,
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst[4], src[4], payload_reg;
-    GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
+    /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
+    const GLuint unit = inst->TexSrcUnit;
     GLuint i;
+    GLuint msg_type;
+
+    assert(unit < BRW_MAX_TEX_UNIT);
 
     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
 
@@ -2496,14 +2649,26 @@ static void emit_txb(struct brw_wm_compile *c,
 	    brw_MOV(p, brw_message_reg(3), src[1]);
 	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
 	    break;
-	default:
+	case TEXTURE_3D_INDEX:
+	case TEXTURE_CUBE_INDEX:
 	    brw_MOV(p, brw_message_reg(2), src[0]);
 	    brw_MOV(p, brw_message_reg(3), src[1]);
 	    brw_MOV(p, brw_message_reg(4), src[2]);
 	    break;
+	default:
+            /* invalid target */
+            abort();
     }
     brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
     brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
+
+    if (BRW_IS_IGDNG(p->brw)) {
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
+    } else {
+        /* Does it work well on SIMD8? */
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+    }
+
     brw_SAMPLE(p,
                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
                1,                                           /* msg_reg_nr */
@@ -2511,10 +2676,12 @@ static void emit_txb(struct brw_wm_compile *c,
                SURF_INDEX_TEXTURE(unit),
                unit,                                        /* sampler */
                inst->DstReg.WriteMask,                      /* writemask */
-               BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,      /* msg_type */
+               msg_type,                                    /* msg_type */
                4,                                           /* response_length */
                4,                                           /* msg_length */
-               0);                                          /* eot */
+               0,                                           /* eot */
+               1,
+               BRW_SAMPLER_SIMD_MODE_SIMD8);	
 }
 
 
@@ -2523,11 +2690,15 @@ static void emit_tex(struct brw_wm_compile *c,
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst[4], src[4], payload_reg;
-    GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
+    /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
+    const GLuint unit = inst->TexSrcUnit;
     GLuint msg_len;
     GLuint i, nr;
     GLuint emit;
     GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
+    GLuint msg_type;
+
+    assert(unit < BRW_MAX_TEX_UNIT);
 
     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
 
@@ -2546,10 +2717,14 @@ static void emit_tex(struct brw_wm_compile *c,
 	    emit = WRITEMASK_XY;
 	    nr = 2;
 	    break;
-	default:
+	case TEXTURE_3D_INDEX:
+	case TEXTURE_CUBE_INDEX:
 	    emit = WRITEMASK_XYZ;
 	    nr = 3;
 	    break;
+	default:
+           /* invalid target */
+           abort();
     }
     msg_len = 1;
 
@@ -2568,6 +2743,16 @@ static void emit_tex(struct brw_wm_compile *c,
        brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
     }
 
+    if (BRW_IS_IGDNG(p->brw)) {
+        if (shadow)
+            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
+        else
+            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
+    } else {
+        /* Does it work for shadow on SIMD8 ? */
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+    }
+    
     brw_SAMPLE(p,
                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
                1,                                          /* msg_reg_nr */
@@ -2575,10 +2760,12 @@ static void emit_tex(struct brw_wm_compile *c,
                SURF_INDEX_TEXTURE(unit),
                unit,                                       /* sampler */
                inst->DstReg.WriteMask,                     /* writemask */
-               BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,           /* msg_type */
+               msg_type,                                   /* msg_type */
                4,                                          /* response_length */
                shadow ? 6 : 4,                             /* msg_length */
-               0);                                         /* eot */
+               0,                                          /* eot */
+               1,
+               BRW_SAMPLER_SIMD_MODE_SIMD8);	
 
     if (shadow)
 	brw_MOV(p, dst[3], brw_imm_f(1.0));
@@ -2595,15 +2782,15 @@ static void post_wm_emit( struct brw_wm_compile *c )
 
 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 {
-#define MAX_IFSN 32
+#define MAX_IF_DEPTH 32
 #define MAX_LOOP_DEPTH 32
-    struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
-    struct brw_instruction *inst0, *inst1;
-    int i, if_insn = 0, loop_insn = 0;
+    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
+    GLuint i, if_depth = 0, loop_depth = 0;
     struct brw_compile *p = &c->func;
     struct brw_indirect stack_index = brw_indirect(0, 0);
 
-    c->reg_index = 0;
+    c->out_of_regs = GL_FALSE;
+
     prealloc_reg(c);
     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
@@ -2611,6 +2798,8 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
     for (i = 0; i < c->nr_fp_insns; i++) {
         const struct prog_instruction *inst = &c->prog_instructions[i];
 
+        c->cur_inst = i;
+
 #if 0
         _mesa_printf("Inst %d: ", i);
         _mesa_print_instruction(inst);
@@ -2653,18 +2842,12 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    case WM_FRONTFACING:
 		emit_frontfacing(c, inst);
 		break;
-	    case OPCODE_ABS:
-		emit_abs(c, inst);
-		break;
 	    case OPCODE_ADD:
 		emit_add(c, inst);
 		break;
 	    case OPCODE_ARL:
 		emit_arl(c, inst);
 		break;
-	    case OPCODE_SUB:
-		emit_sub(c, inst);
-		break;
 	    case OPCODE_FRC:
 		emit_frc(c, inst);
 		break;
@@ -2770,15 +2953,15 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		emit_kil(c);
 		break;
 	    case OPCODE_IF:
-		assert(if_insn < MAX_IFSN);
-		if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+		assert(if_depth < MAX_IF_DEPTH);
+		if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
 		break;
 	    case OPCODE_ELSE:
-		if_inst[if_insn-1]  = brw_ELSE(p, if_inst[if_insn-1]);
+		if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
 		break;
 	    case OPCODE_ENDIF:
-		assert(if_insn > 0);
-		brw_ENDIF(p, if_inst[--if_insn]);
+		assert(if_depth > 0);
+		brw_ENDIF(p, if_inst[--if_depth]);
 		break;
 	    case OPCODE_BGNSUB:
 		brw_save_label(p, inst->Comment, p->nr_insn);
@@ -2812,7 +2995,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		break;
 	    case OPCODE_BGNLOOP:
                 /* XXX may need to invalidate the current_constant regs */
-		loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
+		loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
 		break;
 	    case OPCODE_BRK:
 		brw_BREAK(p);
@@ -2823,25 +3006,34 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 		break;
 	    case OPCODE_ENDLOOP: 
-		loop_insn--;
-		inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
-		/* patch all the BREAK instructions from
-		   last BEGINLOOP */
-		while (inst0 > loop_inst[loop_insn]) {
-		    inst0--;
-		    if (inst0->header.opcode == BRW_OPCODE_BREAK) {
-			inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
+               {
+                  struct brw_instruction *inst0, *inst1;
+                  GLuint br = 1;
+
+                  if (BRW_IS_IGDNG(brw))
+                     br = 2;
+ 
+                  loop_depth--;
+                  inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+                  /* patch all the BREAK/CONT instructions from last BGNLOOP */
+                  while (inst0 > loop_inst[loop_depth]) {
+                     inst0--;
+                     if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+			inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
 			inst0->bits3.if_else.pop_count = 0;
-		    } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
-                        inst0->bits3.if_else.jump_count = inst1 - inst0;
+                     }
+                     else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+                        inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
                         inst0->bits3.if_else.pop_count = 0;
-                    }
-		}
-		break;
+                     }
+                  }
+               }
+               break;
 	    default:
 		_mesa_printf("unsupported IR in fragment shader %d\n",
 			inst->Opcode);
 	}
+
 	if (inst->CondUpdate)
 	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 	else
@@ -2849,13 +3041,14 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
     }
     post_wm_emit(c);
 
-    if (c->reg_index >= BRW_WM_MAX_GRF) {
-        _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
-        /* XXX we need to do some proper error recovery here */
+    if (INTEL_DEBUG & DEBUG_WM) {
+      _mesa_printf("wm-native:\n");
+      for (i = 0; i < p->nr_insn; i++)
+	 brw_disasm(stderr, &p->store[i]);
+      _mesa_printf("\n");
     }
 }
 
-
 /**
  * Do GPU code generation for shaders that use GLSL features such as
  * flow control.  Other shaders will be compiled with the 
@@ -2876,6 +3069,6 @@ void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
         brw_wm_print_program(c, "brw_wm_glsl_emit done");
     }
 
-    c->prog_data.total_grf = c->reg_index;
+    c->prog_data.total_grf = num_grf_used(c);
     c->prog_data.total_scratch = 0;
 }
diff --git a/i965/brw_wm_iz.c b/i965/brw_wm_iz.c
index bd60ac9..5e399ac 100644
--- a/i965/brw_wm_iz.c
+++ b/i965/brw_wm_iz.c
@@ -116,8 +116,13 @@ const struct {
  { C, 0, 1, 1, 1 } 
 };
 
+/**
+ * \param line_aa  AA_NEVER, AA_ALWAYS or AA_SOMETIMES
+ * \param lookup  bitmask of IZ_* flags
+ */
 void brw_wm_lookup_iz( GLuint line_aa,
 		       GLuint lookup,
+		       GLboolean ps_uses_depth,
 		       struct brw_wm_prog_key *key )
 {
    GLuint reg = 2;
@@ -127,7 +132,7 @@ void brw_wm_lookup_iz( GLuint line_aa,
    if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
       key->computes_depth = 1;
 
-   if (wm_iz_table[lookup].sd_present) {
+   if (wm_iz_table[lookup].sd_present || ps_uses_depth) {
       key->source_depth_reg = reg;
       reg += 2;
    }
diff --git a/i965/brw_wm_pass0.c b/i965/brw_wm_pass0.c
index 9214276..6279258 100644
--- a/i965/brw_wm_pass0.c
+++ b/i965/brw_wm_pass0.c
@@ -257,34 +257,6 @@ static void pass0_set_dst( struct brw_wm_compile *c,
 }
 
 
-static void pass0_set_dst_scalar( struct brw_wm_compile *c,
-				  struct brw_wm_instruction *out,
-                                  const struct prog_instruction *inst,
-				  GLuint writemask )
-{
-   if (writemask) {
-      const struct prog_dst_register *dst = &inst->DstReg;
-      GLuint i;
-
-      /* Compute only the first (X) value:
-       */
-      out->writemask = WRITEMASK_X;
-      out->dst[0] = get_value(c);
-
-      /* Update our tracking register file for all the components in
-       * writemask:
-       */
-      for (i = 0; i < 4; i++) {
-	 if (writemask & (1<<i)) {
-	    pass0_set_fpreg_value(c, dst->File, dst->Index, i, out->dst[0]);
-	 }
-      }
-   }
-   else
-      out->writemask = 0;
-}
-
-
 static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c,
 						    struct prog_src_register src,
 						    GLuint i )
@@ -363,10 +335,7 @@ translate_insn(struct brw_wm_compile *c,
 
    /* Dst:
     */
-   if (brw_wm_is_scalar_result(out->opcode)) 
-      pass0_set_dst_scalar(c, out, inst, writemask);
-   else 
-      pass0_set_dst(c, out, inst, writemask);
+   pass0_set_dst(c, out, inst, writemask);
 }
 
 
diff --git a/i965/brw_wm_pass1.c b/i965/brw_wm_pass1.c
index ab9aa2f..3436a24 100644
--- a/i965/brw_wm_pass1.c
+++ b/i965/brw_wm_pass1.c
@@ -159,6 +159,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       case OPCODE_FRC:
       case OPCODE_MOV:
       case OPCODE_SWZ:
+      case OPCODE_TRUNC:
 	 read0 = writemask;
 	 break;
 
diff --git a/i965/brw_wm_sampler_state.c b/i965/brw_wm_sampler_state.c
index 3fc18ff..dff4665 100644
--- a/i965/brw_wm_sampler_state.c
+++ b/i965/brw_wm_sampler_state.c
@@ -103,6 +103,10 @@ struct wm_sampler_key {
       GLenum minfilter, magfilter;
       GLenum comparemode, comparefunc;
       dri_bo *sdc_bo;
+
+      /** If target is cubemap, take context setting.
+       */
+      GLboolean seamless_cube_map;
    } sampler[BRW_MAX_TEX_UNIT];
 };
 
@@ -169,30 +173,33 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
       }  
    }
 
-   if (key->tex_target == GL_TEXTURE_CUBE_MAP &&
-       (key->minfilter != GL_NEAREST || key->magfilter != GL_NEAREST)) {
-      /* If we're using anything but nearest sampling for a cube map, we
-       * need to set this wrap mode to avoid GPU lock-ups.
-       */
-      sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
-      sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
-      sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
-   }
-   else if (key->tex_target == GL_TEXTURE_1D) {
+   sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
+   sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
+   sampler->ss1.t_wrap_mode = translate_wrap_mode(key->wrap_t);
+
+   /* Cube-maps on 965 and later must use the same wrap mode for all 3
+    * coordinate dimensions.  Futher, only CUBE and CLAMP are valid.
+    */
+   if (key->tex_target == GL_TEXTURE_CUBE_MAP) {
+      if (key->seamless_cube_map &&
+	  (key->minfilter != GL_NEAREST || key->magfilter != GL_NEAREST)) {
+	 sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+	 sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+	 sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+      } else {
+	 sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	 sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	 sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+      }
+   } else if (key->tex_target == GL_TEXTURE_1D) {
       /* There's a bug in 1D texture sampling - it actually pays
        * attention to the wrap_t value, though it should not.
        * Override the wrap_t value here to GL_REPEAT to keep
        * any nonexistent border pixels from floating in.
        */
-      sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
-      sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
       sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
    }
-   else {
-      sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
-      sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
-      sampler->ss1.t_wrap_mode = translate_wrap_mode(key->wrap_t);
-   }
+
 
    /* Set shadow function: 
     */
@@ -249,6 +256,9 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 
          entry->tex_target = texObj->Target;
 
+	 entry->seamless_cube_map = (texObj->Target == GL_TEXTURE_CUBE_MAP)
+	    ? ctx->Texture.CubeMapSeamless : GL_FALSE;
+
 	 entry->wrap_r = texObj->WrapR;
 	 entry->wrap_s = texObj->WrapS;
 	 entry->wrap_t = texObj->WrapT;
diff --git a/i965/brw_wm_state.c b/i965/brw_wm_state.c
index 67b4117..39f8c6d 100644
--- a/i965/brw_wm_state.c
+++ b/i965/brw_wm_state.c
@@ -71,7 +71,9 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
       key->max_threads = 1;
    else {
       /* WM maximum threads is number of EUs times number of threads per EU. */
-      if (BRW_IS_G4X(brw))
+      if (BRW_IS_IGDNG(brw))
+         key->max_threads = 12 * 6;
+      else if (BRW_IS_G4X(brw))
 	 key->max_threads = 10 * 5;
       else
 	 key->max_threads = 8 * 4;
@@ -141,7 +143,11 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */
    wm.thread1.depth_coef_urb_read_offset = 1;
    wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-   wm.thread1.binding_table_entry_count = key->nr_surfaces;
+
+   if (BRW_IS_IGDNG(brw))
+      wm.thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else
+      wm.thread1.binding_table_entry_count = key->nr_surfaces;
 
    if (key->total_scratch != 0) {
       wm.thread2.scratch_space_base_pointer =
@@ -158,7 +164,11 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
    wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
 
-   wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
+   if (BRW_IS_IGDNG(brw)) 
+      wm.wm4.sampler_count = 0; /* hardware requirement */
+   else
+      wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
+
    if (brw->wm.sampler_bo != NULL) {
       /* reloc */
       wm.wm4.sampler_state_pointer = brw->wm.sampler_bo->offset >> 5;
diff --git a/i965/brw_wm_surface_state.c b/i965/brw_wm_surface_state.c
index 805df8a..3dcc592 100644
--- a/i965/brw_wm_surface_state.c
+++ b/i965/brw_wm_surface_state.c
@@ -176,22 +176,6 @@ static GLuint translate_tex_format( GLuint mesa_format, GLenum internal_format,
    }
 }
 
-
-/**
- * Use same key for WM and VS surfaces.
- */
-struct brw_surface_key {
-   GLenum target, depthmode;
-   dri_bo *bo;
-   GLint format, internal_format;
-   GLint first_level, last_level;
-   GLint width, height, depth;
-   GLint pitch, cpp;
-   uint32_t tiling;
-   GLuint offset;
-};
-
-
 static void
 brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling)
 {
@@ -268,7 +252,7 @@ brw_create_texture_surface( struct brw_context *brw,
       surf.ss0.cube_neg_z = 1;
    }
 
-   bo = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
 			 &key->bo, key->bo ? 1 : 0,
 			 &surf, sizeof(surf),
@@ -321,10 +305,11 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
    key.tiling = intelObj->mt->region->tiling;
 
    dri_bo_unreference(brw->wm.surf_bo[surf]);
-   brw->wm.surf_bo[surf] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-                                         &key, sizeof(key),
-                                         &key.bo, key.bo ? 1 : 0,
-                                         NULL);
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
    if (brw->wm.surf_bo[surf] == NULL) {
       brw->wm.surf_bo[surf] = brw_create_texture_surface(brw, &key);
    }
@@ -336,7 +321,7 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
  * Create the constant buffer surface.  Vertex/fragment shader constants will be
  * read from this buffer with Data Port Read instructions/messages.
  */
-static dri_bo *
+dri_bo *
 brw_create_constant_surface( struct brw_context *brw,
                              struct brw_surface_key *key )
 {
@@ -362,7 +347,7 @@ brw_create_constant_surface( struct brw_context *brw,
    surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */
    brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
  
-   bo = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
 			 &key->bo, key->bo ? 1 : 0,
 			 &surf, sizeof(surf),
@@ -380,39 +365,70 @@ brw_create_constant_surface( struct brw_context *brw,
    return bo;
 }
 
+/* Creates a new WM constant buffer reflecting the current fragment program's
+ * constants, if needed by the fragment program.
+ *
+ * Otherwise, constants go through the CURBEs using the brw_constant_buffer
+ * state atom.
+ */
+static drm_intel_bo *
+brw_wm_update_constant_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   const struct gl_program_parameter_list *params = fp->program.Base.Parameters;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   drm_intel_bo *const_buffer;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   if (!fp->use_const_buffer)
+      return NULL;
+
+   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "fp_const_buffer",
+				     size, 64);
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+
+   return const_buffer;
+}
 
 /**
  * Update the surface state for a WM constant buffer.
  * The constant buffer will be (re)allocated here if needed.
  */
-static dri_bo *
+static void
 brw_update_wm_constant_surface( GLcontext *ctx,
-                                GLuint surf,
-                                dri_bo *const_buffer,
-                                const struct gl_program_parameter_list *params)
+                                GLuint surf)
 {
    struct brw_context *brw = brw_context(ctx);
    struct brw_surface_key key;
-   struct intel_context *intel = &brw->intel;
-   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   const struct gl_program_parameter_list *params =
+      fp->program.Base.Parameters;
 
-   /* free old const buffer if too small */
-   if (const_buffer && const_buffer->size < size) {
-      dri_bo_unreference(const_buffer);
-      const_buffer = NULL;
-   }
+   /* If we're in this state update atom, we need to update WM constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   dri_bo_unreference(fp->const_buffer);
+   fp->const_buffer = brw_wm_update_constant_buffer(brw);
 
-   /* alloc new buffer if needed */
-   if (!const_buffer) {
-      const_buffer =
-         drm_intel_bo_alloc(intel->bufmgr, "fp_const_buffer", size, 64);
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (fp->const_buffer == 0) {
+      drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
+      brw->wm.surf_bo[surf] = NULL;
+      return;
    }
 
    memset(&key, 0, sizeof(key));
 
    key.format = MESA_FORMAT_RGBA_FLOAT32;
    key.internal_format = GL_RGBA;
-   key.bo = const_buffer;
+   key.bo = fp->const_buffer;
    key.depthmode = GL_NONE;
    key.pitch = params->NumParameters;
    key.width = params->NumParameters;
@@ -427,77 +443,59 @@ brw_update_wm_constant_surface( GLcontext *ctx,
    */
 
    dri_bo_unreference(brw->wm.surf_bo[surf]);
-   brw->wm.surf_bo[surf] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
                                             &key, sizeof(key),
                                             &key.bo, key.bo ? 1 : 0,
                                             NULL);
    if (brw->wm.surf_bo[surf] == NULL) {
       brw->wm.surf_bo[surf] = brw_create_constant_surface(brw, &key);
    }
-
-   return const_buffer;
+   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
 }
 
-
 /**
- * Update the surface state for a VS constant buffer.
- * The constant buffer will be (re)allocated here if needed.
+ * Updates surface / buffer for fragment shader constant buffer, if
+ * one is required.
+ *
+ * This consumes the state updates for the constant buffer, and produces
+ * BRW_NEW_WM_SURFACES to get picked up by brw_prepare_wm_surfaces for
+ * inclusion in the binding table.
  */
-static dri_bo *
-brw_update_vs_constant_surface( GLcontext *ctx,
-                                GLuint surf,
-                                dri_bo *const_buffer,
-                                const struct gl_program_parameter_list *params)
+static void prepare_wm_constant_surface(struct brw_context *brw )
 {
-   struct brw_context *brw = brw_context(ctx);
-   struct brw_surface_key key;
-   struct intel_context *intel = &brw->intel;
-   const int size = params->NumParameters * 4 * sizeof(GLfloat);
-
-   assert(surf == 0);
-
-   /* free old const buffer if too small */
-   if (const_buffer && const_buffer->size < size) {
-      dri_bo_unreference(const_buffer);
-      const_buffer = NULL;
-   }
-
-   /* alloc new buffer if needed */
-   if (!const_buffer) {
-      const_buffer =
-         drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer", size, 64);
-   }
-
-   memset(&key, 0, sizeof(key));
-
-   key.format = MESA_FORMAT_RGBA_FLOAT32;
-   key.internal_format = GL_RGBA;
-   key.bo = const_buffer;
-   key.depthmode = GL_NONE;
-   key.pitch = params->NumParameters;
-   key.width = params->NumParameters;
-   key.height = 1;
-   key.depth = 1;
-   key.cpp = 16;
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
 
-   /*
-   printf("%s:\n", __FUNCTION__);
-   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
-          key.width, key.height, key.depth, key.cpp, key.pitch);
-   */
+   drm_intel_bo_unreference(fp->const_buffer);
+   fp->const_buffer = brw_wm_update_constant_buffer(brw);
 
-   dri_bo_unreference(brw->vs.surf_bo[surf]);
-   brw->vs.surf_bo[surf] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-                                            &key, sizeof(key),
-                                            &key.bo, key.bo ? 1 : 0,
-                                            NULL);
-   if (brw->vs.surf_bo[surf] == NULL) {
-      brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (fp->const_buffer == 0) {
+      if (brw->wm.surf_bo[surf] != NULL) {
+	 drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
+	 brw->wm.surf_bo[surf] = NULL;
+	 brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+      }
+      return;
    }
 
-   return const_buffer;
+   brw_update_wm_constant_surface(ctx, surf);
 }
 
+const struct brw_tracked_state brw_wm_constant_surface = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_FRAGMENT_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_wm_constant_surface,
+};
+
 
 /**
  * Sets up a surface state structure to point at the given region.
@@ -507,7 +505,7 @@ brw_update_vs_constant_surface( GLcontext *ctx,
 static void
 brw_update_renderbuffer_surface(struct brw_context *brw,
 				struct gl_renderbuffer *rb,
-				unsigned int unit, GLboolean cached)
+				unsigned int unit)
 {
    GLcontext *ctx = &brw->intel.ctx;
    dri_bo *region_bo = NULL;
@@ -567,12 +565,11 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 		      ctx->Color.BlendEnabled);
 
    dri_bo_unreference(brw->wm.surf_bo[unit]);
-   brw->wm.surf_bo[unit] = NULL;
-   if (cached) 
-       brw->wm.surf_bo[unit] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-	       &key, sizeof(key),
-	       &region_bo, 1,
-	       NULL);
+   brw->wm.surf_bo[unit] = brw_search_cache(&brw->surface_cache,
+					    BRW_SS_SURFACE,
+					    &key, sizeof(key),
+					    &region_bo, 1,
+					    NULL);
 
    if (brw->wm.surf_bo[unit] == NULL) {
       struct brw_surface_state surf;
@@ -581,7 +578,27 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 
       surf.ss0.surface_format = key.surface_format;
       surf.ss0.surface_type = key.surface_type;
-      surf.ss1.base_addr =  key.draw_offset;
+      if (key.tiling == I915_TILING_NONE) {
+	 surf.ss1.base_addr = key.draw_offset;
+      } else {
+	 uint32_t tile_offset = key.draw_offset % 4096;
+
+	 surf.ss1.base_addr = key.draw_offset - tile_offset;
+
+	 assert(BRW_IS_G4X(brw) || tile_offset == 0);
+	 if (BRW_IS_G4X(brw)) {
+	    if (key.tiling == I915_TILING_X) {
+	       /* Note that the low bits of these fields are missing, so
+		* there's the possibility of getting in trouble.
+		*/
+	       surf.ss5.x_offset = (tile_offset % 512) / key.cpp / 4;
+	       surf.ss5.y_offset = tile_offset / 512 / 2;
+	    } else {
+	       surf.ss5.x_offset = (tile_offset % 128) / key.cpp / 4;
+	       surf.ss5.y_offset = tile_offset / 128 / 2;
+	    }
+	 }
+      }
       if (region_bo != NULL)
 	 surf.ss1.base_addr += region_bo->offset; /* reloc */
 
@@ -598,7 +615,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       surf.ss0.writedisable_alpha = !key.color_mask[3];
 
       /* Key size will never match key size for textures, so we're safe. */
-      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->surface_cache,
+                                               BRW_SS_SURFACE,
                                                &key, sizeof(key),
 					       &region_bo, 1,
 					       &surf, sizeof(surf),
@@ -611,7 +629,7 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 	 drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit],
 				 offsetof(struct brw_surface_state, ss1),
 				 region_bo,
-				 key.draw_offset,
+				 surf.ss1.base_addr - region_bo->offset,
 				 I915_GEM_DOMAIN_RENDER,
 				 I915_GEM_DOMAIN_RENDER);
       }
@@ -630,7 +648,7 @@ brw_wm_get_binding_table(struct brw_context *brw)
 
    assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
 
-   bind_bo = brw_search_cache(&brw->cache, BRW_SS_SURF_BIND,
+   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
 			      NULL, 0,
 			      brw->wm.surf_bo, brw->wm.nr_surfaces,
 			      NULL);
@@ -646,7 +664,7 @@ brw_wm_get_binding_table(struct brw_context *brw)
          else
             data[i] = 0;
 
-      bind_bo = brw_upload_cache( &brw->cache, BRW_SS_SURF_BIND,
+      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
 				  NULL, 0,
 				  brw->wm.surf_bo, brw->wm.nr_surfaces,
 				  data, data_size,
@@ -682,27 +700,17 @@ static void prepare_wm_surfaces(struct brw_context *brw )
       for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
          brw_update_renderbuffer_surface(brw,
 					 ctx->DrawBuffer->_ColorDrawBuffers[i],
-					 i,
-					 GL_FALSE);
+					 i);
       }
    } else {
-      brw_update_renderbuffer_surface(brw, NULL, 0, GL_TRUE);
+      brw_update_renderbuffer_surface(brw, NULL, 0);
    }
 
    old_nr_surfaces = brw->wm.nr_surfaces;
    brw->wm.nr_surfaces = MAX_DRAW_BUFFERS;
 
-   /* Update surface / buffer for fragment shader constant buffer */
-   {
-      const GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
-      struct brw_fragment_program *fp =
-         (struct brw_fragment_program *) brw->fragment_program;
-      fp->const_buffer =
-         brw_update_wm_constant_surface(ctx, surf, fp->const_buffer,
-                                     fp->program.Base.Parameters);
-
-      brw->wm.nr_surfaces = surf + 1;
-   }
+   if (brw->wm.surf_bo[SURF_INDEX_FRAG_CONST_BUFFER] != NULL)
+       brw->wm.nr_surfaces = SURF_INDEX_FRAG_CONST_BUFFER + 1;
 
    /* Update surfaces for textures */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
@@ -735,100 +743,16 @@ static void prepare_wm_surfaces(struct brw_context *brw )
       brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
 }
 
-
-/**
- * Constructs the binding table for the VS surface state.
- */
-static dri_bo *
-brw_vs_get_binding_table(struct brw_context *brw)
-{
-   dri_bo *bind_bo;
-
-   assert(brw->vs.nr_surfaces <= BRW_VS_MAX_SURF);
-
-   bind_bo = brw_search_cache(&brw->cache, BRW_SS_SURF_BIND,
-			      NULL, 0,
-			      brw->vs.surf_bo, brw->vs.nr_surfaces,
-			      NULL);
-
-   if (bind_bo == NULL) {
-      GLuint data_size = brw->vs.nr_surfaces * sizeof(GLuint);
-      uint32_t *data = malloc(data_size);
-      int i;
-
-      for (i = 0; i < brw->vs.nr_surfaces; i++)
-         if (brw->vs.surf_bo[i])
-            data[i] = brw->vs.surf_bo[i]->offset;
-         else
-            data[i] = 0;
-
-      bind_bo = brw_upload_cache( &brw->cache, BRW_SS_SURF_BIND,
-				  NULL, 0,
-				  brw->vs.surf_bo, brw->vs.nr_surfaces,
-				  data, data_size,
-				  NULL, NULL);
-
-      /* Emit binding table relocations to surface state */
-      for (i = 0; i < BRW_VS_MAX_SURF; i++) {
-	 if (brw->vs.surf_bo[i] != NULL) {
-	    dri_bo_emit_reloc(bind_bo,
-			      I915_GEM_DOMAIN_INSTRUCTION, 0,
-			      0,
-			      i * sizeof(GLuint),
-			      brw->vs.surf_bo[i]);
-	 }
-      }
-
-      free(data);
-   }
-
-   return bind_bo;
-}
-
-
-/**
- * Vertex shader surfaces.  Just constant buffer for now.  Could add vertex 
- * shader textures in the future.
- */
-static void prepare_vs_surfaces(struct brw_context *brw )
-{
-   GLcontext *ctx = &brw->intel.ctx;
-
-   /* Update surface / buffer for vertex shader constant buffer */
-   {
-      const GLuint surf = SURF_INDEX_VERT_CONST_BUFFER;
-      struct brw_vertex_program *vp =
-         (struct brw_vertex_program *) brw->vertex_program;
-      vp->const_buffer =
-         brw_update_vs_constant_surface(ctx, surf, vp->const_buffer,
-                                        vp->program.Base.Parameters);
-
-      brw->vs.nr_surfaces = 1;
-   }
-
-   dri_bo_unreference(brw->vs.bind_bo);
-   brw->vs.bind_bo = brw_vs_get_binding_table(brw);
-
-   if (1)
-      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
-}
-
-
-static void
-prepare_surfaces(struct brw_context *brw)
-{
-   prepare_wm_surfaces(brw);
-   prepare_vs_surfaces(brw);
-}
-
-
 const struct brw_tracked_state brw_wm_surfaces = {
    .dirty = {
-      .mesa = _NEW_COLOR | _NEW_TEXTURE | _NEW_BUFFERS | _NEW_PROGRAM,
-      .brw = BRW_NEW_CONTEXT,
+      .mesa = (_NEW_COLOR |
+               _NEW_TEXTURE |
+               _NEW_BUFFERS),
+      .brw = (BRW_NEW_CONTEXT |
+	      BRW_NEW_WM_SURFACES),
       .cache = 0
    },
-   .prepare = prepare_surfaces,
+   .prepare = prepare_wm_surfaces,
 };
 
 
diff --git a/shared/intel_batchbuffer.c b/shared/intel_batchbuffer.c
index 29dc05c..6aa36d1 100644
--- a/shared/intel_batchbuffer.c
+++ b/shared/intel_batchbuffer.c
@@ -195,7 +195,16 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
 {
    struct intel_context *intel = batch->intel;
    GLuint used = batch->ptr - batch->map;
-   GLboolean was_locked = intel->locked;
+
+   if (intel->first_post_swapbuffers_batch == NULL) {
+      intel->first_post_swapbuffers_batch = intel->batch->buf;
+      drm_intel_bo_reference(intel->first_post_swapbuffers_batch);
+   }
+
+   if (intel->first_post_swapbuffers_batch == NULL) {
+      intel->first_post_swapbuffers_batch = intel->batch->buf;
+      drm_intel_bo_reference(intel->first_post_swapbuffers_batch);
+   }
 
    if (used == 0) {
       batch->cliprect_mode = IGNORE_CLIPRECTS;
@@ -243,13 +252,9 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
    /* TODO: Just pass the relocation list and dma buffer up to the
     * kernel.
     */
-   if (!was_locked)
-      LOCK_HARDWARE(intel);
-
+   LOCK_HARDWARE(intel);
    do_flush_locked(batch, used, GL_FALSE);
-
-   if (!was_locked)
-      UNLOCK_HARDWARE(intel);
+   UNLOCK_HARDWARE(intel);
 
    if (INTEL_DEBUG & DEBUG_SYNC) {
       fprintf(stderr, "waiting for idle\n");
diff --git a/shared/intel_blit.c b/shared/intel_blit.c
index 4919828..0c5be4c 100644
--- a/shared/intel_blit.c
+++ b/shared/intel_blit.c
@@ -108,6 +108,8 @@ intelCopyBuffer(const __DRIdrawablePrivate * dPriv,
 	 CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
       }
 
+      assert(src->tiling != I915_TILING_Y);
+      assert(dst->tiling != I915_TILING_Y);
 #ifndef I915
       if (src->tiling != I915_TILING_NONE) {
 	 CMD |= XY_SRC_TILED;
@@ -175,66 +177,6 @@ intelCopyBuffer(const __DRIdrawablePrivate * dPriv,
    UNLOCK_HARDWARE(intel);
 }
 
-
-
-
-void
-intelEmitFillBlit(struct intel_context *intel,
-		  GLuint cpp,
-		  GLshort dst_pitch,
-		  dri_bo *dst_buffer,
-		  GLuint dst_offset,
-		  uint32_t dst_tiling,
-		  GLshort x, GLshort y,
-		  GLshort w, GLshort h,
-		  GLuint color)
-{
-   GLuint BR13, CMD;
-   BATCH_LOCALS;
-
-   dst_pitch *= cpp;
-
-   switch (cpp) {
-   case 1:
-      BR13 = (0xF0 << 16);
-      CMD = XY_COLOR_BLT_CMD;
-      break;
-   case 2:
-      BR13 = (0xF0 << 16) | BR13_565;
-      CMD = XY_COLOR_BLT_CMD;
-      break;
-   case 4:
-      BR13 = (0xF0 << 16) | BR13_8888;
-      CMD = XY_COLOR_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
-      break;
-   default:
-      return;
-   }
-#ifndef I915
-   if (dst_tiling != I915_TILING_NONE) {
-      CMD |= XY_DST_TILED;
-      dst_pitch /= 4;
-   }
-#endif
-
-   DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
-       __FUNCTION__, dst_buffer, dst_pitch, dst_offset, x, y, w, h);
-
-   assert(w > 0);
-   assert(h > 0);
-
-   BEGIN_BATCH(6, NO_LOOP_CLIPRECTS);
-   OUT_BATCH(CMD);
-   OUT_BATCH(BR13 | dst_pitch);
-   OUT_BATCH((y << 16) | x);
-   OUT_BATCH(((y + h) << 16) | (x + w));
-   OUT_RELOC(dst_buffer,
-	     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-	     dst_offset);
-   OUT_BATCH(color);
-   ADVANCE_BATCH();
-}
-
 static GLuint translate_raster_op(GLenum logicop)
 {
    switch(logicop) {
@@ -261,7 +203,7 @@ static GLuint translate_raster_op(GLenum logicop)
 
 /* Copy BitBlt
  */
-void
+GLboolean
 intelEmitCopyBlit(struct intel_context *intel,
 		  GLuint cpp,
 		  GLshort src_pitch,
@@ -283,6 +225,19 @@ intelEmitCopyBlit(struct intel_context *intel,
    dri_bo *aper_array[3];
    BATCH_LOCALS;
 
+   if (dst_tiling != I915_TILING_NONE) {
+      if (dst_offset & 4095)
+	 return GL_FALSE;
+      if (dst_tiling == I915_TILING_Y)
+	 return GL_FALSE;
+   }
+   if (src_tiling != I915_TILING_NONE) {
+      if (src_offset & 4095)
+	 return GL_FALSE;
+      if (src_tiling == I915_TILING_Y)
+	 return GL_FALSE;
+   }
+
    /* do space/cliprects check before going any further */
    do {
        aper_array[0] = intel->batch->buf;
@@ -297,12 +252,7 @@ intelEmitCopyBlit(struct intel_context *intel,
    } while (pass < 2);
 
    if (pass >= 2) {
-       GLboolean locked = GL_FALSE;       
-       if (!intel->locked) {
-           LOCK_HARDWARE(intel);
-           locked = GL_TRUE;
-       }
-
+       LOCK_HARDWARE(intel);
        dri_bo_map(dst_buffer, GL_TRUE);
        dri_bo_map(src_buffer, GL_FALSE);
        _mesa_copy_rect((GLubyte *)dst_buffer->virtual + dst_offset,
@@ -316,11 +266,9 @@ intelEmitCopyBlit(struct intel_context *intel,
        
        dri_bo_unmap(src_buffer);
        dri_bo_unmap(dst_buffer);
-       
-       if (locked)
-           UNLOCK_HARDWARE(intel);
+       UNLOCK_HARDWARE(intel);
 
-       return;
+       return GL_TRUE;
    }
 
    intel_batchbuffer_require_space(intel->batch, 8 * 4, NO_LOOP_CLIPRECTS);
@@ -347,7 +295,7 @@ intelEmitCopyBlit(struct intel_context *intel,
       CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
       break;
    default:
-      return;
+      return GL_FALSE;
    }
 
 #ifndef I915
@@ -362,7 +310,7 @@ intelEmitCopyBlit(struct intel_context *intel,
 #endif
 
    if (dst_y2 <= dst_y || dst_x2 <= dst_x) {
-      return;
+      return GL_TRUE;
    }
 
    assert(dst_x < dst_x2);
@@ -384,6 +332,8 @@ intelEmitCopyBlit(struct intel_context *intel,
    ADVANCE_BATCH();
 
    intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   return GL_TRUE;
 }
 
 
@@ -527,6 +477,8 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                   BR13 |= BR13_565;
                }
 
+	       assert(irb->region->tiling != I915_TILING_Y);
+
 #ifndef I915
 	       if (irb->region->tiling != I915_TILING_NONE) {
 		  CMD |= XY_DST_TILED;
@@ -596,7 +548,7 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
    UNLOCK_HARDWARE(intel);
 }
 
-void
+GLboolean
 intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 				  GLuint cpp,
 				  GLubyte *src_bits, GLuint src_size,
@@ -612,11 +564,19 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
    int dwords = ALIGN(src_size, 8) / 4;
    uint32_t opcode, br13, blit_cmd;
 
+   if (dst_tiling != I915_TILING_NONE) {
+      if (dst_offset & 4095)
+	 return GL_FALSE;
+      if (dst_tiling == I915_TILING_Y)
+	 return GL_FALSE;
+   }
+
    assert( logic_op - GL_CLEAR >= 0 );
    assert( logic_op - GL_CLEAR < 0x10 );
+   assert(dst_pitch > 0);
 
    if (w < 0 || h < 0)
-      return;
+      return GL_TRUE;
 
    dst_pitch *= cpp;
 
@@ -673,4 +633,46 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 			   REFERENCES_CLIPRECTS );
 
    intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   return GL_TRUE;
+}
+
+/* We don't have a memmove-type blit like some other hardware, so we'll do a
+ * rectangular blit covering a large space, then emit 1-scanline blit at the
+ * end to cover the last if we need.
+ */
+void
+intel_emit_linear_blit(struct intel_context *intel,
+		       drm_intel_bo *dst_bo,
+		       unsigned int dst_offset,
+		       drm_intel_bo *src_bo,
+		       unsigned int src_offset,
+		       unsigned int size)
+{
+   GLuint pitch, height;
+
+   /* The pitch is a signed value. */
+   pitch = MIN2(size, (1 << 15) - 1);
+   height = size / pitch;
+   intelEmitCopyBlit(intel, 1,
+		     pitch, src_bo, src_offset, I915_TILING_NONE,
+		     pitch, dst_bo, dst_offset, I915_TILING_NONE,
+		     0, 0, /* src x/y */
+		     0, 0, /* dst x/y */
+		     pitch, height, /* w, h */
+		     GL_COPY);
+
+   src_offset += pitch * height;
+   dst_offset += pitch * height;
+   size -= pitch * height;
+   assert (size < (1 << 15));
+   if (size != 0) {
+      intelEmitCopyBlit(intel, 1,
+			size, src_bo, src_offset, I915_TILING_NONE,
+			size, dst_bo, dst_offset, I915_TILING_NONE,
+			0, 0, /* src x/y */
+			0, 0, /* dst x/y */
+			size, 1, /* w, h */
+			GL_COPY);
+   }
 }
diff --git a/shared/intel_blit.h b/shared/intel_blit.h
index 52065b1..240cb7c 100644
--- a/shared/intel_blit.h
+++ b/shared/intel_blit.h
@@ -35,7 +35,8 @@ extern void intelCopyBuffer(const __DRIdrawablePrivate * dpriv,
 
 extern void intelClearWithBlit(GLcontext * ctx, GLbitfield mask);
 
-extern void intelEmitCopyBlit(struct intel_context *intel,
+GLboolean
+intelEmitCopyBlit(struct intel_context *intel,
                               GLuint cpp,
                               GLshort src_pitch,
                               dri_bo *src_buffer,
@@ -50,16 +51,7 @@ extern void intelEmitCopyBlit(struct intel_context *intel,
                               GLshort w, GLshort h,
 			      GLenum logicop );
 
-extern void intelEmitFillBlit(struct intel_context *intel,
-                              GLuint cpp,
-                              GLshort dst_pitch,
-                              dri_bo *dst_buffer,
-                              GLuint dst_offset,
-			      uint32_t dst_tiling,
-                              GLshort x, GLshort y,
-                              GLshort w, GLshort h, GLuint color);
-
-void
+GLboolean
 intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 				  GLuint cpp,
 				  GLubyte *src_bits, GLuint src_size,
@@ -71,5 +63,11 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 				  GLshort x, GLshort y,
 				  GLshort w, GLshort h,
 				  GLenum logic_op);
+void intel_emit_linear_blit(struct intel_context *intel,
+			    drm_intel_bo *dst_bo,
+			    unsigned int dst_offset,
+			    drm_intel_bo *src_bo,
+			    unsigned int src_offset,
+			    unsigned int size);
 
 #endif
diff --git a/shared/intel_buffer_objects.c b/shared/intel_buffer_objects.c
index 2e6b778..a022593 100644
--- a/shared/intel_buffer_objects.c
+++ b/shared/intel_buffer_objects.c
@@ -28,13 +28,18 @@
 
 #include "main/imports.h"
 #include "main/mtypes.h"
+#include "main/macros.h"
 #include "main/bufferobj.h"
 
 #include "intel_context.h"
+#include "intel_blit.h"
 #include "intel_buffer_objects.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 
+static GLboolean
+intel_bufferobj_unmap(GLcontext * ctx,
+                      GLenum target, struct gl_buffer_object *obj);
 
 /** Allocates a new dri_bo to store the data for the buffer object. */
 static void
@@ -100,7 +105,13 @@ intel_bufferobj_free(GLcontext * ctx, struct gl_buffer_object *obj)
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
 
    assert(intel_obj);
-   assert(!obj->Pointer); /* Mesa should have unmapped it */
+
+   /* Buffer objects are automatically unmapped when deleting according
+    * to the spec, but Mesa doesn't do UnmapBuffer for us at context destroy
+    * (though it does if you call glDeleteBuffers)
+    */
+   if (obj->Pointer)
+      intel_bufferobj_unmap(ctx, 0, obj);
 
    _mesa_free(intel_obj->sys_buffer);
    if (intel_obj->region) {
@@ -119,9 +130,10 @@ intel_bufferobj_free(GLcontext * ctx, struct gl_buffer_object *obj)
  * Allocate space for and store data in a buffer object.  Any data that was
  * previously stored in the buffer object is lost.  If data is NULL,
  * memory will be allocated, but no copy will occur.
- * Called via glBufferDataARB().
+ * Called via ctx->Driver.BufferData().
+ * \return GL_TRUE for success, GL_FALSE if out of memory
  */
-static void
+static GLboolean
 intel_bufferobj_data(GLcontext * ctx,
                      GLenum target,
                      GLsizeiptrARB size,
@@ -156,15 +168,19 @@ intel_bufferobj_data(GLcontext * ctx,
 	 if (intel_obj->sys_buffer != NULL) {
 	    if (data != NULL)
 	       memcpy(intel_obj->sys_buffer, data, size);
-	    return;
+	    return GL_TRUE;
 	 }
       }
 #endif
       intel_bufferobj_alloc_buffer(intel, intel_obj);
+      if (!intel_obj->buffer)
+         return GL_FALSE;
 
       if (data != NULL)
 	 dri_bo_subdata(intel_obj->buffer, 0, size, data);
    }
+
+   return GL_TRUE;
 }
 
 
@@ -191,8 +207,12 @@ intel_bufferobj_subdata(GLcontext * ctx,
 
    if (intel_obj->sys_buffer)
       memcpy((char *)intel_obj->sys_buffer + offset, data, size);
-   else
+   else {
+      /* Flush any existing batchbuffer that might reference this data. */
+      intelFlush(ctx);
+
       dri_bo_subdata(intel_obj->buffer, offset, size, data);
+   }
 }
 
 
@@ -209,7 +229,10 @@ intel_bufferobj_get_subdata(GLcontext * ctx,
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
 
    assert(intel_obj);
-   dri_bo_get_subdata(intel_obj->buffer, offset, size, data);
+   if (intel_obj->sys_buffer)
+      memcpy(data, (char *)intel_obj->sys_buffer + offset, size);
+   else
+      dri_bo_get_subdata(intel_obj->buffer, offset, size, data);
 }
 
 
@@ -234,6 +257,11 @@ intel_bufferobj_map(GLcontext * ctx,
       return obj->Pointer;
    }
 
+   /* Flush any existing batchbuffer that might have written to this
+    * buffer.
+    */
+   intelFlush(ctx);
+
    if (intel_obj->region)
       intel_bufferobj_cow(intel, intel_obj);
 
@@ -251,32 +279,205 @@ intel_bufferobj_map(GLcontext * ctx,
    }
 
    obj->Pointer = intel_obj->buffer->virtual;
+   obj->Length = obj->Size;
+   obj->Offset = 0;
+
    return obj->Pointer;
 }
 
+/**
+ * Called via glMapBufferRange().
+ *
+ * The goal of this extension is to allow apps to accumulate their rendering
+ * at the same time as they accumulate their buffer object.  Without it,
+ * you'd end up blocking on execution of rendering every time you mapped
+ * the buffer to put new data in.
+ *
+ * We support it in 3 ways: If unsynchronized, then don't bother
+ * flushing the batchbuffer before mapping the buffer, which can save blocking
+ * in many cases.  If we would still block, and they allow the whole buffer
+ * to be invalidated, then just allocate a new buffer to replace the old one.
+ * If not, and we'd block, and they allow the subrange of the buffer to be
+ * invalidated, then we can make a new little BO, let them write into that,
+ * and blit it into the real BO at unmap time.
+ */
+static void *
+intel_bufferobj_map_range(GLcontext * ctx,
+			  GLenum target, GLintptr offset, GLsizeiptr length,
+			  GLbitfield access, struct gl_buffer_object *obj)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
+
+   assert(intel_obj);
+
+   /* _mesa_MapBufferRange (GL entrypoint) sets these, but the vbo module also
+    * internally uses our functions directly.
+    */
+   obj->Offset = offset;
+   obj->Length = length;
+   obj->AccessFlags = access;
+
+   if (intel_obj->sys_buffer) {
+      obj->Pointer = intel_obj->sys_buffer + offset;
+      return obj->Pointer;
+   }
+
+   if (intel_obj->region)
+      intel_bufferobj_cow(intel, intel_obj);
+
+   /* If the mapping is synchronized with other GL operations, flush
+    * the batchbuffer so that GEM knows about the buffer access for later
+    * syncing.
+    */
+   if (!(access & GL_MAP_UNSYNCHRONIZED_BIT))
+      intelFlush(ctx);
+
+   if (intel_obj->buffer == NULL) {
+      obj->Pointer = NULL;
+      return NULL;
+   }
+
+   /* If the user doesn't care about existing buffer contents and mapping
+    * would cause us to block, then throw out the old buffer.
+    */
+   if (!(access & GL_MAP_UNSYNCHRONIZED_BIT) &&
+       (access & GL_MAP_INVALIDATE_BUFFER_BIT) &&
+       drm_intel_bo_busy(intel_obj->buffer)) {
+      drm_intel_bo_unreference(intel_obj->buffer);
+      intel_obj->buffer = dri_bo_alloc(intel->bufmgr, "bufferobj",
+				       intel_obj->Base.Size, 64);
+   }
+
+   /* If the user is mapping a range of an active buffer object but
+    * doesn't require the current contents of that range, make a new
+    * BO, and we'll copy what they put in there out at unmap or
+    * FlushRange time.
+    */
+   if ((access & GL_MAP_INVALIDATE_RANGE_BIT) &&
+       drm_intel_bo_busy(intel_obj->buffer)) {
+      if (access & GL_MAP_FLUSH_EXPLICIT_BIT) {
+	 intel_obj->range_map_buffer = _mesa_malloc(length);
+	 obj->Pointer = intel_obj->range_map_buffer;
+      } else {
+	 intel_obj->range_map_bo = drm_intel_bo_alloc(intel->bufmgr,
+						      "range map",
+						      length, 64);
+	 if (!(access & GL_MAP_READ_BIT) &&
+	     intel->intelScreen->kernel_exec_fencing) {
+	    drm_intel_gem_bo_map_gtt(intel_obj->range_map_bo);
+	    intel_obj->mapped_gtt = GL_TRUE;
+	 } else {
+	    drm_intel_bo_map(intel_obj->range_map_bo,
+			     (access & GL_MAP_WRITE_BIT) != 0);
+	    intel_obj->mapped_gtt = GL_FALSE;
+	 }
+	 obj->Pointer = intel_obj->range_map_bo->virtual;
+      }
+      return obj->Pointer;
+   }
+
+   if (!(access & GL_MAP_READ_BIT) &&
+       intel->intelScreen->kernel_exec_fencing) {
+      drm_intel_gem_bo_map_gtt(intel_obj->buffer);
+      intel_obj->mapped_gtt = GL_TRUE;
+   } else {
+      drm_intel_bo_map(intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0);
+      intel_obj->mapped_gtt = GL_FALSE;
+   }
+
+   obj->Pointer = intel_obj->buffer->virtual + offset;
+   return obj->Pointer;
+}
+
+/* Ideally we'd use a BO to avoid taking up cache space for the temporary
+ * data, but FlushMappedBufferRange may be followed by further writes to
+ * the pointer, so we would have to re-map after emitting our blit, which
+ * would defeat the point.
+ */
+static void
+intel_bufferobj_flush_mapped_range(GLcontext *ctx, GLenum target,
+				   GLintptr offset, GLsizeiptr length,
+				   struct gl_buffer_object *obj)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
+   drm_intel_bo *temp_bo;
+
+   /* Unless we're in the range map using a temporary system buffer,
+    * there's no work to do.
+    */
+   if (intel_obj->range_map_buffer == NULL)
+      return;
+
+   temp_bo = drm_intel_bo_alloc(intel->bufmgr, "range map flush", length, 64);
+
+   drm_intel_bo_subdata(temp_bo, 0, length, intel_obj->range_map_buffer);
+
+   intel_emit_linear_blit(intel,
+			  intel_obj->buffer, obj->Offset + offset,
+			  temp_bo, 0,
+			  length);
+
+   drm_intel_bo_unreference(temp_bo);
+}
+
 
 /**
- * Called via glMapBufferARB().
+ * Called via glUnmapBuffer().
  */
 static GLboolean
 intel_bufferobj_unmap(GLcontext * ctx,
                       GLenum target, struct gl_buffer_object *obj)
 {
+   struct intel_context *intel = intel_context(ctx);
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
 
    assert(intel_obj);
+   assert(obj->Pointer);
    if (intel_obj->sys_buffer != NULL) {
-      assert(obj->Pointer);
-      obj->Pointer = NULL;
+      /* always keep the mapping around. */
+   } else if (intel_obj->range_map_buffer != NULL) {
+      /* Since we've emitted some blits to buffers that will (likely) be used
+       * in rendering operations in other cache domains in this batch, emit a
+       * flush.  Once again, we wish for a domain tracker in libdrm to cover
+       * usage inside of a batchbuffer.
+       */
+      intel_batchbuffer_emit_mi_flush(intel->batch);
+      free(intel_obj->range_map_buffer);
+      intel_obj->range_map_buffer = NULL;
+   } else if (intel_obj->range_map_bo != NULL) {
+      if (intel_obj->mapped_gtt) {
+	 drm_intel_gem_bo_unmap_gtt(intel_obj->range_map_bo);
+      } else {
+	 drm_intel_bo_unmap(intel_obj->range_map_bo);
+      }
+
+      intel_emit_linear_blit(intel,
+			     intel_obj->buffer, obj->Offset,
+			     intel_obj->range_map_bo, 0,
+			     obj->Length);
+
+      /* Since we've emitted some blits to buffers that will (likely) be used
+       * in rendering operations in other cache domains in this batch, emit a
+       * flush.  Once again, we wish for a domain tracker in libdrm to cover
+       * usage inside of a batchbuffer.
+       */
+      intel_batchbuffer_emit_mi_flush(intel->batch);
+
+      drm_intel_bo_unreference(intel_obj->range_map_bo);
+      intel_obj->range_map_bo = NULL;
    } else if (intel_obj->buffer != NULL) {
-      assert(obj->Pointer);
       if (intel_obj->mapped_gtt) {
 	 drm_intel_gem_bo_unmap_gtt(intel_obj->buffer);
       } else {
 	 drm_intel_bo_unmap(intel_obj->buffer);
       }
-      obj->Pointer = NULL;
    }
+   obj->Pointer = NULL;
+   obj->Offset = 0;
+   obj->Length = 0;
+
    return GL_TRUE;
 }
 
@@ -294,30 +495,94 @@ intel_bufferobj_buffer(struct intel_context *intel,
    }
 
    if (intel_obj->buffer == NULL) {
+      void *sys_buffer = intel_obj->sys_buffer;
+
+      /* only one of buffer and sys_buffer could be non-NULL */
       intel_bufferobj_alloc_buffer(intel, intel_obj);
+      intel_obj->sys_buffer = NULL;
+
       intel_bufferobj_subdata(&intel->ctx,
 			      GL_ARRAY_BUFFER_ARB,
 			      0,
 			      intel_obj->Base.Size,
-			      intel_obj->sys_buffer,
+			      sys_buffer,
 			      &intel_obj->Base);
-      _mesa_free(intel_obj->sys_buffer);
+      _mesa_free(sys_buffer);
       intel_obj->sys_buffer = NULL;
    }
 
    return intel_obj->buffer;
 }
 
+static void
+intel_bufferobj_copy_subdata(GLcontext *ctx,
+			     struct gl_buffer_object *src,
+			     struct gl_buffer_object *dst,
+			     GLintptr read_offset, GLintptr write_offset,
+			     GLsizeiptr size)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_buffer_object *intel_src = intel_buffer_object(src);
+   struct intel_buffer_object *intel_dst = intel_buffer_object(dst);
+   drm_intel_bo *src_bo, *dst_bo;
+
+   if (size == 0)
+      return;
+
+   /* If we're in system memory, just map and memcpy. */
+   if (intel_src->sys_buffer || intel_dst->sys_buffer) {
+      /* The same buffer may be used, but note that regions copied may
+       * not overlap.
+       */
+      if (src == dst) {
+	 char *ptr = intel_bufferobj_map(ctx, GL_COPY_WRITE_BUFFER,
+					 GL_READ_WRITE, dst);
+	 memcpy(ptr + write_offset, ptr + read_offset, size);
+	 intel_bufferobj_unmap(ctx, GL_COPY_WRITE_BUFFER, dst);
+      } else {
+	 const char *src_ptr;
+	 char *dst_ptr;
+
+	 src_ptr =  intel_bufferobj_map(ctx, GL_COPY_READ_BUFFER,
+					GL_READ_ONLY, src);
+	 dst_ptr =  intel_bufferobj_map(ctx, GL_COPY_WRITE_BUFFER,
+					GL_WRITE_ONLY, dst);
+
+	 memcpy(dst_ptr + write_offset, src_ptr + read_offset, size);
+
+	 intel_bufferobj_unmap(ctx, GL_COPY_READ_BUFFER, src);
+	 intel_bufferobj_unmap(ctx, GL_COPY_WRITE_BUFFER, dst);
+      }
+   }
+
+   /* Otherwise, we have real BOs, so blit them. */
+
+   dst_bo = intel_bufferobj_buffer(intel, intel_dst, INTEL_WRITE_PART);
+   src_bo = intel_bufferobj_buffer(intel, intel_src, INTEL_READ);
+
+   intel_emit_linear_blit(intel,
+			  dst_bo, write_offset,
+			  src_bo, read_offset, size);
+
+   /* Since we've emitted some blits to buffers that will (likely) be used
+    * in rendering operations in other cache domains in this batch, emit a
+    * flush.  Once again, we wish for a domain tracker in libdrm to cover
+    * usage inside of a batchbuffer.
+    */
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+}
+
 void
-intel_bufferobj_init(struct intel_context *intel)
+intelInitBufferObjectFuncs(struct dd_function_table *functions)
 {
-   GLcontext *ctx = &intel->ctx;
-
-   ctx->Driver.NewBufferObject = intel_bufferobj_alloc;
-   ctx->Driver.DeleteBuffer = intel_bufferobj_free;
-   ctx->Driver.BufferData = intel_bufferobj_data;
-   ctx->Driver.BufferSubData = intel_bufferobj_subdata;
-   ctx->Driver.GetBufferSubData = intel_bufferobj_get_subdata;
-   ctx->Driver.MapBuffer = intel_bufferobj_map;
-   ctx->Driver.UnmapBuffer = intel_bufferobj_unmap;
+   functions->NewBufferObject = intel_bufferobj_alloc;
+   functions->DeleteBuffer = intel_bufferobj_free;
+   functions->BufferData = intel_bufferobj_data;
+   functions->BufferSubData = intel_bufferobj_subdata;
+   functions->GetBufferSubData = intel_bufferobj_get_subdata;
+   functions->MapBuffer = intel_bufferobj_map;
+   functions->MapBufferRange = intel_bufferobj_map_range;
+   functions->FlushMappedBufferRange = intel_bufferobj_flush_mapped_range;
+   functions->UnmapBuffer = intel_bufferobj_unmap;
+   functions->CopyBufferSubData = intel_bufferobj_copy_subdata;
 }
diff --git a/shared/intel_buffer_objects.h b/shared/intel_buffer_objects.h
index 0431015..bf3e08a 100644
--- a/shared/intel_buffer_objects.h
+++ b/shared/intel_buffer_objects.h
@@ -48,6 +48,12 @@ struct intel_buffer_object
    struct intel_region *region; /* Is there a zero-copy texture
                                    associated with this (pixel)
                                    buffer object? */
+
+   drm_intel_bo *range_map_bo;
+   void *range_map_buffer;
+   unsigned int range_map_offset;
+   GLsizei range_map_size;
+
    GLboolean mapped_gtt;
 };
 
@@ -60,7 +66,7 @@ dri_bo *intel_bufferobj_buffer(struct intel_context *intel,
 
 /* Hook the bufferobject implementation into mesa: 
  */
-void intel_bufferobj_init(struct intel_context *intel);
+void intelInitBufferObjectFuncs(struct dd_function_table *functions);
 
 
 
@@ -72,10 +78,7 @@ void intel_bufferobj_init(struct intel_context *intel);
 static INLINE struct intel_buffer_object *
 intel_buffer_object(struct gl_buffer_object *obj)
 {
-   if (obj->Name)
-      return (struct intel_buffer_object *) obj;
-   else
-      return NULL;
+   return (struct intel_buffer_object *) obj;
 }
 
 /* Helpers for zerocopy image uploads.  See also intel_regions.h:
diff --git a/shared/intel_buffers.c b/shared/intel_buffers.c
index d2fad9e..e7357e7 100644
--- a/shared/intel_buffers.c
+++ b/shared/intel_buffers.c
@@ -157,7 +157,7 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
    /* Do this here, not core Mesa, since this function is called from
     * many places within the driver.
     */
-   if (ctx->NewState & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+   if (ctx->NewState & _NEW_BUFFERS) {
       /* this updates the DrawBuffer->_NumColorDrawBuffers fields, etc */
       _mesa_update_framebuffer(ctx);
       /* this updates the DrawBuffer's Width/Height if it's a FBO */
@@ -345,6 +345,23 @@ intelDrawBuffer(GLcontext * ctx, GLenum mode)
 static void
 intelReadBuffer(GLcontext * ctx, GLenum mode)
 {
+   if ((ctx->DrawBuffer != NULL) && (ctx->DrawBuffer->Name == 0)) {
+      struct intel_context *const intel = intel_context(ctx);
+      const GLboolean was_front_buffer_reading =
+	intel->is_front_buffer_reading;
+
+      intel->is_front_buffer_reading = (mode == GL_FRONT_LEFT)
+	|| (mode == GL_FRONT);
+
+      /* If we weren't front-buffer reading before but we are now, make sure
+       * that the front-buffer has actually been allocated.
+       */
+      if (!was_front_buffer_reading && intel->is_front_buffer_reading) {
+	 intel_update_renderbuffers(intel->driContext,
+				    intel->driContext->driDrawablePriv);
+      }
+   }
+
    if (ctx->ReadBuffer == ctx->DrawBuffer) {
       /* This will update FBO completeness status.
        * A framebuffer will be incomplete if the GL_READ_BUFFER setting
diff --git a/shared/intel_chipset.h b/shared/intel_chipset.h
index 4593d90..3dc8653 100644
--- a/shared/intel_chipset.h
+++ b/shared/intel_chipset.h
@@ -66,6 +66,10 @@
 #define PCI_CHIP_Q45_G                  0x2E12
 #define PCI_CHIP_G45_G                  0x2E22
 #define PCI_CHIP_G41_G                  0x2E32
+#define PCI_CHIP_B43_G                  0x2E42
+
+#define PCI_CHIP_ILD_G                  0x0042
+#define PCI_CHIP_ILM_G                  0x0046
 
 #define IS_MOBILE(devid)	(devid == PCI_CHIP_I855_GM || \
 				 devid == PCI_CHIP_I915_GM || \
@@ -73,15 +77,22 @@
 				 devid == PCI_CHIP_I945_GME || \
 				 devid == PCI_CHIP_I965_GM || \
 				 devid == PCI_CHIP_I965_GME || \
-				 devid == PCI_CHIP_GM45_GM || IS_IGD(devid))
+				 devid == PCI_CHIP_GM45_GM || \
+				 IS_IGD(devid) || \
+				 devid == PCI_CHIP_ILM_G)
 
 #define IS_G45(devid)           (devid == PCI_CHIP_IGD_E_G || \
                                  devid == PCI_CHIP_Q45_G || \
                                  devid == PCI_CHIP_G45_G || \
-                                 devid == PCI_CHIP_G41_G)
+                                 devid == PCI_CHIP_G41_G || \
+                                 devid == PCI_CHIP_B43_G)
 #define IS_GM45(devid)          (devid == PCI_CHIP_GM45_GM)
 #define IS_G4X(devid)		(IS_G45(devid) || IS_GM45(devid))
 
+#define IS_ILD(devid)           (devid == PCI_CHIP_ILD_G)
+#define IS_ILM(devid)           (devid == PCI_CHIP_ILM_G)
+#define IS_IGDNG(devid)           (IS_ILD(devid) || IS_ILM(devid))
+
 #define IS_915(devid)		(devid == PCI_CHIP_I915_G || \
 				 devid == PCI_CHIP_E7221_G || \
 				 devid == PCI_CHIP_I915_GM)
@@ -99,7 +110,8 @@
 				 devid == PCI_CHIP_I965_GM || \
 				 devid == PCI_CHIP_I965_GME || \
 				 devid == PCI_CHIP_I946_GZ || \
-				 IS_G4X(devid))
+				 IS_G4X(devid) || \
+				 IS_IGDNG(devid))
 
 #define IS_9XX(devid)		(IS_915(devid) || \
 				 IS_945(devid) || \
diff --git a/shared/intel_clear.c b/shared/intel_clear.c
index 19f4763..1b0e221 100644
--- a/shared/intel_clear.c
+++ b/shared/intel_clear.c
@@ -27,25 +27,9 @@
  **************************************************************************/
 
 #include "main/glheader.h"
-#include "main/enums.h"
-#include "main/image.h"
 #include "main/mtypes.h"
-#include "main/arrayobj.h"
-#include "main/attrib.h"
-#include "main/blend.h"
-#include "main/bufferobj.h"
-#include "main/buffers.h"
-#include "main/depth.h"
-#include "main/enable.h"
-#include "main/macros.h"
-#include "main/matrix.h"
-#include "main/polygon.h"
-#include "main/texstate.h"
-#include "main/shaders.h"
-#include "main/stencil.h"
-#include "main/varray.h"
-#include "glapi/dispatch.h"
 #include "swrast/swrast.h"
+#include "drivers/common/meta.h"
 
 #include "intel_context.h"
 #include "intel_blit.h"
@@ -53,240 +37,11 @@
 #include "intel_clear.h"
 #include "intel_fbo.h"
 #include "intel_pixel.h"
+#include "intel_regions.h"
+#include "intel_batchbuffer.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BLIT
 
-#define TRI_CLEAR_COLOR_BITS (BUFFER_BIT_BACK_LEFT |			\
-			      BUFFER_BIT_FRONT_LEFT |			\
-			      BUFFER_BIT_COLOR0 |			\
-			      BUFFER_BIT_COLOR1 |			\
-			      BUFFER_BIT_COLOR2 |			\
-			      BUFFER_BIT_COLOR3 |			\
-			      BUFFER_BIT_COLOR4 |			\
-			      BUFFER_BIT_COLOR5 |			\
-			      BUFFER_BIT_COLOR6 |			\
-			      BUFFER_BIT_COLOR7)
-
-
-/**
- * Per-context one-time init of things for intl_clear_tris().
- * Basically set up a private array object for vertex/color arrays.
- */
-static void
-init_clear(GLcontext *ctx)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct gl_array_object *arraySave = NULL;
-   const GLuint arrayBuffer = ctx->Array.ArrayBufferObj->Name;
-   const GLuint elementBuffer = ctx->Array.ElementArrayBufferObj->Name;
-
-   /* create new array object */
-   intel->clear.arrayObj = _mesa_new_array_object(ctx, ~0);
-
-   /* save current array object, bind new one */
-   _mesa_reference_array_object(ctx, &arraySave, ctx->Array.ArrayObj);
-   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, intel->clear.arrayObj);
-
-   /* one-time setup of vertex arrays (pos, color) */
-   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
-   _mesa_BindBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, 0);
-   _mesa_ColorPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), intel->clear.color);
-   _mesa_VertexPointer(3, GL_FLOAT, 3 * sizeof(GLfloat), intel->clear.vertices);
-   _mesa_Enable(GL_COLOR_ARRAY);
-   _mesa_Enable(GL_VERTEX_ARRAY);
-
-   /* restore original array object */
-   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, arraySave);
-   _mesa_reference_array_object(ctx, &arraySave, NULL);
-
-   /* restore original buffer objects */
-   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, arrayBuffer);
-   _mesa_BindBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, elementBuffer);
-}
-
-
-
-/**
- * Perform glClear where mask contains only color, depth, and/or stencil.
- *
- * The implementation is based on calling into Mesa to set GL state and
- * performing normal triangle rendering.  The intent of this path is to
- * have as generic a path as possible, so that any driver could make use of
- * it.
- */
-void
-intel_clear_tris(GLcontext *ctx, GLbitfield mask)
-{
-   struct intel_context *intel = intel_context(ctx);
-   GLfloat dst_z;
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   int i;
-   GLboolean saved_fp_enable = GL_FALSE, saved_vp_enable = GL_FALSE;
-   GLuint saved_shader_program = 0;
-   unsigned int saved_active_texture;
-   struct gl_array_object *arraySave = NULL;
-
-   if (!intel->clear.arrayObj)
-      init_clear(ctx);
-
-   assert((mask & ~(TRI_CLEAR_COLOR_BITS | BUFFER_BIT_DEPTH |
-		    BUFFER_BIT_STENCIL)) == 0);
-
-   _mesa_PushAttrib(GL_COLOR_BUFFER_BIT |
-		    GL_CURRENT_BIT |
-		    GL_DEPTH_BUFFER_BIT |
-		    GL_ENABLE_BIT |
-		    GL_POLYGON_BIT |
-		    GL_STENCIL_BUFFER_BIT |
-		    GL_TRANSFORM_BIT |
-		    GL_CURRENT_BIT);
-   saved_active_texture = ctx->Texture.CurrentUnit;
-
-   /* Disable existing GL state we don't want to apply to a clear. */
-   _mesa_Disable(GL_ALPHA_TEST);
-   _mesa_Disable(GL_BLEND);
-   _mesa_Disable(GL_CULL_FACE);
-   _mesa_Disable(GL_FOG);
-   _mesa_Disable(GL_POLYGON_SMOOTH);
-   _mesa_Disable(GL_POLYGON_STIPPLE);
-   _mesa_Disable(GL_POLYGON_OFFSET_FILL);
-   _mesa_Disable(GL_LIGHTING);
-   _mesa_Disable(GL_CLIP_PLANE0);
-   _mesa_Disable(GL_CLIP_PLANE1);
-   _mesa_Disable(GL_CLIP_PLANE2);
-   _mesa_Disable(GL_CLIP_PLANE3);
-   _mesa_Disable(GL_CLIP_PLANE4);
-   _mesa_Disable(GL_CLIP_PLANE5);
-   _mesa_PolygonMode(GL_FRONT_AND_BACK, GL_FILL);
-   if (ctx->Extensions.ARB_fragment_program && ctx->FragmentProgram.Enabled) {
-      saved_fp_enable = GL_TRUE;
-      _mesa_Disable(GL_FRAGMENT_PROGRAM_ARB);
-   }
-   if (ctx->Extensions.ARB_vertex_program && ctx->VertexProgram.Enabled) {
-      saved_vp_enable = GL_TRUE;
-      _mesa_Disable(GL_VERTEX_PROGRAM_ARB);
-   }
-   if (ctx->Extensions.ARB_shader_objects && ctx->Shader.CurrentProgram) {
-      saved_shader_program = ctx->Shader.CurrentProgram->Name;
-      _mesa_UseProgramObjectARB(0);
-   }
-
-   if (ctx->Texture._EnabledUnits != 0) {
-      int i;
-
-      for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-	 _mesa_ActiveTextureARB(GL_TEXTURE0 + i);
-	 _mesa_Disable(GL_TEXTURE_1D);
-	 _mesa_Disable(GL_TEXTURE_2D);
-	 _mesa_Disable(GL_TEXTURE_3D);
-	 if (ctx->Extensions.ARB_texture_cube_map)
-	    _mesa_Disable(GL_TEXTURE_CUBE_MAP_ARB);
-	 if (ctx->Extensions.NV_texture_rectangle)
-	    _mesa_Disable(GL_TEXTURE_RECTANGLE_NV);
-	 if (ctx->Extensions.MESA_texture_array) {
-	    _mesa_Disable(GL_TEXTURE_1D_ARRAY_EXT);
-	    _mesa_Disable(GL_TEXTURE_2D_ARRAY_EXT);
-	 }
-      }
-   }
-
-   /* save current array object, bind our private one */
-   _mesa_reference_array_object(ctx, &arraySave, ctx->Array.ArrayObj);
-   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, intel->clear.arrayObj);
-
-   intel_meta_set_passthrough_transform(intel);
-
-   for (i = 0; i < 4; i++) {
-      COPY_4FV(intel->clear.color[i], ctx->Color.ClearColor);
-   }
-
-   /* convert clear Z from [0,1] to NDC coord in [-1,1] */
-   dst_z = -1.0 + 2.0 * ctx->Depth.Clear;
-
-   /* Prepare the vertices, which are the same regardless of which buffer we're
-    * drawing to.
-    */
-   intel->clear.vertices[0][0] = fb->_Xmin;
-   intel->clear.vertices[0][1] = fb->_Ymin;
-   intel->clear.vertices[0][2] = dst_z;
-   intel->clear.vertices[1][0] = fb->_Xmax;
-   intel->clear.vertices[1][1] = fb->_Ymin;
-   intel->clear.vertices[1][2] = dst_z;
-   intel->clear.vertices[2][0] = fb->_Xmax;
-   intel->clear.vertices[2][1] = fb->_Ymax;
-   intel->clear.vertices[2][2] = dst_z;
-   intel->clear.vertices[3][0] = fb->_Xmin;
-   intel->clear.vertices[3][1] = fb->_Ymax;
-   intel->clear.vertices[3][2] = dst_z;
-
-   while (mask != 0) {
-      GLuint this_mask = 0;
-      GLuint color_bit;
-
-      color_bit = _mesa_ffs(mask & TRI_CLEAR_COLOR_BITS);
-      if (color_bit != 0)
-	 this_mask |= (1 << (color_bit - 1));
-
-      /* Clear depth/stencil in the same pass as color. */
-      this_mask |= (mask & (BUFFER_BIT_DEPTH | BUFFER_BIT_STENCIL));
-
-      /* Select the current color buffer and use the color write mask if
-       * we have one, otherwise don't write any color channels.
-       */
-      if (this_mask & BUFFER_BIT_FRONT_LEFT)
-	 _mesa_DrawBuffer(GL_FRONT_LEFT);
-      else if (this_mask & BUFFER_BIT_BACK_LEFT)
-	 _mesa_DrawBuffer(GL_BACK_LEFT);
-      else if (color_bit != 0)
-	 _mesa_DrawBuffer(GL_COLOR_ATTACHMENT0 +
-			  (color_bit - BUFFER_COLOR0 - 1));
-      else
-	 _mesa_ColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
-
-      /* Control writing of the depth clear value to depth. */
-      if (this_mask & BUFFER_BIT_DEPTH) {
-	 _mesa_DepthFunc(GL_ALWAYS);
-	 _mesa_Enable(GL_DEPTH_TEST);
-      } else {
-	 _mesa_Disable(GL_DEPTH_TEST);
-	 _mesa_DepthMask(GL_FALSE);
-      }
-
-      /* Control writing of the stencil clear value to stencil. */
-      if (this_mask & BUFFER_BIT_STENCIL) {
-	 _mesa_Enable(GL_STENCIL_TEST);
-	 _mesa_StencilOpSeparate(GL_FRONT_AND_BACK,
-				 GL_REPLACE, GL_REPLACE, GL_REPLACE);
-	 _mesa_StencilFuncSeparate(GL_FRONT_AND_BACK, GL_ALWAYS,
-				   ctx->Stencil.Clear,
-				   ctx->Stencil.WriteMask[0]);
-      } else {
-	 _mesa_Disable(GL_STENCIL_TEST);
-      }
-
-      _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-      mask &= ~this_mask;
-   }
-
-   intel_meta_restore_transform(intel);
-
-   _mesa_ActiveTextureARB(GL_TEXTURE0 + saved_active_texture);
-   if (saved_fp_enable)
-      _mesa_Enable(GL_FRAGMENT_PROGRAM_ARB);
-   if (saved_vp_enable)
-      _mesa_Enable(GL_VERTEX_PROGRAM_ARB);
-
-   if (saved_shader_program)
-      _mesa_UseProgramObjectARB(saved_shader_program);
-
-   _mesa_PopAttrib();
-
-   /* restore current array object */
-   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, arraySave);
-   _mesa_reference_array_object(ctx, &arraySave, NULL);
-}
-
 static const char *buffer_names[] = {
    [BUFFER_FRONT_LEFT] = "front",
    [BUFFER_BACK_LEFT] = "back",
@@ -340,7 +95,7 @@ intelClear(GLcontext *ctx, GLbitfield mask)
          = intel_get_rb_region(fb, BUFFER_STENCIL);
       if (stencilRegion) {
          /* have hw stencil */
-         if (IS_965(intel->intelScreen->deviceID) ||
+         if (stencilRegion->tiling == I915_TILING_Y ||
 	     (ctx->Stencil.WriteMask[0] & 0xff) != 0xff) {
 	    /* We have to use the 3D engine if we're clearing a partial mask
 	     * of the stencil buffer, or if we're on a 965 which has a tiled
@@ -357,9 +112,10 @@ intelClear(GLcontext *ctx, GLbitfield mask)
 
    /* HW depth */
    if (mask & BUFFER_BIT_DEPTH) {
+      const struct intel_region *irb = intel_get_rb_region(fb, BUFFER_DEPTH);
+
       /* clear depth with whatever method is used for stencil (see above) */
-      if (IS_965(intel->intelScreen->deviceID) ||
-	  tri_mask & BUFFER_BIT_STENCIL)
+      if (irb->tiling == I915_TILING_Y || tri_mask & BUFFER_BIT_STENCIL)
          tri_mask |= BUFFER_BIT_DEPTH;
       else
          blit_mask |= BUFFER_BIT_DEPTH;
@@ -369,7 +125,7 @@ intelClear(GLcontext *ctx, GLbitfield mask)
     * buffer with it.
     */
    if (mask & (BUFFER_BIT_DEPTH | BUFFER_BIT_STENCIL)) {
-      int color_bit = _mesa_ffs(mask & TRI_CLEAR_COLOR_BITS);
+      int color_bit = _mesa_ffs(mask & BUFFER_BITS_COLOR);
       if (color_bit != 0) {
 	 tri_mask |= blit_mask & (1 << (color_bit - 1));
 	 blit_mask &= ~(1 << (color_bit - 1));
@@ -379,14 +135,18 @@ intelClear(GLcontext *ctx, GLbitfield mask)
    /* SW fallback clearing */
    swrast_mask = mask & ~tri_mask & ~blit_mask;
 
-   for (i = 0; i < BUFFER_COUNT; i++) {
-      GLuint bufBit = 1 << i;
-      if ((blit_mask | tri_mask) & bufBit) {
+   {
+      /* look for non-Intel renderbuffers (clear them with swrast) */
+      GLbitfield blit_or_tri = blit_mask | tri_mask;
+      while (blit_or_tri) {
+         GLuint i = _mesa_ffs(blit_or_tri) - 1;
+         GLbitfield bufBit = 1 << i;
          if (!fb->Attachment[i].Renderbuffer->ClassID) {
             blit_mask &= ~bufBit;
             tri_mask &= ~bufBit;
             swrast_mask |= bufBit;
          }
+         blit_or_tri ^= bufBit;
       }
    }
 
@@ -411,7 +171,8 @@ intelClear(GLcontext *ctx, GLbitfield mask)
 	 }
 	 DBG("\n");
       }
-      intel_clear_tris(ctx, tri_mask);
+
+      _mesa_meta_clear(&intel->ctx, tri_mask);
    }
 
    if (swrast_mask) {
diff --git a/shared/intel_context.c b/shared/intel_context.c
index cfd983d..fce42e9 100644
--- a/shared/intel_context.c
+++ b/shared/intel_context.c
@@ -38,6 +38,7 @@
 #include "swrast_setup/swrast_setup.h"
 #include "tnl/tnl.h"
 #include "drivers/common/driverfuncs.h"
+#include "drivers/common/meta.h"
 
 #include "i830_dri.h"
 
@@ -161,6 +162,15 @@ intelGetString(GLcontext * ctx, GLenum name)
       case PCI_CHIP_G41_G:
          chipset = "Intel(R) G41";
          break;
+      case PCI_CHIP_B43_G:
+         chipset = "Intel(R) B43";
+         break;
+      case PCI_CHIP_ILD_G:
+         chipset = "Intel(R) IGDNG_D";
+         break;
+      case PCI_CHIP_ILM_G:
+         chipset = "Intel(R) IGDNG_M";
+         break;
       default:
          chipset = "Unknown Intel Chipset";
          break;
@@ -220,7 +230,9 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
       struct intel_renderbuffer *stencil_rb;
 
       i = 0;
-      if ((intel->is_front_buffer_rendering || !intel_fb->color_rb[1])
+      if ((intel->is_front_buffer_rendering ||
+	   intel->is_front_buffer_reading ||
+	   !intel_fb->color_rb[1])
 	   && intel_fb->color_rb[0]) {
 	 attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
 	 attachments[i++] = intel_bits_per_pixel(intel_fb->color_rb[0]);
@@ -396,7 +408,7 @@ intel_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei w, GLsizei h)
     if (!driContext->driScreenPriv->dri2.enabled)
 	return;
 
-    if (!intel->internal_viewport_call && ctx->DrawBuffer->Name == 0) {
+    if (!intel->meta.internal_viewport_call && ctx->DrawBuffer->Name == 0) {
        /* If we're rendering to the fake front buffer, make sure all the pending
 	* drawing has landed on the real front buffer.  Otherwise when we
 	* eventually get to DRI2GetBuffersWithFormat the stale real front
@@ -495,7 +507,8 @@ intel_flush(GLcontext *ctx, GLboolean needs_mi_flush)
 
       if (screen->dri2.loader &&
           (screen->dri2.loader->base.version >= 2)
-	  && (screen->dri2.loader->flushFrontBuffer != NULL)) {
+	  && (screen->dri2.loader->flushFrontBuffer != NULL) &&
+          intel->driDrawable && intel->driDrawable->loaderPrivate) {
 	 (*screen->dri2.loader->flushFrontBuffer)(intel->driDrawable,
 						  intel->driDrawable->loaderPrivate);
 
@@ -505,7 +518,7 @@ intel_flush(GLcontext *ctx, GLboolean needs_mi_flush)
 	  * each of N places that do rendering.  This has worse performances,
 	  * but it is much easier to get correct.
 	  */
-	 if (intel->is_front_buffer_rendering) {
+	 if (!intel->is_front_buffer_rendering) {
 	    intel->front_buffer_dirty = GL_FALSE;
 	 }
       }
@@ -521,7 +534,27 @@ intelFlush(GLcontext * ctx)
 static void
 intel_glFlush(GLcontext *ctx)
 {
+   struct intel_context *intel = intel_context(ctx);
+
    intel_flush(ctx, GL_TRUE);
+
+   /* We're using glFlush as an indicator that a frame is done, which is
+    * what DRI2 does before calling SwapBuffers (and means we should catch
+    * people doing front-buffer rendering, as well)..
+    *
+    * Wait for the swapbuffers before the one we just emitted, so we don't
+    * get too many swaps outstanding for apps that are GPU-heavy but not
+    * CPU-heavy.
+    *
+    * Unfortunately, we don't have a handle to the batch containing the swap,
+    * and getting our hands on that doesn't seem worth it, so we just us the
+    * first batch we emitted after the last swap.
+    */
+   if (intel->first_post_swapbuffers_batch != NULL) {
+      drm_intel_bo_wait_rendering(intel->first_post_swapbuffers_batch);
+      drm_intel_bo_unreference(intel->first_post_swapbuffers_batch);
+      intel->first_post_swapbuffers_batch = NULL;
+   }
 }
 
 void
@@ -561,10 +594,15 @@ intelInitDriverFunctions(struct dd_function_table *functions)
    functions->CopyConvolutionFilter2D = _swrast_CopyConvolutionFilter2D;
 
    intelInitTextureFuncs(functions);
+   intelInitTextureImageFuncs(functions);
+   intelInitTextureSubImageFuncs(functions);
+   intelInitTextureCopyImageFuncs(functions);
    intelInitStateFuncs(functions);
    intelInitClearFuncs(functions);
    intelInitBufferFuncs(functions);
    intelInitPixelFuncs(functions);
+   intelInitBufferObjectFuncs(functions);
+   intel_init_syncobj_functions(functions);
 }
 
 
@@ -607,6 +645,10 @@ intelInitContext(struct intel_context *intel,
       intel->maxBatchSize = BATCH_SZ;
 
    intel->bufmgr = intelScreen->bufmgr;
+
+   if (0) /* for debug */
+      drm_intel_bufmgr_set_debug(intel->bufmgr, 1);
+
    intel->ttm = intelScreen->ttm;
    if (intel->ttm) {
       int bo_reuse_mode;
@@ -660,7 +702,15 @@ intelInitContext(struct intel_context *intel,
     */
    _mesa_init_point(ctx);
 
+   meta_init_metaops(ctx, &intel->meta);
    ctx->Const.MaxColorAttachments = 4;  /* XXX FBO: review this */
+   if (IS_965(intelScreen->deviceID)) {
+      if (MAX_WIDTH > 8192)
+	 ctx->Const.MaxRenderbufferSize = 8192;
+   } else {
+      if (MAX_WIDTH > 2048)
+	 ctx->Const.MaxRenderbufferSize = 2048;
+   }
 
    /* Initialize the software rasterizer and helper modules. */
    _swrast_CreateContext(ctx);
@@ -672,6 +722,8 @@ intelInitContext(struct intel_context *intel,
    _swrast_allow_pixel_fog(ctx, GL_FALSE);
    _swrast_allow_vertex_fog(ctx, GL_TRUE);
 
+   _mesa_meta_init(ctx);
+
    intel->hw_stencil = mesaVis->stencilBits && mesaVis->depthBits == 24;
    intel->hw_stipple = 1;
 
@@ -718,7 +770,6 @@ intelInitContext(struct intel_context *intel,
 
    intel->batch = intel_batchbuffer_alloc(intel);
 
-   intel_bufferobj_init(intel);
    intel_fbo_init(intel);
 
    if (intel->ctx.Mesa_DXTn) {
@@ -728,6 +779,15 @@ intelInitContext(struct intel_context *intel,
    else if (driQueryOptionb(&intel->optionCache, "force_s3tc_enable")) {
       _mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
    }
+   intel->use_texture_tiling = driQueryOptionb(&intel->optionCache,
+					       "texture_tiling");
+   if (intel->use_texture_tiling &&
+       !intel->intelScreen->kernel_exec_fencing) {
+      fprintf(stderr, "No kernel support for execution fencing, "
+	      "disabling texture tiling\n");
+      intel->use_texture_tiling = GL_FALSE;
+   }
+   intel->use_early_z = driQueryOptionb(&intel->optionCache, "early_z");
 
    intel->prim.primitive = ~0;
 
@@ -767,8 +827,9 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
 
       INTEL_FIREVERTICES(intel);
 
-      if (intel->clear.arrayObj)
-         _mesa_delete_array_object(&intel->ctx, intel->clear.arrayObj);
+      _mesa_meta_free(&intel->ctx);
+
+      meta_destroy_metaops(&intel->meta);
 
       intel->vtbl.destroy(intel);
 
@@ -787,15 +848,68 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
       intel->prim.vb = NULL;
       dri_bo_unreference(intel->prim.vb_bo);
       intel->prim.vb_bo = NULL;
+      dri_bo_unreference(intel->first_post_swapbuffers_batch);
+      intel->first_post_swapbuffers_batch = NULL;
 
       if (release_texture_heaps) {
-         /* This share group is about to go away, free our private
-          * texture object data.
+         /* Nothing is currently done here to free texture heaps;
+          * but we're not using the texture heap utilities, so I
+          * rather think we shouldn't.  I've taken a look, and can't
+          * find any private texture data hanging around anywhere, but
+          * I'm not yet certain there isn't any at all...
           */
-         if (INTEL_DEBUG & DEBUG_TEXTURE)
+         /* if (INTEL_DEBUG & DEBUG_TEXTURE)
             fprintf(stderr, "do something to free texture heaps\n");
+          */
       }
 
+      /* XXX In intelMakeCurrent() below, the context's static regions are 
+       * referenced inside the frame buffer; it's listed as a hack,
+       * with a comment of "XXX FBO temporary fix-ups!", but
+       * as long as it's there, we should release the regions here.
+       * The do/while loop around the block is used to allow the
+       * "continue" statements inside the block to exit the block,
+       * to avoid many layers of "if" constructs.
+       */
+      do {
+         __DRIdrawablePrivate * driDrawPriv = intel->driDrawable;
+         struct intel_framebuffer *intel_fb;
+         struct intel_renderbuffer *irbDepth, *irbStencil;
+         if (!driDrawPriv) {
+            /* We're already detached from the drawable; exit this block. */
+            continue;
+         }
+         intel_fb = (struct intel_framebuffer *) driDrawPriv->driverPrivate;
+         if (!intel_fb) {
+            /* The frame buffer is already gone; exit this block. */
+            continue;
+         }
+         irbDepth = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+         irbStencil = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+
+         /* If the regions of the frame buffer still match the regions
+          * of the context, release them.  If they've changed somehow,
+          * leave them alone.
+          */
+         if (intel_fb->color_rb[0] && intel_fb->color_rb[0]->region == intel->front_region) {
+	    intel_renderbuffer_set_region(intel_fb->color_rb[0], NULL);
+         }
+         if (intel_fb->color_rb[1] && intel_fb->color_rb[1]->region == intel->back_region) {
+	    intel_renderbuffer_set_region(intel_fb->color_rb[1], NULL);
+         }
+
+         if (irbDepth && irbDepth->region == intel->depth_region) {
+	    intel_renderbuffer_set_region(irbDepth, NULL);
+         }
+         /* Usually, the stencil buffer is the same as the depth buffer;
+          * but they're handled separately in MakeCurrent, so we'll
+          * handle them separately here.
+          */
+         if (irbStencil && irbStencil->region == intel->depth_region) {
+	    intel_renderbuffer_set_region(irbStencil, NULL);
+         }
+      } while (0);
+
       intel_region_release(&intel->front_region);
       intel_region_release(&intel->back_region);
       intel_region_release(&intel->depth_region);
@@ -804,12 +918,23 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
 
       /* free the Mesa context */
       _mesa_free_context_data(&intel->ctx);
+
+      FREE(intel);
+      driContextPriv->driverPrivate = NULL;
    }
 }
 
 GLboolean
 intelUnbindContext(__DRIcontextPrivate * driContextPriv)
 {
+   struct intel_context *intel =
+      (struct intel_context *) driContextPriv->driverPrivate;
+
+   /* Deassociate the context with the drawables.
+    */
+   intel->driDrawable = NULL;
+   intel->driReadDrawable = NULL;
+
    return GL_TRUE;
 }
 
@@ -832,7 +957,10 @@ intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
           if (driDrawPriv != driReadPriv)
               intel_update_renderbuffers(driContextPriv, driReadPriv);
       } else {
-          /* XXX FBO temporary fix-ups! */
+          /* XXX FBO temporary fix-ups!  These are released in 
+           * intelDextroyContext(), above.  Changes here should be
+           * reflected there.
+           */
           /* if the renderbuffers don't have regions, init them from the context */
          struct intel_renderbuffer *irbDepth
             = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
@@ -917,7 +1045,6 @@ intelContendedLock(struct intel_context *intel, GLuint flags)
    int me = intel->hHWContext;
 
    drmGetLock(intel->driFd, intel->hHWContext, flags);
-   intel->locked = 1;
 
    if (INTEL_DEBUG & DEBUG_LOCK)
       _mesa_printf("%s - got contended lock\n", __progname);
@@ -974,9 +1101,12 @@ void LOCK_HARDWARE( struct intel_context *intel )
     struct intel_framebuffer *intel_fb = NULL;
     struct intel_renderbuffer *intel_rb = NULL;
 
-    _glthread_LOCK_MUTEX(lockMutex);
-    assert(!intel->locked);
-    intel->locked = 1;
+    intel->locked++;
+    if (intel->locked >= 2)
+       return;
+
+    if (!sPriv->dri2.enabled)
+       _glthread_LOCK_MUTEX(lockMutex);
 
     if (intel->driDrawable) {
        intel_fb = intel->driDrawable->driverPrivate;
@@ -1023,13 +1153,16 @@ void UNLOCK_HARDWARE( struct intel_context *intel )
 {
     __DRIscreen *sPriv = intel->driScreen;
 
-   intel->vtbl.note_unlock( intel );
-   intel->locked = 0;
+   intel->locked--;
+   if (intel->locked > 0)
+      return;
 
-   if (!sPriv->dri2.enabled)
-      DRM_UNLOCK(intel->driFd, intel->driHwLock, intel->hHWContext);
+   assert(intel->locked == 0);
 
-   _glthread_UNLOCK_MUTEX(lockMutex);
+   if (!sPriv->dri2.enabled) {
+      DRM_UNLOCK(intel->driFd, intel->driHwLock, intel->hHWContext);
+      _glthread_UNLOCK_MUTEX(lockMutex);
+   }
 
    if (INTEL_DEBUG & DEBUG_LOCK)
       _mesa_printf("%s - unlocked\n", __progname);
diff --git a/shared/intel_context.h b/shared/intel_context.h
index b5cf7e6..03e7cf3 100644
--- a/shared/intel_context.h
+++ b/shared/intel_context.h
@@ -33,6 +33,7 @@
 #include "main/mtypes.h"
 #include "main/mm.h"
 #include "texmem.h"
+#include "dri_metaops.h"
 #include "drm.h"
 #include "intel_bufmgr.h"
 
@@ -79,9 +80,19 @@ extern void intelFallback(struct intel_context *intel, GLuint bit,
 
 #define INTEL_MAX_FIXUP 64
 
+struct intel_sync_object {
+   struct gl_sync_object Base;
+
+   /** Batch associated with this sync object */
+   drm_intel_bo *bo;
+};
+
+/**
+ * intel_context is derived from Mesa's context class: GLcontext.
+ */
 struct intel_context
 {
-   GLcontext ctx;               /* the parent class */
+   GLcontext ctx;  /**< base class, must be first field */
 
    struct
    {
@@ -91,7 +102,6 @@ struct intel_context
       void (*new_batch) (struct intel_context * intel);
       void (*emit_invarient_state) (struct intel_context * intel);
       void (*note_fence) (struct intel_context *intel, GLuint fence);
-      void (*note_unlock) (struct intel_context *intel);
       void (*update_texture_state) (struct intel_context * intel);
 
       void (*render_start) (struct intel_context * intel);
@@ -158,19 +168,7 @@ struct intel_context
       void (*debug_batch)(struct intel_context *intel);
    } vtbl;
 
-   struct {
-      struct gl_fragment_program *bitmap_fp;
-      struct gl_vertex_program *passthrough_vp;
-
-      struct gl_fragment_program *saved_fp;
-      GLboolean saved_fp_enable;
-      struct gl_vertex_program *saved_vp;
-      GLboolean saved_vp_enable;
-
-      GLint saved_vp_x, saved_vp_y;
-      GLsizei saved_vp_width, saved_vp_height;
-      GLenum saved_matrix_mode;
-   } meta;
+   struct dri_metaops meta;
 
    GLint refcount;
    GLuint Fallback;
@@ -182,7 +180,6 @@ struct intel_context
    struct intel_region *front_region;
    struct intel_region *back_region;
    struct intel_region *depth_region;
-   GLboolean internal_viewport_call;
 
    /**
     * This value indicates that the kernel memory manager is being used
@@ -191,6 +188,7 @@ struct intel_context
    GLboolean ttm;
 
    struct intel_batchbuffer *batch;
+   drm_intel_bo *first_post_swapbuffers_batch;
    GLboolean no_batch_wrap;
    unsigned batch_id;
 
@@ -215,13 +213,6 @@ struct intel_context
    GLuint ClearColor565;
    GLuint ClearColor8888;
 
-   /* info for intel_clear_tris() */
-   struct
-   {
-      struct gl_array_object *arrayObj;
-      GLfloat vertices[4][3];
-      GLfloat color[4][4];
-   } clear;
 
    /* Offsets of fields within the current vertex:
     */
@@ -294,6 +285,17 @@ struct intel_context
     * easily.
     */
    GLboolean is_front_buffer_rendering;
+   /**
+    * Track whether front-buffer is the current read target.
+    *
+    * This is closely associated with is_front_buffer_rendering, but may
+    * be set separately.  The DRI2 fake front buffer must be referenced
+    * either way.
+    */
+   GLboolean is_front_buffer_reading;
+
+   GLboolean use_texture_tiling;
+   GLboolean use_early_z;
 
    drm_clip_rect_t fboRect;     /**< cliprect for FBO rendering */
 
@@ -316,7 +318,7 @@ struct intel_context
    __DRIdrawablePrivate *driReadDrawable;
    __DRIscreenPrivate *driScreen;
    intelScreenPrivate *intelScreen;
-   volatile struct drm_i915_sarea *sarea;
+   volatile drm_i915_sarea_t *sarea;
 
    GLuint lastStamp;
 
@@ -474,6 +476,8 @@ extern void intelFlush(GLcontext * ctx);
 
 extern void intelInitDriverFunctions(struct dd_function_table *functions);
 
+void intel_init_syncobj_functions(struct dd_function_table *functions);
+
 
 /* ================================================================
  * intel_state.c:
@@ -549,6 +553,9 @@ void intel_viewport(GLcontext * ctx, GLint x, GLint y,
 void intel_update_renderbuffers(__DRIcontext *context,
 				__DRIdrawable *drawable);
 
+void i915_set_buf_info_for_region(uint32_t *state, struct intel_region *region,
+				  uint32_t buffer_id);
+
 /*======================================================================
  * Inline conversion functions.  
  * These are better-typed than the macros used previously:
diff --git a/shared/intel_extensions.c b/shared/intel_extensions.c
index 9ec1b4e..2e61c55 100644
--- a/shared/intel_extensions.c
+++ b/shared/intel_extensions.c
@@ -30,10 +30,14 @@
 #include "intel_extensions.h"
 
 
+#define need_GL_ARB_copy_buffer
 #define need_GL_ARB_framebuffer_object
+#define need_GL_ARB_map_buffer_range
 #define need_GL_ARB_occlusion_query
 #define need_GL_ARB_point_parameters
 #define need_GL_ARB_shader_objects
+#define need_GL_ARB_sync
+#define need_GL_ARB_vertex_array_object
 #define need_GL_ARB_vertex_program
 #define need_GL_ARB_vertex_shader
 #define need_GL_ARB_window_pos
@@ -45,9 +49,12 @@
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_framebuffer_object
 #define need_GL_EXT_framebuffer_blit
+#define need_GL_EXT_gpu_program_parameters
 #define need_GL_EXT_point_parameters
+#define need_GL_EXT_provoking_vertex
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_stencil_two_side
+#define need_GL_APPLE_vertex_array_object
 #define need_GL_ATI_separate_stencil
 #define need_GL_ATI_envmap_bumpmap
 #define need_GL_NV_point_sprite
@@ -65,8 +72,13 @@
  * i965_dri.
  */
 static const struct dri_extension card_extensions[] = {
+   { "GL_ARB_copy_buffer",                GL_ARB_copy_buffer_functions },
+   { "GL_ARB_half_float_pixel",           NULL },
+   { "GL_ARB_map_buffer_range",           GL_ARB_map_buffer_range_functions },
    { "GL_ARB_multitexture",               NULL },
    { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
+   { "GL_ARB_point_sprite",               NULL },
+   { "GL_ARB_sync",                       GL_ARB_sync_functions },
    { "GL_ARB_texture_border_clamp",       NULL },
    { "GL_ARB_texture_cube_map",           NULL },
    { "GL_ARB_texture_env_add",            NULL },
@@ -75,6 +87,7 @@ static const struct dri_extension card_extensions[] = {
    { "GL_ARB_texture_env_dot3",           NULL },
    { "GL_ARB_texture_mirrored_repeat",    NULL },
    { "GL_ARB_texture_rectangle",          NULL },
+   { "GL_ARB_vertex_array_object",        GL_ARB_vertex_array_object_functions},
    { "GL_ARB_vertex_program",             GL_ARB_vertex_program_functions },
    { "GL_ARB_window_pos",                 GL_ARB_window_pos_functions },
    { "GL_EXT_blend_color",                GL_EXT_blend_color_functions },
@@ -85,7 +98,9 @@ static const struct dri_extension card_extensions[] = {
    { "GL_EXT_blend_subtract",             NULL },
    { "GL_EXT_cull_vertex",                GL_EXT_cull_vertex_functions },
    { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+   { "GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions },
    { "GL_EXT_packed_depth_stencil",       NULL },
+   { "GL_EXT_provoking_vertex",           GL_EXT_provoking_vertex_functions },
    { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
    { "GL_EXT_stencil_wrap",               NULL },
    { "GL_EXT_texture_edge_clamp",         NULL },
@@ -95,6 +110,7 @@ static const struct dri_extension card_extensions[] = {
    { "GL_EXT_texture_lod_bias",           NULL },
    { "GL_3DFX_texture_compression_FXT1",  NULL },
    { "GL_APPLE_client_storage",           NULL },
+   { "GL_APPLE_vertex_array_object",      GL_APPLE_vertex_array_object_functions},
    { "GL_MESA_pack_invert",               NULL },
    { "GL_MESA_ycbcr_texture",             NULL },
    { "GL_NV_blend_square",                NULL },
@@ -112,8 +128,10 @@ static const struct dri_extension i915_extensions[] = {
    { "GL_ARB_fragment_program",           NULL },
    { "GL_ARB_shadow",                     NULL },
    { "GL_ARB_texture_non_power_of_two",   NULL },
+   { "GL_ATI_separate_stencil",           GL_ATI_separate_stencil_functions },
    { "GL_ATI_texture_env_combine3",       NULL },
    { "GL_EXT_shadow_funcs",               NULL },
+   { "GL_EXT_stencil_two_side",           GL_EXT_stencil_two_side_functions },
    { "GL_NV_texture_env_combine4",        NULL },
    { NULL,                                NULL }
 };
@@ -128,6 +146,7 @@ static const struct dri_extension brw_extensions[] = {
    { "GL_ARB_framebuffer_object",         GL_ARB_framebuffer_object_functions},
    { "GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions },
    { "GL_ARB_point_sprite", 		  NULL },
+   { "GL_ARB_seamless_cube_map",          NULL },
    { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions },
    { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions },
    { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions },
diff --git a/shared/intel_fbo.c b/shared/intel_fbo.c
index 30f58b1..804c034 100644
--- a/shared/intel_fbo.c
+++ b/shared/intel_fbo.c
@@ -35,6 +35,7 @@
 #include "main/context.h"
 #include "main/texformat.h"
 #include "main/texrender.h"
+#include "drivers/common/meta.h"
 
 #include "intel_context.h"
 #include "intel_buffers.h"
@@ -217,7 +218,8 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       DBG("Allocating %d x %d Intel RBO (pitch %d)\n", width,
 	  height, pitch);
 
-      irb->region = intel_region_alloc(intel, cpp, width, height, pitch,
+      irb->region = intel_region_alloc(intel, I915_TILING_NONE,
+				       cpp, width, height, pitch,
 				       GL_TRUE);
       if (!irb->region)
          return GL_FALSE;       /* out of memory? */
@@ -575,9 +577,10 @@ intel_render_texture(GLcontext * ctx,
 
    ASSERT(newImage);
 
-   if (newImage->Border != 0) {
-      /* Fallback on drawing to a texture with a border, which won't have a
-       * miptree.
+   intel_image = intel_texture_image(newImage);
+   if (!intel_image->mt) {
+      /* Fallback on drawing to a texture that doesn't have a miptree
+       * (has a border, width/height 0, etc.)
        */
       _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
       _mesa_render_texture(ctx, fb, att);
@@ -608,7 +611,6 @@ intel_render_texture(GLcontext * ctx,
        irb->Base.RefCount);
 
    /* point the renderbufer's region to the texture image region */
-   intel_image = intel_texture_image(newImage);
    if (irb->region != intel_image->mt->region) {
       if (irb->region)
 	 intel_region_release(&irb->region);
@@ -680,6 +682,11 @@ intel_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
       if (rb == NULL)
 	 continue;
 
+      if (irb == NULL) {
+	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
+	 continue;
+      }
+
       switch (irb->texformat->MesaFormat) {
       case MESA_FORMAT_ARGB8888:
       case MESA_FORMAT_RGB565:
@@ -694,74 +701,6 @@ intel_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
 
 
 /**
- * Called from glBlitFramebuffer().
- * For now, we're doing an approximation with glCopyPixels().
- * XXX we need to bypass all the per-fragment operations, except scissor.
- */
-static void
-intel_blit_framebuffer(GLcontext *ctx,
-                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
-                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
-                       GLbitfield mask, GLenum filter)
-{
-   const GLfloat xZoomSave = ctx->Pixel.ZoomX;
-   const GLfloat yZoomSave = ctx->Pixel.ZoomY;
-   GLsizei width, height;
-   GLfloat xFlip = 1.0F, yFlip = 1.0F;
-
-   if (srcX1 < srcX0) {
-      GLint tmp = srcX1;
-      srcX1 = srcX0;
-      srcX0 = tmp;
-      xFlip = -1.0F;
-   }
-
-   if (srcY1 < srcY0) {
-      GLint tmp = srcY1;
-      srcY1 = srcY0;
-      srcY0 = tmp;
-      yFlip = -1.0F;
-   }
-
-   width = srcX1 - srcX0;
-   height = srcY1 - srcY0;
-
-   ctx->Pixel.ZoomX = xFlip * (dstX1 - dstX0) / (srcX1 - srcY0);
-   ctx->Pixel.ZoomY = yFlip * (dstY1 - dstY0) / (srcY1 - srcY0);
-
-   if (ctx->Pixel.ZoomX < 0.0F) {
-      dstX0 = MAX2(dstX0, dstX1);
-   }
-   else {
-      dstX0 = MIN2(dstX0, dstX1);
-   }
-
-   if (ctx->Pixel.ZoomY < 0.0F) {
-      dstY0 = MAX2(dstY0, dstY1);
-   }
-   else {
-      dstY0 = MIN2(dstY0, dstY1);
-   }
-
-   if (mask & GL_COLOR_BUFFER_BIT) {
-      ctx->Driver.CopyPixels(ctx, srcX0, srcY0, width, height,
-                             dstX0, dstY0, GL_COLOR);
-   }
-   if (mask & GL_DEPTH_BUFFER_BIT) {
-      ctx->Driver.CopyPixels(ctx, srcX0, srcY0, width, height,
-                             dstX0, dstY0, GL_DEPTH);
-   }
-   if (mask & GL_STENCIL_BUFFER_BIT) {
-      ctx->Driver.CopyPixels(ctx, srcX0, srcY0, width, height,
-                             dstX0, dstY0, GL_STENCIL);
-   }
-      
-   ctx->Pixel.ZoomX = xZoomSave;
-   ctx->Pixel.ZoomY = yZoomSave;
-}
-
-
-/**
  * Do one-time context initializations related to GL_EXT_framebuffer_object.
  * Hook in device driver functions.
  */
@@ -776,5 +715,5 @@ intel_fbo_init(struct intel_context *intel)
    intel->ctx.Driver.FinishRenderTexture = intel_finish_render_texture;
    intel->ctx.Driver.ResizeBuffers = intel_resize_buffers;
    intel->ctx.Driver.ValidateFramebuffer = intel_validate_framebuffer;
-   intel->ctx.Driver.BlitFramebuffer = intel_blit_framebuffer;
+   intel->ctx.Driver.BlitFramebuffer = _mesa_meta_blit_framebuffer;
 }
diff --git a/shared/intel_generatemipmap.c b/shared/intel_generatemipmap.c
new file mode 100644
index 0000000..12059e1
--- /dev/null
+++ b/shared/intel_generatemipmap.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "main/glheader.h"
+#include "main/enums.h"
+#include "main/image.h"
+#include "main/mtypes.h"
+#include "main/macros.h"
+#include "main/bufferobj.h"
+#include "main/teximage.h"
+#include "main/texenv.h"
+#include "main/texobj.h"
+#include "main/texstate.h"
+#include "main/texparam.h"
+#include "main/varray.h"
+#include "main/attrib.h"
+#include "main/enable.h"
+#include "main/buffers.h"
+#include "main/fbobject.h"
+#include "main/framebuffer.h"
+#include "main/renderbuffer.h"
+#include "main/depth.h"
+#include "main/hash.h"
+#include "main/mipmap.h"
+#include "main/blend.h"
+#include "glapi/dispatch.h"
+#include "swrast/swrast.h"
+
+#include "intel_screen.h"
+#include "intel_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_pixel.h"
+#include "intel_tex.h"
+#include "intel_mipmap_tree.h"
+
+static const char *intel_fp_tex2d =
+      "!!ARBfp1.0\n"
+      "TEX result.color, fragment.texcoord[0], texture[0], 2D;\n"
+      "END\n";
+
+static GLboolean
+intel_generate_mipmap_level(GLcontext *ctx, GLuint tex_name,
+			    int level, int width, int height)
+{
+   struct intel_context *intel = intel_context(ctx);
+   GLfloat vertices[4][2];
+   GLint status;
+
+   /* Set to source from the previous level */
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, level - 1);
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, level - 1);
+
+   /* Set to draw into the current level */
+   _mesa_FramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
+				 GL_COLOR_ATTACHMENT0_EXT,
+				 GL_TEXTURE_2D,
+				 tex_name,
+				 level);
+   /* Choose to render to the color attachment. */
+   _mesa_DrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
+
+   status = _mesa_CheckFramebufferStatusEXT (GL_FRAMEBUFFER_EXT);
+   if (status != GL_FRAMEBUFFER_COMPLETE_EXT)
+      return GL_FALSE;
+
+   meta_set_passthrough_transform(&intel->meta);
+
+   /* XXX: Doing it right would involve setting up the transformation to do
+    * 0-1 mapping or something, and not changing the vertex data.
+    */
+   vertices[0][0] = 0;
+   vertices[0][1] = 0;
+   vertices[1][0] = width;
+   vertices[1][1] = 0;
+   vertices[2][0] = width;
+   vertices[2][1] = height;
+   vertices[3][0] = 0;
+   vertices[3][1] = height;
+
+   _mesa_VertexPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &vertices);
+   _mesa_Enable(GL_VERTEX_ARRAY);
+   meta_set_default_texrect(&intel->meta);
+
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+   meta_restore_texcoords(&intel->meta);
+   meta_restore_transform(&intel->meta);
+
+   return GL_TRUE;
+}
+
+static GLboolean
+intel_generate_mipmap_2d(GLcontext *ctx,
+			 GLenum target,
+			 struct gl_texture_object *texObj)
+{
+   struct intel_context *intel = intel_context(ctx);
+   GLint old_active_texture;
+   int level, max_levels, start_level, end_level;
+   GLuint fb_name;
+   GLboolean success = GL_FALSE;
+   struct gl_framebuffer *saved_fbo = NULL;
+   struct gl_buffer_object *saved_array_buffer = NULL;
+   struct gl_buffer_object *saved_element_buffer = NULL;
+
+   _mesa_PushAttrib(GL_ENABLE_BIT | GL_TEXTURE_BIT |
+		    GL_CURRENT_BIT | GL_COLOR_BUFFER_BIT |
+		    GL_DEPTH_BUFFER_BIT);
+   _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
+   old_active_texture = ctx->Texture.CurrentUnit;
+   _mesa_reference_framebuffer(&saved_fbo, ctx->DrawBuffer);
+
+   /* use default array/index buffers */
+   _mesa_reference_buffer_object(ctx, &saved_array_buffer,
+                                 ctx->Array.ArrayBufferObj);
+   _mesa_reference_buffer_object(ctx, &ctx->Array.ArrayBufferObj,
+                                 ctx->Shared->NullBufferObj);   
+   _mesa_reference_buffer_object(ctx, &saved_element_buffer,
+                                 ctx->Array.ElementArrayBufferObj);
+   _mesa_reference_buffer_object(ctx, &ctx->Array.ElementArrayBufferObj,
+                                 ctx->Shared->NullBufferObj);   
+
+   _mesa_Disable(GL_POLYGON_STIPPLE);
+   _mesa_Disable(GL_DEPTH_TEST);
+   _mesa_Disable(GL_STENCIL_TEST);
+   _mesa_ColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+   _mesa_DepthMask(GL_FALSE);
+
+   /* Bind the given texture to GL_TEXTURE_2D with linear filtering for our
+    * minification.
+    */
+   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB);
+   _mesa_Enable(GL_TEXTURE_2D);
+   _mesa_BindTexture(GL_TEXTURE_2D, texObj->Name);
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER,
+		       GL_LINEAR_MIPMAP_NEAREST);
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+
+   /* Bind the new renderbuffer to the color attachment point. */
+   _mesa_GenFramebuffersEXT(1, &fb_name);
+   _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, fb_name);
+
+   meta_set_fragment_program(&intel->meta, &intel->meta.tex2d_fp,
+			     intel_fp_tex2d);
+   meta_set_passthrough_vertex_program(&intel->meta);
+
+   max_levels = _mesa_max_texture_levels(ctx, texObj->Target);
+   start_level = texObj->BaseLevel;
+   end_level = texObj->MaxLevel;
+
+   /* Loop generating level+1 from level. */
+   for (level = start_level; level < end_level && level < max_levels - 1; level++) {
+      const struct gl_texture_image *srcImage;
+      int width, height;
+
+      srcImage = _mesa_select_tex_image(ctx, texObj, target, level);
+      if (srcImage->Border != 0)
+	 goto fail;
+
+      width = srcImage->Width / 2;
+      if (width < 1)
+	 width = 1;
+      height = srcImage->Height / 2;
+      if (height < 1)
+	 height = 1;
+
+      if (width == srcImage->Width &&
+	  height == srcImage->Height) {
+	 /* Neither _mesa_max_texture_levels nor texObj->MaxLevel are the
+	  * maximum texture level for the object, so break out when we've gone
+	  * over the edge.
+	  */
+	 break;
+      }
+
+      /* Make sure that there's space allocated for the target level.
+       * We could skip this if there's already space allocated and save some
+       * time.
+       */
+      _mesa_TexImage2D(GL_TEXTURE_2D, level + 1, srcImage->InternalFormat,
+		       width, height, 0,
+		       GL_RGBA, GL_UNSIGNED_INT, NULL);
+
+      if (!intel_generate_mipmap_level(ctx, texObj->Name, level + 1,
+				       width, height))
+	 goto fail;
+   }
+
+   success = GL_TRUE;
+
+fail:
+   meta_restore_fragment_program(&intel->meta);
+   meta_restore_vertex_program(&intel->meta);
+
+   /* restore array/index buffers */
+   _mesa_reference_buffer_object(ctx, &ctx->Array.ArrayBufferObj,
+                                 saved_array_buffer);
+   _mesa_reference_buffer_object(ctx, &saved_array_buffer, NULL);
+   _mesa_reference_buffer_object(ctx, &ctx->Array.ElementArrayBufferObj,
+                                 saved_element_buffer);
+   _mesa_reference_buffer_object(ctx, &saved_element_buffer, NULL);
+
+
+   _mesa_DeleteFramebuffersEXT(1, &fb_name);
+   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB + old_active_texture);
+   if (saved_fbo)
+      _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, saved_fbo->Name);
+   _mesa_reference_framebuffer(&saved_fbo, NULL);
+   _mesa_PopClientAttrib();
+   _mesa_PopAttrib();
+
+   return success;
+}
+
+
+/**
+ * Generate new mipmap data from BASE+1 to BASE+p (the minimally-sized mipmap
+ * level).
+ *
+ * The texture object's miptree must be mapped.
+ *
+ * It would be really nice if this was just called by Mesa whenever mipmaps
+ * needed to be regenerated, rather than us having to remember to do so in
+ * each texture image modification path.
+ *
+ * This function should also include an accelerated path.
+ */
+void
+intel_generate_mipmap(GLcontext *ctx, GLenum target,
+                      struct gl_texture_object *texObj)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_texture_object *intelObj = intel_texture_object(texObj);
+   GLuint nr_faces = (intelObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
+   int face, i;
+
+   /* HW path */
+   if (target == GL_TEXTURE_2D &&
+       ctx->Extensions.EXT_framebuffer_object &&
+       ctx->Extensions.ARB_fragment_program &&
+       ctx->Extensions.ARB_vertex_program) {
+      GLboolean success;
+
+      /* We'll be accessing this texture using GL entrypoints, which should
+       * be resilient against other access to this texture.
+       */
+      _mesa_unlock_texture(ctx, texObj);
+      success = intel_generate_mipmap_2d(ctx, target, texObj);
+      _mesa_lock_texture(ctx, texObj);
+
+      if (success)
+	 return;
+   }
+
+   /* SW path */
+   intel_tex_map_level_images(intel, intelObj, texObj->BaseLevel);
+   _mesa_generate_mipmap(ctx, target, texObj);
+   intel_tex_unmap_level_images(intel, intelObj, texObj->BaseLevel);
+
+   /* Update the level information in our private data in the new images, since
+    * it didn't get set as part of a normal TexImage path.
+    */
+   for (face = 0; face < nr_faces; face++) {
+      for (i = texObj->BaseLevel + 1; i < texObj->MaxLevel; i++) {
+         struct intel_texture_image *intelImage;
+
+	 intelImage = intel_texture_image(texObj->Image[face][i]);
+	 if (intelImage == NULL)
+	    break;
+
+	 intelImage->level = i;
+	 intelImage->face = face;
+	 /* Unreference the miptree to signal that the new Data is a bare
+	  * pointer from mesa.
+	  */
+	 intel_miptree_release(intel, &intelImage->mt);
+      }
+   }
+}
diff --git a/shared/intel_mipmap_tree.c b/shared/intel_mipmap_tree.c
index 6e1e034..c985da5 100644
--- a/shared/intel_mipmap_tree.c
+++ b/shared/intel_mipmap_tree.c
@@ -57,14 +57,16 @@ intel_miptree_create_internal(struct intel_context *intel,
 			      GLuint last_level,
 			      GLuint width0,
 			      GLuint height0,
-			      GLuint depth0, GLuint cpp, GLuint compress_byte)
+			      GLuint depth0, GLuint cpp, GLuint compress_byte,
+			      uint32_t tiling)
 {
    GLboolean ok;
    struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
 
-   DBG("%s target %s format %s level %d..%d\n", __FUNCTION__,
+   DBG("%s target %s format %s level %d..%d <-- %p\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target),
-       _mesa_lookup_enum_by_nr(internal_format), first_level, last_level);
+       _mesa_lookup_enum_by_nr(internal_format), 
+       first_level, last_level, mt);
 
    mt->target = target_to_target(target);
    mt->internal_format = internal_format;
@@ -80,15 +82,16 @@ intel_miptree_create_internal(struct intel_context *intel,
 
 #ifdef I915
    if (IS_945(intel->intelScreen->deviceID))
-      ok = i945_miptree_layout(intel, mt);
+      ok = i945_miptree_layout(intel, mt, tiling);
    else
-      ok = i915_miptree_layout(intel, mt);
+      ok = i915_miptree_layout(intel, mt, tiling);
 #else
-   ok = brw_miptree_layout(intel, mt);
+   ok = brw_miptree_layout(intel, mt, tiling);
 #endif
 
    if (!ok) {
       free(mt);
+      DBG("%s not okay - returning NULL\n", __FUNCTION__);
       return NULL;
    }
 
@@ -98,6 +101,7 @@ intel_miptree_create_internal(struct intel_context *intel,
 struct intel_mipmap_tree *
 intel_miptree_create(struct intel_context *intel,
 		     GLenum target,
+		     GLenum base_format,
 		     GLenum internal_format,
 		     GLuint first_level,
 		     GLuint last_level,
@@ -107,10 +111,23 @@ intel_miptree_create(struct intel_context *intel,
 		     GLboolean expect_accelerated_upload)
 {
    struct intel_mipmap_tree *mt;
+   uint32_t tiling;
+
+   if (intel->use_texture_tiling && compress_byte == 0 &&
+       intel->intelScreen->kernel_exec_fencing) {
+      if (IS_965(intel->intelScreen->deviceID) &&
+	  (base_format == GL_DEPTH_COMPONENT ||
+	   base_format == GL_DEPTH_STENCIL_EXT))
+	 tiling = I915_TILING_Y;
+      else
+	 tiling = I915_TILING_X;
+   } else
+      tiling = I915_TILING_NONE;
 
    mt = intel_miptree_create_internal(intel, target, internal_format,
 				      first_level, last_level, width0,
-				      height0, depth0, cpp, compress_byte);
+				      height0, depth0, cpp, compress_byte,
+				      tiling);
    /*
     * pitch == 0 || height == 0  indicates the null texture
     */
@@ -118,6 +135,7 @@ intel_miptree_create(struct intel_context *intel,
       return NULL;
 
    mt->region = intel_region_alloc(intel,
+				   tiling,
 				   mt->cpp,
 				   mt->pitch,
 				   mt->total_height,
@@ -147,7 +165,8 @@ intel_miptree_create_for_region(struct intel_context *intel,
    mt = intel_miptree_create_internal(intel, target, internal_format,
 				      first_level, last_level,
 				      region->width, region->height, 1,
-				      region->cpp, compress_byte);
+				      region->cpp, compress_byte,
+				      I915_TILING_NONE);
    if (!mt)
       return mt;
 #if 0
@@ -185,6 +204,7 @@ intel_miptree_create_for_region(struct intel_context *intel,
 
 int intel_miptree_pitch_align (struct intel_context *intel,
 			       struct intel_mipmap_tree *mt,
+			       uint32_t tiling,
 			       int pitch)
 {
 #ifdef I915
@@ -205,6 +225,11 @@ int intel_miptree_pitch_align (struct intel_context *intel,
 	 pitch_align = 4;
       }
 
+      if (tiling == I915_TILING_X)
+	 pitch_align = 512;
+      else if (tiling == I915_TILING_Y)
+	 pitch_align = 128;
+
       pitch = ALIGN(pitch * mt->cpp, pitch_align);
 
 #ifdef I915
@@ -328,23 +353,31 @@ intel_miptree_set_level_info(struct intel_mipmap_tree *mt,
 }
 
 
-
 void
-intel_miptree_set_image_offset(struct intel_mipmap_tree *mt,
-			       GLuint level, GLuint img,
-			       GLuint x, GLuint y)
+intel_miptree_set_image_offset_ex(struct intel_mipmap_tree *mt,
+                                  GLuint level, GLuint img,
+                                  GLuint x, GLuint y, 
+                                  GLuint offset)
 {
    if (img == 0 && level == 0)
       assert(x == 0 && y == 0);
 
    assert(img < mt->level[level].nr_images);
 
-   mt->level[level].image_offset[img] = (x + y * mt->pitch) * mt->cpp;
+   mt->level[level].image_offset[img] = (x + y * mt->pitch) * mt->cpp + offset;
 
    DBG("%s level %d img %d pos %d,%d image_offset %x\n",
        __FUNCTION__, level, img, x, y, mt->level[level].image_offset[img]);
 }
 
+void
+intel_miptree_set_image_offset(struct intel_mipmap_tree *mt,
+			       GLuint level, GLuint img,
+			       GLuint x, GLuint y)
+{
+    intel_miptree_set_image_offset_ex(mt, level, img, x, y, 0);
+}
+
 
 /* Although we use the image_offset[] array to store relative offsets
  * to cube faces, Mesa doesn't know anything about this and expects
@@ -454,11 +487,11 @@ intel_miptree_image_data(struct intel_context *intel,
 			0, 0,                             /* source x, y */
 			dst->level[level].width, height); /* width, height */
 
-      src += src_image_pitch * dst->cpp;
+      src = (char *)src + src_image_pitch * dst->cpp;
    }
 }
 
-extern GLuint intel_compressed_alignment(GLenum);
+extern void intel_get_texture_alignment_unit(GLenum, GLuint *, GLuint *);
 /* Copy mipmap image between trees
  */
 void
@@ -475,20 +508,37 @@ intel_miptree_image_copy(struct intel_context *intel,
    const GLuint *dst_depth_offset = intel_miptree_depth_offsets(dst, level);
    const GLuint *src_depth_offset = intel_miptree_depth_offsets(src, level);
    GLuint i;
+   GLboolean success;
 
    if (dst->compressed) {
-       GLuint alignment = intel_compressed_alignment(dst->internal_format);
+       GLuint align_w, align_h;
+
+       intel_get_texture_alignment_unit(dst->internal_format, &align_w, &align_h);
        height = (height + 3) / 4;
-       width = ((width + alignment - 1) & ~(alignment - 1));
+       width = ALIGN(width, align_w);
    }
 
    for (i = 0; i < depth; i++) {
-      intel_region_copy(intel,
-                        dst->region, dst_offset + dst_depth_offset[i],
-                        0,
-                        0,
-                        src->region, src_offset + src_depth_offset[i],
-                        0, 0, width, height);
+      success = intel_region_copy(intel,
+				  dst->region, dst_offset + dst_depth_offset[i],
+				  0, 0,
+				  src->region, src_offset + src_depth_offset[i],
+				  0, 0, width, height, GL_COPY);
+      if (!success) {
+	 GLubyte *src_ptr, *dst_ptr;
+
+	 src_ptr = intel_region_map(intel, src->region);
+	 dst_ptr = intel_region_map(intel, dst->region);
+
+	 _mesa_copy_rect(dst_ptr + dst_offset + dst_depth_offset[i],
+			 dst->cpp,
+			 dst->pitch,
+			 0, 0, width, height,
+			 src_ptr + src_offset + src_depth_offset[i],
+			 src->pitch,
+			 0, 0);
+	 intel_region_unmap(intel, src->region);
+	 intel_region_unmap(intel, dst->region);
+      }
    }
-
 }
diff --git a/shared/intel_mipmap_tree.h b/shared/intel_mipmap_tree.h
index 4060b9d..c890b2a 100644
--- a/shared/intel_mipmap_tree.h
+++ b/shared/intel_mipmap_tree.h
@@ -126,6 +126,7 @@ struct intel_mipmap_tree
 
 struct intel_mipmap_tree *intel_miptree_create(struct intel_context *intel,
                                                GLenum target,
+                                               GLenum base_format,
                                                GLenum internal_format,
                                                GLuint first_level,
                                                GLuint last_level,
@@ -148,6 +149,7 @@ intel_miptree_create_for_region(struct intel_context *intel,
 
 int intel_miptree_pitch_align (struct intel_context *intel,
 			       struct intel_mipmap_tree *mt,
+			       uint32_t tiling,
 			       int pitch);
 
 void intel_miptree_reference(struct intel_mipmap_tree **dst,
@@ -194,6 +196,11 @@ void intel_miptree_set_level_info(struct intel_mipmap_tree *mt,
                                   GLuint x, GLuint y,
                                   GLuint w, GLuint h, GLuint d);
 
+void intel_miptree_set_image_offset_ex(struct intel_mipmap_tree *mt,
+                                       GLuint level,
+                                       GLuint img, GLuint x, GLuint y,
+                                       GLuint offset);
+
 void intel_miptree_set_image_offset(struct intel_mipmap_tree *mt,
                                     GLuint level,
                                     GLuint img, GLuint x, GLuint y);
@@ -218,10 +225,13 @@ void intel_miptree_image_copy(struct intel_context *intel,
 /* i915_mipmap_tree.c:
  */
 GLboolean i915_miptree_layout(struct intel_context *intel,
-			      struct intel_mipmap_tree *mt);
+			      struct intel_mipmap_tree *mt,
+			      uint32_t tiling);
 GLboolean i945_miptree_layout(struct intel_context *intel,
-			      struct intel_mipmap_tree *mt);
+			      struct intel_mipmap_tree *mt,
+			      uint32_t tiling);
 GLboolean brw_miptree_layout(struct intel_context *intel,
-			     struct intel_mipmap_tree *mt);
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling);
 
 #endif
diff --git a/shared/intel_pixel.c b/shared/intel_pixel.c
index fc0ac0b..a300141 100644
--- a/shared/intel_pixel.c
+++ b/shared/intel_pixel.c
@@ -27,9 +27,12 @@
 
 #include "main/enums.h"
 #include "main/state.h"
+#include "main/bufferobj.h"
 #include "main/context.h"
 #include "main/enable.h"
 #include "main/matrix.h"
+#include "main/texstate.h"
+#include "main/varray.h"
 #include "main/viewport.h"
 #include "swrast/swrast.h"
 #include "shader/arbprogram.h"
@@ -174,167 +177,6 @@ intel_check_blit_format(struct intel_region * region,
 }
 
 void
-intel_meta_set_passthrough_transform(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-
-   intel->meta.saved_vp_x = ctx->Viewport.X;
-   intel->meta.saved_vp_y = ctx->Viewport.Y;
-   intel->meta.saved_vp_width = ctx->Viewport.Width;
-   intel->meta.saved_vp_height = ctx->Viewport.Height;
-   intel->meta.saved_matrix_mode = ctx->Transform.MatrixMode;
-
-   intel->internal_viewport_call = GL_TRUE;
-   _mesa_Viewport(0, 0, ctx->DrawBuffer->Width, ctx->DrawBuffer->Height);
-   intel->internal_viewport_call = GL_FALSE;
-
-   _mesa_MatrixMode(GL_PROJECTION);
-   _mesa_PushMatrix();
-   _mesa_LoadIdentity();
-   _mesa_Ortho(0, ctx->DrawBuffer->Width, 0, ctx->DrawBuffer->Height, 1, -1);
-
-   _mesa_MatrixMode(GL_MODELVIEW);
-   _mesa_PushMatrix();
-   _mesa_LoadIdentity();
-}
-
-void
-intel_meta_restore_transform(struct intel_context *intel)
-{
-   _mesa_MatrixMode(GL_PROJECTION);
-   _mesa_PopMatrix();
-   _mesa_MatrixMode(GL_MODELVIEW);
-   _mesa_PopMatrix();
-
-   _mesa_MatrixMode(intel->meta.saved_matrix_mode);
-
-   intel->internal_viewport_call = GL_TRUE;
-   _mesa_Viewport(intel->meta.saved_vp_x, intel->meta.saved_vp_y,
-		  intel->meta.saved_vp_width, intel->meta.saved_vp_height);
-   intel->internal_viewport_call = GL_FALSE;
-}
-
-/**
- * Set up a vertex program to pass through the position and first texcoord
- * for pixel path.
- */
-void
-intel_meta_set_passthrough_vertex_program(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-   static const char *vp =
-      "!!ARBvp1.0\n"
-      "TEMP vertexClip;\n"
-      "DP4 vertexClip.x, state.matrix.mvp.row[0], vertex.position;\n"
-      "DP4 vertexClip.y, state.matrix.mvp.row[1], vertex.position;\n"
-      "DP4 vertexClip.z, state.matrix.mvp.row[2], vertex.position;\n"
-      "DP4 vertexClip.w, state.matrix.mvp.row[3], vertex.position;\n"
-      "MOV result.position, vertexClip;\n"
-      "MOV result.texcoord[0], vertex.texcoord[0];\n"
-      "MOV result.color, vertex.color;\n"
-      "END\n";
-
-   assert(intel->meta.saved_vp == NULL);
-
-   _mesa_reference_vertprog(ctx, &intel->meta.saved_vp,
-			    ctx->VertexProgram.Current);
-   if (intel->meta.passthrough_vp == NULL) {
-      GLuint prog_name;
-      _mesa_GenPrograms(1, &prog_name);
-      _mesa_BindProgram(GL_VERTEX_PROGRAM_ARB, prog_name);
-      _mesa_ProgramStringARB(GL_VERTEX_PROGRAM_ARB,
-			     GL_PROGRAM_FORMAT_ASCII_ARB,
-			     strlen(vp), (const GLubyte *)vp);
-      _mesa_reference_vertprog(ctx, &intel->meta.passthrough_vp,
-			       ctx->VertexProgram.Current);
-      _mesa_DeletePrograms(1, &prog_name);
-   }
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-   _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current,
-			    intel->meta.passthrough_vp);
-   ctx->Driver.BindProgram(ctx, GL_VERTEX_PROGRAM_ARB,
-			   &intel->meta.passthrough_vp->Base);
-
-   intel->meta.saved_vp_enable = ctx->VertexProgram.Enabled;
-   _mesa_Enable(GL_VERTEX_PROGRAM_ARB);
-}
-
-/**
- * Restores the previous vertex program after
- * intel_meta_set_passthrough_vertex_program()
- */
-void
-intel_meta_restore_vertex_program(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-   _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current,
-			    intel->meta.saved_vp);
-   _mesa_reference_vertprog(ctx, &intel->meta.saved_vp, NULL);
-   ctx->Driver.BindProgram(ctx, GL_VERTEX_PROGRAM_ARB,
-			   &ctx->VertexProgram.Current->Base);
-
-   if (!intel->meta.saved_vp_enable)
-      _mesa_Disable(GL_VERTEX_PROGRAM_ARB);
-}
-
-/**
- * Binds the given program string to GL_FRAGMENT_PROGRAM_ARB, caching the
- * program object.
- */
-void
-intel_meta_set_fragment_program(struct intel_context *intel,
-				struct gl_fragment_program **prog,
-				const char *prog_string)
-{
-   GLcontext *ctx = &intel->ctx;
-   assert(intel->meta.saved_fp == NULL);
-
-   _mesa_reference_fragprog(ctx, &intel->meta.saved_fp,
-			    ctx->FragmentProgram.Current);
-   if (*prog == NULL) {
-      GLuint prog_name;
-      _mesa_GenPrograms(1, &prog_name);
-      _mesa_BindProgram(GL_FRAGMENT_PROGRAM_ARB, prog_name);
-      _mesa_ProgramStringARB(GL_FRAGMENT_PROGRAM_ARB,
-			     GL_PROGRAM_FORMAT_ASCII_ARB,
-			     strlen(prog_string), (const GLubyte *)prog_string);
-      _mesa_reference_fragprog(ctx, prog, ctx->FragmentProgram.Current);
-      /* Note that DeletePrograms unbinds the program on us */
-      _mesa_DeletePrograms(1, &prog_name);
-   }
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-   _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current, *prog);
-   ctx->Driver.BindProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, &((*prog)->Base));
-
-   intel->meta.saved_fp_enable = ctx->FragmentProgram.Enabled;
-   _mesa_Enable(GL_FRAGMENT_PROGRAM_ARB);
-}
-
-/**
- * Restores the previous fragment program after
- * intel_meta_set_fragment_program()
- */
-void
-intel_meta_restore_fragment_program(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-   _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current,
-			    intel->meta.saved_fp);
-   _mesa_reference_fragprog(ctx, &intel->meta.saved_fp, NULL);
-   ctx->Driver.BindProgram(ctx, GL_FRAGMENT_PROGRAM_ARB,
-			   &ctx->FragmentProgram.Current->Base);
-
-   if (!intel->meta.saved_fp_enable)
-      _mesa_Disable(GL_FRAGMENT_PROGRAM_ARB);
-}
-
-void
 intelInitPixelFuncs(struct dd_function_table *functions)
 {
    functions->Accum = _swrast_Accum;
@@ -342,18 +184,7 @@ intelInitPixelFuncs(struct dd_function_table *functions)
       functions->Bitmap = intelBitmap;
       functions->CopyPixels = intelCopyPixels;
       functions->DrawPixels = intelDrawPixels;
-#ifdef I915
-      functions->ReadPixels = intelReadPixels;
-#endif
    }
-}
-
-void
-intel_free_pixel_state(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-
-   _mesa_reference_vertprog(ctx, &intel->meta.passthrough_vp, NULL);
-   _mesa_reference_fragprog(ctx, &intel->meta.bitmap_fp, NULL);
+   functions->ReadPixels = intelReadPixels;
 }
 
diff --git a/shared/intel_pixel.h b/shared/intel_pixel.h
index cb41fa1..96a6dd1 100644
--- a/shared/intel_pixel.h
+++ b/shared/intel_pixel.h
@@ -31,16 +31,6 @@
 #include "main/mtypes.h"
 
 void intelInitPixelFuncs(struct dd_function_table *functions);
-void intel_meta_set_passthrough_transform(struct intel_context *intel);
-void intel_meta_restore_transform(struct intel_context *intel);
-void intel_meta_set_passthrough_vertex_program(struct intel_context *intel);
-void intel_meta_restore_vertex_program(struct intel_context *intel);
-void intel_meta_set_fragment_program(struct intel_context *intel,
-				     struct gl_fragment_program **prog,
-				     const char *prog_string);
-void intel_meta_restore_fragment_program(struct intel_context *intel);
-void intel_free_pixel_state(struct intel_context *intel);
-
 GLboolean intel_check_blit_fragment_ops(GLcontext * ctx,
 					GLboolean src_alpha_is_one);
 
@@ -76,6 +66,4 @@ void intelBitmap(GLcontext * ctx,
 		 const struct gl_pixelstore_attrib *unpack,
 		 const GLubyte * pixels);
 
-void intel_clear_tris(GLcontext *ctx, GLbitfield mask);
-
 #endif
diff --git a/shared/intel_pixel_bitmap.c b/shared/intel_pixel_bitmap.c
index a2ccae1..b543a0b 100644
--- a/shared/intel_pixel_bitmap.c
+++ b/shared/intel_pixel_bitmap.c
@@ -42,6 +42,7 @@
 #include "main/varray.h"
 #include "main/attrib.h"
 #include "main/enable.h"
+#include "main/viewport.h"
 #include "shader/arbprogram.h"
 #include "glapi/dispatch.h"
 #include "swrast/swrast.h"
@@ -92,19 +93,12 @@ static const GLubyte *map_pbo( GLcontext *ctx,
    return ADD_POINTERS(buf, bitmap);
 }
 
-static GLboolean test_bit( const GLubyte *src,
-			    GLuint bit )
+static GLboolean test_bit( const GLubyte *src, GLuint bit )
 {
    return (src[bit/8] & (1<<(bit % 8))) ? 1 : 0;
 }
 
-static GLboolean test_msb_bit(const GLubyte *src, GLuint bit)
-{
-   return (src[bit/8] & (1<<(7 - (bit % 8)))) ? 1 : 0;
-}
-
-static void set_bit( GLubyte *dest,
-			  GLuint bit )
+static void set_bit( GLubyte *dest, GLuint bit )
 {
    dest[bit/8] |= 1 << (bit % 8);
 }
@@ -194,7 +188,7 @@ do_blit_bitmap( GLcontext *ctx,
    struct gl_framebuffer *fb = ctx->DrawBuffer;
    GLfloat tmpColor[4];
    GLubyte ubcolor[4];
-   GLuint color8888, color565;
+   GLuint color;
    unsigned int num_cliprects;
    drm_clip_rect_t *cliprects;
    int x_off, y_off;
@@ -232,8 +226,11 @@ do_blit_bitmap( GLcontext *ctx,
    UNCLAMPED_FLOAT_TO_UBYTE(ubcolor[2], tmpColor[2]);
    UNCLAMPED_FLOAT_TO_UBYTE(ubcolor[3], tmpColor[3]);
 
-   color8888 = INTEL_PACKCOLOR8888(ubcolor[0], ubcolor[1], ubcolor[2], ubcolor[3]);
-   color565 = INTEL_PACKCOLOR565(ubcolor[0], ubcolor[1], ubcolor[2]);
+   if (dst->cpp == 2)
+      color = INTEL_PACKCOLOR565(ubcolor[0], ubcolor[1], ubcolor[2]);
+   else
+      color = INTEL_PACKCOLOR8888(ubcolor[0], ubcolor[1],
+				  ubcolor[2], ubcolor[3]);
 
    if (!intel_check_blit_fragment_ops(ctx, tmpColor[3] == 1.0F))
       return GL_FALSE;
@@ -307,21 +304,21 @@ do_blit_bitmap( GLcontext *ctx,
 				   fb->Name == 0 ? GL_TRUE : GL_FALSE) == 0)
 		  continue;
 
-	       /* 
-		*/
-	       intelEmitImmediateColorExpandBlit( intel,
-						  dst->cpp,
-						  (GLubyte *)stipple, 
-						  sz,
-						  (dst->cpp == 2) ? color565 : color8888,
-						  dst->pitch,
-						  dst->buffer,
-						  0,
-						  dst->tiling,
-						  box_x + px,
-						  box_y + py,
-						  w, h,
-						  logic_op);
+	       if (!intelEmitImmediateColorExpandBlit(intel,
+						      dst->cpp,
+						      (GLubyte *)stipple,
+						      sz,
+						      color,
+						      dst->pitch,
+						      dst->buffer,
+						      0,
+						      dst->tiling,
+						      box_x + px,
+						      box_y + py,
+						      w, h,
+						      logic_op)) {
+		  return GL_FALSE;
+	       }
 	    } 
 	 } 
       }
@@ -360,11 +357,8 @@ intel_texture_bitmap(GLcontext * ctx,
       "END\n";
    GLuint texname;
    GLfloat vertices[4][4];
-   GLfloat texcoords[4][2];
    GLint old_active_texture;
-   GLubyte *unpacked_bitmap;
    GLubyte *a8_bitmap;
-   int x, y;
    GLfloat dst_z;
 
    /* We need a fragment program for the KIL effect */
@@ -409,6 +403,12 @@ intel_texture_bitmap(GLcontext * ctx,
       return GL_FALSE;
    }
 
+   if (ctx->Fog.Enabled) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "glBitmap() fallback: fog\n");
+      return GL_FALSE;
+   }
+
    /* Check that we can load in a texture this big. */
    if (width > (1 << (ctx->Const.MaxTextureLevels - 1)) ||
        height > (1 << (ctx->Const.MaxTextureLevels - 1))) {
@@ -418,22 +418,16 @@ intel_texture_bitmap(GLcontext * ctx,
       return GL_FALSE;
    }
 
-   /* Convert the A1 bitmap to an A8 format suitable for glTexImage */
    if (unpack->BufferObj->Name) {
       bitmap = map_pbo(ctx, width, height, unpack, bitmap);
       if (bitmap == NULL)
 	 return GL_TRUE;	/* even though this is an error, we're done */
    }
-   unpacked_bitmap = _mesa_unpack_bitmap(width, height, bitmap,
-					 unpack);
+
+   /* Convert the A1 bitmap to an A8 format suitable for glTexImage */
    a8_bitmap = _mesa_calloc(width * height);
-   for (y = 0; y < height; y++) {
-      for (x = 0; x < width; x++) {
-	 if (test_msb_bit(unpacked_bitmap, ALIGN(width, 8) * y + x))
-	    a8_bitmap[y * width + x] = 0xff;
-      }
-   }
-   _mesa_free(unpacked_bitmap);
+   _mesa_expand_bitmap(width, height, unpack, bitmap, a8_bitmap, width, 0xff);
+
    if (unpack->BufferObj->Name) {
       /* done with PBO so unmap it now */
       ctx->Driver.UnmapBuffer(ctx, GL_PIXEL_UNPACK_BUFFER_EXT,
@@ -467,15 +461,18 @@ intel_texture_bitmap(GLcontext * ctx,
 		    GL_ALPHA, GL_UNSIGNED_BYTE, a8_bitmap);
    _mesa_free(a8_bitmap);
 
-   intel_meta_set_fragment_program(intel, &intel->meta.bitmap_fp, fp);
+   meta_set_fragment_program(&intel->meta, &intel->meta.bitmap_fp, fp);
    _mesa_ProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0,
 				     ctx->Current.RasterColor);
-   intel_meta_set_passthrough_vertex_program(intel);
-   intel_meta_set_passthrough_transform(intel);
+   meta_set_passthrough_vertex_program(&intel->meta);
+   meta_set_passthrough_transform(&intel->meta);
 
    /* convert rasterpos Z from [0,1] to NDC coord in [-1,1] */
    dst_z = -1.0 + 2.0 * ctx->Current.RasterPos[2];
 
+   /* RasterPos[2] already takes into account the DepthRange mapping. */
+   _mesa_DepthRange(0.0, 1.0);
+
    vertices[0][0] = dst_x;
    vertices[0][1] = dst_y;
    vertices[0][2] = dst_z;
@@ -493,25 +490,15 @@ intel_texture_bitmap(GLcontext * ctx,
    vertices[3][2] = dst_z;
    vertices[3][3] = 1.0;
 
-   texcoords[0][0] = 0.0;
-   texcoords[0][1] = 0.0;
-   texcoords[1][0] = 1.0;
-   texcoords[1][1] = 0.0;
-   texcoords[2][0] = 1.0;
-   texcoords[2][1] = 1.0;
-   texcoords[3][0] = 0.0;
-   texcoords[3][1] = 1.0;
-
    _mesa_VertexPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), &vertices);
-   _mesa_ClientActiveTextureARB(GL_TEXTURE0);
-   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
-   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
+   meta_set_default_texrect(&intel->meta);
    _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
-   intel_meta_restore_transform(intel);
-   intel_meta_restore_fragment_program(intel);
-   intel_meta_restore_vertex_program(intel);
+   meta_restore_texcoords(&intel->meta);
+   meta_restore_transform(&intel->meta);
+   meta_restore_fragment_program(&intel->meta);
+   meta_restore_vertex_program(&intel->meta);
 
    _mesa_PopClientAttrib();
    _mesa_Disable(GL_TEXTURE_2D); /* asserted that it was disabled at entry */
diff --git a/shared/intel_pixel_copy.c b/shared/intel_pixel_copy.c
index d50dd68..07ca8f7 100644
--- a/shared/intel_pixel_copy.c
+++ b/shared/intel_pixel_copy.c
@@ -26,18 +26,13 @@
  **************************************************************************/
 
 #include "main/glheader.h"
-#include "main/enums.h"
 #include "main/image.h"
 #include "main/state.h"
 #include "main/mtypes.h"
-#include "main/macros.h"
-#include "swrast/swrast.h"
+#include "drivers/common/meta.h"
 
-#include "intel_screen.h"
 #include "intel_context.h"
-#include "intel_batchbuffer.h"
 #include "intel_buffers.h"
-#include "intel_blit.h"
 #include "intel_regions.h"
 #include "intel_pixel.h"
 
@@ -97,162 +92,6 @@ intel_check_copypixel_blit_fragment_ops(GLcontext * ctx)
 	    ctx->Color.BlendEnabled);
 }
 
-#ifdef I915
-/* Doesn't work for overlapping regions.  Could do a double copy or
- * just fallback.
- */
-static GLboolean
-do_texture_copypixels(GLcontext * ctx,
-                      GLint srcx, GLint srcy,
-                      GLsizei width, GLsizei height,
-                      GLint dstx, GLint dsty, GLenum type)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_region *dst = intel_drawbuf_region(intel);
-   struct intel_region *src = copypix_src_region(intel, type);
-   GLenum src_format;
-   GLenum src_type;
-
-   DBG("%s %d,%d %dx%d --> %d,%d\n", __FUNCTION__, 
-       srcx, srcy, width, height, dstx, dsty);
-
-   if (!src || !dst || type != GL_COLOR)
-      return GL_FALSE;
-
-   if (ctx->_ImageTransferState) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         fprintf(stderr, "%s: check_color failed\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-   /* Can't handle overlapping regions.  Don't have sufficient control
-    * over rasterization to pull it off in-place.  Punt on these for
-    * now.
-    * 
-    * XXX: do a copy to a temporary. 
-    */
-   if (src->buffer == dst->buffer) {
-      drm_clip_rect_t srcbox;
-      drm_clip_rect_t dstbox;
-      drm_clip_rect_t tmp;
-
-      srcbox.x1 = srcx;
-      srcbox.y1 = srcy;
-      srcbox.x2 = srcx + width;
-      srcbox.y2 = srcy + height;
-
-      if (ctx->Pixel.ZoomX > 0) {
-	 dstbox.x1 = dstx;
-	 dstbox.x2 = dstx + width * ctx->Pixel.ZoomX;
-      } else {
-	 dstbox.x1 = dstx + width * ctx->Pixel.ZoomX;
-	 dstbox.x2 = dstx;
-      }
-      if (ctx->Pixel.ZoomY > 0) {
-	 dstbox.y1 = dsty;
-	 dstbox.y2 = dsty + height * ctx->Pixel.ZoomY;
-      } else {
-	 dstbox.y1 = dsty + height * ctx->Pixel.ZoomY;
-	 dstbox.y2 = dsty;
-      }
-
-      DBG("src %d,%d %d,%d\n", srcbox.x1, srcbox.y1, srcbox.x2, srcbox.y2);
-      DBG("dst %d,%d %d,%d (%dx%d) (%f,%f)\n", dstbox.x1, dstbox.y1, dstbox.x2, dstbox.y2,
-	  width, height, ctx->Pixel.ZoomX, ctx->Pixel.ZoomY);
-
-      if (intel_intersect_cliprects(&tmp, &srcbox, &dstbox)) {
-         DBG("%s: regions overlap\n", __FUNCTION__);
-         return GL_FALSE;
-      }
-   }
-
-   intelFlush(&intel->ctx);
-
-   intel->vtbl.install_meta_state(intel);
-
-   /* Is this true?  Also will need to turn depth testing on according
-    * to state:
-    */
-   intel->vtbl.meta_no_stencil_write(intel);
-   intel->vtbl.meta_no_depth_write(intel);
-
-   /* Set the 3d engine to draw into the destination region:
-    */
-   intel->vtbl.meta_draw_region(intel, dst, intel->depth_region);
-
-   intel->vtbl.meta_import_pixel_state(intel);
-
-   if (src->cpp == 2) {
-      src_format = GL_RGB;
-      src_type = GL_UNSIGNED_SHORT_5_6_5;
-   }
-   else {
-      src_format = GL_BGRA;
-      src_type = GL_UNSIGNED_BYTE;
-   }
-
-   /* Set the frontbuffer up as a large rectangular texture.
-    */
-   if (!intel->vtbl.meta_tex_rect_source(intel, src->buffer, 0,
-                                         src->pitch,
-                                         src->height, src_format, src_type)) {
-      intel->vtbl.leave_meta_state(intel);
-      return GL_FALSE;
-   }
-
-
-   intel->vtbl.meta_texture_blend_replace(intel);
-
-   LOCK_HARDWARE(intel);
-
-   if (intel->driDrawable->numClipRects) {
-      __DRIdrawablePrivate *dPriv = intel->driDrawable;
-
-
-      srcy = dPriv->h - srcy - height;  /* convert from gl to hardware coords */
-
-      srcx += dPriv->x;
-      srcy += dPriv->y;
-
-      /* Clip against the source region.  This is the only source
-       * clipping we do.  XXX: Just set the texcord wrap mode to clamp
-       * or similar.
-       *
-       */
-      if (0) {
-         GLint orig_x = srcx;
-         GLint orig_y = srcy;
-
-         if (!_mesa_clip_to_region(0, 0, src->pitch, src->height,
-                                   &srcx, &srcy, &width, &height))
-            goto out;
-
-         dstx += srcx - orig_x;
-         dsty += (srcy - orig_y) * ctx->Pixel.ZoomY;
-      }
-
-      /* Just use the regular cliprect mechanism...  Does this need to
-       * even hold the lock???
-       */
-      intel->vtbl.meta_draw_quad(intel,
-				 dstx,
-				 dstx + width * ctx->Pixel.ZoomX,
-				 dPriv->h - (dsty + height * ctx->Pixel.ZoomY),
-				 dPriv->h - (dsty), 0, /* XXX: what z value? */
-				 0x00ff00ff,
-				 srcx, srcx + width, srcy, srcy + height);
-
-    out:
-      intel->vtbl.leave_meta_state(intel);
-      intel_batchbuffer_emit_mi_flush(intel->batch);
-   }
-   UNLOCK_HARDWARE(intel);
-
-   DBG("%s: success\n", __FUNCTION__);
-   return GL_TRUE;
-}
-#endif /* I915 */
-
 
 /**
  * CopyPixels with the blitter.  Don't support zooming, pixel transfer, etc.
@@ -272,6 +111,12 @@ do_blit_copypixels(GLcontext * ctx,
    drm_clip_rect_t *cliprects;
    int x_off, y_off;
 
+   if (type == GL_DEPTH || type == GL_STENCIL) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "glCopyPixels() fallback: GL_DEPTH || GL_STENCIL\n");
+      return GL_FALSE;
+   }
+
    /* Update draw buffer bounds */
    _mesa_update_state(ctx);
 
@@ -362,14 +207,16 @@ do_blit_copypixels(GLcontext * ctx,
 				   &clip_x, &clip_y, &clip_w, &clip_h))
             continue;
 
-         intelEmitCopyBlit(intel, dst->cpp,
-			   src->pitch, src->buffer, 0, src->tiling,
-			   dst->pitch, dst->buffer, 0, dst->tiling,
-			   clip_x + delta_x, clip_y + delta_y, /* srcx, srcy */
-			   clip_x, clip_y, /* dstx, dsty */
-			   clip_w, clip_h,
-			   ctx->Color.ColorLogicOpEnabled ?
-			   ctx->Color.LogicOp : GL_COPY);
+	 if (!intel_region_copy(intel,
+				dst, 0, clip_x, clip_y,
+				src, 0, clip_x + delta_x, clip_y + delta_y,
+				clip_w, clip_h,
+				ctx->Color.ColorLogicOpEnabled ?
+				ctx->Color.LogicOp : GL_COPY)) {
+	    DBG("%s: blit failure\n", __FUNCTION__);
+	    UNLOCK_HARDWARE(intel);
+	    return GL_FALSE;
+	 }
       }
    }
 out:
@@ -392,12 +239,6 @@ intelCopyPixels(GLcontext * ctx,
    if (do_blit_copypixels(ctx, srcx, srcy, width, height, destx, desty, type))
       return;
 
-#ifdef I915
-   if (do_texture_copypixels(ctx, srcx, srcy, width, height, destx, desty, type))
-      return;
-#endif
-
-   DBG("fallback to _swrast_CopyPixels\n");
-
-   _swrast_CopyPixels(ctx, srcx, srcy, width, height, destx, desty, type);
+   /* this will use swrast if needed */
+   _mesa_meta_copy_pixels(ctx, srcx, srcy, width, height, destx, desty, type);
 }
diff --git a/shared/intel_pixel_draw.c b/shared/intel_pixel_draw.c
index d80069d..7fbb89f 100644
--- a/shared/intel_pixel_draw.c
+++ b/shared/intel_pixel_draw.c
@@ -29,8 +29,6 @@
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/mtypes.h"
-#include "main/macros.h"
-#include "main/bufferobj.h"
 #include "main/teximage.h"
 #include "main/texenv.h"
 #include "main/texobj.h"
@@ -41,169 +39,22 @@
 #include "main/enable.h"
 #include "main/buffers.h"
 #include "main/fbobject.h"
-#include "main/renderbuffer.h"
 #include "main/depth.h"
 #include "main/hash.h"
 #include "main/blend.h"
-#include "glapi/dispatch.h"
 #include "swrast/swrast.h"
+#include "drivers/common/meta.h"
 
-#include "intel_screen.h"
 #include "intel_context.h"
 #include "intel_batchbuffer.h"
 #include "intel_blit.h"
 #include "intel_buffers.h"
 #include "intel_regions.h"
 #include "intel_pixel.h"
-#include "intel_buffer_objects.h"
 #include "intel_fbo.h"
 
-static GLboolean
-intel_texture_drawpixels(GLcontext * ctx,
-			 GLint x, GLint y,
-			 GLsizei width, GLsizei height,
-			 GLenum format,
-			 GLenum type,
-			 const struct gl_pixelstore_attrib *unpack,
-			 const GLvoid *pixels)
-{
-   struct intel_context *intel = intel_context(ctx);
-   GLuint texname;
-   GLfloat vertices[4][4];
-   GLfloat texcoords[4][2];
-   GLfloat z;
-   GLint old_active_texture;
-   GLenum internalFormat;
-
-   /* We're going to mess with texturing with no regard to existing texture
-    * state, so if there is some set up we have to bail.
-    */
-   if (ctx->Texture._EnabledUnits != 0) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "glDrawPixels() fallback: texturing enabled\n");
-      return GL_FALSE;
-   }
-
-   /* Can't do textured DrawPixels with a fragment program, unless we were
-    * to generate a new program that sampled our texture and put the results
-    * in the fragment color before the user's program started.
-    */
-   if (ctx->FragmentProgram.Enabled) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "glDrawPixels() fallback: fragment program enabled\n");
-      return GL_FALSE;
-   }
-
-   /* We don't have a way to generate fragments with stencil values which
-    * will set the resulting stencil value.
-    */
-   if (format == GL_STENCIL_INDEX || format == GL_DEPTH_STENCIL)
-      return GL_FALSE;
-
-   /* Check that we can load in a texture this big. */
-   if (width > (1 << (ctx->Const.MaxTextureLevels - 1)) ||
-       height > (1 << (ctx->Const.MaxTextureLevels - 1))) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "glDrawPixels() fallback: bitmap too large (%dx%d)\n",
-		 width, height);
-      return GL_FALSE;
-   }
-
-   /* To do DEPTH_COMPONENT, we would need to change our setup to not draw to
-    * the color buffer, and sample the texture values into the fragment depth
-    * in a program.
-    */
-   if (format == GL_DEPTH_COMPONENT) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr,
-		 "glDrawPixels() fallback: format == GL_DEPTH_COMPONENT\n");
-      return GL_FALSE;
-   }
-
-   if (!ctx->Extensions.ARB_texture_non_power_of_two &&
-       (!is_power_of_two(width) || !is_power_of_two(height))) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr,
-		 "glDrawPixels() fallback: NPOT texture\n");
-      return GL_FALSE;
-   }
-
-   _mesa_PushAttrib(GL_ENABLE_BIT | GL_TEXTURE_BIT |
-		    GL_CURRENT_BIT);
-   _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
-
-   /* XXX: pixel store stuff */
-   _mesa_Disable(GL_POLYGON_STIPPLE);
-
-   old_active_texture = ctx->Texture.CurrentUnit;
-   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB);
-   _mesa_Enable(GL_TEXTURE_2D);
-   _mesa_GenTextures(1, &texname);
-   _mesa_BindTexture(GL_TEXTURE_2D, texname);
-   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-   _mesa_TexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
-   if (type == GL_ALPHA)
-      internalFormat = GL_ALPHA;
-   else
-      internalFormat = GL_RGBA;
-   _mesa_TexImage2D(GL_TEXTURE_2D, 0, internalFormat, width, height, 0, format,
-		    type, pixels);
-
-   intel_meta_set_passthrough_transform(intel);
-
-   /* convert rasterpos Z from [0,1] to NDC coord in [-1,1] */
-   z = -1.0 + 2.0 * ctx->Current.RasterPos[2];
-
-   /* Create the vertex buffer based on the current raster pos.  The x and y
-    * we're handed are ctx->Current.RasterPos[0,1] rounded to integers.
-    * We also apply the depth.  However, the W component is already multiplied
-    * into ctx->Current.RasterPos[0,1,2] and we can ignore it at this point.
-    */
-   vertices[0][0] = x;
-   vertices[0][1] = y;
-   vertices[0][2] = z;
-   vertices[0][3] = 1.0;
-   vertices[1][0] = x + width * ctx->Pixel.ZoomX;
-   vertices[1][1] = y;
-   vertices[1][2] = z;
-   vertices[1][3] = 1.0;
-   vertices[2][0] = x + width * ctx->Pixel.ZoomX;
-   vertices[2][1] = y + height * ctx->Pixel.ZoomY;
-   vertices[2][2] = z;
-   vertices[2][3] = 1.0;
-   vertices[3][0] = x;
-   vertices[3][1] = y + height * ctx->Pixel.ZoomY;
-   vertices[3][2] = z;
-   vertices[3][3] = 1.0;
-
-   texcoords[0][0] = 0.0;
-   texcoords[0][1] = 0.0;
-   texcoords[1][0] = 1.0;
-   texcoords[1][1] = 0.0;
-   texcoords[2][0] = 1.0;
-   texcoords[2][1] = 1.0;
-   texcoords[3][0] = 0.0;
-   texcoords[3][1] = 1.0;
-
-   _mesa_VertexPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), &vertices);
-   _mesa_ClientActiveTextureARB(GL_TEXTURE0);
-   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
-   _mesa_Enable(GL_VERTEX_ARRAY);
-   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
-   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-   intel_meta_restore_transform(intel);
-
-   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB + old_active_texture);
-   _mesa_PopClientAttrib();
-   _mesa_PopAttrib();
-
-   _mesa_DeleteTextures(1, &texname);
-
-   return GL_TRUE;
-}
 
+/** XXX compare perf of this vs. _mesa_meta_draw_pixels(STENCIL) */
 static GLboolean
 intel_stencil_drawpixels(GLcontext * ctx,
 			 GLint x, GLint y,
@@ -216,14 +67,14 @@ intel_stencil_drawpixels(GLcontext * ctx,
    struct intel_context *intel = intel_context(ctx);
    GLuint texname, rb_name, fb_name, old_fb_name;
    GLfloat vertices[4][2];
-   GLfloat texcoords[4][2];
    struct intel_renderbuffer *irb;
    struct intel_renderbuffer *depth_irb;
    struct gl_renderbuffer *rb;
    struct gl_pixelstore_attrib old_unpack;
    GLstencil *stencil_pixels;
-   int row;
+   int row, y1, y2;
    GLint old_active_texture;
+   GLboolean rendering_to_fbo = ctx->DrawBuffer->Name != 0;
 
    if (format != GL_STENCIL_INDEX)
       return GL_FALSE;
@@ -358,34 +209,35 @@ intel_stencil_drawpixels(GLcontext * ctx,
    ctx->Unpack = old_unpack;
    _mesa_free(stencil_pixels);
 
-   intel_meta_set_passthrough_transform(intel);
+   meta_set_passthrough_transform(&intel->meta);
 
+   /* Since we're rendering to the framebuffer as if it was an FBO,
+    * if it's the window system we have to flip the coordinates.
+    */
+   if (rendering_to_fbo) {
+      y1 = y;
+      y2 = y + height * ctx->Pixel.ZoomY;
+   } else {
+      y1 = irb->Base.Height - (y + height * ctx->Pixel.ZoomY);
+      y2 = irb->Base.Height - y;
+   }
    vertices[0][0] = x;
-   vertices[0][1] = y;
+   vertices[0][1] = y1;
    vertices[1][0] = x + width * ctx->Pixel.ZoomX;
-   vertices[1][1] = y;
+   vertices[1][1] = y1;
    vertices[2][0] = x + width * ctx->Pixel.ZoomX;
-   vertices[2][1] = y + height * ctx->Pixel.ZoomY;
+   vertices[2][1] = y2;
    vertices[3][0] = x;
-   vertices[3][1] = y + height * ctx->Pixel.ZoomY;
-
-   texcoords[0][0] = 0.0;
-   texcoords[0][1] = 0.0;
-   texcoords[1][0] = 1.0;
-   texcoords[1][1] = 0.0;
-   texcoords[2][0] = 1.0;
-   texcoords[2][1] = 1.0;
-   texcoords[3][0] = 0.0;
-   texcoords[3][1] = 1.0;
+   vertices[3][1] = y2;
 
    _mesa_VertexPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &vertices);
-   _mesa_ClientActiveTextureARB(GL_TEXTURE0);
-   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
-   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
+   meta_set_default_texrect(&intel->meta);
+
    _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
-   intel_meta_restore_transform(intel);
+   meta_restore_texcoords(&intel->meta);
+   meta_restore_transform(&intel->meta);
 
    _mesa_ActiveTextureARB(GL_TEXTURE0_ARB + old_active_texture);
    _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, old_fb_name);
@@ -409,17 +261,25 @@ intelDrawPixels(GLcontext * ctx,
                 const struct gl_pixelstore_attrib *unpack,
                 const GLvoid * pixels)
 {
-   if (intel_texture_drawpixels(ctx, x, y, width, height, format, type,
-				unpack, pixels))
-      return;
-
+#if 0
+   /* XXX this function doesn't seem to work reliably even when all
+    * the pre-requisite conditions are met.
+    * Note that this function is never hit with conform.
+    * Fall back to swrast because even the _mesa_meta_draw_pixels() approach
+    * isn't working because of an apparent stencil bug.
+    */
    if (intel_stencil_drawpixels(ctx, x, y, width, height, format, type,
 				unpack, pixels))
       return;
+#else
+   (void) intel_stencil_drawpixels; /* silence warning */
+   if (format == GL_STENCIL_INDEX) {
+      _swrast_DrawPixels(ctx, x, y, width, height, format, type,
+                         unpack, pixels);
+      return;
+   }
+#endif
 
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      _mesa_printf("%s: fallback to swrast\n", __FUNCTION__);
-
-   _swrast_DrawPixels(ctx, x, y, width, height, format, type,
-		      unpack, pixels);
+   _mesa_meta_draw_pixels(ctx, x, y, width, height, format, type,
+                          unpack, pixels);
 }
diff --git a/i915/intel_pixel_read.c b/shared/intel_pixel_read.c
index 56087aa..8713463 100644
--- a/i915/intel_pixel_read.c
+++ b/shared/intel_pixel_read.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 #include "main/glheader.h"
@@ -31,6 +31,7 @@
 #include "main/macros.h"
 #include "main/image.h"
 #include "main/bufferobj.h"
+#include "main/state.h"
 #include "swrast/swrast.h"
 
 #include "intel_screen.h"
@@ -45,10 +46,10 @@
 /* For many applications, the new ability to pull the source buffers
  * back out of the GTT and then do the packing/conversion operations
  * in software will be as much of an improvement as trying to get the
- * blitter and/or texture engine to do the work. 
+ * blitter and/or texture engine to do the work.
  *
  * This step is gated on private backbuffers.
- * 
+ *
  * Obviously the frontbuffer can't be pulled back, so that is either
  * an argument for blit/texture readpixels, or for blitting to a
  * temporary and then pulling that back.
@@ -179,7 +180,7 @@ do_blit_readpixels(GLcontext * ctx,
    if (!src)
       return GL_FALSE;
 
-   if (dst) {
+   if (pack->BufferObj->Name) {
       /* XXX This validation should be done by core mesa:
        */
       if (!_mesa_validate_pbo_access(2, pack, width, height, 1,
@@ -260,16 +261,19 @@ do_blit_readpixels(GLcontext * ctx,
          if (!intel_intersect_cliprects(&rect, &src_rect, &box[i]))
             continue;
 
-         intelEmitCopyBlit(intel,
-                           src->cpp,
-                           src->pitch, src->buffer, 0, src->tiling,
-                           rowLength, dst_buffer, dst_offset, GL_FALSE,
-                           rect.x1,
-                           rect.y1,
-                           rect.x1 - src_rect.x1,
-                           rect.y2 - src_rect.y2,
-                           rect.x2 - rect.x1, rect.y2 - rect.y1,
-			   GL_COPY);
+         if (!intelEmitCopyBlit(intel,
+				src->cpp,
+				src->pitch, src->buffer, 0, src->tiling,
+				rowLength, dst_buffer, dst_offset, GL_FALSE,
+				rect.x1,
+				rect.y1,
+				rect.x1 - src_rect.x1,
+				rect.y2 - src_rect.y2,
+				rect.x2 - rect.x1, rect.y2 - rect.y1,
+				GL_COPY)) {
+	    UNLOCK_HARDWARE(intel);
+	    return GL_FALSE;
+	 }
       }
    }
    UNLOCK_HARDWARE(intel);
@@ -291,6 +295,7 @@ intelReadPixels(GLcontext * ctx,
 
    intelFlush(ctx);
 
+#ifdef I915
    if (do_blit_readpixels
        (ctx, x, y, width, height, format, type, pack, pixels))
       return;
@@ -298,9 +303,22 @@ intelReadPixels(GLcontext * ctx,
    if (do_texture_readpixels
        (ctx, x, y, width, height, format, type, pack, pixels))
       return;
+#else
+   (void)do_blit_readpixels;
+   (void)do_texture_readpixels;
+#endif
 
    if (INTEL_DEBUG & DEBUG_PIXEL)
       _mesa_printf("%s: fallback to swrast\n", __FUNCTION__);
 
+   /* Update Mesa state before calling down into _swrast_ReadPixels, as
+    * the spans code requires the computed buffer states to be up to date,
+    * but _swrast_ReadPixels only updates Mesa state after setting up
+    * the spans code.
+    */
+
+   if (ctx->NewState)
+      _mesa_update_state(ctx);
+
    _swrast_ReadPixels(ctx, x, y, width, height, format, type, pack, pixels);
 }
diff --git a/shared/intel_reg.h b/shared/intel_reg.h
index 57ac8f0..d19f1ba 100644
--- a/shared/intel_reg.h
+++ b/shared/intel_reg.h
@@ -189,6 +189,19 @@
 
 #define S7_DEPTH_OFFSET_CONST_MASK     ~0
 
+/* p143 */
+#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
+/* Dword 1 */
+#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
+#define BUF_3D_ID_DEPTH 	(0x7<<24)
+#define BUF_3D_USE_FENCE	(1<<23)
+#define BUF_3D_TILED_SURFACE	(1<<22)
+#define BUF_3D_TILE_WALK_X	0
+#define BUF_3D_TILE_WALK_Y	(1<<21)
+#define BUF_3D_PITCH(x)         (((x)/4)<<2)
+/* Dword 2 */
+#define BUF_3D_ADDR(x)		((x) & ~0x3)
+
 /* Primitive dispatch on 830-945 */
 #define _3DPRIMITIVE			(CMD_3D | (0x1f << 24))
 #define PRIM_INDIRECT            (1<<23)
diff --git a/shared/intel_regions.c b/shared/intel_regions.c
index 0aa5b8c..a86c66a 100644
--- a/shared/intel_regions.c
+++ b/shared/intel_regions.c
@@ -52,17 +52,77 @@
 
 #define FILE_DEBUG_FLAG DEBUG_REGION
 
+/* This should be set to the maximum backtrace size desired.
+ * Set it to 0 to disable backtrace debugging.
+ */
+#define DEBUG_BACKTRACE_SIZE 0
+
+#if DEBUG_BACKTRACE_SIZE == 0
+/* Use the standard debug output */
+#define _DBG(...) DBG(__VA_ARGS__)
+#else
+/* Use backtracing debug output */
+#define _DBG(...) {debug_backtrace(); DBG(__VA_ARGS__);}
+
+/* Backtracing debug support */
+#include <execinfo.h>
+
+static void
+debug_backtrace(void)
+{
+   void *trace[DEBUG_BACKTRACE_SIZE];
+   char **strings = NULL;
+   int traceSize;
+   register int i;
+
+   traceSize = backtrace(trace, DEBUG_BACKTRACE_SIZE);
+   strings = backtrace_symbols(trace, traceSize);
+   if (strings == NULL) {
+      DBG("no backtrace:");
+      return;
+   }
+
+   /* Spit out all the strings with a colon separator.  Ignore
+    * the first, since we don't really care about the call
+    * to debug_backtrace() itself.  Skip until the final "/" in
+    * the trace to avoid really long lines.
+    */
+   for (i = 1; i < traceSize; i++) {
+      char *p = strings[i], *slash = strings[i];
+      while (*p) {
+         if (*p++ == '/') {
+            slash = p;
+         }
+      }
+
+      DBG("%s:", slash);
+   }
+
+   /* Free up the memory, and we're done */
+   free(strings);
+}
+
+#endif
+
+
+
 /* XXX: Thread safety?
  */
 GLubyte *
 intel_region_map(struct intel_context *intel, struct intel_region *region)
 {
-   DBG("%s\n", __FUNCTION__);
+   intelFlush(&intel->ctx);
+
+   _DBG("%s %p\n", __FUNCTION__, region);
    if (!region->map_refcount++) {
       if (region->pbo)
          intel_region_cow(intel, region);
 
-      dri_bo_map(region->buffer, GL_TRUE);
+      if (region->tiling != I915_TILING_NONE &&
+	  intel->intelScreen->kernel_exec_fencing)
+	 drm_intel_gem_bo_map_gtt(region->buffer);
+      else
+	 dri_bo_map(region->buffer, GL_TRUE);
       region->map = region->buffer->virtual;
    }
 
@@ -72,9 +132,13 @@ intel_region_map(struct intel_context *intel, struct intel_region *region)
 void
 intel_region_unmap(struct intel_context *intel, struct intel_region *region)
 {
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s %p\n", __FUNCTION__, region);
    if (!--region->map_refcount) {
-      dri_bo_unmap(region->buffer);
+      if (region->tiling != I915_TILING_NONE &&
+	  intel->intelScreen->kernel_exec_fencing)
+	 drm_intel_gem_bo_unmap_gtt(region->buffer);
+      else
+	 dri_bo_unmap(region->buffer);
       region->map = NULL;
    }
 }
@@ -87,10 +151,10 @@ intel_region_alloc_internal(struct intel_context *intel,
 {
    struct intel_region *region;
 
-   DBG("%s\n", __FUNCTION__);
-
-   if (buffer == NULL)
+   if (buffer == NULL) {
+      _DBG("%s <-- NULL\n", __FUNCTION__);
       return NULL;
+   }
 
    region = calloc(sizeof(*region), 1);
    region->cpp = cpp;
@@ -104,15 +168,40 @@ intel_region_alloc_internal(struct intel_context *intel,
    region->tiling = I915_TILING_NONE;
    region->bit_6_swizzle = I915_BIT_6_SWIZZLE_NONE;
 
+   _DBG("%s <-- %p\n", __FUNCTION__, region);
    return region;
 }
 
 struct intel_region *
 intel_region_alloc(struct intel_context *intel,
+		   uint32_t tiling,
                    GLuint cpp, GLuint width, GLuint height, GLuint pitch,
 		   GLboolean expect_accelerated_upload)
 {
    dri_bo *buffer;
+   struct intel_region *region;
+
+   /* If we're tiled, our allocations are in 8 or 32-row blocks, so
+    * failure to align our height means that we won't allocate enough pages.
+    *
+    * If we're untiled, we still have to align to 2 rows high because the
+    * data port accesses 2x2 blocks even if the bottom row isn't to be
+    * rendered, so failure to align means we could walk off the end of the
+    * GTT and fault.
+    */
+   if (tiling == I915_TILING_X)
+      height = ALIGN(height, 8);
+   else if (tiling == I915_TILING_Y)
+      height = ALIGN(height, 32);
+   else
+      height = ALIGN(height, 2);
+
+   /* If we're untiled, we have to align to 2 rows high because the
+    * data port accesses 2x2 blocks even if the bottom row isn't to be
+    * rendered, so failure to align means we could walk off the end of the
+    * GTT and fault.
+    */
+   height = ALIGN(height, 2);
 
    if (expect_accelerated_upload) {
       buffer = drm_intel_bo_alloc_for_render(intel->bufmgr, "region",
@@ -122,7 +211,16 @@ intel_region_alloc(struct intel_context *intel,
 				  pitch * cpp * height, 64);
    }
 
-   return intel_region_alloc_internal(intel, cpp, width, height, pitch, buffer);
+   region = intel_region_alloc_internal(intel, cpp, width, height,
+					pitch, buffer);
+
+   if (tiling != I915_TILING_NONE) {
+      assert(((pitch * cpp) & 127) == 0);
+      drm_intel_bo_set_tiling(buffer, &tiling, pitch * cpp);
+      drm_intel_bo_get_tiling(buffer, &region->tiling, &region->bit_6_swizzle);
+   }
+
+   return region;
 }
 
 struct intel_region *
@@ -158,7 +256,7 @@ void
 intel_region_reference(struct intel_region **dst, struct intel_region *src)
 {
    if (src)
-      DBG("%s %p %d\n", __FUNCTION__, src, src->refcount);
+      _DBG("%s %p %d\n", __FUNCTION__, src, src->refcount);
 
    assert(*dst == NULL);
    if (src) {
@@ -172,10 +270,12 @@ intel_region_release(struct intel_region **region_handle)
 {
    struct intel_region *region = *region_handle;
 
-   if (region == NULL)
+   if (region == NULL) {
+      _DBG("%s NULL\n", __FUNCTION__);
       return;
+   }
 
-   DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
+   _DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
 
    ASSERT(region->refcount > 0);
    region->refcount--;
@@ -249,9 +349,7 @@ intel_region_data(struct intel_context *intel,
                   const void *src, GLuint src_pitch,
                   GLuint srcx, GLuint srcy, GLuint width, GLuint height)
 {
-   GLboolean locked = GL_FALSE;
-
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s\n", __FUNCTION__);
 
    if (intel == NULL)
       return;
@@ -264,39 +362,33 @@ intel_region_data(struct intel_context *intel,
          intel_region_cow(intel, dst);
    }
 
-   if (!intel->locked) {
-      LOCK_HARDWARE(intel);
-      locked = GL_TRUE;
-   }
-
+   LOCK_HARDWARE(intel);
    _mesa_copy_rect(intel_region_map(intel, dst) + dst_offset,
                    dst->cpp,
                    dst->pitch,
                    dstx, dsty, width, height, src, src_pitch, srcx, srcy);
 
    intel_region_unmap(intel, dst);
-
-   if (locked)
-      UNLOCK_HARDWARE(intel);
-
+   UNLOCK_HARDWARE(intel);
 }
 
 /* Copy rectangular sub-regions. Need better logic about when to
  * push buffers into AGP - will currently do so whenever possible.
  */
-void
+GLboolean
 intel_region_copy(struct intel_context *intel,
                   struct intel_region *dst,
                   GLuint dst_offset,
                   GLuint dstx, GLuint dsty,
                   struct intel_region *src,
                   GLuint src_offset,
-                  GLuint srcx, GLuint srcy, GLuint width, GLuint height)
+                  GLuint srcx, GLuint srcy, GLuint width, GLuint height,
+		  GLenum logicop)
 {
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s\n", __FUNCTION__);
 
    if (intel == NULL)
-      return;
+      return GL_FALSE;
 
    if (dst->pbo) {
       if (dstx == 0 &&
@@ -308,41 +400,12 @@ intel_region_copy(struct intel_context *intel,
 
    assert(src->cpp == dst->cpp);
 
-   intelEmitCopyBlit(intel,
-                     dst->cpp,
-                     src->pitch, src->buffer, src_offset, src->tiling,
-                     dst->pitch, dst->buffer, dst_offset, dst->tiling,
-                     srcx, srcy, dstx, dsty, width, height,
-		     GL_COPY);
-}
-
-/* Fill a rectangular sub-region.  Need better logic about when to
- * push buffers into AGP - will currently do so whenever possible.
- */
-void
-intel_region_fill(struct intel_context *intel,
-                  struct intel_region *dst,
-                  GLuint dst_offset,
-                  GLuint dstx, GLuint dsty,
-                  GLuint width, GLuint height, GLuint color)
-{
-   DBG("%s\n", __FUNCTION__);
-
-   if (intel == NULL)
-      return;   
-
-   if (dst->pbo) {
-      if (dstx == 0 &&
-          dsty == 0 && width == dst->pitch && height == dst->height)
-         intel_region_release_pbo(intel, dst);
-      else
-         intel_region_cow(intel, dst);
-   }
-
-   intelEmitFillBlit(intel,
-                     dst->cpp,
-                     dst->pitch, dst->buffer, dst_offset, dst->tiling,
-                     dstx, dsty, width, height, color);
+   return intelEmitCopyBlit(intel,
+			    dst->cpp,
+			    src->pitch, src->buffer, src_offset, src->tiling,
+			    dst->pitch, dst->buffer, dst_offset, dst->tiling,
+			    srcx, srcy, dstx, dsty, width, height,
+			    logicop);
 }
 
 /* Attach to a pbo, discarding our data.  Effectively zero-copy upload
@@ -353,9 +416,13 @@ intel_region_attach_pbo(struct intel_context *intel,
                         struct intel_region *region,
                         struct intel_buffer_object *pbo)
 {
+   dri_bo *buffer;
+
    if (region->pbo == pbo)
       return;
 
+   _DBG("%s %p %p\n", __FUNCTION__, region, pbo);
+
    /* If there is already a pbo attached, break the cow tie now.
     * Don't call intel_region_release_pbo() as that would
     * unnecessarily allocate a new buffer we would have to immediately
@@ -371,10 +438,13 @@ intel_region_attach_pbo(struct intel_context *intel,
       region->buffer = NULL;
    }
 
+   /* make sure pbo has a buffer of its own */
+   buffer = intel_bufferobj_buffer(intel, pbo, INTEL_WRITE_FULL);
+
    region->pbo = pbo;
    region->pbo->region = region;
-   dri_bo_reference(pbo->buffer);
-   region->buffer = pbo->buffer;
+   dri_bo_reference(buffer);
+   region->buffer = buffer;
 }
 
 
@@ -385,6 +455,7 @@ void
 intel_region_release_pbo(struct intel_context *intel,
                          struct intel_region *region)
 {
+   _DBG("%s %p\n", __FUNCTION__, region);
    assert(region->buffer == region->pbo->buffer);
    region->pbo->region = NULL;
    region->pbo = NULL;
@@ -403,34 +474,27 @@ void
 intel_region_cow(struct intel_context *intel, struct intel_region *region)
 {
    struct intel_buffer_object *pbo = region->pbo;
-   GLboolean was_locked = intel->locked;
-
-   if (intel == NULL)
-      return;
+   GLboolean ok;
 
    intel_region_release_pbo(intel, region);
 
    assert(region->cpp * region->pitch * region->height == pbo->Base.Size);
 
-   DBG("%s (%d bytes)\n", __FUNCTION__, pbo->Base.Size);
+   _DBG("%s %p (%d bytes)\n", __FUNCTION__, region, pbo->Base.Size);
 
    /* Now blit from the texture buffer to the new buffer: 
     */
 
-   was_locked = intel->locked;
-   if (!was_locked)
-      LOCK_HARDWARE(intel);
-
-   intelEmitCopyBlit(intel,
-		     region->cpp,
-		     region->pitch, region->buffer, 0, region->tiling,
-		     region->pitch, pbo->buffer, 0, region->tiling,
-		     0, 0, 0, 0,
-		     region->pitch, region->height,
-		     GL_COPY);
-
-   if (!was_locked)
-      UNLOCK_HARDWARE(intel);
+   LOCK_HARDWARE(intel);
+   ok = intelEmitCopyBlit(intel,
+                          region->cpp,
+                          region->pitch, pbo->buffer, 0, region->tiling,
+                          region->pitch, region->buffer, 0, region->tiling,
+                          0, 0, 0, 0,
+                          region->pitch, region->height,
+                          GL_COPY);
+   assert(ok);
+   UNLOCK_HARDWARE(intel);
 }
 
 dri_bo *
@@ -459,6 +523,10 @@ intel_recreate_static(struct intel_context *intel,
    if (region == NULL) {
       region = calloc(sizeof(*region), 1);
       region->refcount = 1;
+      _DBG("%s creating new region %p\n", __FUNCTION__, region);
+   }
+   else {
+      _DBG("%s %p\n", __FUNCTION__, region);
    }
 
    if (intel->ctx.Visual.rgbBits == 24)
diff --git a/shared/intel_regions.h b/shared/intel_regions.h
index 45e2bf4..0d379bd 100644
--- a/shared/intel_regions.h
+++ b/shared/intel_regions.h
@@ -73,7 +73,8 @@ struct intel_region
  * copied by calling intel_reference_region().
  */
 struct intel_region *intel_region_alloc(struct intel_context *intel,
-                                        GLuint cpp, GLuint width,
+                                        uint32_t tiling,
+					GLuint cpp, GLuint width,
                                         GLuint height, GLuint pitch,
 					GLboolean expect_accelerated_upload);
 
@@ -109,21 +110,15 @@ void intel_region_data(struct intel_context *intel,
 
 /* Copy rectangular sub-regions
  */
-void intel_region_copy(struct intel_context *intel,
-                       struct intel_region *dest,
-                       GLuint dest_offset,
-                       GLuint destx, GLuint desty,
-                       struct intel_region *src,
-                       GLuint src_offset,
-                       GLuint srcx, GLuint srcy, GLuint width, GLuint height);
-
-/* Fill a rectangular sub-region
- */
-void intel_region_fill(struct intel_context *intel,
-                       struct intel_region *dest,
-                       GLuint dest_offset,
-                       GLuint destx, GLuint desty,
-                       GLuint width, GLuint height, GLuint color);
+GLboolean
+intel_region_copy(struct intel_context *intel,
+		  struct intel_region *dest,
+		  GLuint dest_offset,
+		  GLuint destx, GLuint desty,
+		  struct intel_region *src,
+		  GLuint src_offset,
+		  GLuint srcx, GLuint srcy, GLuint width, GLuint height,
+		  GLenum logicop);
 
 /* Helpers for zerocopy uploads, particularly texture image uploads:
  */
diff --git a/shared/intel_screen.c b/shared/intel_screen.c
index 0f278b3..1b8c56e 100644
--- a/shared/intel_screen.c
+++ b/shared/intel_screen.c
@@ -49,6 +49,10 @@
 #include "i915_drm.h"
 #include "i830_dri.h"
 
+#define DRI_CONF_TEXTURE_TILING(def) \
+	DRI_CONF_OPT_BEGIN(texture_tiling, bool, def)		\
+		DRI_CONF_DESC(en, "Enable texture tiling")	\
+	DRI_CONF_OPT_END					\
 
 PUBLIC const char __driConfigOptions[] =
    DRI_CONF_BEGIN
@@ -64,6 +68,17 @@ PUBLIC const char __driConfigOptions[] =
 	    DRI_CONF_ENUM(1, "Enable reuse of all sizes of buffer objects")
 	 DRI_CONF_DESC_END
       DRI_CONF_OPT_END
+
+#ifdef I915
+     DRI_CONF_TEXTURE_TILING(false)
+#else
+     DRI_CONF_TEXTURE_TILING(true)
+#endif
+
+      DRI_CONF_OPT_BEGIN(early_z, bool, false)
+	 DRI_CONF_DESC(en, "Enable early Z in classic mode (unstable, 945-only).")
+      DRI_CONF_OPT_END
+
    DRI_CONF_SECTION_END
    DRI_CONF_SECTION_QUALITY
       DRI_CONF_FORCE_S3TC_ENABLE(false)
@@ -76,7 +91,7 @@ PUBLIC const char __driConfigOptions[] =
    DRI_CONF_SECTION_END
 DRI_CONF_END;
 
-const GLuint __driNConfigOptions = 8;
+const GLuint __driNConfigOptions = 10;
 
 #ifdef USE_NEW_INTERFACE
 static PFNGLXCREATECONTEXTMODES create_context_modes = NULL;
@@ -305,7 +320,7 @@ intelDestroyScreen(__DRIscreenPrivate * sPriv)
 
    dri_bufmgr_destroy(intelScreen->bufmgr);
    intelUnmapScreenRegions(intelScreen);
-   driDestroyOptionCache(&intelScreen->optionCache);
+   driDestroyOptionInfo(&intelScreen->optionCache);
 
    FREE(intelScreen);
    sPriv->private = NULL;
@@ -587,7 +602,7 @@ intel_init_bufmgr(intelScreenPrivate *intelScreen)
    GLboolean gem_supported;
    struct drm_i915_getparam gp;
    __DRIscreenPrivate *spriv = intelScreen->driScrnPriv;
-   int num_fences;
+   int num_fences = 0;
 
    intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL;
 
@@ -617,10 +632,10 @@ intel_init_bufmgr(intelScreenPrivate *intelScreen)
    /* Otherwise, use the classic buffer manager. */
    if (intelScreen->bufmgr == NULL) {
       if (gem_disable) {
-	 fprintf(stderr, "GEM disabled.  Using classic.\n");
+	 _mesa_warning(NULL, "GEM disabled.  Using classic.");
       } else {
-	 fprintf(stderr, "Failed to initialize GEM.  "
-		 "Falling back to classic.\n");
+	 _mesa_warning(NULL,
+                       "Failed to initialize GEM.  Falling back to classic.");
       }
 
       if (intelScreen->tex.size == 0) {
diff --git a/shared/intel_span.c b/shared/intel_span.c
index 34b78eb..8df4990 100644
--- a/shared/intel_span.c
+++ b/shared/intel_span.c
@@ -501,7 +501,7 @@ intel_map_unmap_buffers(struct intel_context *intel, GLboolean map)
 
 
 /**
- * Prepare for softare rendering.  Map current read/draw framebuffers'
+ * Prepare for software rendering.  Map current read/draw framebuffers'
  * renderbuffes and all currently bound texture objects.
  *
  * Old note: Moved locking out to get reasonable span performance.
@@ -526,7 +526,7 @@ intelSpanRenderStart(GLcontext * ctx)
 }
 
 /**
- * Called when done softare rendering.  Unmap the buffers we mapped in
+ * Called when done software rendering.  Unmap the buffers we mapped in
  * the above function.
  */
 void
diff --git a/shared/intel_syncobj.c b/shared/intel_syncobj.c
new file mode 100644
index 0000000..1286fe9
--- /dev/null
+++ b/shared/intel_syncobj.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright © 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/** @file intel_syncobj.c
+ *
+ * Support for ARB_sync
+ *
+ * ARB_sync is implemented by flushing the current batchbuffer and keeping a
+ * reference on it.  We can then check for completion or wait for compeltion
+ * using the normal buffer object mechanisms.  This does mean that if an
+ * application is using many sync objects, it will emit small batchbuffers
+ * which may end up being a significant overhead.  In other tests of removing
+ * gratuitous batchbuffer syncs in Mesa, it hasn't appeared to be a significant
+ * performance bottleneck, though.
+ */
+
+#include "main/simple_list.h"
+#include "main/imports.h"
+
+#include "intel_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_reg.h"
+
+static struct gl_sync_object *
+intel_new_sync_object(GLcontext *ctx, GLuint id)
+{
+   struct intel_sync_object *sync;
+
+   sync = _mesa_calloc(sizeof(struct intel_sync_object));
+
+   return &sync->Base;
+}
+
+static void
+intel_delete_sync_object(GLcontext *ctx, struct gl_sync_object *s)
+{
+   struct intel_sync_object *sync = (struct intel_sync_object *)s;
+
+   drm_intel_bo_unreference(sync->bo);
+   _mesa_free(sync);
+}
+
+static void
+intel_fence_sync(GLcontext *ctx, struct gl_sync_object *s,
+	       GLenum condition, GLbitfield flags)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_sync_object *sync = (struct intel_sync_object *)s;
+
+   assert(condition == GL_SYNC_GPU_COMMANDS_COMPLETE);
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   sync->bo = intel->batch->buf;
+   drm_intel_bo_reference(sync->bo);
+
+   intelFlush(ctx);
+}
+
+/* We ignore the user-supplied timeout.  This is weaselly -- we're allowed to
+ * round to an implementation-dependent accuracy, and right now our
+ * implementation "rounds" to the wait-forever value.
+ *
+ * The fix would be a new kernel function to do the GTT transition with a
+ * timeout.
+ */
+static void intel_client_wait_sync(GLcontext *ctx, struct gl_sync_object *s,
+				 GLbitfield flags, GLuint64 timeout)
+{
+   struct intel_sync_object *sync = (struct intel_sync_object *)s;
+
+   if (sync->bo) {
+      drm_intel_bo_wait_rendering(sync->bo);
+      s->StatusFlag = 1;
+      drm_intel_bo_unreference(sync->bo);
+      sync->bo = NULL;
+   }
+}
+
+/* We have nothing to do for WaitSync.  Our GL command stream is sequential,
+ * so given that the sync object has already flushed the batchbuffer,
+ * any batchbuffers coming after this waitsync will naturally not occur until
+ * the previous one is done.
+ */
+static void intel_server_wait_sync(GLcontext *ctx, struct gl_sync_object *s,
+				 GLbitfield flags, GLuint64 timeout)
+{
+}
+
+static void intel_check_sync(GLcontext *ctx, struct gl_sync_object *s)
+{
+   struct intel_sync_object *sync = (struct intel_sync_object *)s;
+
+   if (sync->bo && drm_intel_bo_busy(sync->bo)) {
+      drm_intel_bo_unreference(sync->bo);
+      sync->bo = NULL;
+      s->StatusFlag = 1;
+   }
+}
+
+void intel_init_syncobj_functions(struct dd_function_table *functions)
+{
+   functions->NewSyncObject = intel_new_sync_object;
+   functions->DeleteSyncObject = intel_delete_sync_object;
+   functions->FenceSync = intel_fence_sync;
+   functions->CheckSync = intel_check_sync;
+   functions->ClientWaitSync = intel_client_wait_sync;
+   functions->ServerWaitSync = intel_server_wait_sync;
+}
diff --git a/shared/intel_tex.c b/shared/intel_tex.c
index ae0994b..df63f29 100644
--- a/shared/intel_tex.c
+++ b/shared/intel_tex.c
@@ -158,81 +158,11 @@ timed_memcpy(void *dest, const void *src, size_t n)
 }
 #endif /* DO_DEBUG */
 
-/**
- * Generate new mipmap data from BASE+1 to BASE+p (the minimally-sized mipmap
- * level).
- *
- * The texture object's miptree must be mapped.
- *
- * It would be really nice if this was just called by Mesa whenever mipmaps
- * needed to be regenerated, rather than us having to remember to do so in
- * each texture image modification path.
- *
- * This function should also include an accelerated path.
- */
-void
-intel_generate_mipmap(GLcontext *ctx, GLenum target,
-                      struct gl_texture_object *texObj)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_texture_object *intelObj = intel_texture_object(texObj);
-   GLuint nr_faces = (intelObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
-   int face, i;
-
-   _mesa_generate_mipmap(ctx, target, texObj);
-
-   /* Update the level information in our private data in the new images, since
-    * it didn't get set as part of a normal TexImage path.
-    */
-   for (face = 0; face < nr_faces; face++) {
-      for (i = texObj->BaseLevel + 1; i < texObj->MaxLevel; i++) {
-         struct intel_texture_image *intelImage;
-
-	 intelImage = intel_texture_image(texObj->Image[face][i]);
-	 if (intelImage == NULL)
-	    break;
-
-	 intelImage->level = i;
-	 intelImage->face = face;
-	 /* Unreference the miptree to signal that the new Data is a bare
-	  * pointer from mesa.
-	  */
-	 intel_miptree_release(intel, &intelImage->mt);
-      }
-   }
-}
-
-static void intelGenerateMipmap(GLcontext *ctx, GLenum target, struct gl_texture_object *texObj)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_texture_object *intelObj = intel_texture_object(texObj);
-
-   intel_tex_map_level_images(intel, intelObj, texObj->BaseLevel);
-   intel_generate_mipmap(ctx, target, texObj);
-   intel_tex_unmap_level_images(intel, intelObj, texObj->BaseLevel);
-}
-
 void
 intelInitTextureFuncs(struct dd_function_table *functions)
 {
    functions->ChooseTextureFormat = intelChooseTextureFormat;
-   functions->TexImage1D = intelTexImage1D;
-   functions->TexImage2D = intelTexImage2D;
-   functions->TexImage3D = intelTexImage3D;
-   functions->TexSubImage1D = intelTexSubImage1D;
-   functions->TexSubImage2D = intelTexSubImage2D;
-   functions->TexSubImage3D = intelTexSubImage3D;
-   functions->CopyTexImage1D = intelCopyTexImage1D;
-   functions->CopyTexImage2D = intelCopyTexImage2D;
-   functions->CopyTexSubImage1D = intelCopyTexSubImage1D;
-   functions->CopyTexSubImage2D = intelCopyTexSubImage2D;
-   functions->GetTexImage = intelGetTexImage;
-   functions->GenerateMipmap = intelGenerateMipmap;
-
-   /* compressed texture functions */
-   functions->CompressedTexImage2D = intelCompressedTexImage2D;
-   functions->CompressedTexSubImage2D = intelCompressedTexSubImage2D;
-   functions->GetCompressedTexImage = intelGetCompressedTexImage;
+   functions->GenerateMipmap = intel_generate_mipmap;
 
    functions->NewTextureObject = intelNewTextureObject;
    functions->NewTextureImage = intelNewTextureImage;
diff --git a/shared/intel_tex.h b/shared/intel_tex.h
index f5372d8..471aa2a 100644
--- a/shared/intel_tex.h
+++ b/shared/intel_tex.h
@@ -35,116 +35,17 @@
 
 void intelInitTextureFuncs(struct dd_function_table *functions);
 
+void intelInitTextureImageFuncs(struct dd_function_table *functions);
+
+void intelInitTextureSubImageFuncs(struct dd_function_table *functions);
+
+void intelInitTextureCopyImageFuncs(struct dd_function_table *functions);
+
 const struct gl_texture_format *intelChooseTextureFormat(GLcontext * ctx,
                                                          GLint internalFormat,
                                                          GLenum format,
                                                          GLenum type);
 
-
-void intelTexImage3D(GLcontext * ctx,
-                     GLenum target, GLint level,
-                     GLint internalFormat,
-                     GLint width, GLint height, GLint depth,
-                     GLint border,
-                     GLenum format, GLenum type, const void *pixels,
-                     const struct gl_pixelstore_attrib *packing,
-                     struct gl_texture_object *texObj,
-                     struct gl_texture_image *texImage);
-
-void intelTexSubImage3D(GLcontext * ctx,
-                        GLenum target,
-                        GLint level,
-                        GLint xoffset, GLint yoffset, GLint zoffset,
-                        GLsizei width, GLsizei height, GLsizei depth,
-                        GLenum format, GLenum type,
-                        const GLvoid * pixels,
-                        const struct gl_pixelstore_attrib *packing,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage);
-
-void intelTexImage2D(GLcontext * ctx,
-                     GLenum target, GLint level,
-                     GLint internalFormat,
-                     GLint width, GLint height, GLint border,
-                     GLenum format, GLenum type, const void *pixels,
-                     const struct gl_pixelstore_attrib *packing,
-                     struct gl_texture_object *texObj,
-                     struct gl_texture_image *texImage);
-
-void intelTexSubImage2D(GLcontext * ctx,
-                        GLenum target,
-                        GLint level,
-                        GLint xoffset, GLint yoffset,
-                        GLsizei width, GLsizei height,
-                        GLenum format, GLenum type,
-                        const GLvoid * pixels,
-                        const struct gl_pixelstore_attrib *packing,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage);
-
-void intelTexImage1D(GLcontext * ctx,
-                     GLenum target, GLint level,
-                     GLint internalFormat,
-                     GLint width, GLint border,
-                     GLenum format, GLenum type, const void *pixels,
-                     const struct gl_pixelstore_attrib *packing,
-                     struct gl_texture_object *texObj,
-                     struct gl_texture_image *texImage);
-
-void intelTexSubImage1D(GLcontext * ctx,
-                        GLenum target,
-                        GLint level,
-                        GLint xoffset,
-                        GLsizei width,
-                        GLenum format, GLenum type,
-                        const GLvoid * pixels,
-                        const struct gl_pixelstore_attrib *packing,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage);
-
-void intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
-                         GLenum internalFormat,
-                         GLint x, GLint y, GLsizei width, GLint border);
-
-void intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
-                         GLenum internalFormat,
-                         GLint x, GLint y, GLsizei width, GLsizei height,
-                         GLint border);
-
-void intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
-                            GLint xoffset, GLint x, GLint y, GLsizei width);
-
-void intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
-                            GLint xoffset, GLint yoffset,
-                            GLint x, GLint y, GLsizei width, GLsizei height);
-
-void intelGetTexImage(GLcontext * ctx, GLenum target, GLint level,
-                      GLenum format, GLenum type, GLvoid * pixels,
-                      struct gl_texture_object *texObj,
-                      struct gl_texture_image *texImage);
-
-void intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-				GLint internalFormat,
-				GLint width, GLint height, GLint border,
-				GLsizei imageSize, const GLvoid *data,
-				struct gl_texture_object *texObj,
-				struct gl_texture_image *texImage );
-
-void intelCompressedTexSubImage2D(GLcontext * ctx,
-				  GLenum target,
-				  GLint level,
-				  GLint xoffset, GLint yoffset,
-				  GLsizei width, GLsizei height,
-				  GLenum format, GLsizei imageSize,
-				  const GLvoid * pixels,
-				  struct gl_texture_object *texObj,
-				  struct gl_texture_image *texImage);
-
-void intelGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
-				GLvoid *pixels,
-				struct gl_texture_object *texObj,
-				struct gl_texture_image *texImage);
-
 void intelSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 		       unsigned long long offset, GLint depth, GLuint pitch);
 void intelSetTexBuffer(__DRIcontext *pDRICtx,
diff --git a/shared/intel_tex_copy.c b/shared/intel_tex_copy.c
index 90bbb8c..74f7f58 100644
--- a/shared/intel_tex_copy.c
+++ b/shared/intel_tex_copy.c
@@ -73,11 +73,9 @@ get_teximage_source(struct intel_context *intel, GLenum internalFormat)
       return NULL;
    case GL_RGBA:
    case GL_RGBA8:
-      return intel_readbuf_region(intel);
    case GL_RGB:
-      if (intel->ctx.Visual.rgbBits == 16)
-         return intel_readbuf_region(intel);
-      return NULL;
+   case GL_RGB8:
+      return intel_readbuf_region(intel);
    default:
       return NULL;
    }
@@ -99,14 +97,24 @@ do_copy_texsubimage(struct intel_context *intel,
 
    if (!intelImage->mt || !src) {
       if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "%s fail %p %p\n",
-		 __FUNCTION__, intelImage->mt, src);
+	 fprintf(stderr, "%s fail %p %p (0x%08x)\n",
+		 __FUNCTION__, intelImage->mt, src, internalFormat);
+      return GL_FALSE;
+   }
+
+   if (intelImage->mt->cpp != src->cpp) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "%s fail %d vs %d cpp\n",
+		 __FUNCTION__, intelImage->mt->cpp, src->cpp);
       return GL_FALSE;
    }
 
    intelFlush(ctx);
    LOCK_HARDWARE(intel);
    {
+      drm_intel_bo *dst_bo = intel_region_buffer(intel,
+						 intelImage->mt->region,
+						 INTEL_WRITE_PART);
       GLuint image_offset = intel_miptree_image_offset(intelImage->mt,
                                                        intelImage->face,
                                                        intelImage->level);
@@ -118,8 +126,12 @@ do_copy_texsubimage(struct intel_context *intel,
       dstx += x - orig_x;
       dsty += y - orig_y;
 
-      /* image_offset may be non-page-aligned, but that's illegal for tiling. */
-      assert(intelImage->mt->region->tiling == I915_TILING_NONE);
+      /* Can't blit to tiled buffers with non-tile-aligned offset. */
+      if (intelImage->mt->region->tiling != I915_TILING_NONE &&
+	  (image_offset & 4095) != 0) {
+	 UNLOCK_HARDWARE(intel);
+	 return GL_FALSE;
+      }
 
       if (ctx->ReadBuffer->Name == 0) {
 	 /* reading from a window, adjust x, y */
@@ -140,35 +152,35 @@ do_copy_texsubimage(struct intel_context *intel,
 	 src_pitch = src->pitch;
       }
 
-      intelEmitCopyBlit(intel,
-			intelImage->mt->cpp,
-			src_pitch,
-			src->buffer,
-			0,
-			src->tiling,
-			intelImage->mt->pitch,
-			intelImage->mt->region->buffer,
-			image_offset,
-			intelImage->mt->region->tiling,
-			x, y, dstx, dsty, width, height,
-			GL_COPY);
+      if (!intelEmitCopyBlit(intel,
+			     intelImage->mt->cpp,
+			     src_pitch,
+			     src->buffer,
+			     0,
+			     src->tiling,
+			     intelImage->mt->pitch,
+			     dst_bo,
+			     image_offset,
+			     intelImage->mt->region->tiling,
+			     x, y, dstx, dsty, width, height,
+			     GL_COPY)) {
+	 UNLOCK_HARDWARE(intel);
+	 return GL_FALSE;
+      }
    }
 
    UNLOCK_HARDWARE(intel);
 
    /* GL_SGIS_generate_mipmap */
    if (intelImage->level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target, texObj);
+      intel_generate_mipmap(ctx, target, texObj);
    }
 
    return GL_TRUE;
 }
 
 
-
-
-
-void
+static void
 intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
                     GLenum internalFormat,
                     GLint x, GLint y, GLsizei width, GLint border)
@@ -214,7 +226,8 @@ intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
                            width, border);
 }
 
-void
+
+static void
 intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
                     GLenum internalFormat,
                     GLint x, GLint y, GLsizei width, GLsizei height,
@@ -262,7 +275,7 @@ intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
 }
 
 
-void
+static void
 intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
                        GLint xoffset, GLint x, GLint y, GLsizei width)
 {
@@ -287,8 +300,7 @@ intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
 }
 
 
-
-void
+static void
 intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
                        GLint xoffset, GLint yoffset,
                        GLint x, GLint y, GLsizei width, GLsizei height)
@@ -301,7 +313,6 @@ intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
       _mesa_select_tex_image(ctx, texObj, target, level);
    GLenum internalFormat = texImage->InternalFormat;
 
-
    /* Need to check texture is compatible with source format. 
     */
 
@@ -316,3 +327,13 @@ intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
                                  xoffset, yoffset, x, y, width, height);
    }
 }
+
+
+void
+intelInitTextureCopyImageFuncs(struct dd_function_table *functions)
+{
+   functions->CopyTexImage1D = intelCopyTexImage1D;
+   functions->CopyTexImage2D = intelCopyTexImage2D;
+   functions->CopyTexSubImage1D = intelCopyTexSubImage1D;
+   functions->CopyTexSubImage2D = intelCopyTexSubImage2D;
+}
diff --git a/shared/intel_tex_image.c b/shared/intel_tex_image.c
index 5e61e9e..c5f5220 100644
--- a/shared/intel_tex_image.c
+++ b/shared/intel_tex_image.c
@@ -131,6 +131,7 @@ guess_and_alloc_mipmap_tree(struct intel_context *intel,
       comp_byte = intel_compressed_num_bytes(intelImage->base.TexFormat->MesaFormat);
    intelObj->mt = intel_miptree_create(intel,
                                        intelObj->base.Target,
+                                       intelImage->base._BaseFormat,
                                        intelImage->base.InternalFormat,
                                        firstLevel,
                                        lastLevel,
@@ -205,7 +206,7 @@ try_pbo_upload(struct intel_context *intel,
    GLuint src_offset, src_stride;
    GLuint dst_offset, dst_stride;
 
-   if (!pbo ||
+   if (unpack->BufferObj->Name == 0 ||
        intel->ctx._ImageTransferState ||
        unpack->SkipPixels || unpack->SkipRows) {
       DBG("%s: failure 1\n", __FUNCTION__);
@@ -235,12 +236,15 @@ try_pbo_upload(struct intel_context *intel,
 					       INTEL_WRITE_FULL);
 
 
-      intelEmitCopyBlit(intel,
-                        intelImage->mt->cpp,
-                        src_stride, src_buffer, src_offset, GL_FALSE,
-                        dst_stride, dst_buffer, dst_offset, GL_FALSE,
-                        0, 0, 0, 0, width, height,
-			GL_COPY);
+      if (!intelEmitCopyBlit(intel,
+			     intelImage->mt->cpp,
+			     src_stride, src_buffer, src_offset, GL_FALSE,
+			     dst_stride, dst_buffer, dst_offset, GL_FALSE,
+			     0, 0, 0, 0, width, height,
+			     GL_COPY)) {
+	 UNLOCK_HARDWARE(intel);
+	 return GL_FALSE;
+      }
    }
    UNLOCK_HARDWARE(intel);
 
@@ -248,7 +252,6 @@ try_pbo_upload(struct intel_context *intel,
 }
 
 
-
 static GLboolean
 try_pbo_zcopy(struct intel_context *intel,
               struct intel_texture_image *intelImage,
@@ -261,7 +264,7 @@ try_pbo_zcopy(struct intel_context *intel,
    GLuint src_offset, src_stride;
    GLuint dst_offset, dst_stride;
 
-   if (!pbo ||
+   if (unpack->BufferObj->Name == 0 ||
        intel->ctx._ImageTransferState ||
        unpack->SkipPixels || unpack->SkipRows) {
       DBG("%s: failure 1\n", __FUNCTION__);
@@ -293,10 +296,6 @@ try_pbo_zcopy(struct intel_context *intel,
 }
 
 
-
-
-
-
 static void
 intelTexImage(GLcontext * ctx,
               GLint dims,
@@ -307,7 +306,8 @@ intelTexImage(GLcontext * ctx,
               GLenum format, GLenum type, const void *pixels,
               const struct gl_pixelstore_attrib *unpack,
               struct gl_texture_object *texObj,
-              struct gl_texture_image *texImage, GLsizei imageSize, int compressed)
+              struct gl_texture_image *texImage, GLsizei imageSize,
+              GLboolean compressed)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_texture_object *intelObj = intel_texture_object(texObj);
@@ -316,7 +316,6 @@ intelTexImage(GLcontext * ctx,
    GLint postConvHeight = height;
    GLint texelBytes, sizeInBytes;
    GLuint dstRowStride = 0, srcRowStride = texImage->RowStride;
-   GLboolean needs_map;
 
    DBG("%s target %s level %d %dx%dx%d border %d\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target), level, width, height, depth, border);
@@ -414,7 +413,9 @@ intelTexImage(GLcontext * ctx,
        * a miptree, so create one just for our level and store it in the image.
        * It'll get moved into the object miptree at validate time.
        */
-      intelImage->mt = intel_miptree_create(intel, target, internalFormat,
+      intelImage->mt = intel_miptree_create(intel, target,
+					    intelImage->base.TexFormat->BaseFormat,
+					    internalFormat,
 					    level, level,
 					    width, height, depth,
 					    intelImage->base.TexFormat->TexelBytes,
@@ -426,7 +427,7 @@ intelTexImage(GLcontext * ctx,
     */
    if (dims <= 2 &&
        intelImage->mt &&
-       intel_buffer_object(unpack->BufferObj) &&
+       unpack->BufferObj->Name != 0 &&
        check_pbo_format(internalFormat, format,
                         type, intelImage->base.TexFormat)) {
 
@@ -464,8 +465,6 @@ intelTexImage(GLcontext * ctx,
       DBG("pbo upload failed\n");
    }
 
-
-
    /* intelCopyTexImage calls this function with pixels == NULL, with
     * the expectation that the mipmap tree will be set up but nothing
     * more will be done.  This is where those calls return:
@@ -482,15 +481,8 @@ intelTexImage(GLcontext * ctx,
 
    LOCK_HARDWARE(intel);
 
-   /* Two cases where we need a mapping of the miptree: when the user supplied
-    * data is mapped as well (non-PBO, memcpy upload) or when we're going to do
-    * (software) mipmap generation.
-    */
-   needs_map = (pixels != NULL) || (level == texObj->BaseLevel &&
-				  texObj->GenerateMipmap);
-
    if (intelImage->mt) {
-      if (needs_map)
+      if (pixels != NULL)
          texImage->Data = intel_miptree_image_map(intel,
                                                   intelImage->mt,
                                                   intelImage->face,
@@ -547,25 +539,26 @@ intelTexImage(GLcontext * ctx,
 						   format, type, pixels, unpack)) {
 	   _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
        }
-
-       /* GL_SGIS_generate_mipmap */
-       if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-	  intel_generate_mipmap(ctx, target, texObj);
-       }
    }
 
    _mesa_unmap_teximage_pbo(ctx, unpack);
 
    if (intelImage->mt) {
-      if (needs_map)
+      if (pixels != NULL)
          intel_miptree_image_unmap(intel, intelImage->mt);
       texImage->Data = NULL;
    }
 
    UNLOCK_HARDWARE(intel);
+
+   /* GL_SGIS_generate_mipmap */
+   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+      intel_generate_mipmap(ctx, target, texObj);
+   }
 }
 
-void
+
+static void
 intelTexImage3D(GLcontext * ctx,
                 GLenum target, GLint level,
                 GLint internalFormat,
@@ -578,11 +571,11 @@ intelTexImage3D(GLcontext * ctx,
 {
    intelTexImage(ctx, 3, target, level,
                  internalFormat, width, height, depth, border,
-                 format, type, pixels, unpack, texObj, texImage, 0, 0);
+                 format, type, pixels, unpack, texObj, texImage, 0, GL_FALSE);
 }
 
 
-void
+static void
 intelTexImage2D(GLcontext * ctx,
                 GLenum target, GLint level,
                 GLint internalFormat,
@@ -594,10 +587,11 @@ intelTexImage2D(GLcontext * ctx,
 {
    intelTexImage(ctx, 2, target, level,
                  internalFormat, width, height, 1, border,
-                 format, type, pixels, unpack, texObj, texImage, 0, 0);
+                 format, type, pixels, unpack, texObj, texImage, 0, GL_FALSE);
 }
 
-void
+
+static void
 intelTexImage1D(GLcontext * ctx,
                 GLenum target, GLint level,
                 GLint internalFormat,
@@ -609,21 +603,24 @@ intelTexImage1D(GLcontext * ctx,
 {
    intelTexImage(ctx, 1, target, level,
                  internalFormat, width, 1, 1, border,
-                 format, type, pixels, unpack, texObj, texImage, 0, 0);
+                 format, type, pixels, unpack, texObj, texImage, 0, GL_FALSE);
 }
 
-void intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-				GLint internalFormat,
-				GLint width, GLint height, GLint border,
-				GLsizei imageSize, const GLvoid *data,
-				struct gl_texture_object *texObj,
-				struct gl_texture_image *texImage )
+
+static void
+intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
+                           GLint internalFormat,
+                           GLint width, GLint height, GLint border,
+                           GLsizei imageSize, const GLvoid *data,
+                           struct gl_texture_object *texObj,
+                           struct gl_texture_image *texImage )
 {
    intelTexImage(ctx, 2, target, level,
 		 internalFormat, width, height, 1, border,
-		 0, 0, data, &ctx->Unpack, texObj, texImage, imageSize, 1);
+		 0, 0, data, &ctx->Unpack, texObj, texImage, imageSize, GL_TRUE);
 }
 
+
 /**
  * Need to map texture image into memory before copying image data,
  * then unmap it.
@@ -632,7 +629,7 @@ static void
 intel_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
 		    GLenum format, GLenum type, GLvoid * pixels,
 		    struct gl_texture_object *texObj,
-		    struct gl_texture_image *texImage, int compressed)
+		    struct gl_texture_image *texImage, GLboolean compressed)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_texture_image *intelImage = intel_texture_image(texImage);
@@ -686,28 +683,29 @@ intel_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
    }
 }
 
-void
+
+static void
 intelGetTexImage(GLcontext * ctx, GLenum target, GLint level,
                  GLenum format, GLenum type, GLvoid * pixels,
                  struct gl_texture_object *texObj,
                  struct gl_texture_image *texImage)
 {
    intel_get_tex_image(ctx, target, level, format, type, pixels,
-		       texObj, texImage, 0);
-
-
+		       texObj, texImage, GL_FALSE);
 }
 
-void
+
+static void
 intelGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
 			   GLvoid *pixels,
 			   struct gl_texture_object *texObj,
 			   struct gl_texture_image *texImage)
 {
    intel_get_tex_image(ctx, target, level, 0, 0, pixels,
-		       texObj, texImage, 1);
+		       texObj, texImage, GL_TRUE);
 }
 
+
 void
 intelSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 		  unsigned long long offset, GLint depth, GLuint pitch)
@@ -816,3 +814,16 @@ intelSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
     */
    intelSetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
 }
+
+
+void
+intelInitTextureImageFuncs(struct dd_function_table *functions)
+{
+   functions->TexImage1D = intelTexImage1D;
+   functions->TexImage2D = intelTexImage2D;
+   functions->TexImage3D = intelTexImage3D;
+   functions->GetTexImage = intelGetTexImage;
+
+   functions->CompressedTexImage2D = intelCompressedTexImage2D;
+   functions->GetCompressedTexImage = intelGetCompressedTexImage;
+}
diff --git a/shared/intel_tex_layout.c b/shared/intel_tex_layout.c
index e6f9a41..7d69ea4 100644
--- a/shared/intel_tex_layout.c
+++ b/shared/intel_tex_layout.c
@@ -35,26 +35,39 @@
 #include "intel_context.h"
 #include "main/macros.h"
 
-GLuint intel_compressed_alignment(GLenum internalFormat)
+void intel_get_texture_alignment_unit(GLenum internalFormat, GLuint *w, GLuint *h)
 {
-    GLuint alignment = 4;
-
     switch (internalFormat) {
     case GL_COMPRESSED_RGB_FXT1_3DFX:
     case GL_COMPRESSED_RGBA_FXT1_3DFX:
-        alignment = 8;
+        *w = 8;
+        *h = 4;
+        break;
+
+    case GL_RGB_S3TC:
+    case GL_RGB4_S3TC:
+    case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+    case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+    case GL_RGBA_S3TC:
+    case GL_RGBA4_S3TC:
+    case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+    case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+        *w = 4;
+        *h = 4;
         break;
 
     default:
+        *w = 4;
+        *h = 2;
         break;
     }
-
-    return alignment;
 }
 
-void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tree *mt )
+void i945_miptree_layout_2d( struct intel_context *intel,
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling )
 {
-   GLint align_h = 2, align_w = 4;
+   GLuint align_h = 2, align_w = 4;
    GLuint level;
    GLuint x = 0;
    GLuint y = 0;
@@ -62,9 +75,9 @@ void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tr
    GLuint height = mt->height0;
 
    mt->pitch = mt->width0;
+   intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
 
    if (mt->compressed) {
-       align_w = intel_compressed_alignment(mt->internal_format);
        mt->pitch = ALIGN(mt->width0, align_w);
    }
 
@@ -92,7 +105,7 @@ void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tr
    /* Pitch must be a whole number of dwords, even though we
     * express it in texels.
     */
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->pitch);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->pitch);
    mt->total_height = 0;
 
    for ( level = mt->first_level ; level <= mt->last_level ; level++ ) {
diff --git a/shared/intel_tex_layout.h b/shared/intel_tex_layout.h
index dbc90e6..c9de9b5 100644
--- a/shared/intel_tex_layout.h
+++ b/shared/intel_tex_layout.h
@@ -38,5 +38,7 @@ static GLuint minify( GLuint d )
    return MAX2(1, d>>1);
 }
 
-extern void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tree *mt );
-extern GLuint intel_compressed_alignment(GLenum);
+extern void i945_miptree_layout_2d(struct intel_context *intel,
+				   struct intel_mipmap_tree *mt,
+				   uint32_t tiling);
+extern void intel_get_texture_alignment_unit(GLenum, GLuint *, GLuint *);
diff --git a/shared/intel_tex_subimage.c b/shared/intel_tex_subimage.c
index f86de56..8903707 100644
--- a/shared/intel_tex_subimage.c
+++ b/shared/intel_tex_subimage.c
@@ -44,10 +44,12 @@ intelTexSubimage(GLcontext * ctx,
                  GLenum target, GLint level,
                  GLint xoffset, GLint yoffset, GLint zoffset,
                  GLint width, GLint height, GLint depth,
+                 GLsizei imageSize,
                  GLenum format, GLenum type, const void *pixels,
                  const struct gl_pixelstore_attrib *packing,
                  struct gl_texture_object *texObj,
-                 struct gl_texture_image *texImage)
+                 struct gl_texture_image *texImage,
+                 GLboolean compressed)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_texture_image *intelImage = intel_texture_image(texImage);
@@ -59,9 +61,14 @@ intelTexSubimage(GLcontext * ctx,
 
    intelFlush(ctx);
 
-   pixels =
-      _mesa_validate_pbo_teximage(ctx, dims, width, height, depth, format,
-                                  type, pixels, packing, "glTexSubImage2D");
+   if (compressed)
+      pixels = _mesa_validate_pbo_compressed_teximage(ctx, imageSize,
+                                                      pixels, packing,
+                                                      "glCompressedTexImage");
+   else
+      pixels = _mesa_validate_pbo_teximage(ctx, dims, width, height, depth,
+                                           format, type, pixels, packing,
+                                           "glTexSubImage");
    if (!pixels)
       return;
 
@@ -90,20 +97,28 @@ intelTexSubimage(GLcontext * ctx,
 
    assert(dstRowStride);
 
-   if (!texImage->TexFormat->StoreImage(ctx, dims, texImage->_BaseFormat,
-                                        texImage->TexFormat,
-                                        texImage->Data,
-                                        xoffset, yoffset, zoffset,
-                                        dstRowStride,
-                                        texImage->ImageOffsets,
-                                        width, height, depth,
-                                        format, type, pixels, packing)) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "intelTexSubImage");
+   if (compressed) {
+      if (intelImage->mt) {
+         struct intel_region *dst = intelImage->mt->region;
+         
+         _mesa_copy_rect(texImage->Data, dst->cpp, dst->pitch,
+                         xoffset, yoffset / 4,
+                         (width + 3)  & ~3, (height + 3) / 4,
+                         pixels, (width + 3) & ~3, 0, 0);
+      } else
+        memcpy(texImage->Data, pixels, imageSize);
    }
-
-   /* GL_SGIS_generate_mipmap */
-   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      intel_generate_mipmap(ctx, target, texObj);
+   else {
+      if (!texImage->TexFormat->StoreImage(ctx, dims, texImage->_BaseFormat,
+                                           texImage->TexFormat,
+                                           texImage->Data,
+                                           xoffset, yoffset, zoffset,
+                                           dstRowStride,
+                                           texImage->ImageOffsets,
+                                           width, height, depth,
+                                           format, type, pixels, packing)) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "intelTexSubImage");
+      }
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -114,13 +129,15 @@ intelTexSubimage(GLcontext * ctx,
    }
 
    UNLOCK_HARDWARE(intel);
-}
-
-
 
+   /* GL_SGIS_generate_mipmap */
+   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+      intel_generate_mipmap(ctx, target, texObj);
+   }
+}
 
 
-void
+static void
 intelTexSubImage3D(GLcontext * ctx,
                    GLenum target,
                    GLint level,
@@ -132,18 +149,15 @@ intelTexSubImage3D(GLcontext * ctx,
                    struct gl_texture_object *texObj,
                    struct gl_texture_image *texImage)
 {
-
    intelTexSubimage(ctx, 3,
                     target, level,
                     xoffset, yoffset, zoffset,
-                    width, height, depth,
-                    format, type, pixels, packing, texObj, texImage);
-
+                    width, height, depth, 0,
+                    format, type, pixels, packing, texObj, texImage, GL_FALSE);
 }
 
 
-
-void
+static void
 intelTexSubImage2D(GLcontext * ctx,
                    GLenum target,
                    GLint level,
@@ -155,17 +169,15 @@ intelTexSubImage2D(GLcontext * ctx,
                    struct gl_texture_object *texObj,
                    struct gl_texture_image *texImage)
 {
-
    intelTexSubimage(ctx, 2,
                     target, level,
                     xoffset, yoffset, 0,
-                    width, height, 1,
-                    format, type, pixels, packing, texObj, texImage);
-
+                    width, height, 1, 0,
+                    format, type, pixels, packing, texObj, texImage, GL_FALSE);
 }
 
 
-void
+static void
 intelTexSubImage1D(GLcontext * ctx,
                    GLenum target,
                    GLint level,
@@ -180,12 +192,11 @@ intelTexSubImage1D(GLcontext * ctx,
    intelTexSubimage(ctx, 1,
                     target, level,
                     xoffset, 0, 0,
-                    width, 1, 1,
-                    format, type, pixels, packing, texObj, texImage);
-
+                    width, 1, 1, 0,
+                    format, type, pixels, packing, texObj, texImage, GL_FALSE);
 }
 
-void
+static void
 intelCompressedTexSubImage2D(GLcontext * ctx,
 			     GLenum target,
 			     GLint level,
@@ -196,6 +207,20 @@ intelCompressedTexSubImage2D(GLcontext * ctx,
 			     struct gl_texture_object *texObj,
 			     struct gl_texture_image *texImage)
 {
-   fprintf(stderr, "stubbed CompressedTexSubImage2D: %dx%d@%dx%d\n",
-	   width, height, xoffset, yoffset);
+   intelTexSubimage(ctx, 2,
+                    target, level,
+                    xoffset, yoffset, 0,
+                    width, height, 1, imageSize,
+                    format, 0, pixels, &ctx->Unpack, texObj, texImage, GL_TRUE);
+}
+
+
+
+void
+intelInitTextureSubImageFuncs(struct dd_function_table *functions)
+{
+   functions->TexSubImage1D = intelTexSubImage1D;
+   functions->TexSubImage2D = intelTexSubImage2D;
+   functions->TexSubImage3D = intelTexSubImage3D;
+   functions->CompressedTexSubImage2D = intelCompressedTexSubImage2D;
 }
diff --git a/shared/intel_tex_validate.c b/shared/intel_tex_validate.c
index 05a375e..a284d54 100644
--- a/shared/intel_tex_validate.c
+++ b/shared/intel_tex_validate.c
@@ -199,6 +199,7 @@ intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit)
    if (!intelObj->mt) {
       intelObj->mt = intel_miptree_create(intel,
                                           intelObj->base.Target,
+                                          firstImage->base._BaseFormat,
                                           firstImage->base.InternalFormat,
                                           intelObj->firstLevel,
                                           intelObj->lastLevel,
@@ -241,7 +242,7 @@ intel_tex_map_level_images(struct intel_context *intel,
       struct intel_texture_image *intelImage =
 	 intel_texture_image(intelObj->base.Image[face][level]);
 
-      if (intelImage->mt) {
+      if (intelImage && intelImage->mt) {
 	 intelImage->base.Data =
 	    intel_miptree_image_map(intel,
 				    intelImage->mt,
@@ -268,7 +269,7 @@ intel_tex_unmap_level_images(struct intel_context *intel,
       struct intel_texture_image *intelImage =
 	 intel_texture_image(intelObj->base.Image[face][level]);
 
-      if (intelImage->mt) {
+      if (intelImage && intelImage->mt) {
 	 intel_miptree_image_unmap(intel, intelImage->mt);
 	 intelImage->base.Data = NULL;
       }
author	Luc Verhaegen <libv@skynet.be>	2010-03-14 05:54:10 +0100
committer	Luc Verhaegen <libv@skynet.be>	2010-03-14 05:54:10 +0100
commit	4cfe98db595fd3876e252bd23ed48bf82836f795 (patch)
tree	4a05e56b9c3ebba1f42460ffe2e6cf6e7e08eafe
parent	da7796a7d40ca2cd4a03e99ddf38e3a7256a401f (diff)