83 files changed, 3984 insertions, 2383 deletions
diff --git a/configure.ac b/configure.ac
index bd9ecb7..b73fe88 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,7 +1,7 @@
 # Process this file with autoconf to produce a configure script
 
 AC_PREREQ(2.57)
-AC_INIT([mesa-dri-i9xx], 7.4.0, [], mesa-dri-i9xx)
+AC_INIT([mesa-dri-i9xx], 7.5.0, [], mesa-dri-i9xx)
 
 AM_INIT_AUTOMAKE([dist-bzip2])
 
@@ -16,8 +16,9 @@ AC_PROG_CC
 AC_HEADER_STDC
 
 PKG_CHECK_MODULES([DRM], [libdrm >= 2.4.3])
-PKG_CHECK_MODULES([DRI], [libmesadri >= 7.4.0 libmesadri < 7.5.0
-			  libmesadricommon >= 7.4.0 libmesadricommon < 7.5.0])
+# changes in struct gl_fragment_program.
+PKG_CHECK_MODULES([DRI], [libmesadri >= 7.5.0 libmesadri < 7.6.0
+			  libmesadricommon >= 7.5.0 libmesadricommon < 7.6.0])
 
 AC_OUTPUT([
 	Makefile
diff --git a/i915/Makefile.am b/i915/Makefile.am
index b28f2ba..2dc608c 100644
--- a/i915/Makefile.am
+++ b/i915/Makefile.am
@@ -12,7 +12,6 @@ i915_dri_la_SOURCES = \
 	i830_metaops.c \
 	i830_state.c \
 	i830_texblend.c \
-	i830_tex.c \
 	i830_texstate.c \
 	i830_vtbl.c \
 	intel_render.c \
@@ -20,8 +19,8 @@ i915_dri_la_SOURCES = \
 	../shared/intel_buffer_objects.c \
 	../shared/intel_batchbuffer.c \
 	../shared/intel_clear.c \
+	../shared/intel_extensions.c \
 	../shared/intel_mipmap_tree.c \
-	i915_tex_layout.c \
 	../shared/intel_tex_layout.c \
 	../shared/intel_tex_image.c \
 	../shared/intel_tex_subimage.c \
@@ -37,7 +36,7 @@ i915_dri_la_SOURCES = \
 	../shared/intel_buffers.c \
 	../shared/intel_blit.c \
 	../shared/intel_swapbuffers.c \
-	i915_tex.c \
+	i915_tex_layout.c \
 	i915_texstate.c \
 	i915_context.c \
 	i915_debug.c \
@@ -51,7 +50,6 @@ i915_dri_la_SOURCES = \
 	../shared/intel_decode.c \
 	../shared/intel_screen.c \
 	../shared/intel_span.c \
-	intel_state.c \
+	../shared/intel_state.c \
 	intel_tris.c \
-	../shared/intel_fbo.c \
-	../shared/intel_depthstencil.c
+	../shared/intel_fbo.c
diff --git a/i915/i830_context.c b/i915/i830_context.c
index 09b1ec9..840946f 100644
--- a/i915/i830_context.c
+++ b/i915/i830_context.c
@@ -47,7 +47,6 @@ i830InitDriverFunctions(struct dd_function_table *functions)
 {
    intelInitDriverFunctions(functions);
    i830InitStateFuncs(functions);
-   i830InitTextureFuncs(functions);
 }
 
 extern const struct tnl_pipeline_stage *intel_pipeline[];
@@ -73,6 +72,8 @@ i830CreateContext(const __GLcontextModes * mesaVis,
       return GL_FALSE;
    }
 
+   _math_matrix_ctr(&intel->ViewportMatrix);
+
    /* Initialize swrast, tnl driver tables: */
    intelInitSpanFuncs(ctx);
    intelInitTriFuncs(ctx);
@@ -97,6 +98,10 @@ i830CreateContext(const __GLcontextModes * mesaVis,
    ctx->Const.MaxTextureRectSize = (1 << 11);
    ctx->Const.MaxTextureUnits = I830_TEX_UNITS;
 
+   ctx->Const.MaxTextureMaxAnisotropy = 2.0;
+
+   ctx->Const.MaxDrawBuffers = 1;
+
    _tnl_init_vertices(ctx, ctx->Const.MaxArrayLockSize + 12,
                       18 * sizeof(GLfloat));
 
diff --git a/i915/i830_state.c b/i915/i830_state.c
index d9cad0c..8ef6c91 100644
--- a/i915/i830_state.c
+++ b/i915/i830_state.c
@@ -39,6 +39,7 @@
 #include "intel_screen.h"
 #include "intel_batchbuffer.h"
 #include "intel_fbo.h"
+#include "intel_buffers.h"
 
 #include "i830_context.h"
 #include "i830_reg.h"
@@ -446,6 +447,24 @@ i830DepthMask(GLcontext * ctx, GLboolean flag)
       i830->state.Ctx[I830_CTXREG_ENABLES_2] |= DISABLE_DEPTH_WRITE;
 }
 
+/** Called from ctx->Driver.Viewport() */
+static void
+i830Viewport(GLcontext * ctx,
+              GLint x, GLint y, GLsizei width, GLsizei height)
+{
+   intelCalcViewport(ctx);
+
+   intel_viewport(ctx, x, y, width, height);
+}
+
+
+/** Called from ctx->Driver.DepthRange() */
+static void
+i830DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
+{
+   intelCalcViewport(ctx);
+}
+
 /* =============================================================
  * Polygon stipple
  *
@@ -1064,6 +1083,8 @@ i830InitStateFuncs(struct dd_function_table *functions)
    functions->StencilFuncSeparate = i830StencilFuncSeparate;
    functions->StencilMaskSeparate = i830StencilMaskSeparate;
    functions->StencilOpSeparate = i830StencilOpSeparate;
+   functions->DepthRange = i830DepthRange;
+   functions->Viewport = i830Viewport;
 }
 
 void
diff --git a/i915/i830_tex.c b/i915/i830_tex.c
deleted file mode 100644
index 34ac42a..0000000
--- a/i915/i830_tex.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "main/glheader.h"
-#include "main/mtypes.h"
-#include "main/imports.h"
-#include "main/simple_list.h"
-#include "main/enums.h"
-#include "main/image.h"
-#include "main/mm.h"
-#include "main/texstore.h"
-#include "main/texformat.h"
-#include "swrast/swrast.h"
-
-#include "texmem.h"
-
-#include "i830_context.h"
-#include "i830_reg.h"
-
-
-
-static void
-i830TexEnv(GLcontext * ctx, GLenum target,
-           GLenum pname, const GLfloat * param)
-{
-
-   switch (pname) {
-   case GL_TEXTURE_ENV_COLOR:
-   case GL_TEXTURE_ENV_MODE:
-   case GL_COMBINE_RGB:
-   case GL_COMBINE_ALPHA:
-   case GL_SOURCE0_RGB:
-   case GL_SOURCE1_RGB:
-   case GL_SOURCE2_RGB:
-   case GL_SOURCE0_ALPHA:
-   case GL_SOURCE1_ALPHA:
-   case GL_SOURCE2_ALPHA:
-   case GL_OPERAND0_RGB:
-   case GL_OPERAND1_RGB:
-   case GL_OPERAND2_RGB:
-   case GL_OPERAND0_ALPHA:
-   case GL_OPERAND1_ALPHA:
-   case GL_OPERAND2_ALPHA:
-   case GL_RGB_SCALE:
-   case GL_ALPHA_SCALE:
-      break;
-
-   case GL_TEXTURE_LOD_BIAS:{
-         struct i830_context *i830 = i830_context(ctx);
-         GLuint unit = ctx->Texture.CurrentUnit;
-         int b = (int) ((*param) * 16.0);
-         if (b > 63)
-            b = 63;
-         if (b < -64)
-            b = -64;
-         I830_STATECHANGE(i830, I830_UPLOAD_TEX(unit));
-         i830->lodbias_tm0s3[unit] =
-            ((b << TM0S3_LOD_BIAS_SHIFT) & TM0S3_LOD_BIAS_MASK);
-         break;
-      }
-
-   default:
-      break;
-   }
-}
-
-
-
-
-void
-i830InitTextureFuncs(struct dd_function_table *functions)
-{
-/*
-   functions->TexEnv = i830TexEnv;
-*/
-}
diff --git a/i915/i830_texstate.c b/i915/i830_texstate.c
index c718bb0..753c25b 100644
--- a/i915/i830_texstate.c
+++ b/i915/i830_texstate.c
@@ -38,7 +38,7 @@
 
 
 static GLuint
-translate_texture_format(GLuint mesa_format)
+translate_texture_format(GLuint mesa_format, GLuint internal_format)
 {
    switch (mesa_format) {
    case MESA_FORMAT_L8:
@@ -56,7 +56,10 @@ translate_texture_format(GLuint mesa_format)
    case MESA_FORMAT_ARGB4444:
       return MAPSURF_16BIT | MT_16BIT_ARGB4444;
    case MESA_FORMAT_ARGB8888:
-      return MAPSURF_32BIT | MT_32BIT_ARGB8888;
+      if (internal_format == GL_RGB)
+	 return MAPSURF_32BIT | MT_32BIT_XRGB8888;
+      else
+	 return MAPSURF_32BIT | MT_32BIT_ARGB8888;
    case MESA_FORMAT_YCBCR_REV:
       return (MAPSURF_422 | MT_422_YCRCB_NORMAL);
    case MESA_FORMAT_YCBCR:
@@ -119,6 +122,7 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    struct gl_texture_image *firstImage;
    GLuint *state = i830->state.Tex[unit], format, pitch;
    GLint lodbias;
+   GLubyte border[4];
 
    memset(state, 0, sizeof(state));
 
@@ -162,7 +166,8 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 								0, intelObj->
 								firstLevel);
 
-      format = translate_texture_format(firstImage->TexFormat->MesaFormat);
+      format = translate_texture_format(firstImage->TexFormat->MesaFormat,
+					firstImage->InternalFormat);
       pitch = intelObj->mt->pitch * intelObj->mt->cpp;
    }
 
@@ -290,11 +295,16 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
                                                      (ws)));
    }
 
+   /* convert border color from float to ubyte */
+   CLAMPED_FLOAT_TO_UBYTE(border[0], tObj->BorderColor[0]);
+   CLAMPED_FLOAT_TO_UBYTE(border[1], tObj->BorderColor[1]);
+   CLAMPED_FLOAT_TO_UBYTE(border[2], tObj->BorderColor[2]);
+   CLAMPED_FLOAT_TO_UBYTE(border[3], tObj->BorderColor[3]);
 
-   state[I830_TEXREG_TM0S4] = INTEL_PACKCOLOR8888(tObj->_BorderChan[0],
-                                                  tObj->_BorderChan[1],
-                                                  tObj->_BorderChan[2],
-                                                  tObj->_BorderChan[3]);
+   state[I830_TEXREG_TM0S4] = INTEL_PACKCOLOR8888(border[0],
+                                                  border[1],
+                                                  border[2],
+                                                  border[3]);
 
 
    I830_ACTIVESTATE(i830, I830_UPLOAD_TEX(unit), GL_TRUE);
diff --git a/i915/i830_vtbl.c b/i915/i830_vtbl.c
index 8fc8aa5..3bf02de 100644
--- a/i915/i830_vtbl.c
+++ b/i915/i830_vtbl.c
@@ -26,12 +26,14 @@
  **************************************************************************/
 
 #include "glapi/glapi.h"
+#include "main/texformat.h"
 
 #include "i830_context.h"
 #include "i830_reg.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 #include "intel_tris.h"
+#include "intel_fbo.h"
 #include "tnl/t_context.h"
 #include "tnl/t_vertex.h"
 
@@ -422,10 +424,10 @@ i830_emit_state(struct intel_context *intel)
    struct i830_hw_state *state = i830->current;
    int i, count;
    GLuint dirty;
-   GET_CURRENT_CONTEXT(ctx);
-   BATCH_LOCALS;
    dri_bo *aper_array[3 + I830_TEX_UNITS];
    int aper_count;
+   GET_CURRENT_CONTEXT(ctx);
+   BATCH_LOCALS;
 
    /* We don't hold the lock at this point, so want to make sure that
     * there won't be a buffer wrap between the state emits and the primitive
@@ -614,6 +616,8 @@ i830_state_draw_region(struct intel_context *intel,
 {
    struct i830_context *i830 = i830_context(&intel->ctx);
    GLcontext *ctx = &intel->ctx;
+   struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    GLuint value;
 
    ASSERT(state == &i830->state || state == &i830->meta);
@@ -651,13 +655,27 @@ i830_state_draw_region(struct intel_context *intel,
     */
    value = (DSTORG_HORT_BIAS(0x8) |     /* .5 */
             DSTORG_VERT_BIAS(0x8) | DEPTH_IS_Z);    /* .5 */
-            
-   if (color_region && color_region->cpp == 4) {
-      value |= DV_PF_8888;
-   }
-   else {
-      value |= DV_PF_565;
+
+   if (irb != NULL) {
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
+	 value |= DV_PF_8888;
+	 break;
+      case MESA_FORMAT_RGB565:
+	 value |= DV_PF_565;
+	 break;
+      case MESA_FORMAT_ARGB1555:
+	 value |= DV_PF_1555;
+	 break;
+      case MESA_FORMAT_ARGB4444:
+	 value |= DV_PF_4444;
+	 break;
+      default:
+	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
+		       irb->texformat->MesaFormat);
+      }
    }
+
    if (depth_region && depth_region->cpp == 4) {
       value |= DEPTH_FRMT_24_FIXED_8_OTHER;
    }
diff --git a/i915/i915_context.c b/i915/i915_context.c
index 3d6af38..1f9f363 100644
--- a/i915/i915_context.c
+++ b/i915/i915_context.c
@@ -50,16 +50,6 @@
  * Mesa's Driver Functions
  ***************************************/
 
-static const struct dri_extension i915_extensions[] = {
-   {"GL_ARB_depth_texture", NULL},
-   {"GL_ARB_fragment_program", NULL},
-   {"GL_ARB_shadow", NULL},
-   {"GL_ARB_texture_non_power_of_two", NULL},
-   {"GL_ATI_texture_env_combine3",       NULL},
-   {"GL_EXT_shadow_funcs", NULL},
-   {NULL, NULL}
-};
-
 /* Override intel default.
  */
 static void
@@ -93,7 +83,6 @@ i915InitDriverFunctions(struct dd_function_table *functions)
 {
    intelInitDriverFunctions(functions);
    i915InitStateFunctions(functions);
-   i915InitTextureFuncs(functions);
    i915InitFragProgFuncs(functions);
    functions->UpdateState = i915InvalidateState;
 }
@@ -129,6 +118,8 @@ i915CreateContext(const __GLcontextModes * mesaVis,
       return GL_FALSE;
    }
 
+   _math_matrix_ctr(&intel->ViewportMatrix);
+
    /* Initialize swrast, tnl driver tables: */
    intelInitSpanFuncs(ctx);
    intelInitTriFuncs(ctx);
@@ -154,6 +145,8 @@ i915CreateContext(const __GLcontextModes * mesaVis,
    ctx->Const.MaxTextureRectSize = (1 << 11);
    ctx->Const.MaxTextureUnits = I915_TEX_UNITS;
 
+   ctx->Const.MaxTextureMaxAnisotropy = 4.0;
+
    /* GL_ARB_fragment_program limits - don't think Mesa actually
     * validates programs against these, and in any case one ARB
     * instruction can translate to more than one HW instruction, so
@@ -172,8 +165,7 @@ i915CreateContext(const __GLcontextModes * mesaVis,
 
    ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
 
-   driInitExtensions(ctx, i915_extensions, GL_FALSE);
-
+   ctx->Const.MaxDrawBuffers = 1;
 
    _tnl_init_vertices(ctx, ctx->Const.MaxArrayLockSize + 12,
                       36 * sizeof(GLfloat));
diff --git a/i915/i915_fragprog.c b/i915/i915_fragprog.c
index f091d60..2db10c6 100644
--- a/i915/i915_fragprog.c
+++ b/i915/i915_fragprog.c
@@ -162,12 +162,12 @@ src_vector(struct i915_fragment_program *p,
                  GET_SWZ(source->Swizzle, 1),
                  GET_SWZ(source->Swizzle, 2), GET_SWZ(source->Swizzle, 3));
 
-   if (source->NegateBase)
+   if (source->Negate)
       src = negate(src,
-                   GET_BIT(source->NegateBase, 0),
-                   GET_BIT(source->NegateBase, 1),
-                   GET_BIT(source->NegateBase, 2),
-                   GET_BIT(source->NegateBase, 3));
+                   GET_BIT(source->Negate, 0),
+                   GET_BIT(source->Negate, 1),
+                   GET_BIT(source->Negate, 2),
+                   GET_BIT(source->Negate, 3));
 
    return src;
 }
@@ -180,9 +180,9 @@ get_result_vector(struct i915_fragment_program *p,
    switch (inst->DstReg.File) {
    case PROGRAM_OUTPUT:
       switch (inst->DstReg.Index) {
-      case FRAG_RESULT_COLR:
+      case FRAG_RESULT_COLOR:
          return UREG(REG_TYPE_OC, 0);
-      case FRAG_RESULT_DEPR:
+      case FRAG_RESULT_DEPTH:
          p->depth_written = 1;
          return UREG(REG_TYPE_OD, 0);
       default:
@@ -323,7 +323,8 @@ upload_program(struct i915_fragment_program *p)
       p->ctx->FragmentProgram._Current;
    const struct prog_instruction *inst = program->Base.Instructions;
 
-/*    _mesa_debug_fp_inst(program->Base.NumInstructions, inst); */
+   if (INTEL_DEBUG & DEBUG_WM)
+      _mesa_print_program(&program->Base);
 
    /* Is this a parse-failed program?  Ensure a valid program is
     * loaded, as the flagging of an error isn't sufficient to stop
@@ -1049,9 +1050,6 @@ i915ProgramStringNotify(GLcontext * ctx,
          _mesa_append_fog_code(ctx, &p->FragProg);
          p->FragProg.FogOption = GL_NONE;
       }
-
-      if (INTEL_DEBUG & DEBUG_STATE)
-	 _mesa_print_program(prog);
    }
 
    _tnl_program_string(ctx, target, prog);
diff --git a/i915/i915_state.c b/i915/i915_state.c
index 8347deb..814fb59 100644
--- a/i915/i915_state.c
+++ b/i915/i915_state.c
@@ -321,18 +321,9 @@ intelCalcViewport(GLcontext * ctx)
 
    if (ctx->DrawBuffer->Name) {
       /* User created FBO */
-      struct intel_renderbuffer *irb
-         = intel_renderbuffer(ctx->DrawBuffer->_ColorDrawBuffers[0]);
-      if (irb && !irb->RenderToTexture) {
-         /* y=0=top */
-         yScale = -1.0;
-         yBias = irb->Base.Height;
-      }
-      else {
-         /* y=0=bottom */
-         yScale = 1.0;
-         yBias = 0.0;
-      }
+      /* y=0=bottom */
+      yScale = 1.0;
+      yBias = 0.0;
    }
    else {
       /* window buffer, y=0=top */
@@ -353,7 +344,7 @@ intelCalcViewport(GLcontext * ctx)
 
 /** Called from ctx->Driver.Viewport() */
 static void
-intelViewport(GLcontext * ctx,
+i915Viewport(GLcontext * ctx,
               GLint x, GLint y, GLsizei width, GLsizei height)
 {
    intelCalcViewport(ctx);
@@ -364,7 +355,7 @@ intelViewport(GLcontext * ctx,
 
 /** Called from ctx->Driver.DepthRange() */
 static void
-intelDepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
+i915DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
 {
    intelCalcViewport(ctx);
 }
@@ -1033,8 +1024,8 @@ i915InitStateFunctions(struct dd_function_table *functions)
    functions->StencilFuncSeparate = i915StencilFuncSeparate;
    functions->StencilMaskSeparate = i915StencilMaskSeparate;
    functions->StencilOpSeparate = i915StencilOpSeparate;
-   functions->DepthRange = intelDepthRange;
-   functions->Viewport = intelViewport;
+   functions->DepthRange = i915DepthRange;
+   functions->Viewport = i915Viewport;
 }
 
 
diff --git a/i915/i915_tex_layout.c b/i915/i915_tex_layout.c
index d44a2f4..7cc1c09 100644
--- a/i915/i915_tex_layout.c
+++ b/i915/i915_tex_layout.c
@@ -454,7 +454,10 @@ i945_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
 {
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      i945_miptree_layout_cube(intel, mt);
+      if (mt->compressed)
+	 i945_miptree_layout_cube(intel, mt);
+      else
+	 i915_miptree_layout_cube(intel, mt);
       break;
    case GL_TEXTURE_3D:
       i945_miptree_layout_3d(intel, mt);
diff --git a/i915/i915_texstate.c b/i915/i915_texstate.c
index adbb52a..a37dd7f 100644
--- a/i915/i915_texstate.c
+++ b/i915/i915_texstate.c
@@ -37,7 +37,8 @@
 
 
 static GLuint
-translate_texture_format(GLuint mesa_format, GLenum DepthMode)
+translate_texture_format(GLuint mesa_format, GLuint internal_format,
+			 GLenum DepthMode)
 {
    switch (mesa_format) {
    case MESA_FORMAT_L8:
@@ -55,7 +56,10 @@ translate_texture_format(GLuint mesa_format, GLenum DepthMode)
    case MESA_FORMAT_ARGB4444:
       return MAPSURF_16BIT | MT_16BIT_ARGB4444;
    case MESA_FORMAT_ARGB8888:
-      return MAPSURF_32BIT | MT_32BIT_ARGB8888;
+      if (internal_format == GL_RGB)
+	 return MAPSURF_32BIT | MT_32BIT_XRGB8888;
+      else
+	 return MAPSURF_32BIT | MT_32BIT_ARGB8888;
    case MESA_FORMAT_YCBCR_REV:
       return (MAPSURF_422 | MT_422_YCRCB_NORMAL);
    case MESA_FORMAT_YCBCR:
@@ -128,7 +132,8 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    struct intel_texture_object *intelObj = intel_texture_object(tObj);
    struct gl_texture_image *firstImage;
    GLuint *state = i915->state.Tex[unit], format, pitch;
-   GLint lodbias;
+   GLint lodbias, aniso = 0;
+   GLubyte border[4];
 
    memset(state, 0, sizeof(state));
 
@@ -173,7 +178,8 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 								 firstLevel);
 
       format = translate_texture_format(firstImage->TexFormat->MesaFormat, 
-		tObj->DepthMode);
+					firstImage->InternalFormat,
+					tObj->DepthMode);
       pitch = intelObj->mt->pitch * intelObj->mt->cpp;
    }
 
@@ -224,6 +230,10 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
       if (tObj->MaxAnisotropy > 1.0) {
          minFilt = FILTER_ANISOTROPIC;
          magFilt = FILTER_ANISOTROPIC;
+         if (tObj->MaxAnisotropy > 2.0)
+            aniso = SS2_MAX_ANISO_4;
+         else
+            aniso = SS2_MAX_ANISO_2;
       }
       else {
          switch (tObj->MagFilter) {
@@ -269,7 +279,8 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 
       state[I915_TEXREG_SS2] |= ((minFilt << SS2_MIN_FILTER_SHIFT) |
                                  (mipFilt << SS2_MIP_FILTER_SHIFT) |
-                                 (magFilt << SS2_MAG_FILTER_SHIFT));
+                                 (magFilt << SS2_MAG_FILTER_SHIFT) |
+                                 aniso);
    }
 
    {
@@ -313,21 +324,26 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
       state[I915_TEXREG_SS3] |= (unit << SS3_TEXTUREMAP_INDEX_SHIFT);
    }
 
+   /* convert border color from float to ubyte */
+   CLAMPED_FLOAT_TO_UBYTE(border[0], tObj->BorderColor[0]);
+   CLAMPED_FLOAT_TO_UBYTE(border[1], tObj->BorderColor[1]);
+   CLAMPED_FLOAT_TO_UBYTE(border[2], tObj->BorderColor[2]);
+   CLAMPED_FLOAT_TO_UBYTE(border[3], tObj->BorderColor[3]);
 
    if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
       /* GL specs that border color for depth textures is taken from the
        * R channel, while the hardware uses A.  Spam R into all the channels
        * for safety.
        */
-      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(tObj->_BorderChan[0],
-						   tObj->_BorderChan[0],
-						   tObj->_BorderChan[0],
-						   tObj->_BorderChan[0]);
+      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(border[0],
+						   border[0],
+						   border[0],
+						   border[0]);
    } else {
-      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(tObj->_BorderChan[0],
-						   tObj->_BorderChan[1],
-						   tObj->_BorderChan[2],
-						   tObj->_BorderChan[3]);
+      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(border[0],
+						   border[1],
+						   border[2],
+						   border[3]);
    }
 
 
diff --git a/i915/i915_vtbl.c b/i915/i915_vtbl.c
index 3f6d282..1150046 100644
--- a/i915/i915_vtbl.c
+++ b/i915/i915_vtbl.c
@@ -32,6 +32,7 @@
 #include "main/imports.h"
 #include "main/macros.h"
 #include "main/colormac.h"
+#include "main/texformat.h"
 
 #include "tnl/t_context.h"
 #include "tnl/t_vertex.h"
@@ -40,6 +41,7 @@
 #include "intel_tex.h"
 #include "intel_regions.h"
 #include "intel_tris.h"
+#include "intel_fbo.h"
 
 #include "i915_reg.h"
 #include "i915_context.h"
@@ -542,6 +544,8 @@ i915_state_draw_region(struct intel_context *intel,
 {
    struct i915_context *i915 = i915_context(&intel->ctx);
    GLcontext *ctx = &intel->ctx;
+   struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    GLuint value;
 
    ASSERT(state == &i915->state || state == &i915->meta);
@@ -580,12 +584,26 @@ i915_state_draw_region(struct intel_context *intel,
    value = (DSTORG_HORT_BIAS(0x8) |     /* .5 */
             DSTORG_VERT_BIAS(0x8) |     /* .5 */
             LOD_PRECLAMP_OGL | TEX_DEFAULT_COLOR_OGL);
-   if (color_region && color_region->cpp == 4) {
-      value |= DV_PF_8888;
-   }
-   else {
-      value |= (DITHER_FULL_ALWAYS | DV_PF_565);
+   if (irb != NULL) {
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
+	 value |= DV_PF_8888;
+	 break;
+      case MESA_FORMAT_RGB565:
+	 value |= DV_PF_565 | DITHER_FULL_ALWAYS;
+	 break;
+      case MESA_FORMAT_ARGB1555:
+	 value |= DV_PF_1555 | DITHER_FULL_ALWAYS;
+	 break;
+      case MESA_FORMAT_ARGB4444:
+	 value |= DV_PF_4444 | DITHER_FULL_ALWAYS;
+	 break;
+      default:
+	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
+		       irb->texformat->MesaFormat);
+      }
    }
+
    if (depth_region && depth_region->cpp == 4) {
       value |= DEPTH_FRMT_24_FIXED_8_OTHER;
    }
diff --git a/i915/intel_tris.c b/i915/intel_tris.c
index a857803..1d39278 100644
--- a/i915/intel_tris.c
+++ b/i915/intel_tris.c
@@ -89,8 +89,8 @@ intel_flush_inline_primitive(struct intel_context *intel)
 
 static void intel_start_inline(struct intel_context *intel, uint32_t prim)
 {
-   BATCH_LOCALS;
    uint32_t batch_flags = LOOP_CLIPRECTS;
+   BATCH_LOCALS;
 
    intel->vtbl.emit_state(intel);
 
@@ -201,10 +201,10 @@ uint32_t *intel_get_prim_space(struct intel_context *intel, unsigned int count)
 /** Dispatches the accumulated primitive to the batchbuffer. */
 void intel_flush_prim(struct intel_context *intel)
 {
-   BATCH_LOCALS;
    dri_bo *aper_array[2];
    dri_bo *vb_bo;
    unsigned int offset, count;
+   BATCH_LOCALS;
 
    /* Must be called after an intel_start_prim. */
    assert(intel->prim.primitive != ~0);
diff --git a/i965/Makefile.am b/i965/Makefile.am
index 69abe6c..5a118af 100644
--- a/i965/Makefile.am
+++ b/i965/Makefile.am
@@ -15,7 +15,7 @@ i965_dri_la_SOURCES = \
 	../shared/intel_clear.c \
 	../shared/intel_context.c \
 	../shared/intel_decode.c \
-	../shared/intel_depthstencil.c \
+	../shared/intel_extensions.c \
 	../shared/intel_fbo.c \
 	../shared/intel_mipmap_tree.c \
 	../shared/intel_regions.c \
@@ -25,7 +25,7 @@ i965_dri_la_SOURCES = \
 	../shared/intel_pixel_bitmap.c \
 	../shared/intel_pixel_copy.c \
 	../shared/intel_pixel_draw.c \
-	intel_state.c \
+	../shared/intel_state.c \
 	../shared/intel_swapbuffers.c \
 	../shared/intel_tex.c \
 	../shared/intel_tex_copy.c \
diff --git a/i965/brw_cc.c b/i965/brw_cc.c
index 8237016..c724218 100644
--- a/i965/brw_cc.c
+++ b/i965/brw_cc.c
@@ -88,7 +88,7 @@ cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 
    memset(key, 0, sizeof(*key));
 
-   key->stencil = ctx->Stencil.Enabled;
+   key->stencil = ctx->Stencil._Enabled;
    key->stencil_two_side = ctx->Stencil._TestTwoSide;
 
    if (key->stencil) {
diff --git a/i965/brw_context.c b/i965/brw_context.c
index 4357100..4dbe551 100644
--- a/i965/brw_context.c
+++ b/i965/brw_context.c
@@ -111,14 +111,15 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
                                      ctx->Const.MaxTextureImageUnits);
    ctx->Const.MaxVertexTextureImageUnits = 0; /* no vertex shader textures */
 
-   /* Advertise the full hardware capabilities.  The new memory
-    * manager should cope much better with overload situations:
+   /* Mesa limits textures to 4kx4k; it would be nice to fix that someday
     */
-   ctx->Const.MaxTextureLevels = 12;
+   ctx->Const.MaxTextureLevels = 13;
    ctx->Const.Max3DTextureLevels = 9;
    ctx->Const.MaxCubeTextureLevels = 12;
-   ctx->Const.MaxTextureRectSize = (1<<11);
+   ctx->Const.MaxTextureRectSize = (1<<12);
    
+   ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+
    /* if conformance mode is set, swrast can handle any size AA point */
    ctx->Const.MaxPointSizeAA = 255.0;
 
diff --git a/i965/brw_context.h b/i965/brw_context.h
index df90c20..577497b 100644
--- a/i965/brw_context.h
+++ b/i965/brw_context.h
@@ -46,7 +46,7 @@
  *
  * CURBE - constant URB entry.  An urb region (entry) used to hold
  * constant values which the fixed function units can be instructed to
- * preload into the GRF when spawining a thread.
+ * preload into the GRF when spawning a thread.
  *
  * VUE - vertex URB entry.  An urb entry holding a vertex and usually
  * a vertex header.  The header contains control information and
@@ -63,7 +63,7 @@
  * special and may be overwritten.
  *
  * MRF - message register file.  Threads communicate (and terminate)
- * by sending messages.  Message parameters are placed in contigous
+ * by sending messages.  Message parameters are placed in contiguous
  * MRF registers.  All program output is via these messages.  URB
  * entries are populated by sending a message to the shared URB
  * function containing the new data, together with a control word,
@@ -141,7 +141,8 @@ struct brw_context;
 #define BRW_NEW_BATCH			0x10000
 /** brw->depth_region updated */
 #define BRW_NEW_DEPTH_BUFFER		0x20000
-#define BRW_NEW_NR_SURFACES		0x40000
+#define BRW_NEW_NR_WM_SURFACES		0x40000
+#define BRW_NEW_NR_VS_SURFACES		0x80000
 
 struct brw_state_flags {
    /** State update flags signalled by mesa internals */
@@ -154,19 +155,25 @@ struct brw_state_flags {
    GLuint cache;
 };
 
+
+/** Subclass of Mesa vertex program */
 struct brw_vertex_program {
    struct gl_vertex_program program;
    GLuint id;
+   dri_bo *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
 };
 
 
-
+/** Subclass of Mesa fragment program */
 struct brw_fragment_program {
    struct gl_fragment_program program;
-   GLuint id;
-};
-
+   GLuint id;  /**< serial no. to identify frag progs, never re-used */
+   GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
 
+   dri_bo *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
+};
 
 
 /* Data about a particular attempt to compile a program.  Note that
@@ -182,7 +189,7 @@ struct brw_wm_prog_data {
    GLuint total_grf;
    GLuint total_scratch;
 
-   GLuint nr_params;
+   GLuint nr_params;       /**< number of float params/constants */
    GLboolean error;
 
    /* Pointer to tracked values (only valid once
@@ -221,6 +228,7 @@ struct brw_vs_prog_data {
    GLuint urb_read_length;
    GLuint total_grf;
    GLuint outputs_written;
+   GLuint nr_params;       /**< number of float params/constants */
 
    GLuint inputs_read;
 
@@ -237,8 +245,35 @@ struct brw_vs_ouput_sizes {
 };
 
 
+/** Number of texture sampler units */
 #define BRW_MAX_TEX_UNIT 16
-#define BRW_WM_MAX_SURF BRW_MAX_TEX_UNIT + MAX_DRAW_BUFFERS
+
+/**
+ * Size of our surface binding table for the WM.
+ * This contains pointers to the drawing surfaces and current texture
+ * objects and shader constant buffers (+2).
+ */
+#define BRW_WM_MAX_SURF (MAX_DRAW_BUFFERS + BRW_MAX_TEX_UNIT + 1)
+
+/**
+ * Helpers to convert drawing buffers, textures and constant buffers
+ * to surface binding table indexes, for WM.
+ */
+#define SURF_INDEX_DRAW(d)           (d)
+#define SURF_INDEX_FRAG_CONST_BUFFER (MAX_DRAW_BUFFERS) 
+#define SURF_INDEX_TEXTURE(t)        (MAX_DRAW_BUFFERS + 1 + (t))
+
+/**
+ * Size of surface binding table for the VS.
+ * Only one constant buffer for now.
+ */
+#define BRW_VS_MAX_SURF 1
+
+/**
+ * Only a VS constant buffer
+ */
+#define SURF_INDEX_VERT_CONST_BUFFER 0
+
 
 enum brw_cache_id {
    BRW_CC_VP,
@@ -418,8 +453,8 @@ struct brw_context
       struct brw_tracked_state **atoms;
       GLuint nr_atoms;
 
-      GLuint nr_draw_regions;
-      struct intel_region *draw_regions[MAX_DRAW_BUFFERS];
+      GLuint nr_color_regions;
+      struct intel_region *color_regions[MAX_DRAW_BUFFERS];
       struct intel_region *depth_region;
 
       /**
@@ -512,8 +547,8 @@ struct brw_context
    /* BRW_NEW_CURBE_OFFSETS: 
     */
    struct {
-      GLuint wm_start;
-      GLuint wm_size;
+      GLuint wm_start;  /**< pos of first wm const in CURBE buffer */
+      GLuint wm_size;   /**< number of float[4] consts, multiple of 16 */
       GLuint clip_start;
       GLuint clip_size;
       GLuint vs_start;
@@ -545,6 +580,11 @@ struct brw_context
 
       dri_bo *prog_bo;
       dri_bo *state_bo;
+
+      /** Binding table of pointers to surf_bo entries */
+      dri_bo *bind_bo;
+      dri_bo *surf_bo[BRW_VS_MAX_SURF];
+      GLuint nr_surfaces;      
    } vs;
 
    struct {
@@ -576,9 +616,10 @@ struct brw_context
       struct brw_wm_prog_data *prog_data;
       struct brw_wm_compile *compile_data;
 
-      /* Input sizes, calculated from active vertex program:
+      /** Input sizes, calculated from active vertex program.
+       * One bit per fragment program input attribute.
        */
-      GLuint input_size_masks[4];
+      GLbitfield input_size_masks[4];
 
       /** Array of surface default colors (texture border color) */
       dri_bo *sdc_bo[BRW_MAX_TEX_UNIT];
@@ -587,7 +628,7 @@ struct brw_context
       GLuint nr_surfaces;      
 
       GLuint max_threads;
-      dri_bo *scratch_buffer;
+      dri_bo *scratch_bo;
 
       GLuint sampler_count;
       dri_bo *sampler_bo;
@@ -627,8 +668,6 @@ struct brw_context
  * brw_vtbl.c
  */
 void brwInitVtbl( struct brw_context *brw );
-void brw_do_flush( struct brw_context *brw, 
-		   GLuint flags );
 
 /*======================================================================
  * brw_context.c
@@ -670,7 +709,9 @@ void brwInitFragProgFuncs( struct dd_function_table *functions );
  */
 void brw_upload_urb_fence(struct brw_context *brw);
 
-void brw_upload_constant_buffer_state(struct brw_context *brw);
+/* brw_curbe.c
+ */
+void brw_upload_cs_urb_state(struct brw_context *brw);
 
 
 /*======================================================================
@@ -683,6 +724,32 @@ brw_context( GLcontext *ctx )
    return (struct brw_context *)ctx;
 }
 
+static INLINE struct brw_vertex_program *
+brw_vertex_program(struct gl_vertex_program *p)
+{
+   return (struct brw_vertex_program *) p;
+}
+
+static INLINE const struct brw_vertex_program *
+brw_vertex_program_const(const struct gl_vertex_program *p)
+{
+   return (const struct brw_vertex_program *) p;
+}
+
+static INLINE struct brw_fragment_program *
+brw_fragment_program(struct gl_fragment_program *p)
+{
+   return (struct brw_fragment_program *) p;
+}
+
+static INLINE const struct brw_fragment_program *
+brw_fragment_program_const(const struct gl_fragment_program *p)
+{
+   return (const struct brw_fragment_program *) p;
+}
+
+
+
 #define DO_SETUP_BITS ((1<<(FRAG_ATTRIB_MAX)) - 1)
 
 #endif
diff --git a/i965/brw_curbe.c b/i965/brw_curbe.c
index 4eaaa5f..9197fed 100644
--- a/i965/brw_curbe.c
+++ b/i965/brw_curbe.c
@@ -38,23 +38,28 @@
 #include "shader/prog_parameter.h"
 #include "shader/prog_statevars.h"
 #include "intel_batchbuffer.h"
+#include "intel_regions.h"
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_state.h"
 #include "brw_util.h"
 
 
-/* Partition the CURBE between the various users of constant values:
+/**
+ * Partition the CURBE between the various users of constant values:
+ * Note that vertex and fragment shaders can now fetch constants out
+ * of constant buffers.  We no longer allocatea block of the GRF for
+ * constants.  That greatly reduces the demand for space in the CURBE.
+ * Some of the comments within are dated...
  */
 static void calculate_curbe_offsets( struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    /* CACHE_NEW_WM_PROG */
-   GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
+   const GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
    
    /* BRW_NEW_VERTEX_PROGRAM */
-   struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program;
-   GLuint nr_vp_regs = (vp->program.Base.Parameters->NumParameters * 4 + 15) / 16;
+   const GLuint nr_vp_regs = (brw->vs.prog_data->nr_params + 15) / 16;
    GLuint nr_clip_regs = 0;
    GLuint total_regs;
 
@@ -138,24 +143,24 @@ const struct brw_tracked_state brw_curbe_offsets = {
  * fixed-function hardware in a double-buffering scheme to avoid a
  * pipeline stall each time the contents of the curbe is changed.
  */
-void brw_upload_constant_buffer_state(struct brw_context *brw)
+void brw_upload_cs_urb_state(struct brw_context *brw)
 {
-   struct brw_constant_buffer_state cbs; 
-   memset(&cbs, 0, sizeof(cbs));
+   struct brw_cs_urb_state cs_urb;
+   memset(&cs_urb, 0, sizeof(cs_urb));
 
    /* It appears that this is the state packet for the CS unit, ie. the
     * urb entries detailed here are housed in the CS range from the
     * URB_FENCE command.
     */
-   cbs.header.opcode = CMD_CONST_BUFFER_STATE;
-   cbs.header.length = sizeof(cbs)/4 - 2;
+   cs_urb.header.opcode = CMD_CS_URB_STATE;
+   cs_urb.header.length = sizeof(cs_urb)/4 - 2;
 
    /* BRW_NEW_URB_FENCE */
-   cbs.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
-   cbs.bits0.urb_entry_size = brw->urb.csize - 1;
+   cs_urb.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
+   cs_urb.bits0.urb_entry_size = brw->urb.csize - 1;
 
    assert(brw->urb.nr_cs_entries);
-   BRW_CACHED_BATCH_STRUCT(brw, &cbs);
+   BRW_CACHED_BATCH_STRUCT(brw, &cs_urb);
 }
 
 static GLfloat fixed_plane[6][4] = {
@@ -174,10 +179,12 @@ static GLfloat fixed_plane[6][4] = {
 static void prepare_constant_buffer(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
-   struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program;
-   struct brw_fragment_program *fp = (struct brw_fragment_program *)brw->fragment_program;
-   GLuint sz = brw->curbe.total_size;
-   GLuint bufsz = sz * 16 * sizeof(GLfloat);
+   const struct brw_vertex_program *vp =
+      brw_vertex_program_const(brw->vertex_program);
+   const struct brw_fragment_program *fp =
+      brw_fragment_program_const(brw->fragment_program);
+   const GLuint sz = brw->curbe.total_size;
+   const GLuint bufsz = sz * 16 * sizeof(GLfloat);
    GLfloat *buf;
    GLuint i;
 
@@ -189,27 +196,25 @@ static void prepare_constant_buffer(struct brw_context *brw)
    brw->curbe.tracked_state.dirty.mesa |= fp->program.Base.Parameters->StateFlags;
 
    if (sz == 0) {
-
       if (brw->curbe.last_buf) {
 	 free(brw->curbe.last_buf);
 	 brw->curbe.last_buf = NULL;
 	 brw->curbe.last_bufsz  = 0;
       }
-
       return;
    }
 
-   buf = (GLfloat *)malloc(bufsz);
-
-   memset(buf, 0, bufsz);
+   buf = (GLfloat *) _mesa_calloc(bufsz);
 
+   /* fragment shader constants */
    if (brw->curbe.wm_size) {
       GLuint offset = brw->curbe.wm_start * 16;
 
       _mesa_load_state_parameters(ctx, fp->program.Base.Parameters); 
 
+      /* copy float constants */
       for (i = 0; i < brw->wm.prog_data->nr_params; i++) 
-	 buf[offset + i] = brw->wm.prog_data->param[i][0];
+	 buf[offset + i] = *brw->wm.prog_data->param[i];
    }
 
 
@@ -244,18 +249,20 @@ static void prepare_constant_buffer(struct brw_context *brw)
       }
    }
 
-
+   /* vertex shader constants */
    if (brw->curbe.vs_size) {
       GLuint offset = brw->curbe.vs_start * 16;
-      GLuint nr = vp->program.Base.Parameters->NumParameters;
+      GLuint nr = brw->vs.prog_data->nr_params / 4;
 
       _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
 
+      /* XXX just use a memcpy here */
       for (i = 0; i < nr; i++) {
-	 buf[offset + i * 4 + 0] = vp->program.Base.Parameters->ParameterValues[i][0];
-	 buf[offset + i * 4 + 1] = vp->program.Base.Parameters->ParameterValues[i][1];
-	 buf[offset + i * 4 + 2] = vp->program.Base.Parameters->ParameterValues[i][2];
-	 buf[offset + i * 4 + 3] = vp->program.Base.Parameters->ParameterValues[i][3];
+         const GLfloat *value = vp->program.Base.Parameters->ParameterValues[i];
+	 buf[offset + i * 4 + 0] = value[0];
+	 buf[offset + i * 4 + 1] = value[1];
+	 buf[offset + i * 4 + 2] = value[2];
+	 buf[offset + i * 4 + 3] = value[3];
       }
    }
 
@@ -274,11 +281,14 @@ static void prepare_constant_buffer(struct brw_context *brw)
        brw->curbe.last_buf &&
        bufsz == brw->curbe.last_bufsz &&
        memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
-      free(buf);
+      /* constants have not changed */
+      _mesa_free(buf);
    } 
    else {
+      /* constants have changed */
       if (brw->curbe.last_buf)
-	 free(brw->curbe.last_buf);
+	 _mesa_free(brw->curbe.last_buf);
+
       brw->curbe.last_buf = buf;
       brw->curbe.last_bufsz = bufsz;
 
@@ -326,11 +336,77 @@ static void prepare_constant_buffer(struct brw_context *brw)
 }
 
 
+/**
+ * Copy Mesa program parameters into given constant buffer.
+ */
+static void
+update_constant_buffer(struct brw_context *brw,
+                       const struct gl_program_parameter_list *params,
+                       dri_bo *const_buffer)
+{
+   struct intel_context *intel = &brw->intel;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+
+   /* copy Mesa program constants into the buffer */
+   if (const_buffer && size > 0) {
+
+      assert(const_buffer);
+      assert(const_buffer->size >= size);
+
+      if (intel->intelScreen->kernel_exec_fencing) {
+         drm_intel_gem_bo_map_gtt(const_buffer);
+         memcpy(const_buffer->virtual, params->ParameterValues, size);
+         drm_intel_gem_bo_unmap_gtt(const_buffer);
+      }
+      else {
+         dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+      }
+
+      if (0) {
+         int i;
+         for (i = 0; i < params->NumParameters; i++) {
+            float *p = params->ParameterValues[i];
+            printf("%d: %f %f %f %f\n", i, p[0], p[1], p[2], p[3]);
+         }
+      }
+   }
+}
+
+
+/** Copy current vertex program's parameters into the constant buffer */
+static void
+update_vertex_constant_buffer(struct brw_context *brw)
+{
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   if (0) {
+      printf("update VS constants in buffer %p\n", vp->const_buffer);
+      printf("program %u\n", vp->program.Base.Id);
+   }
+   if (vp->use_const_buffer)
+      update_constant_buffer(brw, vp->program.Base.Parameters, vp->const_buffer);
+}
+
+
+/** Copy current fragment program's parameters into the constant buffer */
+static void
+update_fragment_constant_buffer(struct brw_context *brw)
+{
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   if (fp->use_const_buffer)
+      update_constant_buffer(brw, fp->program.Base.Parameters, fp->const_buffer);
+}
+
+
 static void emit_constant_buffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    GLuint sz = brw->curbe.total_size;
 
+   update_vertex_constant_buffer(brw);
+   update_fragment_constant_buffer(brw);
+
    BEGIN_BATCH(2, IGNORE_CLIPRECTS);
    if (sz == 0) {
       OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
diff --git a/i965/brw_defines.h b/i965/brw_defines.h
index 39c3225..98fc909 100644
--- a/i965/brw_defines.h
+++ b/i965/brw_defines.h
@@ -225,6 +225,24 @@
 
 #define BRW_RASTRULE_UPPER_LEFT  0    
 #define BRW_RASTRULE_UPPER_RIGHT 1
+/* These are listed as "Reserved, but not seen as useful"
+ * in Intel documentation (page 212, "Point Rasterization Rule",
+ * section 7.4 "SF Pipeline State Summary", of document
+ * "Intel® 965 Express Chipset Family and Intel® G35 Express
+ * Chipset Graphics Controller Programmer's Reference Manual,
+ * Volume 2: 3D/Media", Revision 1.0b as of January 2008,
+ * available at 
+ *     http://intellinuxgraphics.org/documentation.html
+ * at the time of this writing).
+ *
+ * These appear to be supported on at least some
+ * i965-family devices, and the BRW_RASTRULE_LOWER_RIGHT
+ * is useful when using OpenGL to render to a FBO
+ * (which has the pixel coordinate Y orientation inverted
+ * with respect to the normal OpenGL pixel coordinate system).
+ */
+#define BRW_RASTRULE_LOWER_LEFT  2
+#define BRW_RASTRULE_LOWER_RIGHT 3
 
 #define BRW_RENDERTARGET_CLAMPRANGE_UNORM    0
 #define BRW_RENDERTARGET_CLAMPRANGE_SNORM    1
@@ -349,9 +367,10 @@
 #define BRW_SURFACEFORMAT_L8A8_UNORM                     0x114 
 #define BRW_SURFACEFORMAT_I16_FLOAT                      0x115
 #define BRW_SURFACEFORMAT_L16_FLOAT                      0x116
-#define BRW_SURFACEFORMAT_A16_FLOAT                      0x117 
-#define BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119 
-#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A 
+#define BRW_SURFACEFORMAT_A16_FLOAT                      0x117
+#define BRW_SURFACEFORMAT_L8A8_UNORM_SRGB                0x118
+#define BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A
 #define BRW_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB            0x11B
 #define BRW_SURFACEFORMAT_R8G8_SSCALED                   0x11C
 #define BRW_SURFACEFORMAT_R8G8_USCALED                   0x11D
@@ -368,6 +387,7 @@
 #define BRW_SURFACEFORMAT_A4P4_UNORM                     0x148
 #define BRW_SURFACEFORMAT_R8_SSCALED                     0x149
 #define BRW_SURFACEFORMAT_R8_USCALED                     0x14A
+#define BRW_SURFACEFORMAT_L8_UNORM_SRGB                  0x14C
 #define BRW_SURFACEFORMAT_R1_UINT                        0x181 
 #define BRW_SURFACEFORMAT_YCRCB_NORMAL                   0x182 
 #define BRW_SURFACEFORMAT_YCRCB_SWAPUVY                  0x183 
@@ -734,7 +754,7 @@
 
 
 #define CMD_URB_FENCE                 0x6000
-#define CMD_CONST_BUFFER_STATE        0x6001
+#define CMD_CS_URB_STATE              0x6001
 #define CMD_CONST_BUFFER              0x6002
 
 #define CMD_STATE_BASE_ADDRESS        0x6101
diff --git a/i965/brw_draw.c b/i965/brw_draw.c
index 0b64999..5342622 100644
--- a/i965/brw_draw.c
+++ b/i965/brw_draw.c
@@ -127,6 +127,7 @@ static void brw_emit_prim(struct brw_context *brw,
 			  uint32_t hw_prim)
 {
    struct brw_3d_primitive prim_packet;
+   struct intel_context *intel = &brw->intel;
 
    if (INTEL_DEBUG & DEBUG_PRIMS)
       _mesa_printf("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode), 
@@ -146,10 +147,27 @@ static void brw_emit_prim(struct brw_context *brw,
 
    /* Can't wrap here, since we rely on the validated state. */
    brw->no_batch_wrap = GL_TRUE;
+
+   /* If we're set to always flush, do it before and after the primitive emit.
+    * We want to catch both missed flushes that hurt instruction/state cache
+    * and missed flushes of the render cache as it heads to other parts of
+    * the besides the draw code.
+    */
+   if (intel->always_flush_cache) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH(intel->vtbl.flush_cmd());
+      ADVANCE_BATCH();
+   }
    if (prim_packet.verts_per_instance) {
       intel_batchbuffer_data( brw->intel.batch, &prim_packet,
 			      sizeof(prim_packet), LOOP_CLIPRECTS);
    }
+   if (intel->always_flush_cache) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH(intel->vtbl.flush_cmd());
+      ADVANCE_BATCH();
+   }
+
    brw->no_batch_wrap = GL_FALSE;
 }
 
@@ -393,6 +411,8 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
       retval = GL_TRUE;
    }
 
+   if (intel->always_flush_batch)
+      intel_batchbuffer_flush(intel->batch);
  out:
    UNLOCK_HARDWARE(intel);
 
diff --git a/i965/brw_draw_upload.c b/i965/brw_draw_upload.c
index 73d6dea..b91b20b 100644
--- a/i965/brw_draw_upload.c
+++ b/i965/brw_draw_upload.c
@@ -156,7 +156,13 @@ static GLuint byte_types_scale[5] = {
 };
 
 
-static GLuint get_surface_type( GLenum type, GLuint size, GLboolean normalized )
+/**
+ * Given vertex array type/size/format/normalized info, return
+ * the appopriate hardware surface type.
+ * Format will be GL_RGBA or possibly GL_BGRA for GLubyte[4] color arrays.
+ */
+static GLuint get_surface_type( GLenum type, GLuint size,
+                                GLenum format, GLboolean normalized )
 {
    if (INTEL_DEBUG & DEBUG_VERTS)
       _mesa_printf("type %s size %d normalized %d\n", 
@@ -171,11 +177,20 @@ static GLuint get_surface_type( GLenum type, GLuint size, GLboolean normalized )
       case GL_BYTE: return byte_types_norm[size];
       case GL_UNSIGNED_INT: return uint_types_norm[size];
       case GL_UNSIGNED_SHORT: return ushort_types_norm[size];
-      case GL_UNSIGNED_BYTE: return ubyte_types_norm[size];
+      case GL_UNSIGNED_BYTE:
+         if (format == GL_BGRA) {
+            /* See GL_EXT_vertex_array_bgra */
+            assert(size == 4);
+            return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+         }
+         else {
+            return ubyte_types_norm[size];
+         }
       default: assert(0); return 0;
       }      
    }
    else {
+      assert(format == GL_RGBA); /* sanity check */
       switch (type) {
       case GL_DOUBLE: return double_types[size];
       case GL_FLOAT: return float_types[size];
@@ -262,6 +277,7 @@ copy_array_to_vbo_array( struct brw_context *brw,
 			 struct brw_vertex_element *element,
 			 GLuint dst_stride)
 {
+   struct intel_context *intel = &brw->intel;
    GLuint size = element->count * dst_stride;
 
    get_space(brw, size, &element->bo, &element->offset);
@@ -274,29 +290,52 @@ copy_array_to_vbo_array( struct brw_context *brw,
    }
 
    if (dst_stride == element->glarray->StrideB) {
-      dri_bo_subdata(element->bo,
-		     element->offset,
-		     size,
-		     element->glarray->Ptr);
+      if (intel->intelScreen->kernel_exec_fencing) {
+	 drm_intel_gem_bo_map_gtt(element->bo);
+	 memcpy((char *)element->bo->virtual + element->offset,
+		element->glarray->Ptr, size);
+	 drm_intel_gem_bo_unmap_gtt(element->bo);
+      } else {
+	 dri_bo_subdata(element->bo,
+			element->offset,
+			size,
+			element->glarray->Ptr);
+      }
    } else {
-      void *data;
       char *dest;
-      const char *src = element->glarray->Ptr;
+      const unsigned char *src = element->glarray->Ptr;
       int i;
 
-      data = _mesa_malloc(dst_stride * element->count);
-      dest = data;
-      for (i = 0; i < element->count; i++) {
-	 memcpy(dest, src, dst_stride);
-	 src += element->glarray->StrideB;
-	 dest += dst_stride;
-      }
+      if (intel->intelScreen->kernel_exec_fencing) {
+	 drm_intel_gem_bo_map_gtt(element->bo);
+	 dest = element->bo->virtual;
+	 dest += element->offset;
 
-      dri_bo_subdata(element->bo,
-		     element->offset,
-		     size,
-		     data);
-      _mesa_free(data);
+	 for (i = 0; i < element->count; i++) {
+	    memcpy(dest, src, dst_stride);
+	    src += element->glarray->StrideB;
+	    dest += dst_stride;
+	 }
+
+	 drm_intel_gem_bo_unmap_gtt(element->bo);
+      } else {
+	 void *data;
+
+	 data = _mesa_malloc(dst_stride * element->count);
+	 dest = data;
+	 for (i = 0; i < element->count; i++) {
+	    memcpy(dest, src, dst_stride);
+	    src += element->glarray->StrideB;
+	    dest += dst_stride;
+	 }
+
+	 dri_bo_subdata(element->bo,
+			element->offset,
+			size,
+			data);
+
+	 _mesa_free(data);
+      }
    }
 }
 
@@ -484,6 +523,7 @@ static void brw_emit_vertices(struct brw_context *brw)
       struct brw_vertex_element *input = enabled[i];
       uint32_t format = get_surface_type(input->glarray->Type,
 					 input->glarray->Size,
+					 input->glarray->Format,
 					 input->glarray->Normalized);
       uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC;
       uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC;
@@ -547,9 +587,15 @@ static void brw_prepare_indices(struct brw_context *brw)
 
       /* Straight upload
        */
-      dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
+      if (intel->intelScreen->kernel_exec_fencing) {
+	 drm_intel_gem_bo_map_gtt(bo);
+	 memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size);
+	 drm_intel_gem_bo_unmap_gtt(bo);
+      } else {
+	 dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
+      }
    } else {
-      offset = (GLuint)index_buffer->ptr;
+      offset = (GLuint) (unsigned long) index_buffer->ptr;
 
       /* If the index buffer isn't aligned to its element size, we have to
        * rebase it into a temporary.
diff --git a/i965/brw_eu.h b/i965/brw_eu.h
index b36a197..003332f 100644
--- a/i965/brw_eu.h
+++ b/i965/brw_eu.h
@@ -97,7 +97,7 @@ struct brw_glsl_call;
 
 
 #define BRW_EU_MAX_INSN_STACK 5
-#define BRW_EU_MAX_INSN 1200
+#define BRW_EU_MAX_INSN 4000
 
 struct brw_compile {
    struct brw_instruction store[BRW_EU_MAX_INSN];
@@ -170,6 +170,13 @@ static INLINE struct brw_reg brw_reg( GLuint file,
                                       GLuint writemask )
 {
    struct brw_reg reg;
+   if (type == BRW_GENERAL_REGISTER_FILE)
+      assert(nr < 128);
+   else if (type == BRW_MESSAGE_REGISTER_FILE)
+      assert(nr < 9);
+   else if (type == BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(nr <= BRW_ARF_IP);
+
    reg.type = type;
    reg.file = file;
    reg.nr = nr;
@@ -723,6 +730,13 @@ static INLINE struct brw_indirect brw_indirect( GLuint addr_subnr, GLint offset
    return ptr;
 }
 
+/** Do two brw_regs refer to the same register? */
+static INLINE GLboolean
+brw_same_reg(struct brw_reg r1, struct brw_reg r2)
+{
+   return r1.file == r2.file && r1.nr == r2.nr;
+}
+
 static INLINE struct brw_instruction *current_insn( struct brw_compile *p)
 {
    return &p->store[p->nr_insn];
@@ -841,12 +855,24 @@ void brw_math( struct brw_compile *p,
 
 void brw_dp_READ_16( struct brw_compile *p,
 		     struct brw_reg dest,
-		     GLuint msg_reg_nr,
 		     GLuint scratch_offset );
 
+void brw_dp_READ_4( struct brw_compile *p,
+                    struct brw_reg dest,
+                    GLboolean relAddr,
+                    GLuint location,
+                    GLuint bind_table_index );
+
+void brw_dp_READ_4_vs( struct brw_compile *p,
+                       struct brw_reg dest,
+                       GLuint oword,
+                       GLboolean relAddr,
+                       struct brw_reg addrReg,
+                       GLuint location,
+                       GLuint bind_table_index );
+
 void brw_dp_WRITE_16( struct brw_compile *p,
 		      struct brw_reg src,
-		      GLuint msg_reg_nr,
 		      GLuint scratch_offset );
 
 /* If/else/endif.  Works by manipulating the execution flags on each
diff --git a/i965/brw_eu_debug.c b/i965/brw_eu_debug.c
index 91dbbd5..29f3f6d 100644
--- a/i965/brw_eu_debug.c
+++ b/i965/brw_eu_debug.c
@@ -65,6 +65,7 @@ void brw_print_reg( struct brw_reg hwreg )
        hwreg.width == BRW_WIDTH_8 &&
        hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
        hwreg.type == BRW_REGISTER_TYPE_F) {
+      /* vector register */
       _mesa_printf("vec%d", hwreg.nr);
    }
    else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
@@ -72,8 +73,12 @@ void brw_print_reg( struct brw_reg hwreg )
 	    hwreg.width == BRW_WIDTH_1 &&
 	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
 	    hwreg.type == BRW_REGISTER_TYPE_F) {      
+      /* "scalar" register */
       _mesa_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
    }
+   else if (hwreg.file == BRW_IMMEDIATE_VALUE) {
+      _mesa_printf("imm %f", hwreg.dw1.f);
+   }
    else {
       _mesa_printf("%s%d.%d<%d;%d,%d>:%s", 
 		   file[hwreg.file],
diff --git a/i965/brw_eu_emit.c b/i965/brw_eu_emit.c
index 4e099b5..2a147fb 100644
--- a/i965/brw_eu_emit.c
+++ b/i965/brw_eu_emit.c
@@ -55,6 +55,9 @@ static void guess_execution_size( struct brw_instruction *insn,
 static void brw_set_dest( struct brw_instruction *insn,
 			  struct brw_reg dest )
 {
+   if (dest.type != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(dest.nr < 128);
+
    insn->bits1.da1.dest_reg_file = dest.file;
    insn->bits1.da1.dest_reg_type = dest.type;
    insn->bits1.da1.dest_address_mode = dest.address_mode;
@@ -96,10 +99,13 @@ static void brw_set_dest( struct brw_instruction *insn,
 }
 
 static void brw_set_src0( struct brw_instruction *insn,
-		      struct brw_reg reg )
+                          struct brw_reg reg )
 {
    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 
+   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(reg.nr < 128);
+
    insn->bits1.da1.src0_reg_file = reg.file;
    insn->bits1.da1.src0_reg_type = reg.type;
    insn->bits2.da1.src0_abs = reg.abs;
@@ -169,10 +175,12 @@ static void brw_set_src0( struct brw_instruction *insn,
 
 
 void brw_set_src1( struct brw_instruction *insn,
-			  struct brw_reg reg )
+                   struct brw_reg reg )
 {
    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 
+   assert(reg.nr < 128);
+
    insn->bits1.da1.src1_reg_file = reg.file;
    insn->bits1.da1.src1_reg_type = reg.type;
    insn->bits3.da1.src1_abs = reg.abs;
@@ -312,24 +320,25 @@ static void brw_set_dp_read_message( struct brw_instruction *insn,
 {
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.dp_read.binding_table_index = binding_table_index;
-   insn->bits3.dp_read.msg_control = msg_control;
-   insn->bits3.dp_read.msg_type = msg_type;
-   insn->bits3.dp_read.target_cache = target_cache;
-   insn->bits3.dp_read.response_length = response_length;
-   insn->bits3.dp_read.msg_length = msg_length;
-   insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ;
-   insn->bits3.dp_read.end_of_thread = end_of_thread;
+   insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
+   insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
+   insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
+   insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
+   insn->bits3.dp_read.response_length = response_length;  /*16:19*/
+   insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
+   insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
+   insn->bits3.dp_read.pad1 = 0;  /*28:30*/
+   insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
 }
 
 static void brw_set_sampler_message(struct brw_context *brw,
-                 struct brw_instruction *insn,
-				     GLuint binding_table_index,
-				     GLuint sampler,
-				     GLuint msg_type,
-				     GLuint response_length,
-				     GLuint msg_length,
-				     GLboolean eot)
+                                    struct brw_instruction *insn,
+                                    GLuint binding_table_index,
+                                    GLuint sampler,
+                                    GLuint msg_type,
+                                    GLuint response_length,
+                                    GLuint msg_length,
+                                    GLboolean eot)
 {
    brw_set_src1(insn, brw_imm_d(0));
 
@@ -407,7 +416,7 @@ static struct brw_instruction *brw_alu2(struct brw_compile *p,
  * Convenience routines.
  */
 #define ALU1(OP)					\
-struct brw_instruction *brw_##OP(struct brw_compile *p,			\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
 	      struct brw_reg dest,			\
 	      struct brw_reg src0)   			\
 {							\
@@ -415,7 +424,7 @@ struct brw_instruction *brw_##OP(struct brw_compile *p,			\
 }
 
 #define ALU2(OP)					\
-struct brw_instruction *brw_##OP(struct brw_compile *p,			\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
 	      struct brw_reg dest,			\
 	      struct brw_reg src0,			\
 	      struct brw_reg src1)   			\
@@ -469,9 +478,9 @@ void brw_NOP(struct brw_compile *p)
  */
 
 struct brw_instruction *brw_JMPI(struct brw_compile *p, 
-	      struct brw_reg dest,
-	      struct brw_reg src0,
-	      struct brw_reg src1)
+                                 struct brw_reg dest,
+                                 struct brw_reg src0,
+                                 struct brw_reg src1)
 {
    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
 
@@ -674,7 +683,7 @@ struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 
 
 struct brw_instruction *brw_WHILE(struct brw_compile *p, 
-	       struct brw_instruction *do_insn)
+                                  struct brw_instruction *do_insn)
 {
    struct brw_instruction *insn;
 
@@ -762,7 +771,7 @@ void brw_CMP(struct brw_compile *p,
  * Helpers for the various SEND message types:
  */
 
-/* Invert 8 values
+/** Extended math function, float[8].
  */
 void brw_math( struct brw_compile *p,
 	       struct brw_reg dest,
@@ -794,7 +803,9 @@ void brw_math( struct brw_compile *p,
 			data_type);
 }
 
-/* Use 2 send instructions to invert 16 elements
+/**
+ * Extended math function, float[16].
+ * Use 2 send instructions.
  */
 void brw_math_16( struct brw_compile *p,
 		  struct brw_reg dest,
@@ -847,22 +858,26 @@ void brw_math_16( struct brw_compile *p,
 }
 
 
-
-
+/**
+ * Write block of 16 dwords/floats to the data port Render Cache scratch buffer.
+ * Scratch offset should be a multiple of 64.
+ * Used for register spilling.
+ */
 void brw_dp_WRITE_16( struct brw_compile *p,
 		      struct brw_reg src,
-		      GLuint msg_reg_nr,
 		      GLuint scratch_offset )
 {
+   GLuint msg_reg_nr = 1;
    {
       brw_push_insn_state(p);
       brw_set_mask_control(p, BRW_MASK_DISABLE);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
+      /* set message header global offset field (reg 0, element 2) */
       brw_MOV(p,
 	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
 	      brw_imm_d(scratch_offset));
-			   
+
       brw_pop_insn_state(p);
    }
 
@@ -879,7 +894,7 @@ void brw_dp_WRITE_16( struct brw_compile *p,
       brw_set_src0(insn, src);
 
       brw_set_dp_write_message(insn,
-			       255, /* bti */
+			       255, /* binding table index (255=stateless) */
 			       BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
 			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
 			       msg_length,
@@ -887,24 +902,29 @@ void brw_dp_WRITE_16( struct brw_compile *p,
 			       0, /* response_length */
 			       0); /* eot */
    }
-
 }
 
 
+/**
+ * Read block of 16 dwords/floats from the data port Render Cache scratch buffer.
+ * Scratch offset should be a multiple of 64.
+ * Used for register spilling.
+ */
 void brw_dp_READ_16( struct brw_compile *p,
 		      struct brw_reg dest,
-		      GLuint msg_reg_nr,
 		      GLuint scratch_offset )
 {
+   GLuint msg_reg_nr = 1;
    {
       brw_push_insn_state(p);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       brw_set_mask_control(p, BRW_MASK_DISABLE);
 
+      /* set message header global offset field (reg 0, element 2) */
       brw_MOV(p,
 	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
 	      brw_imm_d(scratch_offset));
-			   
+
       brw_pop_insn_state(p);
    }
 
@@ -919,10 +939,10 @@ void brw_dp_READ_16( struct brw_compile *p,
       brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
 
       brw_set_dp_read_message(insn,
-			      255, /* bti */
-			      3,  /* msg_control */
+			      255, /* binding table index (255=stateless) */
+			      3,  /* msg_control (3 means 4 Owords) */
 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
-			      1, /* target cache */
+			      1, /* target cache (render/scratch) */
 			      1, /* msg_length */
 			      2, /* response_length */
 			      0); /* eot */
@@ -930,14 +950,143 @@ void brw_dp_READ_16( struct brw_compile *p,
 }
 
 
+/**
+ * Read a float[4] vector from the data port Data Cache (const buffer).
+ * Location (in buffer) should be a multiple of 16.
+ * Used for fetching shader constants.
+ * If relAddr is true, we'll do an indirect fetch using the address register.
+ */
+void brw_dp_READ_4( struct brw_compile *p,
+                    struct brw_reg dest,
+                    GLboolean relAddr,
+                    GLuint location,
+                    GLuint bind_table_index )
+{
+   /* XXX: relAddr not implemented */
+   GLuint msg_reg_nr = 1;
+   {
+      struct brw_reg b;
+      brw_push_insn_state(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+   /* Setup MRF[1] with location/offset into const buffer */
+      b = brw_message_reg(msg_reg_nr);
+      b = retype(b, BRW_REGISTER_TYPE_UD);
+      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
+       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
+       */
+      brw_MOV(p, b, brw_imm_ud(location));
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.mask_control = BRW_MASK_DISABLE;
+  
+      /* cast dest to a uword[8] vector */
+      dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, brw_null_reg());
+
+      brw_set_dp_read_message(insn,
+			      bind_table_index,
+			      0,  /* msg_control (0 means 1 Oword) */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      0, /* source cache = data cache */
+			      1, /* msg_length */
+			      1, /* response_length (1 Oword) */
+			      0); /* eot */
+   }
+}
+
+
+/**
+ * Read float[4] constant(s) from VS constant buffer.
+ * For relative addressing, two float[4] constants will be read into 'dest'.
+ * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
+ */
+void brw_dp_READ_4_vs(struct brw_compile *p,
+                      struct brw_reg dest,
+                      GLuint oword,
+                      GLboolean relAddr,
+                      struct brw_reg addrReg,
+                      GLuint location,
+                      GLuint bind_table_index)
+{
+   GLuint msg_reg_nr = 1;
+
+   assert(oword < 2);
+   /*
+   printf("vs const read msg, location %u, msg_reg_nr %d\n",
+          location, msg_reg_nr);
+   */
+
+   /* Setup MRF[1] with location/offset into const buffer */
+   {
+      struct brw_reg b;
+
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      /*brw_set_access_mode(p, BRW_ALIGN_16);*/
+
+      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
+       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
+       */
+      b = brw_message_reg(msg_reg_nr);
+      b = retype(b, BRW_REGISTER_TYPE_UD);
+      /*b = get_element_ud(b, 2);*/
+      if (relAddr) {
+         brw_ADD(p, b, addrReg, brw_imm_ud(location));
+      }
+      else {
+         brw_MOV(p, b, brw_imm_ud(location));
+      }
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.mask_control = BRW_MASK_DISABLE;
+      /*insn->header.access_mode = BRW_ALIGN_16;*/
+  
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, brw_null_reg());
+
+      brw_set_dp_read_message(insn,
+			      bind_table_index,
+			      oword,  /* 0 = lower Oword, 1 = upper Oword */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      0, /* source cache = data cache */
+			      1, /* msg_length */
+			      1, /* response_length (1 Oword) */
+			      0); /* eot */
+   }
+}
+
+
+
 void brw_fb_WRITE(struct brw_compile *p,
-		   struct brw_reg dest,
-		   GLuint msg_reg_nr,
-		   struct brw_reg src0,
-		   GLuint binding_table_index,
-		   GLuint msg_length,
-		   GLuint response_length,
-		   GLboolean eot)
+                  struct brw_reg dest,
+                  GLuint msg_reg_nr,
+                  struct brw_reg src0,
+                  GLuint binding_table_index,
+                  GLuint msg_length,
+                  GLuint response_length,
+                  GLboolean eot)
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
    
@@ -958,7 +1107,11 @@ void brw_fb_WRITE(struct brw_compile *p,
 }
 
 
-
+/**
+ * Texture sample instruction.
+ * Note: the msg_type plus msg_length values determine exactly what kind
+ * of sampling operation is performed.  See volume 4, page 161 of docs.
+ */
 void brw_SAMPLE(struct brw_compile *p,
 		struct brw_reg dest,
 		GLuint msg_reg_nr,
@@ -973,8 +1126,8 @@ void brw_SAMPLE(struct brw_compile *p,
 {
    GLboolean need_stall = 0;
    
-   if(writemask == 0) {
-/*       _mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
+   if (writemask == 0) {
+      /*_mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
       return;
    }
    
@@ -1006,7 +1159,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
       if (newmask != writemask) {
 	 need_stall = 1;
-/* 	 _mesa_printf("need stall %x %x\n", newmask , writemask); */
+         /* _mesa_printf("need stall %x %x\n", newmask , writemask); */
       }
       else {
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
@@ -1047,14 +1200,13 @@ void brw_SAMPLE(struct brw_compile *p,
 			      eot);
    }
 
-   if (need_stall)
-   {
+   if (need_stall) {
       struct brw_reg reg = vec8(offset(dest, response_length-1));
 
       /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
        */
       brw_push_insn_state(p);
-      brw_set_compression_control(p, GL_FALSE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       brw_MOV(p, reg, reg);	      
       brw_pop_insn_state(p);
    }
diff --git a/i965/brw_fallback.c b/i965/brw_fallback.c
index 5f4f2d5..2993574 100644
--- a/i965/brw_fallback.c
+++ b/i965/brw_fallback.c
@@ -75,7 +75,7 @@ static GLboolean do_check_fallback(struct brw_context *brw)
    
    /* _NEW_STENCIL 
     */
-   if (ctx->Stencil.Enabled &&
+   if (ctx->Stencil._Enabled &&
        (ctx->DrawBuffer->Name == 0 && !brw->intel.hw_stencil)) {
       DBG("FALLBACK: stencil\n");
       return GL_TRUE;
diff --git a/i965/brw_misc_state.c b/i965/brw_misc_state.c
index f311663..9bc5c35 100644
--- a/i965/brw_misc_state.c
+++ b/i965/brw_misc_state.c
@@ -101,6 +101,7 @@ const struct brw_tracked_state brw_drawing_rect = {
 
 static void prepare_binding_table_pointers(struct brw_context *brw)
 {
+   brw_add_validated_bo(brw, brw->vs.bind_bo);
    brw_add_validated_bo(brw, brw->wm.bind_bo);
 }
 
@@ -117,13 +118,11 @@ static void upload_binding_table_pointers(struct brw_context *brw)
 
    BEGIN_BATCH(6, IGNORE_CLIPRECTS);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
-   OUT_BATCH(0); /* vs */
+   OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
    OUT_BATCH(0); /* gs */
    OUT_BATCH(0); /* clip */
    OUT_BATCH(0); /* sf */
-   OUT_RELOC(brw->wm.bind_bo,
-	     I915_GEM_DOMAIN_SAMPLER, 0,
-	     0);
+   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
    ADVANCE_BATCH();
 }
 
@@ -178,7 +177,7 @@ static void upload_psp_urb_cbs(struct brw_context *brw )
 {
    upload_pipelined_state_pointers(brw);
    brw_upload_urb_fence(brw);
-   brw_upload_constant_buffer_state(brw);
+   brw_upload_cs_urb_state(brw);
 }
 
 const struct brw_tracked_state brw_psp_urb_cbs = {
diff --git a/i965/brw_program.c b/i965/brw_program.c
index 0c86911..bac6918 100644
--- a/i965/brw_program.c
+++ b/i965/brw_program.c
@@ -38,6 +38,7 @@
 
 #include "brw_context.h"
 #include "brw_util.h"
+#include "brw_wm.h"
 
 static void brwBindProgram( GLcontext *ctx,
 			    GLenum target, 
@@ -94,7 +95,12 @@ static struct gl_program *brwNewProgram( GLcontext *ctx,
 static void brwDeleteProgram( GLcontext *ctx,
 			      struct gl_program *prog )
 {
-   
+   if (prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
+      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
+      struct brw_fragment_program *brw_fprog = brw_fragment_program(fprog);
+      dri_bo_unreference(brw_fprog->const_buffer);
+   }
+
    _mesa_delete_program( ctx, prog );
 }
 
@@ -110,30 +116,36 @@ static void brwProgramStringNotify( GLcontext *ctx,
 				    GLenum target,
 				    struct gl_program *prog )
 {
+   struct brw_context *brw = brw_context(ctx);
+
    if (target == GL_FRAGMENT_PROGRAM_ARB) {
       struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
-      struct brw_context *brw = brw_context(ctx);
-      struct brw_fragment_program *p = (struct brw_fragment_program *)prog;
-      struct brw_fragment_program *fp = (struct brw_fragment_program *)brw->fragment_program;
+      struct brw_fragment_program *newFP = brw_fragment_program(fprog);
+      const struct brw_fragment_program *curFP =
+         brw_fragment_program_const(brw->fragment_program);
+
       if (fprog->FogOption) {
          _mesa_append_fog_code(ctx, fprog);
          fprog->FogOption = GL_NONE;
       }
 
-      if (p == fp)
+      if (newFP == curFP)
 	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
-      p->id = brw->program_id++;      
+      newFP->id = brw->program_id++;      
+      newFP->isGLSL = brw_wm_is_glsl(fprog);
    }
    else if (target == GL_VERTEX_PROGRAM_ARB) {
-      struct brw_context *brw = brw_context(ctx);
-      struct brw_vertex_program *p = (struct brw_vertex_program *)prog;
-      struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program;
-      if (p == vp)
+      struct gl_vertex_program *vprog = (struct gl_vertex_program *) prog;
+      struct brw_vertex_program *newVP = brw_vertex_program(vprog);
+      const struct brw_vertex_program *curVP =
+         brw_vertex_program_const(brw->vertex_program);
+
+      if (newVP == curVP)
 	 brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
-      if (p->program.IsPositionInvariant) {
-	 _mesa_insert_mvp_code(ctx, &p->program);
+      if (newVP->program.IsPositionInvariant) {
+	 _mesa_insert_mvp_code(ctx, &newVP->program);
       }
-      p->id = brw->program_id++;      
+      newVP->id = brw->program_id++;      
 
       /* Also tell tnl about it:
        */
diff --git a/i965/brw_sf_state.c b/i965/brw_sf_state.c
index e22d080..c999187 100644
--- a/i965/brw_sf_state.c
+++ b/i965/brw_sf_state.c
@@ -43,10 +43,12 @@ static void upload_sf_vp(struct brw_context *brw)
    const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
    struct brw_sf_viewport sfv;
    GLfloat y_scale, y_bias;
+   const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
+   const GLfloat *v = ctx->Viewport._WindowMap.m;
 
    memset(&sfv, 0, sizeof(sfv));
 
-   if (intel_rendering_to_texture(ctx)) {
+   if (render_to_fbo) {
       y_scale = 1.0;
       y_bias = 0;
    }
@@ -57,8 +59,6 @@ static void upload_sf_vp(struct brw_context *brw)
 
    /* _NEW_VIEWPORT */
 
-   const GLfloat *v = ctx->Viewport._WindowMap.m;
-
    sfv.viewport.m00 = v[MAT_SX];
    sfv.viewport.m11 = v[MAT_SY] * y_scale;
    sfv.viewport.m22 = v[MAT_SZ] * depth_scale;
@@ -66,7 +66,9 @@ static void upload_sf_vp(struct brw_context *brw)
    sfv.viewport.m31 = v[MAT_TY] * y_scale + y_bias;
    sfv.viewport.m32 = v[MAT_TZ] * depth_scale;
 
-   /* _NEW_SCISSOR */
+   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT
+    * for DrawBuffer->_[XY]{min,max}
+    */
 
    /* The scissor only needs to handle the intersection of drawable and
     * scissor rect.  Clipping to the boundaries of static shared buffers
@@ -75,7 +77,7 @@ static void upload_sf_vp(struct brw_context *brw)
     * Note that the hardware's coordinates are inclusive, while Mesa's min is
     * inclusive but max is exclusive.
     */
-   if (intel_rendering_to_texture(ctx)) {
+   if (render_to_fbo) {
       /* texmemory: Y=0=bottom */
       sfv.scissor.xmin = ctx->DrawBuffer->_Xmin;
       sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
@@ -97,7 +99,8 @@ static void upload_sf_vp(struct brw_context *brw)
 const struct brw_tracked_state brw_sf_vp = {
    .dirty = {
       .mesa  = (_NEW_VIEWPORT | 
-		_NEW_SCISSOR),
+		_NEW_SCISSOR |
+		_NEW_BUFFERS),
       .brw   = 0,
       .cache = 0
    },
@@ -111,10 +114,13 @@ struct brw_sf_unit_key {
    unsigned int nr_urb_entries, urb_size, sfsize;
 
    GLenum front_face, cull_face;
-   GLboolean scissor, line_smooth, point_sprite, point_attenuated;
+   unsigned scissor:1;
+   unsigned line_smooth:1;
+   unsigned point_sprite:1;
+   unsigned point_attenuated:1;
+   unsigned render_to_fbo:1;
    float line_width;
    float point_size;
-   GLboolean render_to_texture;
 };
 
 static void
@@ -144,10 +150,10 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
    key->line_smooth = ctx->Line.SmoothFlag;
 
    key->point_sprite = ctx->Point.PointSprite;
-   key->point_size = ctx->Point.Size;
+   key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
    key->point_attenuated = ctx->Point._Attenuated;
 
-   key->render_to_texture = intel_rendering_to_texture(&brw->intel.ctx);
+   key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
 }
 
 static dri_bo *
@@ -194,10 +200,10 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    else
       sf.sf5.front_winding = BRW_FRONTWINDING_CW;
 
-   /* The viewport is inverted for rendering to texture, and that inverts
+   /* The viewport is inverted for rendering to a FBO, and that inverts
     * polygon front/back orientation.
     */
-   sf.sf5.front_winding ^= key->render_to_texture;
+   sf.sf5.front_winding ^= key->render_to_fbo;
 
    switch (key->cull_face) {
    case GL_FRONT:
@@ -228,7 +234,33 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
        sf.sf6.line_width = 0;
 
    /* _NEW_POINT */
-   sf.sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT;	/* opengl conventions */
+   key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
+   if (!key->render_to_fbo) {
+      /* Rendering to an OpenGL window */
+      sf.sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT;
+   }
+   else {
+      /* If rendering to an FBO, the pixel coordinate system is
+       * inverted with respect to the normal OpenGL coordinate
+       * system, so BRW_RASTRULE_LOWER_RIGHT is correct.
+       * But this value is listed as "Reserved, but not seen as useful"
+       * in Intel documentation (page 212, "Point Rasterization Rule",
+       * section 7.4 "SF Pipeline State Summary", of document
+       * "Intel® 965 Express Chipset Family and Intel® G35 Express
+       * Chipset Graphics Controller Programmer's Reference Manual,
+       * Volume 2: 3D/Media", Revision 1.0b as of January 2008,
+       * available at 
+       *     http://intellinuxgraphics.org/documentation.html
+       * at the time of this writing).
+       *
+       * It does work on at least some devices, if not all;
+       * if devices that don't support it can be identified,
+       * the likely failure case is that points are rasterized
+       * incorrectly, which is no worse than occurs without
+       * the value, so we're using it here.
+       */
+      sf.sf6.point_rast_rule = BRW_RASTRULE_LOWER_RIGHT;
+   }
    /* XXX clamp max depends on AA vs. non-AA */
 
    sf.sf7.sprite_point = key->point_sprite;
diff --git a/i965/brw_state.h b/i965/brw_state.h
index df839c5..81b0a45 100644
--- a/i965/brw_state.h
+++ b/i965/brw_state.h
@@ -52,7 +52,6 @@ const struct brw_tracked_state brw_cc_vp;
 const struct brw_tracked_state brw_check_fallback;
 const struct brw_tracked_state brw_clip_prog;
 const struct brw_tracked_state brw_clip_unit;
-const struct brw_tracked_state brw_constant_buffer_state;
 const struct brw_tracked_state brw_constant_buffer;
 const struct brw_tracked_state brw_curbe_offsets;
 const struct brw_tracked_state brw_invarient_state;
diff --git a/i965/brw_state_batch.c b/i965/brw_state_batch.c
index dc87859..811940e 100644
--- a/i965/brw_state_batch.c
+++ b/i965/brw_state_batch.c
@@ -97,8 +97,6 @@ void brw_clear_batch_cache_flush( struct brw_context *brw )
 {
    clear_batch_cache(brw);
 
-/*    brw_do_flush(brw, BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE); */
-   
    brw->state.dirty.mesa |= ~0;
    brw->state.dirty.brw |= ~0;
    brw->state.dirty.cache |= ~0;
diff --git a/i965/brw_state_dump.c b/i965/brw_state_dump.c
index b28c57c..a713262 100644
--- a/i965/brw_state_dump.c
+++ b/i965/brw_state_dump.c
@@ -84,6 +84,19 @@ get_965_surfacetype(unsigned int surfacetype)
     }
 }
 
+static const char *
+get_965_surface_format(unsigned int surface_format)
+{
+    switch (surface_format) {
+    case 0x000: return "r32g32b32a32_float";
+    case 0x0c1: return "b8g8r8a8_unorm";
+    case 0x100: return "b5g6r5_unorm";
+    case 0x102: return "b5g5r5a1_unorm";
+    case 0x104: return "b4g4r4a4_unorm";
+    default: return "unknown";
+    }
+}
+
 static void dump_wm_surface_state(struct brw_context *brw)
 {
    int i;
@@ -95,7 +108,7 @@ static void dump_wm_surface_state(struct brw_context *brw)
       char name[20];
 
       if (surf_bo == NULL) {
-	 fprintf(stderr, "WM SS%d: NULL\n", i);
+	 fprintf(stderr, "  WM SS%d: NULL\n", i);
 	 continue;
       }
       dri_bo_map(surf_bo, GL_FALSE);
@@ -103,8 +116,9 @@ static void dump_wm_surface_state(struct brw_context *brw)
       surf = (struct brw_surface_state *)(surf_bo->virtual);
 
       sprintf(name, "WM SS%d", i);
-      state_out(name, surf, surfoff, 0, "%s\n",
-		get_965_surfacetype(surf->ss0.surface_type));
+      state_out(name, surf, surfoff, 0, "%s %s\n",
+		get_965_surfacetype(surf->ss0.surface_type),
+		get_965_surface_format(surf->ss0.surface_format));
       state_out(name, surf, surfoff, 1, "offset\n");
       state_out(name, surf, surfoff, 2, "%dx%d size, %d mips\n",
 		surf->ss2.width + 1, surf->ss2.height + 1, surf->ss2.mip_count);
@@ -162,6 +176,14 @@ static void brw_debug_prog(const char *name, dri_bo *prog)
       fprintf(stderr, "%8s: 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x\n",
 	      name, (unsigned int)prog->offset + i * 4 * 4,
 	      data[i * 4], data[i * 4 + 1], data[i * 4 + 2], data[i * 4 + 3]);
+      /* Stop at the end of the program.  It'd be nice to keep track of the actual
+       * intended program size instead of guessing like this.
+       */
+      if (data[i * 4 + 0] == 0 &&
+	  data[i * 4 + 1] == 0 &&
+	  data[i * 4 + 2] == 0 &&
+	  data[i * 4 + 3] == 0)
+	 break;
    }
 
    dri_bo_unmap(prog);
diff --git a/i965/brw_structs.h b/i965/brw_structs.h
index 4e577d0..89e2981 100644
--- a/i965/brw_structs.h
+++ b/i965/brw_structs.h
@@ -439,7 +439,7 @@ struct brw_urb_fence
    } bits1;
 };
 
-struct brw_constant_buffer_state /* previously brw_command_streamer */
+struct brw_cs_urb_state
 {
    struct header header;
 
@@ -1031,10 +1031,10 @@ struct brw_surface_state
       GLuint writedisable_green:1; 
       GLuint writedisable_red:1; 
       GLuint writedisable_alpha:1; 
-      GLuint surface_format:9; 
+      GLuint surface_format:9;     /**< BRW_SURFACEFORMAT_x */
       GLuint data_return_format:1; 
       GLuint pad0:1;
-      GLuint surface_type:3; 
+      GLuint surface_type:3;       /**< BRW_SURFACE_1D/2D/3D/CUBE */
    } ss0;
    
    struct {
diff --git a/i965/brw_tex.c b/i965/brw_tex.c
index ef99e9c..71bff16 100644
--- a/i965/brw_tex.c
+++ b/i965/brw_tex.c
@@ -32,21 +32,12 @@
 
 #include "main/glheader.h"
 #include "main/mtypes.h"
-#include "main/imports.h"
-#include "main/simple_list.h"
-#include "main/enums.h"
-#include "main/image.h"
 #include "main/teximage.h"
-#include "main/texstore.h"
-#include "main/texformat.h"
-
-#include "texmem.h"
 
 #include "intel_context.h"
 #include "intel_regions.h"
 #include "intel_tex.h"
 #include "brw_context.h"
-#include "brw_defines.h"
 
 
 void brw_FrameBufferTexInit( struct brw_context *brw,
diff --git a/i965/brw_vs.h b/i965/brw_vs.h
index 99d0e93..1e4f660 100644
--- a/i965/brw_vs.h
+++ b/i965/brw_vs.h
@@ -75,6 +75,11 @@ struct brw_vs_compile {
 
    struct brw_reg userplane[6];
 
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
 };
 
 void brw_vs_emit( struct brw_vs_compile *c );
diff --git a/i965/brw_vs_constval.c b/i965/brw_vs_constval.c
index 9977677..249a800 100644
--- a/i965/brw_vs_constval.c
+++ b/i965/brw_vs_constval.c
@@ -39,8 +39,8 @@
  */
 struct tracker {
    GLboolean twoside;
-   GLubyte active[PROGRAM_OUTPUT+1][128];
-   GLuint size_masks[4];
+   GLubyte active[PROGRAM_OUTPUT+1][MAX_PROGRAM_TEMPS];
+   GLbitfield size_masks[4];  /**< one bit per fragment program input attrib */
 };
 
 
@@ -53,8 +53,10 @@ static void set_active_component( struct tracker *t,
    case PROGRAM_TEMPORARY:
    case PROGRAM_INPUT:
    case PROGRAM_OUTPUT:
+      assert(file < PROGRAM_OUTPUT + 1);
+      assert(index < Elements(t->active[0]));
       t->active[file][index] |= active;
-
+      break;
    default:
       break;
    }
@@ -96,7 +98,7 @@ static GLubyte get_active( struct tracker *t,
 			   struct prog_src_register src )
 {
    GLuint i;
-   GLubyte active = src.NegateBase; /* NOTE! */
+   GLubyte active = src.Negate; /* NOTE! */
 
    if (src.RelAddr)
       return 0xf;
@@ -108,10 +110,15 @@ static GLubyte get_active( struct tracker *t,
    return active;
 }
 
+/**
+ * Return the size (1,2,3 or 4) of the output/result for VERT_RESULT_idx.
+ */
 static GLubyte get_output_size( struct tracker *t,
 				GLuint idx )
 {
-   GLubyte active = t->active[PROGRAM_OUTPUT][idx];
+   GLubyte active;
+   assert(idx < VERT_RESULT_MAX);
+   active = t->active[PROGRAM_OUTPUT][idx];
    if (active & (1<<3)) return 4;
    if (active & (1<<2)) return 3;
    if (active & (1<<1)) return 2;
@@ -123,7 +130,7 @@ static GLubyte get_output_size( struct tracker *t,
  */
 static void calc_sizes( struct tracker *t )
 {
-   GLuint i;
+   GLint vertRes;
 
    if (t->twoside) {
       t->active[PROGRAM_OUTPUT][VERT_RESULT_COL0] |= 
@@ -133,12 +140,27 @@ static void calc_sizes( struct tracker *t )
 	 t->active[PROGRAM_OUTPUT][VERT_RESULT_BFC1];
    }
 
-   for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
-      switch (get_output_size(t, i)) {
-      case 4: t->size_masks[4-1] |= 1<<i;
-      case 3: t->size_masks[3-1] |= 1<<i;
-      case 2: t->size_masks[2-1] |= 1<<i;
-      case 1: t->size_masks[1-1] |= 1<<i;
+   /* Examine vertex program output sizes to set the size_masks[] info
+    * which describes the fragment program input sizes.
+    */
+   for (vertRes = VERT_RESULT_TEX0; vertRes < VERT_RESULT_MAX; vertRes++) {
+      GLint fragAttrib;
+
+      /* map vertex program output index to fragment program input index */
+      if (vertRes <= VERT_RESULT_TEX7)
+         fragAttrib = FRAG_ATTRIB_TEX0 + vertRes - VERT_RESULT_TEX0;
+      else if (vertRes >= VERT_RESULT_VAR0)
+         fragAttrib = FRAG_ATTRIB_VAR0 + vertRes - VERT_RESULT_VAR0;
+      else
+         continue;
+      assert(fragAttrib >= FRAG_ATTRIB_TEX0);
+      assert(fragAttrib <= FRAG_ATTRIB_MAX);
+
+      switch (get_output_size(t, vertRes)) {
+      case 4: t->size_masks[4-1] |= 1 << fragAttrib;
+      case 3: t->size_masks[3-1] |= 1 << fragAttrib;
+      case 2: t->size_masks[2-1] |= 1 << fragAttrib;
+      case 1: t->size_masks[1-1] |= 1 << fragAttrib;
 	 break;
       }
    }
@@ -170,8 +192,8 @@ static void calc_wm_input_sizes( struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    /* BRW_NEW_VERTEX_PROGRAM */
-   struct brw_vertex_program *vp = 
-      (struct brw_vertex_program *)brw->vertex_program;
+   const struct brw_vertex_program *vp =
+      brw_vertex_program_const(brw->vertex_program);
    /* BRW_NEW_INPUT_DIMENSIONS */
    struct tracker t;
    GLuint insn;
diff --git a/i965/brw_vs_emit.c b/i965/brw_vs_emit.c
index 235f826..b69616d 100644
--- a/i965/brw_vs_emit.c
+++ b/i965/brw_vs_emit.c
@@ -38,18 +38,49 @@
 #include "brw_vs.h"
 
 
+static struct brw_reg get_tmp( struct brw_vs_compile *c )
+{
+   struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
 
-/* Do things as simply as possible.  Allocate and populate all regs
+static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+			       
+static void release_tmps( struct brw_vs_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+/**
+ * Preallocate GRF register before code emit.
+ * Do things as simply as possible.  Allocate and populate all regs
  * ahead of time.
  */
 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
    GLuint i, reg = 0, mrf;
-   GLuint nr_params;
+
+#if 0
+   if (c->vp->program.Base.Parameters->NumParameters >= 6)
+      c->vp->use_const_buffer = 1;
+   else
+#endif
+      c->vp->use_const_buffer = GL_FALSE;
+   /*printf("use_const_buffer = %d\n", c->use_const_buffer);*/
 
    /* r0 -- reserved as usual
     */
-   c->r0 = brw_vec8_grf(reg, 0); reg++;
+   c->r0 = brw_vec8_grf(reg, 0);
+   reg++;
 
    /* User clip planes from curbe: 
     */
@@ -60,24 +91,33 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 
       /* Deal with curbe alignment:
        */
-      reg += ((6+c->key.nr_userclip+3)/4)*2;
+      reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
    }
 
    /* Vertex program parameters from curbe:
     */
-   nr_params = c->vp->program.Base.Parameters->NumParameters;
-   for (i = 0; i < nr_params; i++) {
-      c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
-   }     
-   reg += (nr_params+1)/2;
+   if (c->vp->use_const_buffer) {
+      /* get constants from a real constant buffer */
+      c->prog_data.curb_read_length = 0;
+      c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
+   }
+   else {
+      /* use a section of the GRF for constants */
+      GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
+      for (i = 0; i < nr_params; i++) {
+         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
+      }
+      reg += (nr_params + 1) / 2;
+      c->prog_data.curb_read_length = reg - 1;
 
-   c->prog_data.curb_read_length = reg - 1;
+      c->prog_data.nr_params = nr_params * 4;
+   }
 
    /* Allocate input regs:  
     */
    c->nr_inputs = 0;
    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
-      if (c->prog_data.inputs_read & (1<<i)) {
+      if (c->prog_data.inputs_read & (1 << i)) {
 	 c->nr_inputs++;
 	 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 	 reg++;
@@ -91,7 +131,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->first_output = reg;
    mrf = 4;
    for (i = 0; i < VERT_RESULT_MAX; i++) {
-      if (c->prog_data.outputs_written & (1<<i)) {
+      if (c->prog_data.outputs_written & (1 << i)) {
 	 c->nr_outputs++;
 	 if (i == VERT_RESULT_HPOS) {
 	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
@@ -132,17 +172,24 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
       reg++;
    }
 
+   if (c->vp->use_const_buffer) {
+      for (i = 0; i < 3; i++) {
+         c->current_const[i].index = -1;
+         c->current_const[i].reg = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+   }
+
    for (i = 0; i < 128; i++) {
-       if (c->output_regs[i].used_in_src) {
-            c->output_regs[i].reg = brw_vec8_grf(reg, 0);
-            reg++;
-        }
+      if (c->output_regs[i].used_in_src) {
+         c->output_regs[i].reg = brw_vec8_grf(reg, 0);
+         reg++;
+      }
    }
 
    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
    reg += 2;
- 
-   
+
    /* Some opcodes need an internal temporary:
     */
    c->first_tmp = reg;
@@ -152,35 +199,23 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     * urb_read_length is the number of registers read from *each*
     * vertex urb, so is half the amount:
     */
-   c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
+   c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 
-   c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
+   c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
    c->prog_data.total_grf = reg;
-}
-
-
-static struct brw_reg get_tmp( struct brw_vs_compile *c )
-{
-   struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
 
-   if (++c->last_tmp > c->prog_data.total_grf)
-      c->prog_data.total_grf = c->last_tmp;
-
-   return tmp;
-}
-
-static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
-{
-   if (tmp.nr == c->last_tmp-1)
-      c->last_tmp--;
-}
-			       
-static void release_tmps( struct brw_vs_compile *c )
-{
-   c->last_tmp = c->first_tmp;
+   if (INTEL_DEBUG & DEBUG_VS) {
+      _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
+      _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
+      _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
+   }
 }
 
 
+/**
+ * If an instruction uses a temp reg both as a src and the dest, we
+ * sometimes need to allocate an intermediate temporary.
+ */
 static void unalias1( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
@@ -200,6 +235,10 @@ static void unalias1( struct brw_vs_compile *c,
    }
 }
 
+/**
+ * \sa unalias2
+ * Checkes if 2-operand instruction needs an intermediate temporary.
+ */
 static void unalias2( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
@@ -222,6 +261,10 @@ static void unalias2( struct brw_vs_compile *c,
    }
 }
 
+/**
+ * \sa unalias2
+ * Checkes if 3-operand instruction needs an intermediate temporary.
+ */
 static void unalias3( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
@@ -615,6 +658,8 @@ static void emit_lit_noalias( struct brw_vs_compile *c,
    }
 
    brw_ENDIF(p, if_insn);
+
+   release_tmp(c, tmp);
 }
 
 static void emit_lrp_noalias(struct brw_vs_compile *c,
@@ -655,13 +700,83 @@ static void emit_nrm( struct brw_vs_compile *c,
 }
 
 
+static struct brw_reg
+get_constant(struct brw_vs_compile *c,
+             const struct prog_instruction *inst,
+             GLuint argIndex)
+{
+   const struct prog_src_register *src = &inst->SrcReg[argIndex];
+   struct brw_compile *p = &c->func;
+   struct brw_reg const_reg;
+   struct brw_reg const2_reg;
+
+   assert(argIndex < 3);
+
+   if (c->current_const[argIndex].index != src->Index || src->RelAddr) {
+      struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
+
+      c->current_const[argIndex].index = src->Index;
+
+#if 0
+      printf("  fetch const[%d] for arg %d into reg %d\n",
+             src->Index, argIndex, c->current_const[argIndex].reg.nr);
+#endif
+      /* need to fetch the constant now */
+      brw_dp_READ_4_vs(p,
+                       c->current_const[argIndex].reg,/* writeback dest */
+                       0,                             /* oword */
+                       src->RelAddr,                  /* relative indexing? */
+                       addrReg,                       /* address register */
+                       16 * src->Index,               /* byte offset */
+                       SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
+                       );
+
+      if (src->RelAddr) {
+         /* second read */
+         const2_reg = get_tmp(c);
+
+         /* use upper half of address reg for second read */
+         addrReg = stride(addrReg, 0, 4, 0);
+         addrReg.subnr = 16;
+
+         brw_dp_READ_4_vs(p,
+                          const2_reg,              /* writeback dest */
+                          1,                       /* oword */
+                          src->RelAddr,            /* relative indexing? */
+                          addrReg,                 /* address register */
+                          16 * src->Index,         /* byte offset */
+                          SURF_INDEX_VERT_CONST_BUFFER
+                          );
+      }
+   }
+
+   const_reg = c->current_const[argIndex].reg;
+
+   if (src->RelAddr) {
+      /* merge the two Owords into the constant register */
+      /* const_reg[7..4] = const2_reg[7..4] */
+      brw_MOV(p,
+              suboffset(stride(const_reg, 0, 4, 1), 4),
+              suboffset(stride(const2_reg, 0, 4, 1), 4));
+      release_tmp(c, const2_reg);
+   }
+   else {
+      /* replicate lower four floats into upper half (to get XYZWXYZW) */
+      const_reg = stride(const_reg, 0, 4, 0);
+      const_reg.subnr = 0;
+   }
+
+   return const_reg;
+}
+
+
+
 /* TODO: relative addressing!
  */
 static struct brw_reg get_reg( struct brw_vs_compile *c,
-			       GLuint file,
+			       gl_register_file file,
 			       GLuint index )
 {
-
    switch (file) {
    case PROGRAM_TEMPORARY:
    case PROGRAM_INPUT:
@@ -690,13 +805,17 @@ static struct brw_reg get_reg( struct brw_vs_compile *c,
 }
 
 
+/**
+ * Indirect addressing:  get reg[[arg] + offset].
+ */
 static struct brw_reg deref( struct brw_vs_compile *c,
 			     struct brw_reg arg,
 			     GLint offset)
 {
    struct brw_compile *p = &c->func;
    struct brw_reg tmp = vec4(get_tmp(c));
-   struct brw_reg vp_address = retype(vec1(get_reg(c, PROGRAM_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
+   struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
+   struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
    struct brw_reg indirect = brw_vec4_indirect(0,0);
 
@@ -717,10 +836,67 @@ static struct brw_reg deref( struct brw_vs_compile *c,
       brw_pop_insn_state(p);
    }
    
+   /* NOTE: tmp not released */
    return vec8(tmp);
 }
 
 
+/**
+ * Get brw reg corresponding to the instruction's [argIndex] src reg.
+ * TODO: relative addressing!
+ */
+static struct brw_reg
+get_src_reg( struct brw_vs_compile *c,
+             const struct prog_instruction *inst,
+             GLuint argIndex )
+{
+   const GLuint file = inst->SrcReg[argIndex].File;
+   const GLint index = inst->SrcReg[argIndex].Index;
+   const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
+
+   switch (file) {
+   case PROGRAM_TEMPORARY:
+   case PROGRAM_INPUT:
+   case PROGRAM_OUTPUT:
+      if (relAddr) {
+         return deref(c, c->regs[file][0], index);
+      }
+      else {
+         assert(c->regs[file][index].nr != 0);
+         return c->regs[file][index];
+      }
+
+   case PROGRAM_STATE_VAR:
+   case PROGRAM_CONSTANT:
+   case PROGRAM_UNIFORM:
+      if (c->vp->use_const_buffer) {
+         return get_constant(c, inst, argIndex);
+      }
+      else if (relAddr) {
+         return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
+      }
+      else {
+         assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
+         return c->regs[PROGRAM_STATE_VAR][index];
+      }
+   case PROGRAM_ADDRESS:
+      assert(index == 0);
+      return c->regs[file][index];
+
+   case PROGRAM_UNDEFINED:
+      /* this is a normal case since we loop over all three src args */
+      return brw_null_reg();
+
+   case PROGRAM_LOCAL_PARAM: 
+   case PROGRAM_ENV_PARAM: 
+   case PROGRAM_WRITE_ONLY:
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+
 static void emit_arl( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0 )
@@ -732,30 +908,31 @@ static void emit_arl( struct brw_vs_compile *c,
    if (need_tmp) 
       tmp = get_tmp(c);
 
-   brw_RNDD(p, tmp, arg0);
-   brw_MUL(p, dst, tmp, brw_imm_d(16));
+   brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
+   brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
 
    if (need_tmp)
       release_tmp(c, tmp);
 }
 
 
-/* Will return mangled results for SWZ op.  The emit_swz() function
+/**
+ * Return the brw reg for the given instruction's src argument.
+ * Will return mangled results for SWZ op.  The emit_swz() function
  * ignores this result and recalculates taking extended swizzles into
  * account.
  */
 static struct brw_reg get_arg( struct brw_vs_compile *c,
-			       struct prog_src_register *src )
+                               const struct prog_instruction *inst,
+                               GLuint argIndex )
 {
+   const struct prog_src_register *src = &inst->SrcReg[argIndex];
    struct brw_reg reg;
 
    if (src->File == PROGRAM_UNDEFINED)
       return brw_null_reg();
 
-   if (src->RelAddr) 
-      reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
-   else
-      reg = get_reg(c, src->File, src->Index);
+   reg = get_src_reg(c, inst, argIndex);
 
    /* Convert 3-bit swizzle to 2-bit.  
     */
@@ -766,16 +943,38 @@ static struct brw_reg get_arg( struct brw_vs_compile *c,
 
    /* Note this is ok for non-swizzle instructions: 
     */
-   reg.negate = src->NegateBase ? 1 : 0;   
+   reg.negate = src->Negate ? 1 : 0;   
 
    return reg;
 }
 
 
+/**
+ * Get brw register for the given program dest register.
+ */
 static struct brw_reg get_dst( struct brw_vs_compile *c,
 			       struct prog_dst_register dst )
 {
-   struct brw_reg reg = get_reg(c, dst.File, dst.Index);
+   struct brw_reg reg;
+
+   switch (dst.File) {
+   case PROGRAM_TEMPORARY:
+   case PROGRAM_OUTPUT:
+      assert(c->regs[dst.File][dst.Index].nr != 0);
+      reg = c->regs[dst.File][dst.Index];
+      break;
+   case PROGRAM_ADDRESS:
+      assert(dst.Index == 0);
+      reg = c->regs[dst.File][dst.Index];
+      break;
+   case PROGRAM_UNDEFINED:
+      /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
+      reg = brw_null_reg();
+      break;
+   default:
+      assert(0);
+      reg = brw_null_reg();
+   }
 
    reg.dw1.bits.writemask = dst.WriteMask;
 
@@ -785,14 +984,16 @@ static struct brw_reg get_dst( struct brw_vs_compile *c,
 
 static void emit_swz( struct brw_vs_compile *c, 
 		      struct brw_reg dst,
-		      struct prog_src_register src )
+                      const struct prog_instruction *inst)
 {
+   const GLuint argIndex = 0;
+   const struct prog_src_register src = inst->SrcReg[argIndex];
    struct brw_compile *p = &c->func;
    GLuint zeros_mask = 0;
    GLuint ones_mask = 0;
    GLuint src_mask = 0;
    GLubyte src_swz[4];
-   GLboolean need_tmp = (src.NegateBase &&
+   GLboolean need_tmp = (src.Negate &&
 			 dst.file != BRW_GENERAL_REGISTER_FILE);
    struct brw_reg tmp = dst;
    GLuint i;
@@ -826,10 +1027,7 @@ static void emit_swz( struct brw_vs_compile *c,
    if (src_mask) {
       struct brw_reg arg0;
 
-      if (src.RelAddr) 
-	 arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
-      else
-	 arg0 = get_reg(c, src.File, src.Index);
+      arg0 = get_src_reg(c, inst, argIndex);
 
       arg0 = brw_swizzle(arg0, 
 			 src_swz[0], src_swz[1], 
@@ -844,8 +1042,8 @@ static void emit_swz( struct brw_vs_compile *c,
    if (ones_mask) 
       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
 
-   if (src.NegateBase)
-      brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
+   if (src.Negate)
+      brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
    
    if (need_tmp) {
       brw_MOV(p, dst, tmp);
@@ -975,7 +1173,7 @@ post_vs_emit( struct brw_vs_compile *c,
 }
 
 
-/* Emit the fragment program instructions here.
+/* Emit the vertex program instructions here.
  */
 void brw_vs_emit(struct brw_vs_compile *c )
 {
@@ -1025,6 +1223,11 @@ void brw_vs_emit(struct brw_vs_compile *c )
       struct brw_reg args[3], dst;
       GLuint i;
       
+#if 0
+      printf("%d: ", insn);
+      _mesa_print_instruction(inst);
+#endif
+
       /* Get argument regs.  SWZ is special and does this itself.
        */
       if (inst->Opcode != OPCODE_SWZ)
@@ -1032,10 +1235,10 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	      struct prog_src_register *src = &inst->SrcReg[i];
 	      index = src->Index;
 	      file = src->File;	
-	      if (file == PROGRAM_OUTPUT&&c->output_regs[index].used_in_src)
+	      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
 		  args[i] = c->output_regs[index].reg;
 	      else
-		  args[i] = get_arg(c, src);
+                  args[i] = get_arg(c, inst, i);
 	  }
 
       /* Get dest regs.  Note that it is possible for a reg to be both
@@ -1163,7 +1366,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 /* The args[0] value can't be used here as it won't have
 	  * correctly encoded the full swizzle:
 	  */
-	 emit_swz(c, dst, inst->SrcReg[0] );
+	 emit_swz(c, dst, inst);
 	 break;
       case OPCODE_TRUNC:
          /* round toward zero */
diff --git a/i965/brw_vs_state.c b/i965/brw_vs_state.c
index 1a63766..3d29538 100644
--- a/i965/brw_vs_state.c
+++ b/i965/brw_vs_state.c
@@ -44,6 +44,8 @@ struct brw_vs_unit_key {
    unsigned int curbe_offset;
 
    unsigned int nr_urb_entries, urb_size;
+
+   unsigned int nr_surfaces;
 };
 
 static void
@@ -62,6 +64,9 @@ vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    key->nr_urb_entries = brw->urb.nr_vs_entries;
    key->urb_size = brw->urb.vsize;
 
+   /* BRW_NEW_NR_VS_SURFACES */
+   key->nr_surfaces = brw->vs.nr_surfaces;
+
    /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
    if (ctx->Transform.ClipPlanesEnabled) {
       /* Note that we read in the userclip planes as well, hence
@@ -92,6 +97,8 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     * brw_urb_WRITE() results.
     */
    vs.thread1.single_program_flow = 0;
+   vs.thread1.binding_table_entry_count = key->nr_surfaces;
+
    vs.thread3.urb_entry_read_length = key->urb_entry_read_length;
    vs.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
    vs.thread3.dispatch_grf_start_reg = 1;
@@ -158,6 +165,7 @@ const struct brw_tracked_state brw_vs_unit = {
    .dirty = {
       .mesa  = _NEW_TRANSFORM,
       .brw   = (BRW_NEW_CURBE_OFFSETS |
+                BRW_NEW_NR_VS_SURFACES |
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/i965/brw_vtbl.c b/i965/brw_vtbl.c
index e69d4c5..ba03afd 100644
--- a/i965/brw_vtbl.c
+++ b/i965/brw_vtbl.c
@@ -23,14 +23,12 @@
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-            
-
+**********************************************************************/
 
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
 
 #include "main/glheader.h"
 #include "main/mtypes.h"
@@ -44,12 +42,11 @@
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_state.h"
-
 #include "brw_draw.h"
 #include "brw_state.h"
 #include "brw_fallback.h"
 #include "brw_vs.h"
-#include <stdarg.h>
+
 
 static void
 dri_bo_release(dri_bo **bo)
@@ -58,7 +55,9 @@ dri_bo_release(dri_bo **bo)
    *bo = NULL;
 }
 
-/* called from intelDestroyContext()
+
+/**
+ * called from intelDestroyContext()
  */
 static void brw_destroy_context( struct intel_context *intel )
 {
@@ -68,16 +67,19 @@ static void brw_destroy_context( struct intel_context *intel )
    brw_destroy_state(brw);
    brw_draw_destroy( brw );
 
+   _mesa_free(brw->wm.compile_data);
+
    brw_FrameBufferTexDestroy( brw );
 
-   for (i = 0; i < brw->state.nr_draw_regions; i++)
-       intel_region_release(&brw->state.draw_regions[i]);
-   brw->state.nr_draw_regions = 0;
+   for (i = 0; i < brw->state.nr_color_regions; i++)
+      intel_region_release(&brw->state.color_regions[i]);
+   brw->state.nr_color_regions = 0;
    intel_region_release(&brw->state.depth_region);
 
    dri_bo_release(&brw->curbe.curbe_bo);
    dri_bo_release(&brw->vs.prog_bo);
    dri_bo_release(&brw->vs.state_bo);
+   dri_bo_release(&brw->vs.bind_bo);
    dri_bo_release(&brw->gs.prog_bo);
    dri_bo_release(&brw->gs.state_bo);
    dri_bo_release(&brw->clip.prog_bo);
@@ -91,6 +93,7 @@ static void brw_destroy_context( struct intel_context *intel )
    dri_bo_release(&brw->wm.bind_bo);
    for (i = 0; i < BRW_WM_MAX_SURF; i++)
       dri_bo_release(&brw->wm.surf_bo[i]);
+   dri_bo_release(&brw->wm.sampler_bo);
    dri_bo_release(&brw->wm.prog_bo);
    dri_bo_release(&brw->wm.state_bo);
    dri_bo_release(&brw->cc.prog_bo);
@@ -98,37 +101,46 @@ static void brw_destroy_context( struct intel_context *intel )
    dri_bo_release(&brw->cc.vp_bo);
 }
 
-/* called from intelDrawBuffer()
+
+/**
+ * called from intelDrawBuffer()
  */
 static void brw_set_draw_region( struct intel_context *intel, 
-				  struct intel_region *draw_regions[],
-				  struct intel_region *depth_region,
-				GLuint num_regions)
+                                 struct intel_region *color_regions[],
+                                 struct intel_region *depth_region,
+                                 GLuint num_color_regions)
 {
    struct brw_context *brw = brw_context(&intel->ctx);
-   int i;
+   GLuint i;
+
+   /* release old color/depth regions */
    if (brw->state.depth_region != depth_region)
       brw->state.dirty.brw |= BRW_NEW_DEPTH_BUFFER;
-   for (i = 0; i < brw->state.nr_draw_regions; i++)
-       intel_region_release(&brw->state.draw_regions[i]);
+   for (i = 0; i < brw->state.nr_color_regions; i++)
+       intel_region_release(&brw->state.color_regions[i]);
    intel_region_release(&brw->state.depth_region);
-   for (i = 0; i < num_regions; i++)
-       intel_region_reference(&brw->state.draw_regions[i], draw_regions[i]);
+
+   /* reference new color/depth regions */
+   for (i = 0; i < num_color_regions; i++)
+       intel_region_reference(&brw->state.color_regions[i], color_regions[i]);
    intel_region_reference(&brw->state.depth_region, depth_region);
-   brw->state.nr_draw_regions = num_regions;
+   brw->state.nr_color_regions = num_color_regions;
 }
 
-/* called from intel_batchbuffer_flush and children before sending a
+
+/**
+ * called from intel_batchbuffer_flush and children before sending a
  * batchbuffer off.
  */
 static void brw_finish_batch(struct intel_context *intel)
 {
    struct brw_context *brw = brw_context(&intel->ctx);
-
    brw_emit_query_end(brw);
 }
 
-/* called from intelFlushBatchLocked
+
+/**
+ * called from intelFlushBatchLocked
  */
 static void brw_new_batch( struct intel_context *intel )
 {
@@ -159,39 +171,20 @@ static void brw_new_batch( struct intel_context *intel )
    }
 }
 
-static void brw_note_fence( struct intel_context *intel, 
-			    GLuint fence )
+
+static void brw_note_fence( struct intel_context *intel, GLuint fence )
 {
    brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
 }
- 
+
+
 static void brw_note_unlock( struct intel_context *intel )
 {
    struct brw_context *brw = brw_context(&intel->ctx);
-
    brw_state_cache_check_size(brw);
 }
 
 
-void brw_do_flush( struct brw_context *brw, 
-		   GLuint flags )
-{
-   struct brw_mi_flush flush;
-   memset(&flush, 0, sizeof(flush));      
-   flush.opcode = CMD_MI_FLUSH;
-   flush.flags = flags;
-   BRW_BATCH_STRUCT(brw, &flush);
-}
-
-
-static void brw_emit_flush( struct intel_context *intel,
-			GLuint unused )
-{
-   brw_do_flush(brw_context(&intel->ctx),
-		BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE);
-}
-
-
 /* called from intelWaitForIdle() and intelFlush()
  *
  * For now, just flush everything.  Could be smarter later.
@@ -205,6 +198,7 @@ static GLuint brw_flush_cmd( void )
    return *(GLuint *)&flush;
 }
 
+
 static void brw_invalidate_state( struct intel_context *intel, GLuint new_state )
 {
    /* nothing */
@@ -214,20 +208,18 @@ static void brw_invalidate_state( struct intel_context *intel, GLuint new_state
 void brwInitVtbl( struct brw_context *brw )
 {
    brw->intel.vtbl.check_vertex_size = 0;
-   brw->intel.vtbl.emit_state = 0; 
-   brw->intel.vtbl.reduced_primitive_state = 0;	
+   brw->intel.vtbl.emit_state = 0;
+   brw->intel.vtbl.reduced_primitive_state = 0;
    brw->intel.vtbl.render_start = 0;
-   brw->intel.vtbl.update_texture_state = 0; 
+   brw->intel.vtbl.update_texture_state = 0;
 
-   brw->intel.vtbl.invalidate_state = brw_invalidate_state; 
-   brw->intel.vtbl.note_fence = brw_note_fence; 
-   brw->intel.vtbl.note_unlock = brw_note_unlock; 
+   brw->intel.vtbl.invalidate_state = brw_invalidate_state;
+   brw->intel.vtbl.note_fence = brw_note_fence;
+   brw->intel.vtbl.note_unlock = brw_note_unlock;
    brw->intel.vtbl.new_batch = brw_new_batch;
    brw->intel.vtbl.finish_batch = brw_finish_batch;
    brw->intel.vtbl.destroy = brw_destroy_context;
    brw->intel.vtbl.set_draw_region = brw_set_draw_region;
    brw->intel.vtbl.flush_cmd = brw_flush_cmd;
-   brw->intel.vtbl.emit_flush = brw_emit_flush;
    brw->intel.vtbl.debug_batch = brw_debug_batch;
 }
-
diff --git a/i965/brw_wm.c b/i965/brw_wm.c
index c6791da..8a3b7df 100644
--- a/i965/brw_wm.c
+++ b/i965/brw_wm.c
@@ -82,6 +82,58 @@ GLuint brw_wm_is_scalar_result( GLuint opcode )
 }
 
 
+/**
+ * Do GPU code generation for non-GLSL shader.  non-GLSL shaders have
+ * no flow control instructions so we can more readily do SSA-style
+ * optimizations.
+ */
+static void
+brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+   /* Augment fragment program.  Add instructions for pre- and
+    * post-fragment-program tasks such as interpolation and fogging.
+    */
+   brw_wm_pass_fp(c);
+
+   /* Translate to intermediate representation.  Build register usage
+    * chains.
+    */
+   brw_wm_pass0(c);
+
+   /* Dead code removal.
+    */
+   brw_wm_pass1(c);
+
+   /* Register allocation.
+    * Divide by two because we operate on 16 pixels at a time and require
+    * two GRF entries for each logical shader register.
+    */
+   c->grf_limit = BRW_WM_MAX_GRF / 2;
+
+   brw_wm_pass2(c);
+
+   /* how many general-purpose registers are used */
+   c->prog_data.total_grf = c->max_wm_grf;
+
+   /* Scratch space is used for register spilling */
+   if (c->last_scratch) {
+      c->prog_data.total_scratch = c->last_scratch + 0x40;
+   }
+   else {
+      c->prog_data.total_scratch = 0;
+   }
+
+   /* Emit GEN4 code.
+    */
+   brw_wm_emit(c);
+}
+
+
+/**
+ * All Mesa program -> GPU code generation goes through this function.
+ * Depending on the instructions used (i.e. flow control instructions)
+ * we'll use one of two code generators.
+ */
 static void do_wm_prog( struct brw_context *brw,
 			struct brw_fragment_program *fp, 
 			struct brw_wm_prog_key *key)
@@ -92,52 +144,39 @@ static void do_wm_prog( struct brw_context *brw,
 
    c = brw->wm.compile_data;
    if (c == NULL) {
-     brw->wm.compile_data = calloc(1, sizeof(*brw->wm.compile_data));
-     c = brw->wm.compile_data;
+      brw->wm.compile_data = calloc(1, sizeof(*brw->wm.compile_data));
+      c = brw->wm.compile_data;
+      if (c == NULL) {
+         /* Ouch - big out of memory problem.  Can't continue
+          * without triggering a segfault, no way to signal,
+          * so just return.
+          */
+         return;
+      }
    } else {
-     memset(c, 0, sizeof(*brw->wm.compile_data));
+      memset(c, 0, sizeof(*brw->wm.compile_data));
    }
    memcpy(&c->key, key, sizeof(*key));
 
    c->fp = fp;
    c->env_param = brw->intel.ctx.FragmentProgram.Parameters;
 
-    brw_init_compile(brw, &c->func);
-   if (brw_wm_is_glsl(&c->fp->program)) {
-       brw_wm_glsl_emit(brw, c);
-   } else {
-       /* Augment fragment program.  Add instructions for pre- and
-	* post-fragment-program tasks such as interpolation and fogging.
-	*/
-       brw_wm_pass_fp(c);
-
-       /* Translate to intermediate representation.  Build register usage
-	* chains.
-	*/
-       brw_wm_pass0(c);
-
-       /* Dead code removal.
-	*/
-       brw_wm_pass1(c);
-
-       /* Register allocation.
-	*/
-       c->grf_limit = BRW_WM_MAX_GRF/2;
-
-       brw_wm_pass2(c);
-
-       c->prog_data.total_grf = c->max_wm_grf;
-       if (c->last_scratch) {
-	   c->prog_data.total_scratch =
-	       c->last_scratch + 0x40;
-       } else {
-	   c->prog_data.total_scratch = 0;
-       }
-
-       /* Emit GEN4 code.
-	*/
-       brw_wm_emit(c);
+   brw_init_compile(brw, &c->func);
+
+   /* temporary sanity check assertion */
+   ASSERT(fp->isGLSL == brw_wm_is_glsl(&c->fp->program));
+
+   /*
+    * Shader which use GLSL features such as flow control are handled
+    * differently from "simple" shaders.
+    */
+   if (fp->isGLSL) {
+      brw_wm_glsl_emit(brw, c);
    }
+   else {
+      brw_wm_non_glsl_emit(brw, c);
+   }
+
    if (INTEL_DEBUG & DEBUG_WM)
       fprintf(stderr, "\n");
 
@@ -161,7 +200,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
 {
    GLcontext *ctx = &brw->intel.ctx;
    /* BRW_NEW_FRAGMENT_PROGRAM */
-   struct brw_fragment_program *fp = 
+   const struct brw_fragment_program *fp = 
       (struct brw_fragment_program *)brw->fragment_program;
    GLuint lookup = 0;
    GLuint line_aa;
@@ -176,7 +215,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
        ctx->Color.AlphaEnabled)
       lookup |= IZ_PS_KILL_ALPHATEST_BIT;
 
-   if (fp->program.Base.OutputsWritten & (1<<FRAG_RESULT_DEPR))
+   if (fp->program.Base.OutputsWritten & (1<<FRAG_RESULT_DEPTH))
       lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
 
    /* _NEW_DEPTH */
@@ -188,7 +227,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
       lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
 
    /* _NEW_STENCIL */
-   if (ctx->Stencil.Enabled) {
+   if (ctx->Stencil._Enabled) {
       lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
 
       if (ctx->Stencil.WriteMask[0] ||
@@ -228,7 +267,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
 
 
    /* BRW_NEW_WM_INPUT_DIMENSIONS */
-   key->projtex_mask = brw->wm.input_size_masks[4-1] >> (FRAG_ATTRIB_TEX0 - FRAG_ATTRIB_WPOS); 
+   key->proj_attrib_mask = brw->wm.input_size_masks[4-1];
 
    /* _NEW_LIGHT */
    key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT);
@@ -245,6 +284,11 @@ static void brw_wm_populate_key( struct brw_context *brw,
 	    if (img->TexFormat->MesaFormat == MESA_FORMAT_YCBCR)
 		key->yuvtex_swap_mask |= 1 << i;
 	 }
+
+         key->tex_swizzles[i] = t->_Swizzle;
+      }
+      else {
+         key->tex_swizzles[i] = SWIZZLE_NOOP;
       }
    }
 
@@ -275,10 +319,11 @@ static void brw_wm_populate_key( struct brw_context *brw,
       key->drawable_height = brw->intel.driDrawable->h;
    }
 
-   /* Extra info:
-    */
-   key->program_string_id = fp->id;
+   /* CACHE_NEW_VS_PROG */
+   key->vp_outputs_written = brw->vs.prog_data->outputs_written & DO_SETUP_BITS;
 
+   /* The unique fragment program ID */
+   key->program_string_id = fp->id;
 }
 
 
@@ -302,8 +347,6 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
 }
 
 
-/* See brw_wm.c:
- */
 const struct brw_tracked_state brw_wm_prog = {
    .dirty = {
       .mesa  = (_NEW_COLOR |
@@ -317,7 +360,7 @@ const struct brw_tracked_state brw_wm_prog = {
       .brw   = (BRW_NEW_FRAGMENT_PROGRAM |
 		BRW_NEW_WM_INPUT_DIMENSIONS |
 		BRW_NEW_REDUCED_PRIMITIVE),
-      .cache = 0
+      .cache = CACHE_NEW_VS_PROG,
    },
    .prepare = brw_prepare_wm_prog
 };
diff --git a/i965/brw_wm.h b/i965/brw_wm.h
index 3cbdf81..295fed8 100644
--- a/i965/brw_wm.h
+++ b/i965/brw_wm.h
@@ -65,15 +65,17 @@ struct brw_wm_prog_key {
    GLuint flat_shade:1;
    GLuint runtime_check_aads_emit:1;
    
-   GLuint projtex_mask:16;
+   GLbitfield proj_attrib_mask; /**< one bit per fragment program attribute */
    GLuint shadowtex_mask:16;
    GLuint yuvtex_mask:16;
    GLuint yuvtex_swap_mask:16;	/* UV swaped */
-   //   GLuint pad1:16;
+
+   GLuint tex_swizzles[BRW_MAX_TEX_UNIT];
 
    GLuint program_string_id:32;
    GLuint origin_x, origin_y;
    GLuint drawable_height;
+   GLuint vp_outputs_written;
 };
 
 
@@ -142,13 +144,12 @@ struct brw_wm_instruction {
    GLuint writemask:4;
    GLuint tex_unit:4;   /* texture unit for TEX, TXD, TXP instructions */
    GLuint tex_idx:3;    /* TEXTURE_1D,2D,3D,CUBE,RECT_INDEX source target */
+   GLuint tex_shadow:1; /* do shadow comparison? */
    GLuint eot:1;    	/* End of thread indicator for FB_WRITE*/
    GLuint target:10;    /* target binding table index for FB_WRITE*/
 };
 
 
-#define PROGRAM_INTERNAL_PARAM 
-
 #define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + FRAG_ATTRIB_MAX + 3)
 #define BRW_WM_MAX_GRF   128		/* hardware limit */
 #define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
@@ -240,17 +241,25 @@ struct brw_wm_compile {
    GLuint max_wm_grf;
    GLuint last_scratch;
 
+   /** Mapping from Mesa registers to hardware registers */
    struct {
-	GLboolean inited;
-	struct brw_reg reg;
+      GLboolean inited;
+      struct brw_reg reg;
    } wm_regs[PROGRAM_PAYLOAD+1][256][4];
+
    struct brw_reg stack;
    struct brw_reg emit_mask_reg;
-   GLuint reg_index;
+   GLuint reg_index;  /**< Index of next free GRF register */
    GLuint tmp_regs[BRW_WM_MAX_GRF];
    GLuint tmp_index;
    GLuint tmp_max;
    GLuint subroutines[BRW_WM_MAX_SUBROUTINE];
+
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
 };
 
 
diff --git a/i965/brw_wm_emit.c b/i965/brw_wm_emit.c
index bc8e8c9..a870e75 100644
--- a/i965/brw_wm_emit.c
+++ b/i965/brw_wm_emit.c
@@ -699,7 +699,6 @@ static void emit_tex( struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
    GLuint msgLength, responseLength;
-   GLboolean shadow = (c->key.shadowtex_mask & (1<<inst->tex_unit)) ? 1 : 0;
    GLuint i, nr;
    GLuint emit;
 
@@ -721,7 +720,7 @@ static void emit_tex( struct brw_wm_compile *c,
       break;
    }
 
-   if (shadow) {
+   if (inst->tex_shadow) {
       nr = 4;
       emit |= WRITEMASK_W;
    }
@@ -743,10 +742,10 @@ static void emit_tex( struct brw_wm_compile *c,
 	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
 	      1,
 	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-	      inst->tex_unit + MAX_DRAW_BUFFERS, /* surface */
+              SURF_INDEX_TEXTURE(inst->tex_unit),
 	      inst->tex_unit,	  /* sampler */
 	      inst->writemask,
-	      (shadow ? 
+	      (inst->tex_shadow ? 
 	       BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE : 
 	       BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE),
 	      responseLength,
@@ -792,7 +791,7 @@ static void emit_txb( struct brw_wm_compile *c,
 	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
 	      1,
 	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-	      inst->tex_unit + MAX_DRAW_BUFFERS, /* surface */
+              SURF_INDEX_TEXTURE(inst->tex_unit),
 	      inst->tex_unit,	  /* sampler */
 	      inst->writemask,
 	      BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
@@ -914,6 +913,9 @@ static void emit_aa( struct brw_wm_compile *c,
 
 /* Post-fragment-program processing.  Send the results to the
  * framebuffer.
+ * \param arg0  the fragment color
+ * \param arg1  the pass-through depth value
+ * \param arg2  the shader-computed depth value
  */
 static void emit_fb_write( struct brw_wm_compile *c,
 			   struct brw_reg *arg0,
@@ -1022,8 +1024,8 @@ static void emit_fb_write( struct brw_wm_compile *c,
 }
 
 
-/* Post-fragment-program processing.  Send the results to the
- * framebuffer.
+/**
+ * Move a GPR to scratch memory. 
  */
 static void emit_spill( struct brw_wm_compile *c,
 			struct brw_reg reg,
@@ -1042,11 +1044,13 @@ static void emit_spill( struct brw_wm_compile *c,
    */
    brw_dp_WRITE_16(p, 
 		   retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
-		   1, 
 		   slot);
 }
 
 
+/**
+ * Load a GPR from scratch memory. 
+ */
 static void emit_unspill( struct brw_wm_compile *c,
 			  struct brw_reg reg,
 			  GLuint slot )
@@ -1067,13 +1071,13 @@ static void emit_unspill( struct brw_wm_compile *c,
 
    brw_dp_READ_16(p,
 		  retype(vec16(reg), BRW_REGISTER_TYPE_UW),
-		  1, 
 		  slot);
 }
 
 
 /**
- * Retrieve upto 4 GEN4 register pairs for the given wm reg:
+ * Retrieve up to 4 GEN4 register pairs for the given wm reg:
+ * Args with unspill_reg != 0 will be loaded from scratch memory.
  */
 static void get_argument_regs( struct brw_wm_compile *c,
 			       struct brw_wm_ref *arg[],
@@ -1083,13 +1087,12 @@ static void get_argument_regs( struct brw_wm_compile *c,
 
    for (i = 0; i < 4; i++) {
       if (arg[i]) {
-
-	 if (arg[i]->unspill_reg) 
-	    emit_unspill(c, 
+	 if (arg[i]->unspill_reg)
+	    emit_unspill(c,
 			 brw_vec8_grf(arg[i]->unspill_reg, 0),
 			 arg[i]->value->spill_slot);
 
-	 regs[i] = arg[i]->hw_reg;	 
+	 regs[i] = arg[i]->hw_reg;
       }
       else {
 	 regs[i] = brw_null_reg();
@@ -1098,6 +1101,9 @@ static void get_argument_regs( struct brw_wm_compile *c,
 }
 
 
+/**
+ * For values that have a spill_slot!=0, write those regs to scratch memory.
+ */
 static void spill_values( struct brw_wm_compile *c,
 			  struct brw_wm_value *values,
 			  GLuint nr )
diff --git a/i965/brw_wm_fp.c b/i965/brw_wm_fp.c
index 7ebe5b9..49aad28 100644
--- a/i965/brw_wm_fp.c
+++ b/i965/brw_wm_fp.c
@@ -80,9 +80,8 @@ static struct prog_src_register src_reg(GLuint file, GLuint idx)
    reg.Index = idx;
    reg.Swizzle = SWIZZLE_NOOP;
    reg.RelAddr = 0;
-   reg.NegateBase = 0;
+   reg.Negate = NEGATE_NONE;
    reg.Abs = 0;
-   reg.NegateAbs = 0;
    return reg;
 }
 
@@ -112,6 +111,12 @@ static struct prog_src_register src_swizzle1( struct prog_src_register reg, int
    return src_swizzle(reg, x, x, x, x);
 }
 
+static struct prog_src_register src_swizzle4( struct prog_src_register reg, uint swizzle )
+{
+   reg.Swizzle = swizzle;
+   return reg;
+}
+
 
 /***********************************************************************
  * Dest regs
@@ -187,6 +192,7 @@ static struct prog_instruction * emit_tex_op(struct brw_wm_compile *c,
 				       GLuint saturate,
 				       GLuint tex_src_unit,
 				       GLuint tex_src_target,
+				       GLuint tex_shadow,
 				       struct prog_src_register src0,
 				       struct prog_src_register src1,
 				       struct prog_src_register src2 )
@@ -200,6 +206,7 @@ static struct prog_instruction * emit_tex_op(struct brw_wm_compile *c,
    inst->SaturateMode = saturate;   
    inst->TexSrcUnit = tex_src_unit;
    inst->TexSrcTarget = tex_src_target;
+   inst->TexShadow = tex_shadow;
    inst->SrcReg[0] = src0;
    inst->SrcReg[1] = src1;
    inst->SrcReg[2] = src2;
@@ -216,7 +223,7 @@ static struct prog_instruction * emit_op(struct brw_wm_compile *c,
 				       struct prog_src_register src2 )
 {
    return emit_tex_op(c, op, dest, saturate,
-                      0, 0,  /* tex unit, target */
+                      0, 0, 0,  /* tex unit, target, shadow */
                       src0, src1, src2);
 }
    
@@ -282,8 +289,7 @@ static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
       struct prog_dst_register pixel_w = get_temp(c);
       struct prog_src_register deltas = get_delta_xy(c);
       struct prog_src_register interp_wpos = src_reg(PROGRAM_PAYLOAD, FRAG_ATTRIB_WPOS);
-      
-      
+
       /* deltas.xyw = DELTAS2 deltas.xy, payload.interp_wpos.x
        */
       emit_op(c,
@@ -548,7 +554,6 @@ static void precalc_dst( struct brw_wm_compile *c,
 	      src_undef());
    }
 
-
    if (dst.WriteMask & WRITEMASK_XZ) {
       struct prog_instruction *swz;
       GLuint z = GET_SWZ(src0.Swizzle, Z);
@@ -563,7 +568,7 @@ static void precalc_dst( struct brw_wm_compile *c,
 		    src_undef(),
 		    src_undef());
       /* Avoid letting negation flag of src0 affect our 1 constant. */
-      swz->SrcReg[0].NegateBase &= ~NEGATE_X;
+      swz->SrcReg[0].Negate &= ~NEGATE_X;
    }
    if (dst.WriteMask & WRITEMASK_W) {
       /* dst.w = mov src1.w
@@ -598,10 +603,9 @@ static void precalc_lit( struct brw_wm_compile *c,
 		    src_undef(),
 		    src_undef());
       /* Avoid letting the negation flag of src0 affect our 1 constant. */
-      swz->SrcReg[0].NegateBase = 0;
+      swz->SrcReg[0].Negate = NEGATE_NONE;
    }
 
-
    if (dst.WriteMask & WRITEMASK_YZ) {
       emit_op(c,
 	      OPCODE_LIT,
@@ -646,7 +650,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                      src0,
                      src_undef(),
                      src_undef());
-       out->SrcReg[0].NegateBase = 0;
+       out->SrcReg[0].Negate = NEGATE_NONE;
        out->SrcReg[0].Abs = 1;
 
        /* tmp0 = MAX(coord.X, coord.Y) */
@@ -745,6 +749,7 @@ static void precalc_tex( struct brw_wm_compile *c,
                   inst->SaturateMode,
                   unit,
                   inst->TexSrcTarget,
+                  inst->TexShadow,
                   coord,
                   src_undef(),
                   src_undef());
@@ -805,21 +810,40 @@ static void precalc_tex( struct brw_wm_compile *c,
                   inst->SaturateMode,
                   unit,
                   inst->TexSrcTarget,
+                  inst->TexShadow,
                   coord,
                   src_undef(),
                   src_undef());
    }
 
+   /* For GL_EXT_texture_swizzle: */
+   if (c->key.tex_swizzles[unit] != SWIZZLE_NOOP) {
+      /* swizzle the result of the TEX instruction */
+      struct prog_src_register tmpsrc = src_reg_from_dst(inst->DstReg);
+      emit_op(c, OPCODE_SWZ,
+              inst->DstReg,
+              SATURATE_OFF, /* saturate already done above */
+              src_swizzle4(tmpsrc, c->key.tex_swizzles[unit]),
+              src_undef(),
+              src_undef());
+   }
+
    if ((inst->TexSrcTarget == TEXTURE_RECT_INDEX) ||
        (inst->TexSrcTarget == TEXTURE_CUBE_INDEX))
       release_temp(c, tmpcoord);
 }
 
 
+/**
+ * Check if the given TXP instruction really needs the divide-by-W step.
+ */
 static GLboolean projtex( struct brw_wm_compile *c,
 			  const struct prog_instruction *inst )
 {
-   struct prog_src_register src = inst->SrcReg[0];
+   const struct prog_src_register src = inst->SrcReg[0];
+   GLboolean retVal;
+
+   assert(inst->Opcode == OPCODE_TXP);
 
    /* Only try to detect the simplest cases.  Could detect (later)
     * cases where we are trying to emit code like RCP {1.0}, MUL x,
@@ -829,16 +853,21 @@ static GLboolean projtex( struct brw_wm_compile *c,
     * user-provided fragment programs anyway:
     */
    if (inst->TexSrcTarget == TEXTURE_CUBE_INDEX)
-      return 0;  /* ut2004 gun rendering !?! */
+      retVal = GL_FALSE;  /* ut2004 gun rendering !?! */
    else if (src.File == PROGRAM_INPUT && 
 	    GET_SWZ(src.Swizzle, W) == W &&
-           (c->key.projtex_mask & (1<<(src.Index + FRAG_ATTRIB_WPOS - FRAG_ATTRIB_TEX0))) == 0)
-      return 0;
+            (c->key.proj_attrib_mask & (1 << src.Index)) == 0)
+      retVal = GL_FALSE;
    else
-      return 1;
+      retVal = GL_TRUE;
+
+   return retVal;
 }
 
 
+/**
+ * Emit code for TXP.
+ */
 static void precalc_txp( struct brw_wm_compile *c,
 			       const struct prog_instruction *inst )
 {
@@ -889,42 +918,41 @@ static void precalc_txp( struct brw_wm_compile *c,
 static void emit_fb_write( struct brw_wm_compile *c )
 {
    struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH);
-   struct prog_src_register outdepth = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DEPR);
+   struct prog_src_register outdepth = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DEPTH);
    struct prog_src_register outcolor;
    GLuint i;
 
    struct prog_instruction *inst, *last_inst;
    struct brw_context *brw = c->func.brw;
 
-   /* inst->Sampler is not used by backend, 
-      use it for fb write target and eot */
-
-   if (brw->state.nr_draw_regions > 1) {
-       for (i = 0 ; i < brw->state.nr_draw_regions; i++) {
-	   outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0 + i);
-	   last_inst = inst = emit_op(c,
-		   WM_FB_WRITE, dst_mask(dst_undef(),0), 0,
-		   outcolor, payload_r0_depth, outdepth);
-	   inst->Sampler = (i<<1);
-	   if (c->fp_fragcolor_emitted) {
-	       outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLR);
-	       last_inst = inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
-		       0, outcolor, payload_r0_depth, outdepth);
-	       inst->Sampler = (i<<1);
-	   }
-       }
-       last_inst->Sampler |= 1; //eot
+   /* The inst->Aux field is used for FB write target and the EOT marker */
+
+   if (brw->state.nr_color_regions > 1) {
+      for (i = 0 ; i < brw->state.nr_color_regions; i++) {
+         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0 + i);
+         last_inst = inst = emit_op(c,
+                                    WM_FB_WRITE, dst_mask(dst_undef(),0), 0,
+                                    outcolor, payload_r0_depth, outdepth);
+         inst->Aux = (i<<1);
+         if (c->fp_fragcolor_emitted) {
+            outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
+            last_inst = inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
+                                       0, outcolor, payload_r0_depth, outdepth);
+            inst->Aux = (i<<1);
+         }
+      }
+      last_inst->Aux |= 1; //eot
    }
    else {
       /* if gl_FragData[0] is written, use it, else use gl_FragColor */
       if (c->fp->program.Base.OutputsWritten & (1 << FRAG_RESULT_DATA0))
          outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0);
       else 
-         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLR);
+         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
 
-       inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
-	       0, outcolor, payload_r0_depth, outdepth);
-       inst->Sampler = 1|(0<<1);
+      inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
+                     0, outcolor, payload_r0_depth, outdepth);
+      inst->Aux = 1|(0<<1);
    }
 }
 
@@ -955,9 +983,9 @@ static void validate_dst_regs( struct brw_wm_compile *c,
 			       const struct prog_instruction *inst )
 {
    if (inst->DstReg.File == PROGRAM_OUTPUT) {
-       GLuint idx = inst->DstReg.Index;
-       if (idx == FRAG_RESULT_COLR)
-	   c->fp_fragcolor_emitted = 1;
+      GLuint idx = inst->DstReg.Index;
+      if (idx == FRAG_RESULT_COLOR)
+         c->fp_fragcolor_emitted = 1;
    }
 }
 
@@ -981,6 +1009,11 @@ static void print_insns( const struct prog_instruction *insn,
    }
 }
 
+
+/**
+ * Initial pass for fragment program code generation.
+ * This function is used by both the GLSL and non-GLSL paths.
+ */
 void brw_wm_pass_fp( struct brw_wm_compile *c )
 {
    struct brw_fragment_program *fp = c->fp;
@@ -997,15 +1030,19 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
    c->pixel_w = src_undef();
    c->nr_fp_insns = 0;
 
-   /* Emit preamble instructions:
+   /* Emit preamble instructions.  This is where special instructions such as
+    * WM_CINTERP, WM_LINTERP, WM_PINTERP and WM_WPOSXY are emitted to
+    * compute shader inputs from varying vars.
     */
-
-
    for (insn = 0; insn < fp->program.Base.NumInstructions; insn++) {
       const struct prog_instruction *inst = &fp->program.Base.Instructions[insn];
       validate_src_regs(c, inst);
       validate_dst_regs(c, inst);
    }
+
+   /* Loop over all instructions doing assorted simplifications and
+    * transformations.
+    */
    for (insn = 0; insn < fp->program.Base.NumInstructions; insn++) {
       const struct prog_instruction *inst = &fp->program.Base.Instructions[insn];
       struct prog_instruction *out;
@@ -1014,7 +1051,6 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
        * necessary:
        */
 
-
       switch (inst->Opcode) {
       case OPCODE_SWZ: 
 	 out = emit_insn(c, inst);
@@ -1024,14 +1060,14 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
       case OPCODE_ABS:
 	 out = emit_insn(c, inst);
 	 out->Opcode = OPCODE_MOV;
-	 out->SrcReg[0].NegateBase = 0;
+	 out->SrcReg[0].Negate = NEGATE_NONE;
 	 out->SrcReg[0].Abs = 1;
 	 break;
 
       case OPCODE_SUB: 
 	 out = emit_insn(c, inst);
 	 out->Opcode = OPCODE_ADD;
-	 out->SrcReg[1].NegateBase ^= 0xf;
+	 out->SrcReg[1].Negate ^= NEGATE_XYZW;
 	 break;
 
       case OPCODE_SCS: 
@@ -1094,9 +1130,9 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
    }
 
    if (INTEL_DEBUG & DEBUG_WM) {
-	   _mesa_printf("pass_fp:\n");
-	   print_insns( c->prog_instructions, c->nr_fp_insns );
-	   _mesa_printf("\n");
+      _mesa_printf("pass_fp:\n");
+      print_insns( c->prog_instructions, c->nr_fp_insns );
+      _mesa_printf("\n");
    }
 }
 
diff --git a/i965/brw_wm_glsl.c b/i965/brw_wm_glsl.c
index 5a5497e..a907e1b 100644
--- a/i965/brw_wm_glsl.c
+++ b/i965/brw_wm_glsl.c
@@ -8,12 +8,17 @@ enum _subroutine {
     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
 };
 
-/* Only guess, need a flag in gl_fragment_program later */
+
+/**
+ * Determine if the given fragment program uses GLSL features such
+ * as flow conditionals, loops, subroutines.
+ * Some GLSL shaders may use these features, others might not.
+ */
 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 {
     int i;
     for (i = 0; i < fp->Base.NumInstructions; i++) {
-	struct prog_instruction *inst = &fp->Base.Instructions[i];
+	const struct prog_instruction *inst = &fp->Base.Instructions[i];
 	switch (inst->Opcode) {
 	    case OPCODE_IF:
 	    case OPCODE_TRUNC:
@@ -36,6 +41,10 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
     return GL_FALSE; 
 }
 
+
+/**
+ * Record the mapping of a Mesa register to a hardware register.
+ */
 static void set_reg(struct brw_wm_compile *c, int file, int index, 
 	int component, struct brw_reg reg)
 {
@@ -43,7 +52,11 @@ static void set_reg(struct brw_wm_compile *c, int file, int index,
     c->wm_regs[file][index][component].inited = GL_TRUE;
 }
 
-static int get_scalar_dst_index(struct prog_instruction *inst)
+/**
+ * Examine instruction's write mask to find index of first component
+ * enabled for writing.
+ */
+static int get_scalar_dst_index(const struct prog_instruction *inst)
 {
     int i;
     for (i = 0; i < 4; i++)
@@ -62,6 +75,10 @@ static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
     return reg;
 }
 
+/**
+ * Save current temp register info.
+ * There must be a matching call to release_tmps().
+ */
 static int mark_tmps(struct brw_wm_compile *c)
 {
     return c->tmp_index;
@@ -77,8 +94,22 @@ static void release_tmps(struct brw_wm_compile *c, int mark)
     c->tmp_index = mark;
 }
 
+/**
+ * Convert Mesa src register to brw register.
+ *
+ * Since we're running in SOA mode each Mesa register corresponds to four
+ * hardware registers.  We allocate the hardware registers as needed here.
+ *
+ * \param file  register file, one of PROGRAM_x
+ * \param index  register number
+ * \param component  src component (X=0, Y=1, Z=2, W=3)
+ * \param nr  not used?!?
+ * \param neg  negate value?
+ * \param abs  take absolute value?
+ */
 static struct brw_reg 
-get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GLuint neg, GLuint abs)
+get_reg(struct brw_wm_compile *c, int file, int index, int component,
+        int nr, GLuint neg, GLuint abs)
 {
     struct brw_reg reg;
     switch (file) {
@@ -99,12 +130,18 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GL
 	    return brw_null_reg();
     }
 
-    if(c->wm_regs[file][index][component].inited)
+    /* see if we've already allocated a HW register for this Mesa register */
+    if (c->wm_regs[file][index][component].inited) {
+	/* yes, re-use */
 	reg = c->wm_regs[file][index][component].reg;
-    else 
+    }
+    else {
+	/* no, allocate new register */
 	reg = brw_vec8_grf(c->reg_index, 0);
+    }
 
-    if(!c->wm_regs[file][index][component].inited) {
+    /* if this is a new register allocation, record it in the table */
+    if (!c->wm_regs[file][index][component].inited) {
 	set_reg(c, file, index, component, reg);
 	c->reg_index++;
     }
@@ -113,7 +150,7 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GL
 	/* ran out of temporary registers! */
 #if 1
         /* This is a big hack for now.
-         * Return bad register index, but don't just crash hange the GPU.
+         * Return bad register index, just don't hang the GPU.
          */
         _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
         c->reg_index = BRW_WM_MAX_GRF - 13;
@@ -130,78 +167,273 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GL
     return reg;
 }
 
+
+/**
+ * Preallocate registers.  This sets up the Mesa to hardware register
+ * mapping for certain registers, such as constants (uniforms/state vars)
+ * and shader inputs.
+ */
 static void prealloc_reg(struct brw_wm_compile *c)
 {
     int i, j;
     struct brw_reg reg;
-    int nr_interp_regs = 0;
+    int urb_read_length = 0;
     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
 
     for (i = 0; i < 4; i++) {
-	reg = (i < c->key.nr_depth_regs) 
-	    ? brw_vec8_grf(i*2, 0) : brw_vec8_grf(0, 0);
+        if (i < c->key.nr_depth_regs) 
+            reg = brw_vec8_grf(i * 2, 0);
+        else
+            reg = brw_vec8_grf(0, 0);
 	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
     }
-    c->reg_index += 2*c->key.nr_depth_regs;
+    c->reg_index += 2 * c->key.nr_depth_regs;
+
+    /* constants */
     {
-	int nr_params = c->fp->program.Base.Parameters->NumParameters;
-	struct gl_program_parameter_list *plist = 
-	    c->fp->program.Base.Parameters;
-	int index = 0;
-	c->prog_data.nr_params = 4*nr_params;
-	for (i = 0; i < nr_params; i++) {
-	    for (j = 0; j < 4; j++, index++) {
-		reg = brw_vec1_grf(c->reg_index + index/8, 
-			index%8);
-		c->prog_data.param[index] = 
-		    &plist->ParameterValues[i][j];
-		set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
-	    }
-	}
-	c->nr_creg = 2*((4*nr_params+15)/16);
-	c->reg_index += c->nr_creg;
+        const int nr_params = c->fp->program.Base.Parameters->NumParameters;
+
+        /* use a real constant buffer, or just use a section of the GRF? */
+        c->fp->use_const_buffer = GL_FALSE; /* (nr_params > 8);*/
+
+        if (c->fp->use_const_buffer) {
+           /* We'll use a real constant buffer and fetch constants from
+            * it with a dataport read message.
+            */
+
+           /* number of float constants in CURBE */
+           c->prog_data.nr_params = 0;
+        }
+        else {
+           const struct gl_program_parameter_list *plist = 
+              c->fp->program.Base.Parameters;
+           int index = 0;
+
+           /* number of float constants in CURBE */
+           c->prog_data.nr_params = 4 * nr_params;
+
+           /* loop over program constants (float[4]) */
+           for (i = 0; i < nr_params; i++) {
+              /* loop over XYZW channels */
+              for (j = 0; j < 4; j++, index++) {
+                 reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
+                 /* Save pointer to parameter/constant value.
+                  * Constants will be copied in prepare_constant_buffer()
+                  */
+                 c->prog_data.param[index] = &plist->ParameterValues[i][j];
+                 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
+              }
+           }
+           /* number of constant regs used (each reg is float[8]) */
+           c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
+           c->reg_index += c->nr_creg;
+        }
     }
-    for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
-	if (inputs & (1<<i)) {
-	    nr_interp_regs++;
-	    reg = brw_vec8_grf(c->reg_index, 0);
-	    for (j = 0; j < 4; j++)
-		set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
-	    c->reg_index += 2;
 
-	}
+    /* fragment shader inputs */
+    for (i = 0; i < VERT_RESULT_MAX; i++) {
+       int fp_input;
+
+       if (i >= VERT_RESULT_VAR0)
+	  fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
+       else if (i <= VERT_RESULT_TEX7)
+	  fp_input = i;
+       else
+	  fp_input = -1;
+
+       if (fp_input >= 0 && inputs & (1 << fp_input)) {
+	  urb_read_length = c->reg_index;
+	  reg = brw_vec8_grf(c->reg_index, 0);
+	  for (j = 0; j < 4; j++)
+	     set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
+       }
+       if (c->key.vp_outputs_written & (1 << i)) {
+	  c->reg_index += 2;
+       }
     }
+
     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
-    c->prog_data.urb_read_length = nr_interp_regs * 2;
+    c->prog_data.urb_read_length = urb_read_length;
     c->prog_data.curb_read_length = c->nr_creg;
     c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
     c->reg_index++;
     c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
     c->reg_index += 2;
+
+    /* An instruction may reference up to three constants.
+     * They'll be found in these registers.
+     * XXX alloc these on demand!
+     */
+    if (c->fp->use_const_buffer) {
+       for (i = 0; i < 3; i++) {
+          c->current_const[i].index = -1;
+          c->current_const[i].reg = alloc_tmp(c);
+       }
+    }
+#if 0
+    printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
+    printf("AFTER PRE_ALLOC, reg_index = %d\n", c->reg_index);
+#endif
 }
 
+
+/**
+ * Check if any of the instruction's src registers are constants, uniforms,
+ * or statevars.  If so, fetch any constants that we don't already have in
+ * the three GRF slots.
+ */
+static void fetch_constants(struct brw_wm_compile *c,
+                            const struct prog_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   /* loop over instruction src regs */
+   for (i = 0; i < 3; i++) {
+      const struct prog_src_register *src = &inst->SrcReg[i];
+      if (src->File == PROGRAM_STATE_VAR ||
+          src->File == PROGRAM_CONSTANT ||
+          src->File == PROGRAM_UNIFORM) {
+	 c->current_const[i].index = src->Index;
+
+#if 0
+	 printf("  fetch const[%d] for arg %d into reg %d\n",
+		src->Index, i, c->current_const[i].reg.nr);
+#endif
+
+	 /* need to fetch the constant now */
+	 brw_dp_READ_4(p,
+		       c->current_const[i].reg,  /* writeback dest */
+		       src->RelAddr,             /* relative indexing? */
+		       16 * src->Index,          /* byte offset */
+		       SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
+		       );
+      }
+   }
+}
+
+
+/**
+ * Convert Mesa dst register to brw register.
+ */
 static struct brw_reg get_dst_reg(struct brw_wm_compile *c, 
-	struct prog_instruction *inst, int component, int nr)
+                                  const struct prog_instruction *inst,
+                                  GLuint component)
 {
+    const int nr = 1;
     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 	    0, 0);
 }
 
+
+static struct brw_reg
+get_src_reg_const(struct brw_wm_compile *c,
+                  const struct prog_instruction *inst,
+                  GLuint srcRegIndex, GLuint component)
+{
+   /* We should have already fetched the constant from the constant
+    * buffer in fetch_constants().  Now we just have to return a
+    * register description that extracts the needed component and
+    * smears it across all eight vector components.
+    */
+   const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
+   struct brw_reg const_reg;
+
+   assert(component < 4);
+   assert(srcRegIndex < 3);
+   assert(c->current_const[srcRegIndex].index != -1);
+   const_reg = c->current_const[srcRegIndex].reg;
+
+   /* extract desired float from the const_reg, and smear */
+   const_reg = stride(const_reg, 0, 1, 0);
+   const_reg.subnr = component * 4;
+
+   if (src->Negate & (1 << component))
+      const_reg = negate(const_reg);
+   if (src->Abs)
+      const_reg = brw_abs(const_reg);
+
+#if 0
+   printf("  form const[%d].%d for arg %d, reg %d\n",
+          c->current_const[srcRegIndex].index,
+          component,
+          srcRegIndex,
+          const_reg.nr);
+#endif
+
+   return const_reg;
+}
+
+
+/**
+ * Convert Mesa src register to brw register.
+ */
 static struct brw_reg get_src_reg(struct brw_wm_compile *c, 
-	struct prog_src_register *src, int index, int nr)
-{
-    int component = GET_SWZ(src->Swizzle, index);
-    return get_reg(c, src->File, src->Index, component, nr, 
-	    src->NegateBase, src->Abs);
+                                  const struct prog_instruction *inst,
+                                  GLuint srcRegIndex, GLuint channel)
+{
+    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
+    const GLuint nr = 1;
+    const GLuint component = GET_SWZ(src->Swizzle, channel);
+
+    if (c->fp->use_const_buffer &&
+        (src->File == PROGRAM_STATE_VAR ||
+         src->File == PROGRAM_CONSTANT ||
+         src->File == PROGRAM_UNIFORM)) {
+       return get_src_reg_const(c, inst, srcRegIndex, component);
+    }
+    else {
+       /* other type of source register */
+       return get_reg(c, src->File, src->Index, component, nr, 
+                      src->Negate, src->Abs);
+    }
 }
 
-/* Subroutines are minimal support for resusable instruction sequences.
-   They are implemented as simply as possible to minimise overhead: there
-   is no explicit support for communication between the caller and callee
-   other than saving the return address in a temporary register, nor is
-   there any automatic local storage.  This implies that great care is
-   required before attempting reentrancy or any kind of nested
-   subroutine invocations. */
+
+/**
+ * Same as \sa get_src_reg() but if the register is a literal, emit
+ * a brw_reg encoding the literal.
+ * Note that a brw instruction only allows one src operand to be a literal.
+ * For instructions with more than one operand, only the second can be a
+ * literal.  This means that we treat some literals as constants/uniforms
+ * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
+ * 
+ */
+static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c, 
+                                      const struct prog_instruction *inst,
+                                      GLuint srcRegIndex, GLuint channel)
+{
+    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
+    if (src->File == PROGRAM_CONSTANT) {
+       /* a literal */
+       const int component = GET_SWZ(src->Swizzle, channel);
+       const GLfloat *param =
+          c->fp->program.Base.Parameters->ParameterValues[src->Index];
+       GLfloat value = param[component];
+       if (src->Negate & (1 << channel))
+          value = -value;
+       if (src->Abs)
+          value = FABSF(value);
+#if 0
+       printf("  form immed value %f for chan %d\n", value, channel);
+#endif
+       return brw_imm_f(value);
+    }
+    else {
+       return get_src_reg(c, inst, srcRegIndex, channel);
+    }
+}
+
+
+/**
+ * Subroutines are minimal support for resusable instruction sequences.
+ * They are implemented as simply as possible to minimise overhead: there
+ * is no explicit support for communication between the caller and callee
+ * other than saving the return address in a temporary register, nor is
+ * there any automatic local storage.  This implies that great care is
+ * required before attempting reentrancy or any kind of nested
+ * subroutine invocations.
+ */
 static void invoke_subroutine( struct brw_wm_compile *c,
 			       enum _subroutine subroutine,
 			       void (*emit)( struct brw_wm_compile * ) )
@@ -258,7 +490,7 @@ static void invoke_subroutine( struct brw_wm_compile *c,
 }
 
 static void emit_abs( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                      const struct prog_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -266,8 +498,8 @@ static void emit_abs( struct brw_wm_compile *c,
     for (i = 0; i < 4; i++) {
 	if (inst->DstReg.WriteMask & (1<<i)) {
 	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src = get_src_reg(c, inst, 0, i);
 	    brw_MOV(p, dst, brw_abs(src));
 	}
     }
@@ -275,7 +507,7 @@ static void emit_abs( struct brw_wm_compile *c,
 }
 
 static void emit_trunc( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                        const struct prog_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -284,8 +516,8 @@ static void emit_trunc( struct brw_wm_compile *c,
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
 	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i, 1) ;
-	    src = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src = get_src_reg(c, inst, 0, i);
 	    brw_RNDZ(p, dst, src);
 	}
     }
@@ -293,7 +525,7 @@ static void emit_trunc( struct brw_wm_compile *c,
 }
 
 static void emit_mov( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                      const struct prog_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -302,8 +534,10 @@ static void emit_mov( struct brw_wm_compile *c,
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
 	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+            /* XXX some moves from immediate value don't work reliably!!! */
+            /*src = get_src_reg_imm(c, inst, 0, i);*/
+            src = get_src_reg(c, inst, 0, i);
 	    brw_MOV(p, dst, src);
 	}
     }
@@ -311,7 +545,7 @@ static void emit_mov( struct brw_wm_compile *c,
 }
 
 static void emit_pixel_xy(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                          const struct prog_instruction *inst)
 {
     struct brw_reg r1 = brw_vec1_grf(1, 0);
     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
@@ -320,8 +554,8 @@ static void emit_pixel_xy(struct brw_wm_compile *c,
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
 
-    dst0 = get_dst_reg(c, inst, 0, 1);
-    dst1 = get_dst_reg(c, inst, 1, 1);
+    dst0 = get_dst_reg(c, inst, 0);
+    dst1 = get_dst_reg(c, inst, 1);
     /* Calculate pixel centers by adding 1 or 0 to each of the
      * micro-tile coordinates passed in r1.
      */
@@ -338,21 +572,20 @@ static void emit_pixel_xy(struct brw_wm_compile *c,
 		stride(suboffset(r1_uw, 5), 2, 4, 0),
 		brw_imm_v(0x11001100));
     }
-
 }
 
 static void emit_delta_xy(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                          const struct prog_instruction *inst)
 {
     struct brw_reg r1 = brw_vec1_grf(1, 0);
     struct brw_reg dst0, dst1, src0, src1;
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
 
-    dst0 = get_dst_reg(c, inst, 0, 1);
-    dst1 = get_dst_reg(c, inst, 1, 1);
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    src1 = get_src_reg(c, &inst->SrcReg[0], 1, 1);
+    dst0 = get_dst_reg(c, inst, 0);
+    dst1 = get_dst_reg(c, inst, 1);
+    src0 = get_src_reg(c, inst, 0, 0);
+    src1 = get_src_reg(c, inst, 0, 1);
     /* Calc delta X,Y by subtracting origin in r1 from the pixel
      * centers.
      */
@@ -370,10 +603,8 @@ static void emit_delta_xy(struct brw_wm_compile *c,
 		negate(suboffset(r1,1)));
 
     }
-
 }
 
-
 static void fire_fb_write( struct brw_wm_compile *c,
                            GLuint base_reg,
                            GLuint nr,
@@ -404,7 +635,7 @@ static void fire_fb_write( struct brw_wm_compile *c,
 }
 
 static void emit_fb_write(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                          const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     int nr = 2;
@@ -416,38 +647,64 @@ static void emit_fb_write(struct brw_wm_compile *c,
      */
     if (c->key.aa_dest_stencil_reg)
 	nr += 1;
-    {
-	brw_push_insn_state(p);
-	for (channel = 0; channel < 4; channel++) {
-	    src0 = get_src_reg(c,  &inst->SrcReg[0], channel, 1);
-	    /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
-	    /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
-	    brw_MOV(p, brw_message_reg(nr + channel), src0);
-	}
-	/* skip over the regs populated above: */
-	nr += 8;
-	brw_pop_insn_state(p);
+
+    brw_push_insn_state(p);
+    for (channel = 0; channel < 4; channel++) {
+        src0 = get_src_reg(c,  inst, 0, channel);
+        /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
+        /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
+        brw_MOV(p, brw_message_reg(nr + channel), src0);
     }
+    /* skip over the regs populated above: */
+    nr += 8;
+    brw_pop_insn_state(p);
 
-   if (c->key.source_depth_to_render_target)
-   {
-      if (c->key.computes_depth) {
-         src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1);
-         brw_MOV(p, brw_message_reg(nr), src0);
-      } else {
-         src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
-         brw_MOV(p, brw_message_reg(nr), src0);
-      }
+    if (c->key.source_depth_to_render_target) {
+       if (c->key.computes_depth) {
+          src0 = get_src_reg(c, inst, 2, 2);
+          brw_MOV(p, brw_message_reg(nr), src0);
+       }
+       else {
+          src0 = get_src_reg(c, inst, 1, 1);
+          brw_MOV(p, brw_message_reg(nr), src0);
+       }
+
+       nr += 2;
+    }
 
-      nr += 2;
+    if (c->key.dest_depth_reg) {
+        GLuint comp = c->key.dest_depth_reg / 2;
+        GLuint off = c->key.dest_depth_reg % 2;
+
+        assert(comp == 1);
+        assert(off == 0);
+#if 0
+        /* XXX do we need this code?   comp always 1, off always 0, it seems */
+        if (off != 0) {
+            brw_push_insn_state(p);
+            brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+            brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
+            /* 2nd half? */
+            brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
+            brw_pop_insn_state(p);
+        }
+        else
+#endif
+        {
+           struct brw_reg src =  get_src_reg(c, inst, 1, 1);
+           brw_MOV(p, brw_message_reg(nr), src);
+        }
+        nr += 2;
    }
-    target = inst->Sampler >> 1;
-    eot = inst->Sampler & 1;
+
+    target = inst->Aux >> 1;
+    eot = inst->Aux & 1;
     fire_fb_write(c, 0, nr, target, eot);
 }
 
 static void emit_pixel_w( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                          const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -455,10 +712,10 @@ static void emit_pixel_w( struct brw_wm_compile *c,
 	struct brw_reg dst, src0, delta0, delta1;
 	struct brw_reg interp3;
 
-	dst = get_dst_reg(c, inst, 3, 1);
-	src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-	delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
-	delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
+	dst = get_dst_reg(c, inst, 3);
+	src0 = get_src_reg(c, inst, 0, 0);
+	delta0 = get_src_reg(c, inst, 1, 0);
+	delta1 = get_src_reg(c, inst, 1, 1);
 
 	interp3 = brw_vec1_grf(src0.nr+1, 4);
 	/* Calc 1/w - just linterp wpos[3] optimized by putting the
@@ -477,19 +734,19 @@ static void emit_pixel_w( struct brw_wm_compile *c,
 }
 
 static void emit_linterp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
     struct brw_reg interp[4];
     struct brw_reg dst, delta0, delta1;
     struct brw_reg src0;
+    GLuint nr, i;
 
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
-    delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
-    GLuint nr = src0.nr;
-    int i;
+    src0 = get_src_reg(c, inst, 0, 0);
+    delta0 = get_src_reg(c, inst, 1, 0);
+    delta1 = get_src_reg(c, inst, 1, 1);
+    nr = src0.nr;
 
     interp[0] = brw_vec1_grf(nr, 0);
     interp[1] = brw_vec1_grf(nr, 4);
@@ -498,7 +755,7 @@ static void emit_linterp(struct brw_wm_compile *c,
 
     for(i = 0; i < 4; i++ ) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
 	    brw_MAC(p, dst, suboffset(interp[i],1), delta1);
 	}
@@ -506,17 +763,17 @@ static void emit_linterp(struct brw_wm_compile *c,
 }
 
 static void emit_cinterp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
 
     struct brw_reg interp[4];
     struct brw_reg dst, src0;
+    GLuint nr, i;
 
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    GLuint nr = src0.nr;
-    int i;
+    src0 = get_src_reg(c, inst, 0, 0);
+    nr = src0.nr;
 
     interp[0] = brw_vec1_grf(nr, 0);
     interp[1] = brw_vec1_grf(nr, 4);
@@ -525,14 +782,14 @@ static void emit_cinterp(struct brw_wm_compile *c,
 
     for(i = 0; i < 4; i++ ) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_MOV(p, dst, suboffset(interp[i],3));
 	}
     }
 }
 
 static void emit_pinterp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -540,13 +797,13 @@ static void emit_pinterp(struct brw_wm_compile *c,
     struct brw_reg interp[4];
     struct brw_reg dst, delta0, delta1;
     struct brw_reg src0, w;
+    GLuint nr, i;
 
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
-    delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
-    w = get_src_reg(c, &inst->SrcReg[2], 3, 1);
-    GLuint nr = src0.nr;
-    int i;
+    src0 = get_src_reg(c, inst, 0, 0);
+    delta0 = get_src_reg(c, inst, 1, 0);
+    delta1 = get_src_reg(c, inst, 1, 1);
+    w = get_src_reg(c, inst, 2, 3);
+    nr = src0.nr;
 
     interp[0] = brw_vec1_grf(nr, 0);
     interp[1] = brw_vec1_grf(nr, 4);
@@ -555,7 +812,7 @@ static void emit_pinterp(struct brw_wm_compile *c,
 
     for(i = 0; i < 4; i++ ) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
 	    brw_MAC(p, dst, suboffset(interp[i],1), 
 		    delta1);
@@ -566,7 +823,7 @@ static void emit_pinterp(struct brw_wm_compile *c,
 
 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
 static void emit_frontfacing(struct brw_wm_compile *c,
-			     struct prog_instruction *inst)
+			     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
@@ -576,7 +833,7 @@ static void emit_frontfacing(struct brw_wm_compile *c,
 
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_MOV(p, dst, brw_imm_f(0.0));
 	}
     }
@@ -587,7 +844,7 @@ static void emit_frontfacing(struct brw_wm_compile *c,
     brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_MOV(p, dst, brw_imm_f(1.0));
 	}
     }
@@ -595,7 +852,7 @@ static void emit_frontfacing(struct brw_wm_compile *c,
 }
 
 static void emit_xpd(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -605,12 +862,12 @@ static void emit_xpd(struct brw_wm_compile *c,
 	GLuint i1 = (i+1)%3;
 	if (mask & (1<<i)) {
 	    struct brw_reg src0, src1, dst;
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = negate(get_src_reg(c, &inst->SrcReg[0], i2, 1));
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i1, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = negate(get_src_reg(c, inst, 0, i2));
+	    src1 = get_src_reg_imm(c, inst, 1, i1);
 	    brw_MUL(p, brw_null_reg(), src0, src1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i1, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i2, 1);
+	    src0 = get_src_reg(c, inst, 0, i1);
+	    src1 = get_src_reg_imm(c, inst, 1, i2);
 	    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 	    brw_MAC(p, dst, src0, src1);
 	    brw_set_saturate(p, 0);
@@ -620,17 +877,17 @@ static void emit_xpd(struct brw_wm_compile *c,
 }
 
 static void emit_dp3(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_reg src0[3], src1[3], dst;
     int i;
     struct brw_compile *p = &c->func;
     for (i = 0; i < 3; i++) {
-	src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	src0[i] = get_src_reg(c, inst, 0, i);
+	src1[i] = get_src_reg_imm(c, inst, 1, i);
     }
 
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
@@ -639,16 +896,16 @@ static void emit_dp3(struct brw_wm_compile *c,
 }
 
 static void emit_dp4(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_reg src0[4], src1[4], dst;
     int i;
     struct brw_compile *p = &c->func;
     for (i = 0; i < 4; i++) {
-	src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	src0[i] = get_src_reg(c, inst, 0, i);
+	src1[i] = get_src_reg_imm(c, inst, 1, i);
     }
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
     brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
@@ -658,16 +915,16 @@ static void emit_dp4(struct brw_wm_compile *c,
 }
 
 static void emit_dph(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_reg src0[4], src1[4], dst;
     int i;
     struct brw_compile *p = &c->func;
     for (i = 0; i < 4; i++) {
-	src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	src0[i] = get_src_reg(c, inst, 0, i);
+	src1[i] = get_src_reg_imm(c, inst, 1, i);
     }
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
     brw_MAC(p, dst, src0[2], src1[2]);
@@ -682,7 +939,7 @@ static void emit_dph(struct brw_wm_compile *c,
  * register's X, Y, Z and W channels (subject to writemasking of course).
  */
 static void emit_math1(struct brw_wm_compile *c,
-		struct prog_instruction *inst, GLuint func)
+                       const struct prog_instruction *inst, GLuint func)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, dst, tmp;
@@ -692,7 +949,7 @@ static void emit_math1(struct brw_wm_compile *c,
     tmp = alloc_tmp(c);
 
     /* Get first component of source register */
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+    src0 = get_src_reg(c, inst, 0, 0);
 
     /* tmp = func(src0) */
     brw_MOV(p, brw_message_reg(2), src0);
@@ -710,7 +967,7 @@ static void emit_math1(struct brw_wm_compile *c,
     /* replicate tmp value across enabled dest channels */
     for (i = 0; i < 4; i++) {
        if (inst->DstReg.WriteMask & (1 << i)) {
-          dst = get_dst_reg(c, inst, i, 1);    
+          dst = get_dst_reg(c, inst, i);
           brw_MOV(p, dst, tmp);
        }
     }
@@ -719,43 +976,43 @@ static void emit_math1(struct brw_wm_compile *c,
 }
 
 static void emit_rcp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
 }
 
 static void emit_rsq(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
 }
 
 static void emit_sin(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
 }
 
 static void emit_cos(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
 }
 
 static void emit_ex2(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
 }
 
 static void emit_lg2(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
 }
 
 static void emit_add(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, dst;
@@ -764,17 +1021,30 @@ static void emit_add(struct brw_wm_compile *c,
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
 	    brw_ADD(p, dst, src0, src1);
 	}
     }
     brw_set_saturate(p, 0);
 }
 
+static void emit_arl(struct brw_wm_compile *c,
+                     const struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, addr_reg;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+                           BRW_ARF_ADDRESS, 0);
+    src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
+    brw_MOV(p, addr_reg, src0);
+    brw_set_saturate(p, 0);
+}
+
 static void emit_sub(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, dst;
@@ -783,9 +1053,9 @@ static void emit_sub(struct brw_wm_compile *c,
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
 	    brw_ADD(p, dst, src0, negate(src1));
 	}
     }
@@ -793,7 +1063,7 @@ static void emit_sub(struct brw_wm_compile *c,
 }
 
 static void emit_mul(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, dst;
@@ -802,9 +1072,9 @@ static void emit_mul(struct brw_wm_compile *c,
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
 	    brw_MUL(p, dst, src0, src1);
 	}
     }
@@ -812,7 +1082,7 @@ static void emit_mul(struct brw_wm_compile *c,
 }
 
 static void emit_frc(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, dst;
@@ -821,8 +1091,8 @@ static void emit_frc(struct brw_wm_compile *c,
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg_imm(c, inst, 0, i);
 	    brw_FRC(p, dst, src0);
 	}
     }
@@ -831,7 +1101,7 @@ static void emit_frc(struct brw_wm_compile *c,
 }
 
 static void emit_flr(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, dst;
@@ -840,78 +1110,71 @@ static void emit_flr(struct brw_wm_compile *c,
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg_imm(c, inst, 0, i);
 	    brw_RNDD(p, dst, src0);
 	}
     }
     brw_set_saturate(p, 0);
 }
 
-static void emit_max(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg src0, src1, dst;
-    int i;
-    brw_push_insn_state(p);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_MOV(p, dst, src0);
-	    brw_set_saturate(p, 0);
-
-	    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	    brw_MOV(p, dst, src1);
-	    brw_set_saturate(p, 0);
-	    brw_set_predicate_control_flag_value(p, 0xff);
-	}
-    }
-    brw_pop_insn_state(p);
-}
 
-static void emit_min(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+static void emit_min_max(struct brw_wm_compile *c,
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg src0, src1, dst;
+    const GLuint mask = inst->DstReg.WriteMask;
+    const int mark = mark_tmps(c);
     int i;
     brw_push_insn_state(p);
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+            struct brw_reg real_dst = get_dst_reg(c, inst, i);
+	    struct brw_reg src0 = get_src_reg(c, inst, 0, i);
+	    struct brw_reg src1 = get_src_reg(c, inst, 1, i);
+            struct brw_reg dst;
+            /* if dst==src0 or dst==src1 we need to use a temp reg */
+            GLboolean use_temp = brw_same_reg(dst, src0) ||
+                                 brw_same_reg(dst, src1);
+            if (use_temp)
+               dst = alloc_tmp(c);
+            else
+               dst = real_dst;
+
+            /*
+            printf("  Min/max: dst %d  src0 %d  src1 %d\n",
+                   dst.nr, src0.nr, src1.nr);
+            */
 	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 	    brw_MOV(p, dst, src0);
 	    brw_set_saturate(p, 0);
 
-	    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
+            if (inst->Opcode == OPCODE_MIN)
+               brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
+            else
+               brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
+
 	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 	    brw_MOV(p, dst, src1);
 	    brw_set_saturate(p, 0);
 	    brw_set_predicate_control_flag_value(p, 0xff);
+            if (use_temp)
+               brw_MOV(p, real_dst, dst);
 	}
     }
     brw_pop_insn_state(p);
+    release_tmps(c, mark);
 }
 
 static void emit_pow(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst, src0, src1;
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    src1 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
+    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+    src0 = get_src_reg_imm(c, inst, 0, 0);
+    src1 = get_src_reg_imm(c, inst, 1, 0);
 
     brw_MOV(p, brw_message_reg(2), src0);
     brw_MOV(p, brw_message_reg(3), src1);
@@ -927,7 +1190,7 @@ static void emit_pow(struct brw_wm_compile *c,
 }
 
 static void emit_lrp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -936,10 +1199,10 @@ static void emit_lrp(struct brw_wm_compile *c,
     int mark = mark_tmps(c);
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
 
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
 
 	    if (src1.nr == dst.nr) {
 		tmp1 = alloc_tmp(c);
@@ -947,7 +1210,7 @@ static void emit_lrp(struct brw_wm_compile *c,
 	    } else
 		tmp1 = src1;
 
-	    src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
+	    src2 = get_src_reg(c, inst, 2, i);
 	    if (src2.nr == dst.nr) {
 		tmp2 = alloc_tmp(c);
 		brw_MOV(p, tmp2, src2);
@@ -980,7 +1243,7 @@ static void emit_kil(struct brw_wm_compile *c)
 }
 
 static void emit_mad(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -989,10 +1252,10 @@ static void emit_mad(struct brw_wm_compile *c,
 
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
-	    src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
+	    src2 = get_src_reg_imm(c, inst, 2, i);
 	    brw_MUL(p, dst, src0, src1);
 
 	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
@@ -1003,7 +1266,7 @@ static void emit_mad(struct brw_wm_compile *c,
 }
 
 static void emit_sop(struct brw_wm_compile *c,
-		struct prog_instruction *inst, GLuint cond)
+                     const struct prog_instruction *inst, GLuint cond)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -1012,9 +1275,9 @@ static void emit_sop(struct brw_wm_compile *c,
 
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
 	    brw_push_insn_state(p);
 	    brw_CMP(p, brw_null_reg(), cond, src0, src1);
 	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
@@ -1027,43 +1290,43 @@ static void emit_sop(struct brw_wm_compile *c,
 }
 
 static void emit_slt(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_L);
 }
 
 static void emit_sle(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_LE);
 }
 
 static void emit_sgt(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_G);
 }
 
 static void emit_sge(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_GE);
 }
 
 static void emit_seq(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_EQ);
 }
 
 static void emit_sne(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
 }
 
 static void emit_ddx(struct brw_wm_compile *c,
-                struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -1071,8 +1334,8 @@ static void emit_ddx(struct brw_wm_compile *c,
     struct brw_reg dst;
     struct brw_reg src0, w;
     GLuint nr, i;
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
+    src0 = get_src_reg(c, inst, 0, 0);
+    w = get_src_reg(c, inst, 1, 3);
     nr = src0.nr;
     interp[0] = brw_vec1_grf(nr, 0);
     interp[1] = brw_vec1_grf(nr, 4);
@@ -1081,7 +1344,7 @@ static void emit_ddx(struct brw_wm_compile *c,
     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
     for(i = 0; i < 4; i++ ) {
         if (mask & (1<<i)) {
-            dst = get_dst_reg(c, inst, i, 1);
+            dst = get_dst_reg(c, inst, i);
             brw_MOV(p, dst, interp[i]);
             brw_MUL(p, dst, dst, w);
         }
@@ -1090,7 +1353,7 @@ static void emit_ddx(struct brw_wm_compile *c,
 }
 
 static void emit_ddy(struct brw_wm_compile *c,
-                struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -1099,9 +1362,9 @@ static void emit_ddy(struct brw_wm_compile *c,
     struct brw_reg src0, w;
     GLuint nr, i;
 
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+    src0 = get_src_reg(c, inst, 0, 0);
     nr = src0.nr;
-    w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
+    w = get_src_reg(c, inst, 1, 3);
     interp[0] = brw_vec1_grf(nr, 0);
     interp[1] = brw_vec1_grf(nr, 4);
     interp[2] = brw_vec1_grf(nr+1, 0);
@@ -1109,7 +1372,7 @@ static void emit_ddy(struct brw_wm_compile *c,
     brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
     for(i = 0; i < 4; i++ ) {
         if (mask & (1<<i)) {
-            dst = get_dst_reg(c, inst, i, 1);
+            dst = get_dst_reg(c, inst, i);
             brw_MOV(p, dst, suboffset(interp[i], 1));
             brw_MUL(p, dst, dst, w);
         }
@@ -1117,23 +1380,23 @@ static void emit_ddy(struct brw_wm_compile *c,
     brw_set_saturate(p, 0);
 }
 
-static __inline struct brw_reg high_words( struct brw_reg reg )
+static INLINE struct brw_reg high_words( struct brw_reg reg )
 {
     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
 		   0, 8, 2 );
 }
 
-static __inline struct brw_reg low_words( struct brw_reg reg )
+static INLINE struct brw_reg low_words( struct brw_reg reg )
 {
     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
 }
 
-static __inline struct brw_reg even_bytes( struct brw_reg reg )
+static INLINE struct brw_reg even_bytes( struct brw_reg reg )
 {
     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
 }
 
-static __inline struct brw_reg odd_bytes( struct brw_reg reg )
+static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 {
     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
 		   0, 16, 2 );
@@ -1233,7 +1496,7 @@ static void noise1_sub( struct brw_wm_compile *c ) {
 }
 
 static void emit_noise1( struct brw_wm_compile *c,
-			 struct prog_instruction *inst )
+			 const struct prog_instruction *inst )
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src, param, dst;
@@ -1243,7 +1506,7 @@ static void emit_noise1( struct brw_wm_compile *c,
 
     assert( mark == 0 );
     
-    src = get_src_reg( c, inst->SrcReg, 0, 1 );
+    src = get_src_reg( c, inst, 0, 0 );
 
     param = alloc_tmp( c );
 
@@ -1255,7 +1518,7 @@ static void emit_noise1( struct brw_wm_compile *c,
     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_MOV( p, dst, param );
 	}
     }
@@ -1403,7 +1666,7 @@ static void noise2_sub( struct brw_wm_compile *c ) {
 }
 
 static void emit_noise2( struct brw_wm_compile *c,
-			 struct prog_instruction *inst )
+			 const struct prog_instruction *inst )
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, param0, param1, dst;
@@ -1413,8 +1676,8 @@ static void emit_noise2( struct brw_wm_compile *c,
 
     assert( mark == 0 );
     
-    src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
-    src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
+    src0 = get_src_reg( c, inst, 0, 0 );
+    src1 = get_src_reg( c, inst, 0, 1 );
 
     param0 = alloc_tmp( c );
     param1 = alloc_tmp( c );
@@ -1428,7 +1691,7 @@ static void emit_noise2( struct brw_wm_compile *c,
     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_MOV( p, dst, param0 );
 	}
     }
@@ -1438,9 +1701,11 @@ static void emit_noise2( struct brw_wm_compile *c,
     release_tmps( c, mark );
 }
 
-/* The three-dimensional case is much like the one- and two- versions above,
-   but since the number of corners is rapidly growing we now pack 16 16-bit
-   hashes into each register to extract more parallelism from the EUs. */
+/**
+ * The three-dimensional case is much like the one- and two- versions above,
+ * but since the number of corners is rapidly growing we now pack 16 16-bit
+ * hashes into each register to extract more parallelism from the EUs.
+ */
 static void noise3_sub( struct brw_wm_compile *c ) {
 
     struct brw_compile *p = &c->func;
@@ -1704,7 +1969,7 @@ static void noise3_sub( struct brw_wm_compile *c ) {
 }
 
 static void emit_noise3( struct brw_wm_compile *c,
-			 struct prog_instruction *inst )
+			 const struct prog_instruction *inst )
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
@@ -1714,9 +1979,9 @@ static void emit_noise3( struct brw_wm_compile *c,
 
     assert( mark == 0 );
     
-    src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
-    src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
-    src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
+    src0 = get_src_reg( c, inst, 0, 0 );
+    src1 = get_src_reg( c, inst, 0, 1 );
+    src2 = get_src_reg( c, inst, 0, 2 );
 
     param0 = alloc_tmp( c );
     param1 = alloc_tmp( c );
@@ -1732,7 +1997,7 @@ static void emit_noise3( struct brw_wm_compile *c,
     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_MOV( p, dst, param0 );
 	}
     }
@@ -1742,13 +2007,15 @@ static void emit_noise3( struct brw_wm_compile *c,
     release_tmps( c, mark );
 }
     
-/* For the four-dimensional case, the little micro-optimisation benefits
-   we obtain by unrolling all the loops aren't worth the massive bloat it
-   now causes.  Instead, we loop twice around performing a similar operation
-   to noise3, once for the w=0 cube and once for the w=1, with a bit more
-   code to glue it all together. */
-static void noise4_sub( struct brw_wm_compile *c ) {
-
+/**
+ * For the four-dimensional case, the little micro-optimisation benefits
+ * we obtain by unrolling all the loops aren't worth the massive bloat it
+ * now causes.  Instead, we loop twice around performing a similar operation
+ * to noise3, once for the w=0 cube and once for the w=1, with a bit more
+ * code to glue it all together.
+ */
+static void noise4_sub( struct brw_wm_compile *c )
+{
     struct brw_compile *p = &c->func;
     struct brw_reg param[ 4 ],
 	x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
@@ -2125,7 +2392,7 @@ static void noise4_sub( struct brw_wm_compile *c ) {
 }
 
 static void emit_noise4( struct brw_wm_compile *c,
-			 struct prog_instruction *inst )
+			 const struct prog_instruction *inst )
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
@@ -2135,10 +2402,10 @@ static void emit_noise4( struct brw_wm_compile *c,
 
     assert( mark == 0 );
     
-    src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
-    src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
-    src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
-    src3 = get_src_reg( c, inst->SrcReg, 3, 1 );
+    src0 = get_src_reg( c, inst, 0, 0 );
+    src1 = get_src_reg( c, inst, 0, 1 );
+    src2 = get_src_reg( c, inst, 0, 2 );
+    src3 = get_src_reg( c, inst, 0, 3 );
 
     param0 = alloc_tmp( c );
     param1 = alloc_tmp( c );
@@ -2156,7 +2423,7 @@ static void emit_noise4( struct brw_wm_compile *c,
     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_MOV( p, dst, param0 );
 	}
     }
@@ -2167,17 +2434,17 @@ static void emit_noise4( struct brw_wm_compile *c,
 }
     
 static void emit_wpos_xy(struct brw_wm_compile *c,
-                struct prog_instruction *inst)
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
     struct brw_reg src0[2], dst[2];
 
-    dst[0] = get_dst_reg(c, inst, 0, 1);
-    dst[1] = get_dst_reg(c, inst, 1, 1);
+    dst[0] = get_dst_reg(c, inst, 0);
+    dst[1] = get_dst_reg(c, inst, 1);
 
-    src0[0] = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    src0[1] = get_src_reg(c, &inst->SrcReg[0], 1, 1);
+    src0[0] = get_src_reg(c, inst, 0, 0);
+    src0[1] = get_src_reg(c, inst, 0, 1);
 
     /* Calculate the pixel offset from window bottom left into destination
      * X and Y channels.
@@ -2200,27 +2467,28 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 }
 
 /* TODO
-   BIAS on SIMD8 not workind yet...
+   BIAS on SIMD8 not working yet...
  */	
 static void emit_txb(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst[4], src[4], payload_reg;
     GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
-
     GLuint i;
+
     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+
     for (i = 0; i < 4; i++) 
-	dst[i] = get_dst_reg(c, inst, i, 1);
+	dst[i] = get_dst_reg(c, inst, i);
     for (i = 0; i < 4; i++)
-	src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	src[i] = get_src_reg(c, inst, 0, i);
 
     switch (inst->TexSrcTarget) {
 	case TEXTURE_1D_INDEX:
-	    brw_MOV(p, brw_message_reg(2), src[0]);
-	    brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
-	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+	    brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
+	    brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
+	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
 	    break;
 	case TEXTURE_2D_INDEX:
 	case TEXTURE_RECT_INDEX:
@@ -2234,28 +2502,28 @@ static void emit_txb(struct brw_wm_compile *c,
 	    brw_MOV(p, brw_message_reg(4), src[2]);
 	    break;
     }
-    brw_MOV(p, brw_message_reg(5), src[3]);
-    brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+    brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
+    brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
     brw_SAMPLE(p,
-	    retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
-	    1,
-	    retype(payload_reg, BRW_REGISTER_TYPE_UW),
-	    unit + MAX_DRAW_BUFFERS, /* surface */
-	    unit,     /* sampler */
-	    inst->DstReg.WriteMask,
-	    BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
-	    4,
-	    4,
-	    0);
+               retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
+               1,                                           /* msg_reg_nr */
+               retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
+               SURF_INDEX_TEXTURE(unit),
+               unit,                                        /* sampler */
+               inst->DstReg.WriteMask,                      /* writemask */
+               BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,      /* msg_type */
+               4,                                           /* response_length */
+               4,                                           /* msg_length */
+               0);                                          /* eot */
 }
 
+
 static void emit_tex(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst[4], src[4], payload_reg;
     GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
-
     GLuint msg_len;
     GLuint i, nr;
     GLuint emit;
@@ -2264,10 +2532,9 @@ static void emit_tex(struct brw_wm_compile *c,
     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
 
     for (i = 0; i < 4; i++) 
-	dst[i] = get_dst_reg(c, inst, i, 1);
+	dst[i] = get_dst_reg(c, inst, i);
     for (i = 0; i < 4; i++)
-	src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
-
+	src[i] = get_src_reg(c, inst, 0, i);
 
     switch (inst->TexSrcTarget) {
 	case TEXTURE_1D_INDEX:
@@ -2286,6 +2553,7 @@ static void emit_tex(struct brw_wm_compile *c,
     }
     msg_len = 1;
 
+    /* move/load S, T, R coords */
     for (i = 0; i < nr; i++) {
 	static const GLuint swz[4] = {0,1,2,2};
 	if (emit & (1<<i))
@@ -2296,26 +2564,27 @@ static void emit_tex(struct brw_wm_compile *c,
     }
 
     if (shadow) {
-	brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
-	brw_MOV(p, brw_message_reg(6), src[2]);
+       brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
+       brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
     }
 
     brw_SAMPLE(p,
-	    retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
-	    1,
-	    retype(payload_reg, BRW_REGISTER_TYPE_UW),
-	    unit + MAX_DRAW_BUFFERS, /* surface */
-	    unit,     /* sampler */
-	    inst->DstReg.WriteMask,
-	    BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
-	    4,
-	    shadow ? 6 : 4,
-	    0);
+               retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
+               1,                                          /* msg_reg_nr */
+               retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
+               SURF_INDEX_TEXTURE(unit),
+               unit,                                       /* sampler */
+               inst->DstReg.WriteMask,                     /* writemask */
+               BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,           /* msg_type */
+               4,                                          /* response_length */
+               shadow ? 6 : 4,                             /* msg_length */
+               0);                                         /* eot */
 
     if (shadow)
 	brw_MOV(p, dst[3], brw_imm_f(1.0));
 }
 
+
 /**
  * Resolve subroutine calls after code emit is done.
  */
@@ -2340,7 +2609,16 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
 
     for (i = 0; i < c->nr_fp_insns; i++) {
-	struct prog_instruction *inst = &c->prog_instructions[i];
+        const struct prog_instruction *inst = &c->prog_instructions[i];
+
+#if 0
+        _mesa_printf("Inst %d: ", i);
+        _mesa_print_instruction(inst);
+#endif
+
+        /* fetch any constants that this instruction needs */
+        if (c->fp->use_const_buffer)
+           fetch_constants(c, inst);
 
 	if (inst->CondUpdate)
 	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
@@ -2381,6 +2659,9 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    case OPCODE_ADD:
 		emit_add(c, inst);
 		break;
+	    case OPCODE_ARL:
+		emit_arl(c, inst);
+		break;
 	    case OPCODE_SUB:
 		emit_sub(c, inst);
 		break;
@@ -2397,6 +2678,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		emit_trunc(c, inst);
 		break;
 	    case OPCODE_MOV:
+	    case OPCODE_SWZ:
 		emit_mov(c, inst);
 		break;
 	    case OPCODE_DP3:
@@ -2429,11 +2711,9 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    case OPCODE_LG2:
 		emit_lg2(c, inst);
 		break;
-	    case OPCODE_MAX:	
-		emit_max(c, inst);
-		break;
 	    case OPCODE_MIN:	
-		emit_min(c, inst);
+	    case OPCODE_MAX:	
+		emit_min_max(c, inst);
 		break;
 	    case OPCODE_DDX:
 		emit_ddx(c, inst);
@@ -2531,6 +2811,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 
 		break;
 	    case OPCODE_BGNLOOP:
+                /* XXX may need to invalidate the current_constant regs */
 		loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
 		break;
 	    case OPCODE_BRK:
@@ -2574,10 +2855,27 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
     }
 }
 
+
+/**
+ * Do GPU code generation for shaders that use GLSL features such as
+ * flow control.  Other shaders will be compiled with the 
+ */
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
 {
+    if (INTEL_DEBUG & DEBUG_WM) {
+        _mesa_printf("brw_wm_glsl_emit:\n");
+    }
+
+    /* initial instruction translation/simplification */
     brw_wm_pass_fp(c);
+
+    /* actual code generation */
     brw_wm_emit_glsl(brw, c);
+
+    if (INTEL_DEBUG & DEBUG_WM) {
+        brw_wm_print_program(c, "brw_wm_glsl_emit done");
+    }
+
     c->prog_data.total_grf = c->reg_index;
     c->prog_data.total_scratch = 0;
 }
diff --git a/i965/brw_wm_pass0.c b/i965/brw_wm_pass0.c
index fca7b7a..9214276 100644
--- a/i965/brw_wm_pass0.c
+++ b/i965/brw_wm_pass0.c
@@ -51,6 +51,7 @@ static struct brw_wm_value *get_value( struct brw_wm_compile *c)
    return &c->vreg[c->nr_vreg++];
 }
 
+/** return pointer to a newly allocated instruction */
 static struct brw_wm_instruction *get_instruction( struct brw_wm_compile *c )
 {
    assert(c->nr_insns < BRW_WM_MAX_INSN);
@@ -60,6 +61,7 @@ static struct brw_wm_instruction *get_instruction( struct brw_wm_compile *c )
 /***********************************************************************
  */
 
+/** Init the "undef" register */
 static void pass0_init_undef( struct brw_wm_compile *c)
 {
    struct brw_wm_ref *ref = &c->undef_ref;
@@ -69,6 +71,7 @@ static void pass0_init_undef( struct brw_wm_compile *c)
    ref->prevuse = NULL;
 }
 
+/** Set a FP register to a value */
 static void pass0_set_fpreg_value( struct brw_wm_compile *c,
 				   GLuint file,
 				   GLuint idx,
@@ -83,6 +86,7 @@ static void pass0_set_fpreg_value( struct brw_wm_compile *c,
    c->pass0_fp_reg[file][idx][component] = ref;
 }
 
+/** Set a FP register to a ref */
 static void pass0_set_fpreg_ref( struct brw_wm_compile *c,
 				 GLuint file,
 				 GLuint idx,
@@ -115,12 +119,13 @@ static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c,
       ref->value = &c->creg[i/16];
       ref->insn = 0;
       ref->prevuse = NULL;
-      
+
       return ref;
    }
 }
 
 
+/** Return a ref to a constant/literal value */
 static const struct brw_wm_ref *get_const_ref( struct brw_wm_compile *c,
 					       const GLfloat *constval )
 {
@@ -142,7 +147,7 @@ static const struct brw_wm_ref *get_const_ref( struct brw_wm_compile *c,
        */
       c->constref[i].constval = *constval;
       c->constref[i].ref = get_param_ref(c, constval);
-   
+
       return c->constref[i].ref;
    }
    else {
@@ -187,7 +192,7 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 	 
 	 /* There's something really hokey about parameters parsed in
 	  * arb programs - they all end up in here, whether they be
-	  * state values, paramters or constants.  This duplicates the
+	  * state values, parameters or constants.  This duplicates the
 	  * structure above & also seems to subvert the limits set for
 	  * each type of constant/param.
 	  */ 
@@ -198,7 +203,7 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 	     */
 	    ref = get_const_ref(c, &plist->ParameterValues[idx][component]);
 	    break;
-	    
+
 	 case PROGRAM_STATE_VAR:
 	 case PROGRAM_UNIFORM:
 	    /* These may change from run to run:
@@ -229,14 +234,13 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 
 
 
-
 /***********************************************************************
  * Straight translation to internal instruction format
  */
 
 static void pass0_set_dst( struct brw_wm_compile *c,
-			   struct brw_wm_instruction *out,		     
-			   const struct prog_instruction *inst,		     
+			   struct brw_wm_instruction *out,
+			   const struct prog_instruction *inst,
 			   GLuint writemask )
 {
    const struct prog_dst_register *dst = &inst->DstReg;
@@ -245,18 +249,17 @@ static void pass0_set_dst( struct brw_wm_compile *c,
    for (i = 0; i < 4; i++) {
       if (writemask & (1<<i)) {
 	 out->dst[i] = get_value(c);
-
 	 pass0_set_fpreg_value(c, dst->File, dst->Index, i, out->dst[i]);
       }
    }
-   
+
    out->writemask = writemask;
 }
 
 
 static void pass0_set_dst_scalar( struct brw_wm_compile *c,
-				  struct brw_wm_instruction *out,		     
-				  const struct prog_instruction *inst,		     
+				  struct brw_wm_instruction *out,
+                                  const struct prog_instruction *inst,
 				  GLuint writemask )
 {
    if (writemask) {
@@ -282,7 +285,6 @@ static void pass0_set_dst_scalar( struct brw_wm_compile *c,
 }
 
 
-
 static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c,
 						    struct prog_src_register src,
 						    GLuint i )
@@ -292,14 +294,13 @@ static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c,
    static const GLfloat const_zero = 0.0;
    static const GLfloat const_one = 1.0;
 
-	 
    if (component == SWIZZLE_ZERO) 
       src_ref = get_const_ref(c, &const_zero);
    else if (component == SWIZZLE_ONE) 
       src_ref = get_const_ref(c, &const_one);
    else 
       src_ref = pass0_get_reg(c, src.File, src.Index, component);
-	 
+
    return src_ref;
 }
 
@@ -311,19 +312,19 @@ static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c,
 {
    const struct brw_wm_ref *ref = get_fp_src_reg_ref(c, src, i);
    struct brw_wm_ref *newref = get_ref(c);
-      
+
    newref->value = ref->value;
    newref->hw_reg = ref->hw_reg;
 
-   if (insn) { 
+   if (insn) {
       newref->insn = insn - c->instruction;
       newref->prevuse = newref->value->lastuse;
       newref->value->lastuse = newref;
    }
 
-   if (src.NegateBase & (1<<i)) 
+   if (src.Negate & (1 << i))
       newref->hw_reg.negate ^= 1;
-	    
+
    if (src.Abs) {
       newref->hw_reg.negate = 0;
       newref->hw_reg.abs = 1;
@@ -333,9 +334,9 @@ static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c,
 }
 
 
-
-static struct brw_wm_instruction *translate_insn( struct brw_wm_compile *c,
-						  const struct prog_instruction *inst )
+static void
+translate_insn(struct brw_wm_compile *c,
+               const struct prog_instruction *inst)
 {
    struct brw_wm_instruction *out = get_instruction(c);
    GLuint writemask = inst->DstReg.WriteMask;
@@ -348,8 +349,9 @@ static struct brw_wm_instruction *translate_insn( struct brw_wm_compile *c,
    out->saturate = (inst->SaturateMode != SATURATE_OFF);
    out->tex_unit = inst->TexSrcUnit;
    out->tex_idx = inst->TexSrcTarget;
-   out->eot = inst->Sampler & 1;
-   out->target = inst->Sampler>>1;
+   out->tex_shadow = inst->TexShadow;
+   out->eot = inst->Aux & 1;
+   out->target = inst->Aux >> 1;
 
    /* Args:
     */
@@ -365,8 +367,6 @@ static struct brw_wm_instruction *translate_insn( struct brw_wm_compile *c,
       pass0_set_dst_scalar(c, out, inst, writemask);
    else 
       pass0_set_dst(c, out, inst, writemask);
-
-   return out;
 }
 
 
@@ -426,6 +426,7 @@ static void pass0_init_payload( struct brw_wm_compile *c )
 			     &c->payload.input_interp[i] );      
 }
 
+
 /***********************************************************************
  * PASS 0
  *
@@ -448,7 +449,6 @@ void brw_wm_pass0( struct brw_wm_compile *c )
    for (insn = 0; insn < c->nr_fp_insns; insn++) {
       const struct prog_instruction *inst = &c->prog_instructions[insn];
 
-
       /* Optimize away moves, otherwise emit translated instruction:
        */      
       switch (inst->Opcode) {
@@ -461,8 +461,6 @@ void brw_wm_pass0( struct brw_wm_compile *c )
 	    translate_insn(c, inst);
 	 }
 	 break;
-	 
-
       default:
 	 translate_insn(c, inst);
 	 break;
@@ -473,4 +471,3 @@ void brw_wm_pass0( struct brw_wm_compile *c )
       brw_wm_print_program(c, "pass0");
    }
 }
-
diff --git a/i965/brw_wm_pass1.c b/i965/brw_wm_pass1.c
index a1fea6f..ab9aa2f 100644
--- a/i965/brw_wm_pass1.c
+++ b/i965/brw_wm_pass1.c
@@ -58,7 +58,8 @@ static void unlink_ref(struct brw_wm_ref *ref)
 
    if (ref == value->lastuse) {
       value->lastuse = ref->prevuse;
-   } else {
+   }
+   else {
       struct brw_wm_ref *i = value->lastuse;
       while (i->prevuse != ref) i = i->prevuse;
       i->prevuse = ref->prevuse;
@@ -75,8 +76,9 @@ static void track_arg(struct brw_wm_compile *c,
    for (i = 0; i < 4; i++) {
       struct brw_wm_ref *ref = inst->src[arg][i];
       if (ref) {
-	 if (readmask & (1<<i)) 
+	 if (readmask & (1<<i)) {
 	    ref->value->contributes_to_output = 1;
+         }
 	 else {
 	    unlink_ref(ref);
 	    inst->src[arg][i] = NULL;
@@ -88,15 +90,21 @@ static void track_arg(struct brw_wm_compile *c,
 static GLuint get_texcoord_mask( GLuint tex_idx )
 {
    switch (tex_idx) {
-   case TEXTURE_1D_INDEX: return WRITEMASK_X;
-   case TEXTURE_2D_INDEX: return WRITEMASK_XY;
-   case TEXTURE_3D_INDEX: return WRITEMASK_XYZ;
-   case TEXTURE_CUBE_INDEX: return WRITEMASK_XYZ;
-   case TEXTURE_RECT_INDEX: return WRITEMASK_XY;
+   case TEXTURE_1D_INDEX:
+      return WRITEMASK_X;
+   case TEXTURE_2D_INDEX:
+      return WRITEMASK_XY;
+   case TEXTURE_3D_INDEX:
+      return WRITEMASK_XYZ;
+   case TEXTURE_CUBE_INDEX:
+      return WRITEMASK_XYZ;
+   case TEXTURE_RECT_INDEX:
+      return WRITEMASK_XY;
    default: return 0;
    }
 }
 
+
 /* Step two: Basically this is dead code elimination.  
  *
  * Iterate backwards over instructions, noting which values
@@ -202,9 +210,10 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 break;
 
       case OPCODE_TEX:
+      case OPCODE_TXP:
 	 read0 = get_texcoord_mask(inst->tex_idx);
 
-	 if (c->key.shadowtex_mask & (1<<inst->tex_unit))
+         if (inst->tex_shadow)
 	    read0 |= WRITEMASK_Z;
 	 break;
 
@@ -259,7 +268,6 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 break;
 
       case OPCODE_DST:
-      case OPCODE_TXP:
       case WM_FRONTFACING:
       default:
 	 break;
@@ -274,6 +282,3 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       brw_wm_print_program(c, "pass1");
    }
 }
-
-
-
diff --git a/i965/brw_wm_pass2.c b/i965/brw_wm_pass2.c
index 6fca9ad..6faea01 100644
--- a/i965/brw_wm_pass2.c
+++ b/i965/brw_wm_pass2.c
@@ -69,8 +69,6 @@ static void prealloc_reg(struct brw_wm_compile *c,
  */
 static void init_registers( struct brw_wm_compile *c )
 {
-   struct brw_context *brw = c->func.brw;
-   GLuint inputs = (brw->vs.prog_data->outputs_written & DO_SETUP_BITS);
    GLuint nr_interp_regs = 0;
    GLuint i = 0;
    GLuint j;
@@ -84,18 +82,22 @@ static void init_registers( struct brw_wm_compile *c )
    for (j = 0; j < c->nr_creg; j++) 
       prealloc_reg(c, &c->creg[j], i++);
 
-   for (j = 0; j < FRAG_ATTRIB_MAX; j++) 
-      if (inputs & (1<<j)) {
-	 /* index for vs output and ps input are not the same 
-	    in shader varying */
-	 GLuint index;
-	 if (j > FRAG_ATTRIB_VAR0)
-	     index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
+   for (j = 0; j < FRAG_ATTRIB_MAX; j++) {
+      if (c->key.vp_outputs_written & (1<<j)) {
+	 int fp_index;
+
+	 if (j >= VERT_RESULT_VAR0)
+	    fp_index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
+	 else if (j <= VERT_RESULT_TEX7)
+	    fp_index = j;
 	 else
-	     index = j;
+	    fp_index = -1;
+
 	 nr_interp_regs++;
-	 prealloc_reg(c, &c->payload.input_interp[index], i++);
+	 if (fp_index >= 0)
+	    prealloc_reg(c, &c->payload.input_interp[fp_index], i++);
       }
+   }
 
    assert(nr_interp_regs >= 1);
 
@@ -120,7 +122,7 @@ static void update_register_usage(struct brw_wm_compile *c,
       /* Only search those which can change:
        */
       if (grf->nextuse < thisinsn) {
-	 struct brw_wm_ref *ref = grf->value->lastuse;
+	 const struct brw_wm_ref *ref = grf->value->lastuse;
 
 	 /* Has last use of value been passed?
 	  */
@@ -148,7 +150,7 @@ static void spill_value(struct brw_wm_compile *c,
    /* Allocate a spill slot.  Note that allocations start from 0x40 -
     * the first slot is reserved to mean "undef" in brw_wm_emit.c
     */
-   if (!value->spill_slot) {  
+   if (!value->spill_slot) {
       c->last_scratch += 0x40;	
       value->spill_slot = c->last_scratch;
    }
@@ -189,7 +191,7 @@ static GLuint search_contiguous_regs(struct brw_wm_compile *c,
 	 if (grf[i+j].nextuse < group_nextuse)
 	    group_nextuse = grf[i+j].nextuse;
       }
-	 
+
       if (group_nextuse > furthest) {
 	 furthest = group_nextuse;
 	 reg = i;
@@ -197,7 +199,7 @@ static GLuint search_contiguous_regs(struct brw_wm_compile *c,
    }
 
    assert(furthest != thisinsn);
-   
+
    /* Any non-empty regs will need to be spilled:
     */
    for (j = 0; j < nr; j++) 
@@ -243,7 +245,7 @@ static void alloc_contiguous_dest(struct brw_wm_compile *c,
 
 static void load_args(struct brw_wm_compile *c, 
 		      struct brw_wm_instruction *inst)
-{   
+{
    GLuint thisinsn = inst - c->instruction;
    GLuint i,j;
 
@@ -258,17 +260,17 @@ static void load_args(struct brw_wm_compile *c,
 		* register allocation and mark the ref as requiring a fill.
 		*/
 	       GLuint reg = search_contiguous_regs(c, 1, thisinsn);
-            
+
 	       c->pass2_grf[reg].value = ref->value;
 	       c->pass2_grf[reg].nextuse = thisinsn;
-	    
+
 	       ref->value->resident = &c->pass2_grf[reg];
 
 	       /* Note that a fill is required:
 		*/
 	       ref->unspill_reg = reg*2;
 	    }
-	    
+
 	    /* Adjust the hw_reg to point at the value's current location:
 	     */
 	    assert(ref->value == ref->value->resident->value);
@@ -294,7 +296,7 @@ void brw_wm_pass2( struct brw_wm_compile *c )
 
    for (insn = 0; insn < c->nr_insns; insn++) {
       struct brw_wm_instruction *inst = &c->instruction[insn];
-      
+
       /* Update registers' nextuse values:
        */
       update_register_usage(c, insn);
@@ -322,11 +324,11 @@ void brw_wm_pass2( struct brw_wm_compile *c )
 	 break;
       }
 
-      if (TEST_DST_SPILLS && inst->opcode != WM_PIXELXY)
+      if (TEST_DST_SPILLS && inst->opcode != WM_PIXELXY) {
 	 for (i = 0; i < 4; i++)	
 	    if (inst->dst[i])
 	       spill_value(c, inst->dst[i]);
-
+      }
    }
 
    if (INTEL_DEBUG & DEBUG_WM) {
@@ -339,6 +341,3 @@ void brw_wm_pass2( struct brw_wm_compile *c )
        brw_wm_print_program(c, "pass2/done");
    }
 }
-
-
-
diff --git a/i965/brw_wm_sampler_state.c b/i965/brw_wm_sampler_state.c
index 68a9296..3fc18ff 100644
--- a/i965/brw_wm_sampler_state.c
+++ b/i965/brw_wm_sampler_state.c
@@ -152,7 +152,7 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
       sampler->ss0.mag_filter = BRW_MAPFILTER_ANISOTROPIC;
 
       if (key->max_aniso > 2.0) {
-	 sampler->ss3.max_aniso = MAX2((key->max_aniso - 2) / 2,
+	 sampler->ss3.max_aniso = MIN2((key->max_aniso - 2) / 2,
 				       BRW_ANISORATIO_16);
       }
    }
@@ -178,6 +178,16 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
       sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
       sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
    }
+   else if (key->tex_target == GL_TEXTURE_1D) {
+      /* There's a bug in 1D texture sampling - it actually pays
+       * attention to the wrap_t value, though it should not.
+       * Override the wrap_t value here to GL_REPEAT to keep
+       * any nonexistent border pixels from floating in.
+       */
+      sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
+      sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
+      sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+   }
    else {
       sampler->ss1.r_wrap_mode = translate_wrap_mode(key->wrap_r);
       sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
@@ -217,6 +227,7 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
    sampler->ss2.default_color_pointer = sdc_bo->offset >> 5; /* reloc */
 }
 
+
 /** Sets up the cache key for sampler state for all texture units */
 static void
 brw_wm_sampler_populate_key(struct brw_context *brw,
diff --git a/i965/brw_wm_state.c b/i965/brw_wm_state.c
index 1844eba..67b4117 100644
--- a/i965/brw_wm_state.c
+++ b/i965/brw_wm_state.c
@@ -62,6 +62,7 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 {
    GLcontext *ctx = &brw->intel.ctx;
    const struct gl_fragment_program *fp = brw->fragment_program;
+   const struct brw_fragment_program *bfp = (struct brw_fragment_program *) fp;
    struct intel_context *intel = &brw->intel;
 
    memset(key, 0, sizeof(*key));
@@ -103,11 +104,14 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    /* as far as we can tell */
    key->computes_depth =
-      (fp->Base.OutputsWritten & (1 << FRAG_RESULT_DEPR)) != 0;
+      (fp->Base.OutputsWritten & (1 << FRAG_RESULT_DEPTH)) != 0;
 
    /* _NEW_COLOR */
    key->uses_kill = fp->UsesKill || ctx->Color.AlphaEnabled;
-   key->is_glsl = brw_wm_is_glsl(fp);
+   key->is_glsl = bfp->isGLSL;
+
+   /* temporary sanity check assertion */
+   ASSERT(bfp->isGLSL == brw_wm_is_glsl(fp));
 
    /* _NEW_DEPTH */
    key->stats_wm = intel->stats_wm;
@@ -121,6 +125,9 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
    key->offset_factor = ctx->Polygon.OffsetFactor;
 }
 
+/**
+ * Setup wm hardware state.  See page 225 of Volume 2
+ */
 static dri_bo *
 wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 			dri_bo **reloc_bufs)
@@ -138,7 +145,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 
    if (key->total_scratch != 0) {
       wm.thread2.scratch_space_base_pointer =
-	 brw->wm.scratch_buffer->offset >> 10; /* reloc */
+	 brw->wm.scratch_bo->offset >> 10; /* reloc */
       wm.thread2.per_thread_scratch_space = key->total_scratch / 1024 - 1;
    } else {
       wm.thread2.scratch_space_base_pointer = 0;
@@ -147,9 +154,9 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 
    wm.thread3.dispatch_grf_start_reg = key->dispatch_grf_start_reg;
    wm.thread3.urb_entry_read_length = key->urb_entry_read_length;
+   wm.thread3.urb_entry_read_offset = 0;
    wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
    wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
-   wm.thread3.urb_entry_read_offset = 0;
 
    wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
    if (brw->wm.sampler_bo != NULL) {
@@ -216,7 +223,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 			0, 0,
 			wm.thread2.per_thread_scratch_space,
 			offsetof(struct brw_wm_unit_state, thread2),
-			brw->wm.scratch_buffer);
+			brw->wm.scratch_bo);
    }
 
    /* Emit sampler state relocation */
@@ -247,20 +254,20 @@ static void upload_wm_unit( struct brw_context *brw )
    if (key.total_scratch) {
       GLuint total = key.total_scratch * key.max_threads;
 
-      if (brw->wm.scratch_buffer && total > brw->wm.scratch_buffer->size) {
-	 dri_bo_unreference(brw->wm.scratch_buffer);
-	 brw->wm.scratch_buffer = NULL;
+      if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) {
+	 dri_bo_unreference(brw->wm.scratch_bo);
+	 brw->wm.scratch_bo = NULL;
       }
-      if (brw->wm.scratch_buffer == NULL) {
-	 brw->wm.scratch_buffer = dri_bo_alloc(intel->bufmgr,
-					       "wm scratch",
-					       total,
-					       4096);
+      if (brw->wm.scratch_bo == NULL) {
+	 brw->wm.scratch_bo = dri_bo_alloc(intel->bufmgr,
+                                           "wm scratch",
+                                           total,
+                                           4096);
       }
    }
 
    reloc_bufs[0] = brw->wm.prog_bo;
-   reloc_bufs[1] = brw->wm.scratch_buffer;
+   reloc_bufs[1] = brw->wm.scratch_bo;
    reloc_bufs[2] = brw->wm.sampler_bo;
 
    dri_bo_unreference(brw->wm.state_bo);
@@ -283,7 +290,7 @@ const struct brw_tracked_state brw_wm_unit = {
 
       .brw = (BRW_NEW_FRAGMENT_PROGRAM | 
 	      BRW_NEW_CURBE_OFFSETS |
-	      BRW_NEW_NR_SURFACES),
+	      BRW_NEW_NR_WM_SURFACES),
 
       .cache = (CACHE_NEW_WM_PROG |
 		CACHE_NEW_SAMPLER)
diff --git a/i965/brw_wm_surface_state.c b/i965/brw_wm_surface_state.c
index 3487b85..805df8a 100644
--- a/i965/brw_wm_surface_state.c
+++ b/i965/brw_wm_surface_state.c
@@ -33,11 +33,12 @@
 #include "main/mtypes.h"
 #include "main/texformat.h"
 #include "main/texstore.h"
+#include "shader/prog_parameter.h"
 
 #include "intel_mipmap_tree.h"
 #include "intel_batchbuffer.h"
 #include "intel_tex.h"
-
+#include "intel_fbo.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
@@ -69,7 +70,8 @@ static GLuint translate_tex_target( GLenum target )
 }
 
 
-static GLuint translate_tex_format( GLuint mesa_format, GLenum depth_mode )
+static GLuint translate_tex_format( GLuint mesa_format, GLenum internal_format,
+				    GLenum depth_mode )
 {
    switch( mesa_format ) {
    case MESA_FORMAT_L8:
@@ -89,10 +91,16 @@ static GLuint translate_tex_format( GLuint mesa_format, GLenum depth_mode )
       return BRW_SURFACEFORMAT_R8G8B8_UNORM;      
 
    case MESA_FORMAT_ARGB8888:
-      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+      if (internal_format == GL_RGB)
+	 return BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
+      else
+	 return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 
    case MESA_FORMAT_RGBA8888_REV:
-      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+      if (internal_format == GL_RGB)
+	 return BRW_SURFACEFORMAT_R8G8B8X8_UNORM;
+      else
+	 return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
 
    case MESA_FORMAT_RGB565:
       return BRW_SURFACEFORMAT_B5G6R5_UNORM;
@@ -133,13 +141,34 @@ static GLuint translate_tex_format( GLuint mesa_format, GLenum depth_mode )
    case MESA_FORMAT_RGBA_DXT5:
        return BRW_SURFACEFORMAT_BC3_UNORM;
 
-   case MESA_FORMAT_SRGBA8:
-      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB;
+   case MESA_FORMAT_SARGB8:
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB;
+
+   case MESA_FORMAT_SLA8:
+      return BRW_SURFACEFORMAT_L8A8_UNORM_SRGB;
+
+   case MESA_FORMAT_SL8:
+      return BRW_SURFACEFORMAT_L8_UNORM_SRGB;
+
    case MESA_FORMAT_SRGB_DXT1:
       return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
 
    case MESA_FORMAT_S8_Z24:
-      return BRW_SURFACEFORMAT_I24X8_UNORM;
+      /* XXX: these different surface formats don't seem to
+       * make any difference for shadow sampler/compares.
+       */
+      if (depth_mode == GL_INTENSITY) 
+         return BRW_SURFACEFORMAT_I24X8_UNORM;
+      else if (depth_mode == GL_ALPHA)
+         return BRW_SURFACEFORMAT_A24X8_UNORM;
+      else
+         return BRW_SURFACEFORMAT_L24X8_UNORM;
+
+   case MESA_FORMAT_DUDV8:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+
+   case MESA_FORMAT_SIGNED_RGBA8888_REV:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
 
    default:
       assert(0);
@@ -147,10 +176,14 @@ static GLuint translate_tex_format( GLuint mesa_format, GLenum depth_mode )
    }
 }
 
-struct brw_wm_surface_key {
+
+/**
+ * Use same key for WM and VS surfaces.
+ */
+struct brw_surface_key {
    GLenum target, depthmode;
    dri_bo *bo;
-   GLint format;
+   GLint format, internal_format;
    GLint first_level, last_level;
    GLint width, height, depth;
    GLint pitch, cpp;
@@ -158,6 +191,7 @@ struct brw_wm_surface_key {
    GLuint offset;
 };
 
+
 static void
 brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling)
 {
@@ -179,7 +213,7 @@ brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling)
 
 static dri_bo *
 brw_create_texture_surface( struct brw_context *brw,
-			    struct brw_wm_surface_key *key )
+			    struct brw_surface_key *key )
 {
    struct brw_surface_state surf;
    dri_bo *bo;
@@ -188,9 +222,11 @@ brw_create_texture_surface( struct brw_context *brw,
 
    surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
    surf.ss0.surface_type = translate_tex_target(key->target);
-
-   if (key->bo) 
-      surf.ss0.surface_format = translate_tex_format(key->format, key->depthmode);
+   if (key->bo) {
+      surf.ss0.surface_format = translate_tex_format(key->format,
+						     key->internal_format,
+						     key->depthmode);
+   }
    else {
       switch (key->depth) {
       case 32:
@@ -256,7 +292,8 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
    struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current;
    struct intel_texture_object *intelObj = intel_texture_object(tObj);
    struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel];
-   struct brw_wm_surface_key key;
+   struct brw_surface_key key;
+   const GLuint surf = SURF_INDEX_TEXTURE(unit);
 
    memset(&key, 0, sizeof(key));
 
@@ -267,6 +304,7 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
       key.offset = intelObj->textureOffset;
    } else {
       key.format = firstImage->TexFormat->MesaFormat;
+      key.internal_format = firstImage->InternalFormat;
       key.pitch = intelObj->mt->pitch;
       key.depth = firstImage->Depth;
       key.bo = intelObj->mt->region->buffer;
@@ -282,34 +320,207 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
    key.cpp = intelObj->mt->cpp;
    key.tiling = intelObj->mt->region->tiling;
 
-   dri_bo_unreference(brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS]);
-   brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-							       &key, sizeof(key),
-							       &key.bo, key.bo ? 1 : 0,
-							       NULL);
-   if (brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] == NULL) {
-      brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] = brw_create_texture_surface(brw, &key);
+   dri_bo_unreference(brw->wm.surf_bo[surf]);
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
+                                         &key, sizeof(key),
+                                         &key.bo, key.bo ? 1 : 0,
+                                         NULL);
+   if (brw->wm.surf_bo[surf] == NULL) {
+      brw->wm.surf_bo[surf] = brw_create_texture_surface(brw, &key);
    }
 }
 
+
+
+/**
+ * Create the constant buffer surface.  Vertex/fragment shader constants will be
+ * read from this buffer with Data Port Read instructions/messages.
+ */
+static dri_bo *
+brw_create_constant_surface( struct brw_context *brw,
+                             struct brw_surface_key *key )
+{
+   const GLint w = key->width - 1;
+   struct brw_surface_state surf;
+   dri_bo *bo;
+
+   memset(&surf, 0, sizeof(surf));
+
+   surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   surf.ss0.surface_type = BRW_SURFACE_BUFFER;
+   surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   assert(key->bo);
+   if (key->bo)
+      surf.ss1.base_addr = key->bo->offset; /* reloc */
+   else
+      surf.ss1.base_addr = key->offset;
+
+   surf.ss2.width = w & 0x7f;            /* bits 6:0 of size or width */
+   surf.ss2.height = (w >> 7) & 0x1fff;  /* bits 19:7 of size or width */
+   surf.ss3.depth = (w >> 20) & 0x7f;    /* bits 26:20 of size or width */
+   surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */
+   brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
+ 
+   bo = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+			 key, sizeof(*key),
+			 &key->bo, key->bo ? 1 : 0,
+			 &surf, sizeof(surf),
+			 NULL, NULL);
+
+   if (key->bo) {
+      /* Emit relocation to surface contents */
+      dri_bo_emit_reloc(bo,
+			I915_GEM_DOMAIN_SAMPLER, 0,
+			0,
+			offsetof(struct brw_surface_state, ss1),
+			key->bo);
+   }
+
+   return bo;
+}
+
+
+/**
+ * Update the surface state for a WM constant buffer.
+ * The constant buffer will be (re)allocated here if needed.
+ */
+static dri_bo *
+brw_update_wm_constant_surface( GLcontext *ctx,
+                                GLuint surf,
+                                dri_bo *const_buffer,
+                                const struct gl_program_parameter_list *params)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_surface_key key;
+   struct intel_context *intel = &brw->intel;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+
+   /* free old const buffer if too small */
+   if (const_buffer && const_buffer->size < size) {
+      dri_bo_unreference(const_buffer);
+      const_buffer = NULL;
+   }
+
+   /* alloc new buffer if needed */
+   if (!const_buffer) {
+      const_buffer =
+         drm_intel_bo_alloc(intel->bufmgr, "fp_const_buffer", size, 64);
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.format = MESA_FORMAT_RGBA_FLOAT32;
+   key.internal_format = GL_RGBA;
+   key.bo = const_buffer;
+   key.depthmode = GL_NONE;
+   key.pitch = params->NumParameters;
+   key.width = params->NumParameters;
+   key.height = 1;
+   key.depth = 1;
+   key.cpp = 16;
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   dri_bo_unreference(brw->wm.surf_bo[surf]);
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->wm.surf_bo[surf] == NULL) {
+      brw->wm.surf_bo[surf] = brw_create_constant_surface(brw, &key);
+   }
+
+   return const_buffer;
+}
+
+
+/**
+ * Update the surface state for a VS constant buffer.
+ * The constant buffer will be (re)allocated here if needed.
+ */
+static dri_bo *
+brw_update_vs_constant_surface( GLcontext *ctx,
+                                GLuint surf,
+                                dri_bo *const_buffer,
+                                const struct gl_program_parameter_list *params)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_surface_key key;
+   struct intel_context *intel = &brw->intel;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+
+   assert(surf == 0);
+
+   /* free old const buffer if too small */
+   if (const_buffer && const_buffer->size < size) {
+      dri_bo_unreference(const_buffer);
+      const_buffer = NULL;
+   }
+
+   /* alloc new buffer if needed */
+   if (!const_buffer) {
+      const_buffer =
+         drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer", size, 64);
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.format = MESA_FORMAT_RGBA_FLOAT32;
+   key.internal_format = GL_RGBA;
+   key.bo = const_buffer;
+   key.depthmode = GL_NONE;
+   key.pitch = params->NumParameters;
+   key.width = params->NumParameters;
+   key.height = 1;
+   key.depth = 1;
+   key.cpp = 16;
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   dri_bo_unreference(brw->vs.surf_bo[surf]);
+   brw->vs.surf_bo[surf] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->vs.surf_bo[surf] == NULL) {
+      brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
+   }
+
+   return const_buffer;
+}
+
+
 /**
  * Sets up a surface state structure to point at the given region.
  * While it is only used for the front/back buffer currently, it should be
  * usable for further buffers when doing ARB_draw_buffer support.
  */
 static void
-brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
-			  unsigned int unit, GLboolean cached)
+brw_update_renderbuffer_surface(struct brw_context *brw,
+				struct gl_renderbuffer *rb,
+				unsigned int unit, GLboolean cached)
 {
    GLcontext *ctx = &brw->intel.ctx;
    dri_bo *region_bo = NULL;
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+   struct intel_region *region = irb ? irb->region : NULL;
    struct {
       unsigned int surface_type;
       unsigned int surface_format;
-      unsigned int width, height, cpp;
+      unsigned int width, height, pitch, cpp;
       GLubyte color_mask[4];
       GLboolean color_blend;
       uint32_t tiling;
+      uint32_t draw_offset;
    } key;
 
    memset(&key, 0, sizeof(key));
@@ -318,14 +529,29 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
       region_bo = region->buffer;
 
       key.surface_type = BRW_SURFACE_2D;
-      if (region->cpp == 4)
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
 	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-      else
+	 break;
+      case MESA_FORMAT_RGB565:
 	 key.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+	 break;
+      case MESA_FORMAT_ARGB1555:
+	 key.surface_format = BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
+	 break;
+      case MESA_FORMAT_ARGB4444:
+	 key.surface_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
+	 break;
+      default:
+	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
+		       irb->texformat->MesaFormat);
+      }
       key.tiling = region->tiling;
-      key.width = region->pitch; /* XXX: not really! */
+      key.width = region->width;
       key.height = region->height;
+      key.pitch = region->pitch;
       key.cpp = region->cpp;
+      key.draw_offset = region->draw_offset; /* cur 3d or cube face offset */
    } else {
       key.surface_type = BRW_SURFACE_NULL;
       key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
@@ -333,6 +559,7 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
       key.width = 1;
       key.height = 1;
       key.cpp = 4;
+      key.draw_offset = 0;
    }
    memcpy(key.color_mask, ctx->Color.ColorMask,
 	  sizeof(key.color_mask));
@@ -354,13 +581,14 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
 
       surf.ss0.surface_format = key.surface_format;
       surf.ss0.surface_type = key.surface_type;
+      surf.ss1.base_addr =  key.draw_offset;
       if (region_bo != NULL)
-	 surf.ss1.base_addr = region_bo->offset; /* reloc */
+	 surf.ss1.base_addr += region_bo->offset; /* reloc */
 
       surf.ss2.width = key.width - 1;
       surf.ss2.height = key.height - 1;
       brw_set_surface_tiling(&surf, key.tiling);
-      surf.ss3.pitch = (key.width * key.cpp) - 1;
+      surf.ss3.pitch = (key.pitch * key.cpp) - 1;
 
       /* _NEW_COLOR */
       surf.ss0.color_blend = key.color_blend;
@@ -371,7 +599,7 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
 
       /* Key size will never match key size for textures, so we're safe. */
       brw->wm.surf_bo[unit] = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
-					      &key, sizeof(key),
+                                               &key, sizeof(key),
 					       &region_bo, 1,
 					       &surf, sizeof(surf),
 					       NULL, NULL);
@@ -380,12 +608,12 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
 	  * them both.  We might be able to figure out from other state
 	  * a more restrictive relocation to emit.
 	  */
-	 dri_bo_emit_reloc(brw->wm.surf_bo[unit],
-			   I915_GEM_DOMAIN_RENDER,
-			   I915_GEM_DOMAIN_RENDER,
-			   0,
-			   offsetof(struct brw_surface_state, ss1),
-			   region_bo);
+	 drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit],
+				 offsetof(struct brw_surface_state, ss1),
+				 region_bo,
+				 key.draw_offset,
+				 I915_GEM_DOMAIN_RENDER,
+				 I915_GEM_DOMAIN_RENDER);
       }
    }
 }
@@ -400,6 +628,8 @@ brw_wm_get_binding_table(struct brw_context *brw)
 {
    dri_bo *bind_bo;
 
+   assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
+
    bind_bo = brw_search_cache(&brw->cache, BRW_SS_SURF_BIND,
 			      NULL, 0,
 			      brw->wm.surf_bo, brw->wm.nr_surfaces,
@@ -446,54 +676,159 @@ static void prepare_wm_surfaces(struct brw_context *brw )
    GLuint i;
    int old_nr_surfaces;
 
-   if (brw->state.nr_draw_regions  > 1) {
-      for (i = 0; i < brw->state.nr_draw_regions; i++) {
-         brw_update_region_surface(brw, brw->state.draw_regions[i], i,
-				   GL_FALSE);
+   /* _NEW_BUFFERS */
+   /* Update surfaces for drawing buffers */
+   if (ctx->DrawBuffer->_NumColorDrawBuffers >= 1) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+         brw_update_renderbuffer_surface(brw,
+					 ctx->DrawBuffer->_ColorDrawBuffers[i],
+					 i,
+					 GL_FALSE);
       }
-   }else {
-      brw_update_region_surface(brw, brw->state.draw_regions[0], 0, GL_TRUE);
+   } else {
+      brw_update_renderbuffer_surface(brw, NULL, 0, GL_TRUE);
    }
 
    old_nr_surfaces = brw->wm.nr_surfaces;
    brw->wm.nr_surfaces = MAX_DRAW_BUFFERS;
 
+   /* Update surface / buffer for fragment shader constant buffer */
+   {
+      const GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
+      struct brw_fragment_program *fp =
+         (struct brw_fragment_program *) brw->fragment_program;
+      fp->const_buffer =
+         brw_update_wm_constant_surface(ctx, surf, fp->const_buffer,
+                                     fp->program.Base.Parameters);
+
+      brw->wm.nr_surfaces = surf + 1;
+   }
+
+   /* Update surfaces for textures */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
+      const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
+      const GLuint surf = SURF_INDEX_TEXTURE(i);
 
       /* _NEW_TEXTURE, BRW_NEW_TEXDATA */
-      if(texUnit->_ReallyEnabled) {
+      if (texUnit->_ReallyEnabled) {
          if (texUnit->_Current == intel->frame_buffer_texobj) {
-            dri_bo_unreference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]);
-            brw->wm.surf_bo[i+MAX_DRAW_BUFFERS] = brw->wm.surf_bo[0];
-            dri_bo_reference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]);
-            brw->wm.nr_surfaces = i + MAX_DRAW_BUFFERS + 1;
+            /* render to texture */
+            dri_bo_unreference(brw->wm.surf_bo[surf]);
+            brw->wm.surf_bo[surf] = brw->wm.surf_bo[0];
+            dri_bo_reference(brw->wm.surf_bo[surf]);
+            brw->wm.nr_surfaces = surf + 1;
          } else {
+            /* regular texture */
             brw_update_texture_surface(ctx, i);
-            brw->wm.nr_surfaces = i + MAX_DRAW_BUFFERS + 1;
+            brw->wm.nr_surfaces = surf + 1;
          }
       } else {
-         dri_bo_unreference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]);
-         brw->wm.surf_bo[i+MAX_DRAW_BUFFERS] = NULL;
+         dri_bo_unreference(brw->wm.surf_bo[surf]);
+         brw->wm.surf_bo[surf] = NULL;
       }
-
    }
 
    dri_bo_unreference(brw->wm.bind_bo);
    brw->wm.bind_bo = brw_wm_get_binding_table(brw);
 
    if (brw->wm.nr_surfaces != old_nr_surfaces)
-      brw->state.dirty.brw |= BRW_NEW_NR_SURFACES;
+      brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
+}
+
+
+/**
+ * Constructs the binding table for the VS surface state.
+ */
+static dri_bo *
+brw_vs_get_binding_table(struct brw_context *brw)
+{
+   dri_bo *bind_bo;
+
+   assert(brw->vs.nr_surfaces <= BRW_VS_MAX_SURF);
+
+   bind_bo = brw_search_cache(&brw->cache, BRW_SS_SURF_BIND,
+			      NULL, 0,
+			      brw->vs.surf_bo, brw->vs.nr_surfaces,
+			      NULL);
+
+   if (bind_bo == NULL) {
+      GLuint data_size = brw->vs.nr_surfaces * sizeof(GLuint);
+      uint32_t *data = malloc(data_size);
+      int i;
+
+      for (i = 0; i < brw->vs.nr_surfaces; i++)
+         if (brw->vs.surf_bo[i])
+            data[i] = brw->vs.surf_bo[i]->offset;
+         else
+            data[i] = 0;
+
+      bind_bo = brw_upload_cache( &brw->cache, BRW_SS_SURF_BIND,
+				  NULL, 0,
+				  brw->vs.surf_bo, brw->vs.nr_surfaces,
+				  data, data_size,
+				  NULL, NULL);
+
+      /* Emit binding table relocations to surface state */
+      for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+	 if (brw->vs.surf_bo[i] != NULL) {
+	    dri_bo_emit_reloc(bind_bo,
+			      I915_GEM_DOMAIN_INSTRUCTION, 0,
+			      0,
+			      i * sizeof(GLuint),
+			      brw->vs.surf_bo[i]);
+	 }
+      }
+
+      free(data);
+   }
+
+   return bind_bo;
+}
+
+
+/**
+ * Vertex shader surfaces.  Just constant buffer for now.  Could add vertex 
+ * shader textures in the future.
+ */
+static void prepare_vs_surfaces(struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+
+   /* Update surface / buffer for vertex shader constant buffer */
+   {
+      const GLuint surf = SURF_INDEX_VERT_CONST_BUFFER;
+      struct brw_vertex_program *vp =
+         (struct brw_vertex_program *) brw->vertex_program;
+      vp->const_buffer =
+         brw_update_vs_constant_surface(ctx, surf, vp->const_buffer,
+                                        vp->program.Base.Parameters);
+
+      brw->vs.nr_surfaces = 1;
+   }
+
+   dri_bo_unreference(brw->vs.bind_bo);
+   brw->vs.bind_bo = brw_vs_get_binding_table(brw);
+
+   if (1)
+      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
+}
+
+
+static void
+prepare_surfaces(struct brw_context *brw)
+{
+   prepare_wm_surfaces(brw);
+   prepare_vs_surfaces(brw);
 }
 
 
 const struct brw_tracked_state brw_wm_surfaces = {
    .dirty = {
-      .mesa = _NEW_COLOR | _NEW_TEXTURE | _NEW_BUFFERS,
+      .mesa = _NEW_COLOR | _NEW_TEXTURE | _NEW_BUFFERS | _NEW_PROGRAM,
       .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
-   .prepare = prepare_wm_surfaces,
+   .prepare = prepare_surfaces,
 };
 
 
diff --git a/i965/intel_state.c b/i965/intel_state.c
deleted file mode 100644
index 0c9c670..0000000
--- a/i965/intel_state.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "main/colormac.h"
-#include "main/dd.h"
-
-#include "intel_screen.h"
-#include "intel_context.h"
-#include "intel_regions.h"
-#include "swrast/swrast.h"
-
-int
-intel_translate_shadow_compare_func( GLenum func )
-{
-   switch(func) {
-   case GL_NEVER: 
-       return COMPAREFUNC_ALWAYS; 
-   case GL_LESS: 
-       return COMPAREFUNC_LEQUAL; 
-   case GL_LEQUAL: 
-       return COMPAREFUNC_LESS;
-   case GL_GREATER: 
-       return COMPAREFUNC_GEQUAL; 
-   case GL_GEQUAL: 
-      return COMPAREFUNC_GREATER; 
-   case GL_NOTEQUAL: 
-      return COMPAREFUNC_EQUAL; 
-   case GL_EQUAL: 
-      return COMPAREFUNC_NOTEQUAL; 
-   case GL_ALWAYS: 
-       return COMPAREFUNC_NEVER; 
-   }
-
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
-   return COMPAREFUNC_NEVER; 
-}
-
-int
-intel_translate_compare_func( GLenum func )
-{
-   switch(func) {
-   case GL_NEVER: 
-      return COMPAREFUNC_NEVER; 
-   case GL_LESS: 
-      return COMPAREFUNC_LESS; 
-   case GL_LEQUAL: 
-      return COMPAREFUNC_LEQUAL; 
-   case GL_GREATER: 
-      return COMPAREFUNC_GREATER; 
-   case GL_GEQUAL: 
-      return COMPAREFUNC_GEQUAL; 
-   case GL_NOTEQUAL: 
-      return COMPAREFUNC_NOTEQUAL; 
-   case GL_EQUAL: 
-      return COMPAREFUNC_EQUAL; 
-   case GL_ALWAYS: 
-      return COMPAREFUNC_ALWAYS; 
-   }
-
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
-   return COMPAREFUNC_ALWAYS; 
-}
-
-int
-intel_translate_stencil_op( GLenum op )
-{
-   switch(op) {
-   case GL_KEEP: 
-      return STENCILOP_KEEP; 
-   case GL_ZERO: 
-      return STENCILOP_ZERO; 
-   case GL_REPLACE: 
-      return STENCILOP_REPLACE; 
-   case GL_INCR: 
-      return STENCILOP_INCRSAT;
-   case GL_DECR: 
-      return STENCILOP_DECRSAT;
-   case GL_INCR_WRAP:
-      return STENCILOP_INCR; 
-   case GL_DECR_WRAP:
-      return STENCILOP_DECR; 
-   case GL_INVERT: 
-      return STENCILOP_INVERT; 
-   default: 
-      return STENCILOP_ZERO;
-   }
-}
-
-int
-intel_translate_blend_factor( GLenum factor )
-{
-   switch(factor) {
-   case GL_ZERO: 
-      return BLENDFACT_ZERO; 
-   case GL_SRC_ALPHA: 
-      return BLENDFACT_SRC_ALPHA; 
-   case GL_ONE: 
-      return BLENDFACT_ONE; 
-   case GL_SRC_COLOR: 
-      return BLENDFACT_SRC_COLR; 
-   case GL_ONE_MINUS_SRC_COLOR: 
-      return BLENDFACT_INV_SRC_COLR; 
-   case GL_DST_COLOR: 
-      return BLENDFACT_DST_COLR; 
-   case GL_ONE_MINUS_DST_COLOR: 
-      return BLENDFACT_INV_DST_COLR; 
-   case GL_ONE_MINUS_SRC_ALPHA:
-      return BLENDFACT_INV_SRC_ALPHA; 
-   case GL_DST_ALPHA: 
-      return BLENDFACT_DST_ALPHA; 
-   case GL_ONE_MINUS_DST_ALPHA:
-      return BLENDFACT_INV_DST_ALPHA; 
-   case GL_SRC_ALPHA_SATURATE: 
-      return BLENDFACT_SRC_ALPHA_SATURATE;
-   case GL_CONSTANT_COLOR:
-      return BLENDFACT_CONST_COLOR; 
-   case GL_ONE_MINUS_CONSTANT_COLOR:
-      return BLENDFACT_INV_CONST_COLOR;
-   case GL_CONSTANT_ALPHA:
-      return BLENDFACT_CONST_ALPHA; 
-   case GL_ONE_MINUS_CONSTANT_ALPHA:
-      return BLENDFACT_INV_CONST_ALPHA;
-   }
-   
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, factor);
-   return BLENDFACT_ZERO;
-}
-
-int
-intel_translate_logic_op( GLenum opcode )
-{
-   switch(opcode) {
-   case GL_CLEAR: 
-      return LOGICOP_CLEAR; 
-   case GL_AND: 
-      return LOGICOP_AND; 
-   case GL_AND_REVERSE: 
-      return LOGICOP_AND_RVRSE; 
-   case GL_COPY: 
-      return LOGICOP_COPY; 
-   case GL_COPY_INVERTED: 
-      return LOGICOP_COPY_INV; 
-   case GL_AND_INVERTED: 
-      return LOGICOP_AND_INV; 
-   case GL_NOOP: 
-      return LOGICOP_NOOP; 
-   case GL_XOR: 
-      return LOGICOP_XOR; 
-   case GL_OR: 
-      return LOGICOP_OR; 
-   case GL_OR_INVERTED: 
-      return LOGICOP_OR_INV; 
-   case GL_NOR: 
-      return LOGICOP_NOR; 
-   case GL_EQUIV: 
-      return LOGICOP_EQUIV; 
-   case GL_INVERT: 
-      return LOGICOP_INV; 
-   case GL_OR_REVERSE: 
-      return LOGICOP_OR_RVRSE; 
-   case GL_NAND: 
-      return LOGICOP_NAND; 
-   case GL_SET: 
-      return LOGICOP_SET; 
-   default:
-      return LOGICOP_SET;
-   }
-}
-
-
-static void
-intelClearColor(GLcontext *ctx, const GLfloat color[4])
-{
-   struct intel_context *intel = intel_context(ctx);
-   GLubyte clear[4];
-
-   CLAMPED_FLOAT_TO_UBYTE(clear[0], color[0]);
-   CLAMPED_FLOAT_TO_UBYTE(clear[1], color[1]);
-   CLAMPED_FLOAT_TO_UBYTE(clear[2], color[2]);
-   CLAMPED_FLOAT_TO_UBYTE(clear[3], color[3]);
-
-   /* compute both 32 and 16-bit clear values */
-   intel->ClearColor8888 = INTEL_PACKCOLOR8888(clear[0], clear[1],
-                                               clear[2], clear[3]);
-   intel->ClearColor565 = INTEL_PACKCOLOR565(clear[0], clear[1], clear[2]);
-}
-
-
-/* Fallback to swrast for select and feedback.
- */
-static void
-intelRenderMode( GLcontext *ctx, GLenum mode )
-{
-   struct intel_context *intel = intel_context(ctx);
-   FALLBACK( intel, INTEL_FALLBACK_RENDERMODE, (mode != GL_RENDER) );
-}
-
-
-void
-intelInitStateFuncs( struct dd_function_table *functions )
-{
-   functions->RenderMode = intelRenderMode;
-   functions->ClearColor = intelClearColor;
-}
diff --git a/shared/intel_batchbuffer.c b/shared/intel_batchbuffer.c
index 9d99372..29dc05c 100644
--- a/shared/intel_batchbuffer.c
+++ b/shared/intel_batchbuffer.c
@@ -207,7 +207,7 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
 	      used);
 
    /* Emit a flush if the bufmgr doesn't do it for us. */
-   if (!intel->ttm) {
+   if (intel->always_flush_cache || !intel->ttm) {
       *(GLuint *) (batch->ptr) = intel->vtbl.flush_cmd();
       batch->ptr += 4;
       used = batch->ptr - batch->map;
diff --git a/shared/intel_blit.c b/shared/intel_blit.c
index e160957..4919828 100644
--- a/shared/intel_blit.c
+++ b/shared/intel_blit.c
@@ -32,6 +32,8 @@
 #include "main/mtypes.h"
 #include "main/context.h"
 #include "main/enums.h"
+#include "main/texformat.h"
+#include "main/colormac.h"
 
 #include "intel_blit.h"
 #include "intel_buffers.h"
@@ -98,11 +100,11 @@ intelCopyBuffer(const __DRIdrawablePrivate * dPriv,
       ASSERT(src->cpp == dst->cpp);
 
       if (cpp == 2) {
-	 BR13 = (0xCC << 16) | (1 << 24);
+	 BR13 = (0xCC << 16) | BR13_565;
 	 CMD = XY_SRC_COPY_BLT_CMD;
       }
       else {
-	 BR13 = (0xCC << 16) | (1 << 24) | (1 << 25);
+	 BR13 = (0xCC << 16) | BR13_8888;
 	 CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
       }
 
@@ -194,13 +196,15 @@ intelEmitFillBlit(struct intel_context *intel,
 
    switch (cpp) {
    case 1:
+      BR13 = (0xF0 << 16);
+      CMD = XY_COLOR_BLT_CMD;
+      break;
    case 2:
-   case 3:
-      BR13 = (0xF0 << 16) | (1 << 24);
+      BR13 = (0xF0 << 16) | BR13_565;
       CMD = XY_COLOR_BLT_CMD;
       break;
    case 4:
-      BR13 = (0xF0 << 16) | (1 << 24) | (1 << 25);
+      BR13 = (0xF0 << 16) | BR13_8888;
       CMD = XY_COLOR_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
       break;
    default:
@@ -335,12 +339,11 @@ intelEmitCopyBlit(struct intel_context *intel,
       CMD = XY_SRC_COPY_BLT_CMD;
       break;
    case 2:
-   case 3:
-      BR13 |= (1 << 24);
+      BR13 |= BR13_565;
       CMD = XY_SRC_COPY_BLT_CMD;
       break;
    case 4:
-      BR13 |= (1 << 24) | (1 << 25);
+      BR13 |= BR13_8888;
       CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
       break;
    default:
@@ -483,10 +486,9 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
             const GLbitfield bufBit = 1 << buf;
             if ((clearMask & bufBit) && !(bufBit & skipBuffers)) {
                /* OK, clear this renderbuffer */
-               struct intel_region *irb_region =
-		  intel_get_rb_region(fb, buf);
+	       struct intel_renderbuffer *irb = intel_get_renderbuffer(fb, buf);
                dri_bo *write_buffer =
-                  intel_region_buffer(intel, irb_region,
+                  intel_region_buffer(intel, irb->region,
                                       all ? INTEL_WRITE_FULL :
                                       INTEL_WRITE_PART);
 
@@ -494,15 +496,13 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                GLint pitch, cpp;
                GLuint BR13, CMD;
 
-               ASSERT(irb_region);
-
-               pitch = irb_region->pitch;
-               cpp = irb_region->cpp;
+               pitch = irb->region->pitch;
+               cpp = irb->region->cpp;
 
                DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
                    __FUNCTION__,
-                   irb_region->buffer, (pitch * cpp),
-                   irb_region->draw_offset,
+                   irb->region->buffer, (pitch * cpp),
+                   irb->region->draw_offset,
                    b.x1, b.y1, b.x2 - b.x1, b.y2 - b.y1);
 
 	       BR13 = 0xf0 << 16;
@@ -510,7 +510,7 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
 
                /* Setup the blit command */
                if (cpp == 4) {
-                  BR13 |= (1 << 24) | (1 << 25);
+                  BR13 |= BR13_8888;
                   if (buf == BUFFER_DEPTH || buf == BUFFER_STENCIL) {
                      if (clearMask & BUFFER_BIT_DEPTH)
                         CMD |= XY_BLT_WRITE_RGB;
@@ -523,12 +523,12 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                   }
                }
                else {
-                  ASSERT(cpp == 2 || cpp == 0);
-                  BR13 |= (1 << 24);
+                  ASSERT(cpp == 2);
+                  BR13 |= BR13_565;
                }
 
 #ifndef I915
-	       if (irb_region->tiling != I915_TILING_NONE) {
+	       if (irb->region->tiling != I915_TILING_NONE) {
 		  CMD |= XY_DST_TILED;
 		  pitch /= 4;
 	       }
@@ -539,9 +539,36 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                   clearVal = clear_depth;
                }
                else {
-                  clearVal = (cpp == 4)
-                     ? intel->ClearColor8888 : intel->ClearColor565;
-               }
+		  uint8_t clear[4];
+		  GLclampf *color = ctx->Color.ClearColor;
+
+		  CLAMPED_FLOAT_TO_UBYTE(clear[0], color[0]);
+		  CLAMPED_FLOAT_TO_UBYTE(clear[1], color[1]);
+		  CLAMPED_FLOAT_TO_UBYTE(clear[2], color[2]);
+		  CLAMPED_FLOAT_TO_UBYTE(clear[3], color[3]);
+
+		  switch (irb->texformat->MesaFormat) {
+		  case MESA_FORMAT_ARGB8888:
+		     clearVal = intel->ClearColor8888;
+		     break;
+		  case MESA_FORMAT_RGB565:
+		     clearVal = intel->ClearColor565;
+		     break;
+		  case MESA_FORMAT_ARGB4444:
+		     clearVal = PACK_COLOR_4444(clear[3], clear[0],
+						clear[1], clear[2]);
+		     break;
+		  case MESA_FORMAT_ARGB1555:
+		     clearVal = PACK_COLOR_1555(clear[3], clear[0],
+						clear[1], clear[2]);
+		     break;
+		  default:
+		     _mesa_problem(ctx, "Unexpected renderbuffer format: %d\n",
+				   irb->texformat->MesaFormat);
+		     clearVal = 0;
+		  }
+	       }
+
                /*
                   _mesa_debug(ctx, "hardware blit clear buf %d rb id %d\n",
                   buf, irb->Base.Name);
@@ -557,14 +584,13 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                OUT_BATCH((b.y2 << 16) | b.x2);
                OUT_RELOC(write_buffer,
 			 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                         irb_region->draw_offset);
+                         irb->region->draw_offset);
                OUT_BATCH(clearVal);
                ADVANCE_BATCH();
                clearMask &= ~bufBit;    /* turn off bit, for faster loop exit */
             }
          }
       }
-      intel_batchbuffer_emit_mi_flush(intel->batch);
    }
 
    UNLOCK_HARDWARE(intel);
diff --git a/shared/intel_buffer_objects.c b/shared/intel_buffer_objects.c
index 60d7bb3..2e6b778 100644
--- a/shared/intel_buffer_objects.c
+++ b/shared/intel_buffer_objects.c
@@ -35,9 +35,6 @@
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 
-static GLboolean intel_bufferobj_unmap(GLcontext * ctx,
-				       GLenum target,
-				       struct gl_buffer_object *obj);
 
 /** Allocates a new dri_bo to store the data for the buffer object. */
 static void
@@ -103,13 +100,9 @@ intel_bufferobj_free(GLcontext * ctx, struct gl_buffer_object *obj)
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
 
    assert(intel_obj);
+   assert(!obj->Pointer); /* Mesa should have unmapped it */
 
-   /* Buffer objects are automatically unmapped when deleting according
-    * to the spec.
-    */
-   if (obj->Pointer)
-      intel_bufferobj_unmap(ctx, 0, obj);
-
+   _mesa_free(intel_obj->sys_buffer);
    if (intel_obj->region) {
       intel_bufferobj_release_region(intel, intel_obj);
    }
@@ -141,11 +134,7 @@ intel_bufferobj_data(GLcontext * ctx,
    intel_obj->Base.Size = size;
    intel_obj->Base.Usage = usage;
 
-   /* Buffer objects are automatically unmapped when creating new data buffers
-    * according to the spec.
-    */
-   if (obj->Pointer)
-      intel_bufferobj_unmap(ctx, 0, obj);
+   assert(!obj->Pointer); /* Mesa should have unmapped it */
 
    if (intel_obj->region)
       intel_bufferobj_release_region(intel, intel_obj);
@@ -154,7 +143,23 @@ intel_bufferobj_data(GLcontext * ctx,
       dri_bo_unreference(intel_obj->buffer);
       intel_obj->buffer = NULL;
    }
+   _mesa_free(intel_obj->sys_buffer);
+   intel_obj->sys_buffer = NULL;
+
    if (size != 0) {
+#ifdef I915
+      /* On pre-965, stick VBOs in system memory, as we're always doing swtnl
+       * with their contents anyway.
+       */
+      if (target == GL_ARRAY_BUFFER || target == GL_ELEMENT_ARRAY_BUFFER) {
+	 intel_obj->sys_buffer = _mesa_malloc(size);
+	 if (intel_obj->sys_buffer != NULL) {
+	    if (data != NULL)
+	       memcpy(intel_obj->sys_buffer, data, size);
+	    return;
+	 }
+      }
+#endif
       intel_bufferobj_alloc_buffer(intel, intel_obj);
 
       if (data != NULL)
@@ -184,7 +189,10 @@ intel_bufferobj_subdata(GLcontext * ctx,
    if (intel_obj->region)
       intel_bufferobj_cow(intel, intel_obj);
 
-   dri_bo_subdata(intel_obj->buffer, offset, size, data);
+   if (intel_obj->sys_buffer)
+      memcpy((char *)intel_obj->sys_buffer + offset, data, size);
+   else
+      dri_bo_subdata(intel_obj->buffer, offset, size, data);
 }
 
 
@@ -216,11 +224,16 @@ intel_bufferobj_map(GLcontext * ctx,
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
+   GLboolean read_only = (access == GL_READ_ONLY_ARB);
+   GLboolean write_only = (access == GL_WRITE_ONLY_ARB);
 
-   /* XXX: Translate access to flags arg below:
-    */
    assert(intel_obj);
 
+   if (intel_obj->sys_buffer) {
+      obj->Pointer = intel_obj->sys_buffer;
+      return obj->Pointer;
+   }
+
    if (intel_obj->region)
       intel_bufferobj_cow(intel, intel_obj);
 
@@ -229,7 +242,14 @@ intel_bufferobj_map(GLcontext * ctx,
       return NULL;
    }
 
-   dri_bo_map(intel_obj->buffer, GL_TRUE);
+   if (write_only && intel->intelScreen->kernel_exec_fencing) {
+      drm_intel_gem_bo_map_gtt(intel_obj->buffer);
+      intel_obj->mapped_gtt = GL_TRUE;
+   } else {
+      drm_intel_bo_map(intel_obj->buffer, !read_only);
+      intel_obj->mapped_gtt = GL_FALSE;
+   }
+
    obj->Pointer = intel_obj->buffer->virtual;
    return obj->Pointer;
 }
@@ -245,9 +265,16 @@ intel_bufferobj_unmap(GLcontext * ctx,
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
 
    assert(intel_obj);
-   if (intel_obj->buffer != NULL) {
+   if (intel_obj->sys_buffer != NULL) {
       assert(obj->Pointer);
-      dri_bo_unmap(intel_obj->buffer);
+      obj->Pointer = NULL;
+   } else if (intel_obj->buffer != NULL) {
+      assert(obj->Pointer);
+      if (intel_obj->mapped_gtt) {
+	 drm_intel_gem_bo_unmap_gtt(intel_obj->buffer);
+      } else {
+	 drm_intel_bo_unmap(intel_obj->buffer);
+      }
       obj->Pointer = NULL;
    }
    return GL_TRUE;
@@ -266,6 +293,18 @@ intel_bufferobj_buffer(struct intel_context *intel,
       }
    }
 
+   if (intel_obj->buffer == NULL) {
+      intel_bufferobj_alloc_buffer(intel, intel_obj);
+      intel_bufferobj_subdata(&intel->ctx,
+			      GL_ARRAY_BUFFER_ARB,
+			      0,
+			      intel_obj->Base.Size,
+			      intel_obj->sys_buffer,
+			      &intel_obj->Base);
+      _mesa_free(intel_obj->sys_buffer);
+      intel_obj->sys_buffer = NULL;
+   }
+
    return intel_obj->buffer;
 }
 
diff --git a/shared/intel_buffer_objects.h b/shared/intel_buffer_objects.h
index bf6dbd5..0431015 100644
--- a/shared/intel_buffer_objects.h
+++ b/shared/intel_buffer_objects.h
@@ -42,10 +42,13 @@ struct intel_buffer_object
 {
    struct gl_buffer_object Base;
    dri_bo *buffer;     /* the low-level buffer manager's buffer handle */
+   /** System memory buffer data, if not using a BO to store the data. */
+   void *sys_buffer;
 
    struct intel_region *region; /* Is there a zero-copy texture
                                    associated with this (pixel)
                                    buffer object? */
+   GLboolean mapped_gtt;
 };
 
 
diff --git a/shared/intel_buffers.c b/shared/intel_buffers.c
index f1908cb..d2fad9e 100644
--- a/shared/intel_buffers.c
+++ b/shared/intel_buffers.c
@@ -154,7 +154,7 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       return;
    }
 
-   /* Do this here, note core Mesa, since this function is called from
+   /* Do this here, not core Mesa, since this function is called from
     * many places within the driver.
     */
    if (ctx->NewState & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
@@ -172,9 +172,6 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       return;
    }
 
-   if (fb->Name)
-      intel_validate_paired_depth_stencil(ctx, fb);
-
    /*
     * How many color buffers are we drawing into?
     */
@@ -182,7 +179,8 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       /* writing to 0  */
       colorRegions[0] = NULL;
       intel->constant_cliprect = GL_TRUE;
-   } else if (fb->_NumColorDrawBuffers > 1) {
+   }
+   else if (fb->_NumColorDrawBuffers > 1) {
        int i;
        struct intel_renderbuffer *irb;
 
@@ -204,6 +202,8 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
 	       intel_batchbuffer_flush(intel->batch);
 	    intel->front_cliprects = GL_TRUE;
 	    colorRegions[0] = intel_get_rb_region(fb, BUFFER_FRONT_LEFT);
+
+	    intel->front_buffer_dirty = GL_TRUE;
 	 }
 	 else {
 	    if (!intel->constant_cliprect && intel->front_cliprects)
@@ -221,14 +221,6 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       }
    }
 
-   /* Update culling direction which changes depending on the
-    * orientation of the buffer:
-    */
-   if (ctx->Driver.FrontFace)
-      ctx->Driver.FrontFace(ctx, ctx->Polygon.FrontFace);
-   else
-      ctx->NewState |= _NEW_POLYGON;
-
    if (!colorRegions[0]) {
       FALLBACK(intel, INTEL_FALLBACK_DRAW_BUFFER, GL_TRUE);
    }
@@ -260,50 +252,43 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
    /***
     *** Stencil buffer
     *** This can only be hardware accelerated if we're using a
-    *** combined DEPTH_STENCIL buffer (for now anyway).
+    *** combined DEPTH_STENCIL buffer.
     ***/
    if (fb->_StencilBuffer && fb->_StencilBuffer->Wrapped) {
       irbStencil = intel_renderbuffer(fb->_StencilBuffer->Wrapped);
       if (irbStencil && irbStencil->region) {
          ASSERT(irbStencil->Base._ActualFormat == GL_DEPTH24_STENCIL8_EXT);
          FALLBACK(intel, INTEL_FALLBACK_STENCIL_BUFFER, GL_FALSE);
-         /* need to re-compute stencil hw state */
-	 if (ctx->Driver.Enable != NULL)
-	    ctx->Driver.Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
-	 else
-	    ctx->NewState |= _NEW_STENCIL;
-         if (!depthRegion)
-            depthRegion = irbStencil->region;
       }
       else {
          FALLBACK(intel, INTEL_FALLBACK_STENCIL_BUFFER, GL_TRUE);
       }
    }
    else {
-      /* XXX FBO: instead of FALSE, pass ctx->Stencil.Enabled ??? */
+      /* XXX FBO: instead of FALSE, pass ctx->Stencil._Enabled ??? */
       FALLBACK(intel, INTEL_FALLBACK_STENCIL_BUFFER, GL_FALSE);
-      /* need to re-compute stencil hw state */
-      if (ctx->Driver.Enable != NULL)
-	 ctx->Driver.Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
-      else
-	 ctx->NewState |= _NEW_STENCIL;
    }
 
    /*
-    * Update depth test state
+    * Update depth and stencil test state
     */
    if (ctx->Driver.Enable) {
-      if (ctx->Depth.Test && fb->Visual.depthBits > 0) {
-	 ctx->Driver.Enable(ctx, GL_DEPTH_TEST, GL_TRUE);
-      } else {
-	 ctx->Driver.Enable(ctx, GL_DEPTH_TEST, GL_FALSE);
-      }
-   } else {
-      ctx->NewState |= _NEW_DEPTH;
+      ctx->Driver.Enable(ctx, GL_DEPTH_TEST,
+                         (ctx->Depth.Test && fb->Visual.depthBits > 0));
+      ctx->Driver.Enable(ctx, GL_STENCIL_TEST,
+                         (ctx->Stencil.Enabled && fb->Visual.stencilBits > 0));
+   }
+   else {
+      /* Mesa's Stencil._Enabled field is updated when
+       * _NEW_BUFFERS | _NEW_STENCIL, but i965 code assumes that the value
+       * only changes with _NEW_STENCIL (which seems sensible).  So flag it
+       * here since this is the _NEW_BUFFERS path.
+       */
+      ctx->NewState |= (_NEW_DEPTH | _NEW_STENCIL);
    }
 
    intel->vtbl.set_draw_region(intel, colorRegions, depthRegion, 
-	fb->_NumColorDrawBuffers);
+                               fb->_NumColorDrawBuffers);
 
    /* update viewport since it depends on window size */
 #ifdef I915
@@ -322,12 +307,37 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       ctx->Driver.DepthRange(ctx,
 			     ctx->Viewport.Near,
 			     ctx->Viewport.Far);
+
+   /* Update culling direction which changes depending on the
+    * orientation of the buffer:
+    */
+   if (ctx->Driver.FrontFace)
+      ctx->Driver.FrontFace(ctx, ctx->Polygon.FrontFace);
+   else
+      ctx->NewState |= _NEW_POLYGON;
 }
 
 
 static void
 intelDrawBuffer(GLcontext * ctx, GLenum mode)
 {
+   if ((ctx->DrawBuffer != NULL) && (ctx->DrawBuffer->Name == 0)) {
+      struct intel_context *const intel = intel_context(ctx);
+      const GLboolean was_front_buffer_rendering =
+	intel->is_front_buffer_rendering;
+
+      intel->is_front_buffer_rendering = (mode == GL_FRONT_LEFT)
+	|| (mode == GL_FRONT);
+
+      /* If we weren't front-buffer rendering before but we are now, make sure
+       * that the front-buffer has actually been allocated.
+       */
+      if (!was_front_buffer_rendering && intel->is_front_buffer_rendering) {
+	 intel_update_renderbuffers(intel->driContext,
+				    intel->driContext->driDrawablePriv);
+      }
+   }
+
    intel_draw_buffer(ctx, ctx->DrawBuffer);
 }
 
diff --git a/shared/intel_clear.c b/shared/intel_clear.c
index c3ba50f..19f4763 100644
--- a/shared/intel_clear.c
+++ b/shared/intel_clear.c
@@ -30,6 +30,7 @@
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/mtypes.h"
+#include "main/arrayobj.h"
 #include "main/attrib.h"
 #include "main/blend.h"
 #include "main/bufferobj.h"
@@ -38,6 +39,7 @@
 #include "main/enable.h"
 #include "main/macros.h"
 #include "main/matrix.h"
+#include "main/polygon.h"
 #include "main/texstate.h"
 #include "main/shaders.h"
 #include "main/stencil.h"
@@ -65,6 +67,45 @@
 			      BUFFER_BIT_COLOR6 |			\
 			      BUFFER_BIT_COLOR7)
 
+
+/**
+ * Per-context one-time init of things for intl_clear_tris().
+ * Basically set up a private array object for vertex/color arrays.
+ */
+static void
+init_clear(GLcontext *ctx)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct gl_array_object *arraySave = NULL;
+   const GLuint arrayBuffer = ctx->Array.ArrayBufferObj->Name;
+   const GLuint elementBuffer = ctx->Array.ElementArrayBufferObj->Name;
+
+   /* create new array object */
+   intel->clear.arrayObj = _mesa_new_array_object(ctx, ~0);
+
+   /* save current array object, bind new one */
+   _mesa_reference_array_object(ctx, &arraySave, ctx->Array.ArrayObj);
+   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, intel->clear.arrayObj);
+
+   /* one-time setup of vertex arrays (pos, color) */
+   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
+   _mesa_BindBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, 0);
+   _mesa_ColorPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), intel->clear.color);
+   _mesa_VertexPointer(3, GL_FLOAT, 3 * sizeof(GLfloat), intel->clear.vertices);
+   _mesa_Enable(GL_COLOR_ARRAY);
+   _mesa_Enable(GL_VERTEX_ARRAY);
+
+   /* restore original array object */
+   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, arraySave);
+   _mesa_reference_array_object(ctx, &arraySave, NULL);
+
+   /* restore original buffer objects */
+   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, arrayBuffer);
+   _mesa_BindBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, elementBuffer);
+}
+
+
+
 /**
  * Perform glClear where mask contains only color, depth, and/or stencil.
  *
@@ -77,14 +118,16 @@ void
 intel_clear_tris(GLcontext *ctx, GLbitfield mask)
 {
    struct intel_context *intel = intel_context(ctx);
-   GLfloat vertices[4][3];
-   GLfloat color[4][4];
    GLfloat dst_z;
    struct gl_framebuffer *fb = ctx->DrawBuffer;
    int i;
    GLboolean saved_fp_enable = GL_FALSE, saved_vp_enable = GL_FALSE;
    GLuint saved_shader_program = 0;
    unsigned int saved_active_texture;
+   struct gl_array_object *arraySave = NULL;
+
+   if (!intel->clear.arrayObj)
+      init_clear(ctx);
 
    assert((mask & ~(TRI_CLEAR_COLOR_BITS | BUFFER_BIT_DEPTH |
 		    BUFFER_BIT_STENCIL)) == 0);
@@ -93,10 +136,10 @@ intel_clear_tris(GLcontext *ctx, GLbitfield mask)
 		    GL_CURRENT_BIT |
 		    GL_DEPTH_BUFFER_BIT |
 		    GL_ENABLE_BIT |
+		    GL_POLYGON_BIT |
 		    GL_STENCIL_BUFFER_BIT |
 		    GL_TRANSFORM_BIT |
 		    GL_CURRENT_BIT);
-   _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
    saved_active_texture = ctx->Texture.CurrentUnit;
 
    /* Disable existing GL state we don't want to apply to a clear. */
@@ -114,6 +157,7 @@ intel_clear_tris(GLcontext *ctx, GLbitfield mask)
    _mesa_Disable(GL_CLIP_PLANE3);
    _mesa_Disable(GL_CLIP_PLANE4);
    _mesa_Disable(GL_CLIP_PLANE5);
+   _mesa_PolygonMode(GL_FRONT_AND_BACK, GL_FILL);
    if (ctx->Extensions.ARB_fragment_program && ctx->FragmentProgram.Enabled) {
       saved_fp_enable = GL_TRUE;
       _mesa_Disable(GL_FRAGMENT_PROGRAM_ARB);
@@ -146,13 +190,14 @@ intel_clear_tris(GLcontext *ctx, GLbitfield mask)
       }
    }
 
+   /* save current array object, bind our private one */
+   _mesa_reference_array_object(ctx, &arraySave, ctx->Array.ArrayObj);
+   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, intel->clear.arrayObj);
+
    intel_meta_set_passthrough_transform(intel);
 
    for (i = 0; i < 4; i++) {
-      color[i][0] = ctx->Color.ClearColor[0];
-      color[i][1] = ctx->Color.ClearColor[1];
-      color[i][2] = ctx->Color.ClearColor[2];
-      color[i][3] = ctx->Color.ClearColor[3];
+      COPY_4FV(intel->clear.color[i], ctx->Color.ClearColor);
    }
 
    /* convert clear Z from [0,1] to NDC coord in [-1,1] */
@@ -161,23 +206,18 @@ intel_clear_tris(GLcontext *ctx, GLbitfield mask)
    /* Prepare the vertices, which are the same regardless of which buffer we're
     * drawing to.
     */
-   vertices[0][0] = fb->_Xmin;
-   vertices[0][1] = fb->_Ymin;
-   vertices[0][2] = dst_z;
-   vertices[1][0] = fb->_Xmax;
-   vertices[1][1] = fb->_Ymin;
-   vertices[1][2] = dst_z;
-   vertices[2][0] = fb->_Xmax;
-   vertices[2][1] = fb->_Ymax;
-   vertices[2][2] = dst_z;
-   vertices[3][0] = fb->_Xmin;
-   vertices[3][1] = fb->_Ymax;
-   vertices[3][2] = dst_z;
-
-   _mesa_ColorPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), &color);
-   _mesa_VertexPointer(3, GL_FLOAT, 3 * sizeof(GLfloat), &vertices);
-   _mesa_Enable(GL_COLOR_ARRAY);
-   _mesa_Enable(GL_VERTEX_ARRAY);
+   intel->clear.vertices[0][0] = fb->_Xmin;
+   intel->clear.vertices[0][1] = fb->_Ymin;
+   intel->clear.vertices[0][2] = dst_z;
+   intel->clear.vertices[1][0] = fb->_Xmax;
+   intel->clear.vertices[1][1] = fb->_Ymin;
+   intel->clear.vertices[1][2] = dst_z;
+   intel->clear.vertices[2][0] = fb->_Xmax;
+   intel->clear.vertices[2][1] = fb->_Ymax;
+   intel->clear.vertices[2][2] = dst_z;
+   intel->clear.vertices[3][0] = fb->_Xmin;
+   intel->clear.vertices[3][1] = fb->_Ymax;
+   intel->clear.vertices[3][2] = dst_z;
 
    while (mask != 0) {
       GLuint this_mask = 0;
@@ -215,14 +255,16 @@ intel_clear_tris(GLcontext *ctx, GLbitfield mask)
       /* Control writing of the stencil clear value to stencil. */
       if (this_mask & BUFFER_BIT_STENCIL) {
 	 _mesa_Enable(GL_STENCIL_TEST);
-	 _mesa_StencilOp(GL_REPLACE, GL_REPLACE, GL_REPLACE);
-	 _mesa_StencilFuncSeparate(GL_FRONT, GL_ALWAYS, ctx->Stencil.Clear,
+	 _mesa_StencilOpSeparate(GL_FRONT_AND_BACK,
+				 GL_REPLACE, GL_REPLACE, GL_REPLACE);
+	 _mesa_StencilFuncSeparate(GL_FRONT_AND_BACK, GL_ALWAYS,
+				   ctx->Stencil.Clear,
 				   ctx->Stencil.WriteMask[0]);
       } else {
 	 _mesa_Disable(GL_STENCIL_TEST);
       }
 
-      CALL_DrawArrays(ctx->Exec, (GL_TRIANGLE_FAN, 0, 4));
+      _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
       mask &= ~this_mask;
    }
@@ -238,8 +280,11 @@ intel_clear_tris(GLcontext *ctx, GLbitfield mask)
    if (saved_shader_program)
       _mesa_UseProgramObjectARB(saved_shader_program);
 
-   _mesa_PopClientAttrib();
    _mesa_PopAttrib();
+
+   /* restore current array object */
+   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, arraySave);
+   _mesa_reference_array_object(ctx, &arraySave, NULL);
 }
 
 static const char *buffer_names[] = {
@@ -247,13 +292,10 @@ static const char *buffer_names[] = {
    [BUFFER_BACK_LEFT] = "back",
    [BUFFER_FRONT_RIGHT] = "front right",
    [BUFFER_BACK_RIGHT] = "back right",
-   [BUFFER_AUX0] = "aux0",
-   [BUFFER_AUX1] = "aux1",
-   [BUFFER_AUX2] = "aux2",
-   [BUFFER_AUX3] = "aux3",
    [BUFFER_DEPTH] = "depth",
    [BUFFER_STENCIL] = "stencil",
    [BUFFER_ACCUM] = "accum",
+   [BUFFER_AUX0] = "aux0",
    [BUFFER_COLOR0] = "color0",
    [BUFFER_COLOR1] = "color1",
    [BUFFER_COLOR2] = "color2",
diff --git a/shared/intel_context.c b/shared/intel_context.c
index 2e76e93..cfd983d 100644
--- a/shared/intel_context.c
+++ b/shared/intel_context.c
@@ -28,8 +28,7 @@
 
 #include "main/glheader.h"
 #include "main/context.h"
-#include "main/matrix.h"
-#include "main/simple_list.h"
+#include "main/arrayobj.h"
 #include "main/extensions.h"
 #include "main/framebuffer.h"
 #include "main/imports.h"
@@ -38,68 +37,42 @@
 #include "swrast/swrast.h"
 #include "swrast_setup/swrast_setup.h"
 #include "tnl/tnl.h"
-
-#include "tnl/t_pipeline.h"
-#include "tnl/t_vertex.h"
-
 #include "drivers/common/driverfuncs.h"
 
-#include "intel_screen.h"
-
 #include "i830_dri.h"
 
 #include "intel_chipset.h"
 #include "intel_buffers.h"
 #include "intel_tex.h"
 #include "intel_batchbuffer.h"
-#include "intel_blit.h"
 #include "intel_clear.h"
+#include "intel_extensions.h"
 #include "intel_pixel.h"
 #include "intel_regions.h"
 #include "intel_buffer_objects.h"
 #include "intel_fbo.h"
 #include "intel_decode.h"
 #include "intel_bufmgr.h"
+#include "intel_screen.h"
 #include "intel_swapbuffers.h"
 
 #include "drirenderbuffer.h"
 #include "vblank.h"
 #include "utils.h"
 #include "xmlpool.h"            /* for symbolic values of enum-type options */
+
+
 #ifndef INTEL_DEBUG
 int INTEL_DEBUG = (0);
 #endif
 
-#define need_GL_ARB_multisample
-#define need_GL_ARB_occlusion_query
-#define need_GL_ARB_point_parameters
-#define need_GL_ARB_shader_objects
-#define need_GL_ARB_texture_compression
-#define need_GL_ARB_vertex_buffer_object
-#define need_GL_ARB_vertex_program
-#define need_GL_ARB_vertex_shader
-#define need_GL_ARB_window_pos
-#define need_GL_EXT_blend_color
-#define need_GL_EXT_blend_equation_separate
-#define need_GL_EXT_blend_func_separate
-#define need_GL_EXT_blend_minmax
-#define need_GL_EXT_cull_vertex
-#define need_GL_EXT_fog_coord
-#define need_GL_EXT_framebuffer_object
-#define need_GL_EXT_multi_draw_arrays
-#define need_GL_EXT_point_parameters
-#define need_GL_EXT_secondary_color
-#define need_GL_ATI_separate_stencil
-#define need_GL_NV_point_sprite
-#define need_GL_NV_vertex_program
-#define need_GL_VERSION_2_0
-#define need_GL_VERSION_2_1
-
-#include "extension_helper.h"
-
-#define DRIVER_DATE                     "20090326 2009Q1 RC2"
+
+#define DRIVER_DATE                     "20090712 2009Q2 RC3"
 #define DRIVER_DATE_GEM                 "GEM " DRIVER_DATE
 
+
+static void intel_flush(GLcontext *ctx, GLboolean needs_mi_flush);
+
 static const GLubyte *
 intelGetString(GLcontext * ctx, GLenum name)
 {
@@ -203,6 +176,24 @@ intelGetString(GLcontext * ctx, GLenum name)
    }
 }
 
+static unsigned
+intel_bits_per_pixel(const struct intel_renderbuffer *rb)
+{
+   switch (rb->Base._ActualFormat) {
+   case GL_RGB5:
+   case GL_DEPTH_COMPONENT16:
+      return 16;
+   case GL_RGB8:
+   case GL_RGBA8:
+   case GL_DEPTH_COMPONENT24:
+   case GL_DEPTH24_STENCIL8_EXT:
+   case GL_STENCIL_INDEX8_EXT:
+      return 32;
+   default:
+      return 0;
+   }
+}
+
 void
 intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 {
@@ -210,7 +201,7 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
    struct intel_renderbuffer *rb;
    struct intel_region *region, *depth_region;
    struct intel_context *intel = context->driverPrivate;
-   __DRIbuffer *buffers;
+   __DRIbuffer *buffers = NULL;
    __DRIscreen *screen;
    int i, count;
    unsigned int attachments[10];
@@ -222,22 +213,63 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 
    screen = intel->intelScreen->driScrnPriv;
 
-   i = 0;
-   if (intel_fb->color_rb[0])
-      attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
-   if (intel_fb->color_rb[1])
-      attachments[i++] = __DRI_BUFFER_BACK_LEFT;
-   if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH))
-      attachments[i++] = __DRI_BUFFER_DEPTH;
-   if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL))
-      attachments[i++] = __DRI_BUFFER_STENCIL;
-
-   buffers = (*screen->dri2.loader->getBuffers)(drawable,
-						&drawable->w,
-						&drawable->h,
-						attachments, i,
-						&count,
-						drawable->loaderPrivate);
+   if (screen->dri2.loader
+       && (screen->dri2.loader->base.version > 2)
+       && (screen->dri2.loader->getBuffersWithFormat != NULL)) {
+      struct intel_renderbuffer *depth_rb;
+      struct intel_renderbuffer *stencil_rb;
+
+      i = 0;
+      if ((intel->is_front_buffer_rendering || !intel_fb->color_rb[1])
+	   && intel_fb->color_rb[0]) {
+	 attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+	 attachments[i++] = intel_bits_per_pixel(intel_fb->color_rb[0]);
+      }
+
+      if (intel_fb->color_rb[1]) {
+	 attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+	 attachments[i++] = intel_bits_per_pixel(intel_fb->color_rb[1]);
+      }
+
+      depth_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+      stencil_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+
+      if ((depth_rb != NULL) && (stencil_rb != NULL)) {
+	 attachments[i++] = __DRI_BUFFER_DEPTH_STENCIL;
+	 attachments[i++] = intel_bits_per_pixel(depth_rb);
+      } else if (depth_rb != NULL) {
+	 attachments[i++] = __DRI_BUFFER_DEPTH;
+	 attachments[i++] = intel_bits_per_pixel(depth_rb);
+      } else if (stencil_rb != NULL) {
+	 attachments[i++] = __DRI_BUFFER_STENCIL;
+	 attachments[i++] = intel_bits_per_pixel(stencil_rb);
+      }
+
+      buffers =
+	 (*screen->dri2.loader->getBuffersWithFormat)(drawable,
+						      &drawable->w,
+						      &drawable->h,
+						      attachments, i / 2,
+						      &count,
+						      drawable->loaderPrivate);
+   } else if (screen->dri2.loader) {
+      i = 0;
+      if (intel_fb->color_rb[0])
+	 attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+      if (intel_fb->color_rb[1])
+	 attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+      if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH))
+	 attachments[i++] = __DRI_BUFFER_DEPTH;
+      if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL))
+	 attachments[i++] = __DRI_BUFFER_STENCIL;
+
+      buffers = (*screen->dri2.loader->getBuffers)(drawable,
+						   &drawable->w,
+						   &drawable->h,
+						   attachments, i,
+						   &count,
+						   drawable->loaderPrivate);
+   }
 
    if (buffers == NULL)
       return;
@@ -265,6 +297,11 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 	   region_name = "dri2 front buffer";
 	   break;
 
+       case __DRI_BUFFER_FAKE_FRONT_LEFT:
+	   rb = intel_fb->color_rb[0];
+	   region_name = "dri2 fake front buffer";
+	   break;
+
        case __DRI_BUFFER_BACK_LEFT:
 	   rb = intel_fb->color_rb[1];
 	   region_name = "dri2 back buffer";
@@ -275,6 +312,11 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 	   region_name = "dri2 depth buffer";
 	   break;
 
+       case __DRI_BUFFER_DEPTH_STENCIL:
+	   rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+	   region_name = "dri2 depth / stencil buffer";
+	   break;
+
        case __DRI_BUFFER_STENCIL:
 	   rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
 	   region_name = "dri2 stencil buffer";
@@ -321,6 +363,23 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 
        intel_renderbuffer_set_region(rb, region);
        intel_region_release(&region);
+
+       if (buffers[i].attachment == __DRI_BUFFER_DEPTH_STENCIL) {
+	  rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+	  if (rb != NULL) {
+	     struct intel_region *stencil_region = NULL;
+
+	     if (rb->region) {
+		dri_bo_flink(rb->region->buffer, &name);
+		if (name == buffers[i].name)
+		   continue;
+	     }
+
+	     intel_region_reference(&stencil_region, region);
+	     intel_renderbuffer_set_region(rb, stencil_region);
+	     intel_region_release(&stencil_region);
+	  }
+       }
    }
 
    driUpdateFramebufferSize(&intel->ctx, drawable);
@@ -337,9 +396,20 @@ intel_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei w, GLsizei h)
     if (!driContext->driScreenPriv->dri2.enabled)
 	return;
 
-    intel_update_renderbuffers(driContext, driContext->driDrawablePriv);
-    if (driContext->driDrawablePriv != driContext->driReadablePriv)
-	intel_update_renderbuffers(driContext, driContext->driReadablePriv);
+    if (!intel->internal_viewport_call && ctx->DrawBuffer->Name == 0) {
+       /* If we're rendering to the fake front buffer, make sure all the pending
+	* drawing has landed on the real front buffer.  Otherwise when we
+	* eventually get to DRI2GetBuffersWithFormat the stale real front
+	* buffer contents will get copied to the new fake front buffer.
+	*/
+       if (intel->is_front_buffer_rendering) {
+	  intel_flush(ctx, GL_FALSE);
+       }
+
+       intel_update_renderbuffers(driContext, driContext->driDrawablePriv);
+       if (driContext->driDrawablePriv != driContext->driReadablePriv)
+	  intel_update_renderbuffers(driContext, driContext->driReadablePriv);
+    }
 
     old_viewport = ctx->Driver.Viewport;
     ctx->Driver.Viewport = NULL;
@@ -349,112 +419,6 @@ intel_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei w, GLsizei h)
     ctx->Driver.Viewport = old_viewport;
 }
 
-/**
- * Extension strings exported by the intel driver.
- *
- * Extensions supported by all chips supported by i830_dri, i915_dri, or
- * i965_dri.
- */
-static const struct dri_extension card_extensions[] = {
-   { "GL_ARB_multisample",                GL_ARB_multisample_functions },
-   { "GL_ARB_multitexture",               NULL },
-   { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
-   { "GL_ARB_texture_border_clamp",       NULL },
-   { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
-   { "GL_ARB_texture_cube_map",           NULL },
-   { "GL_ARB_texture_env_add",            NULL },
-   { "GL_ARB_texture_env_combine",        NULL },
-   { "GL_ARB_texture_env_crossbar",       NULL },
-   { "GL_ARB_texture_env_dot3",           NULL },
-   { "GL_ARB_texture_mirrored_repeat",    NULL },
-   { "GL_ARB_texture_rectangle",          NULL },
-   { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
-   { "GL_ARB_vertex_program",             GL_ARB_vertex_program_functions },
-   { "GL_ARB_window_pos",                 GL_ARB_window_pos_functions },
-   { "GL_EXT_blend_color",                GL_EXT_blend_color_functions },
-   { "GL_EXT_blend_equation_separate",    GL_EXT_blend_equation_separate_functions },
-   { "GL_EXT_blend_func_separate",        GL_EXT_blend_func_separate_functions },
-   { "GL_EXT_blend_minmax",               GL_EXT_blend_minmax_functions },
-   { "GL_EXT_blend_logic_op",             NULL },
-   { "GL_EXT_blend_subtract",             NULL },
-   { "GL_EXT_cull_vertex",                GL_EXT_cull_vertex_functions },
-   { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
-   { "GL_EXT_multi_draw_arrays",          GL_EXT_multi_draw_arrays_functions },
-   { "GL_EXT_packed_depth_stencil",       NULL },
-   { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
-   { "GL_EXT_stencil_wrap",               NULL },
-   { "GL_EXT_texture_edge_clamp",         NULL },
-   { "GL_EXT_texture_env_combine",        NULL },
-   { "GL_EXT_texture_env_dot3",           NULL },
-   { "GL_EXT_texture_filter_anisotropic", NULL },
-   { "GL_EXT_texture_lod_bias",           NULL },
-   { "GL_3DFX_texture_compression_FXT1",  NULL },
-   { "GL_APPLE_client_storage",           NULL },
-   { "GL_MESA_pack_invert",               NULL },
-   { "GL_MESA_ycbcr_texture",             NULL },
-   { "GL_NV_blend_square",                NULL },
-   { "GL_NV_point_sprite",                GL_NV_point_sprite_functions },
-   { "GL_NV_vertex_program",              GL_NV_vertex_program_functions },
-   { "GL_NV_vertex_program1_1",           NULL },
-   { "GL_SGIS_generate_mipmap",           NULL },
-   { NULL, NULL }
-};
-
-static const struct dri_extension brw_extensions[] = {
-   { "GL_ARB_depth_texture",              NULL },
-   { "GL_ARB_draw_buffers",               NULL },
-   { "GL_ARB_fragment_program",           NULL },
-   { "GL_ARB_fragment_program_shadow",    NULL },
-   { "GL_ARB_fragment_shader",            NULL },
-   { "GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions },
-   { "GL_ARB_point_sprite", 		  NULL },
-   { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions },
-   { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions },
-#if 0
-   /* Support for GLSL 1.20 is currently broken in core Mesa.
-    */
-   { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions },
-#endif
-   { "GL_ARB_shadow",                     NULL },
-   { "GL_ARB_texture_non_power_of_two",   NULL },
-   { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions },
-   { "GL_EXT_shadow_funcs",               NULL },
-   { "GL_EXT_texture_sRGB",		  NULL },
-   { "GL_ATI_separate_stencil",           GL_ATI_separate_stencil_functions },
-   { "GL_ATI_texture_env_combine3",       NULL },
-   { NULL,                                NULL }
-};
-
-static const struct dri_extension arb_oq_extensions[] = {
-   { NULL, NULL }
-};
-
-static const struct dri_extension ttm_extensions[] = {
-   { "GL_ARB_pixel_buffer_object",        NULL },
-   { "GL_EXT_framebuffer_object",         GL_EXT_framebuffer_object_functions },
-   { NULL, NULL }
-};
-
-/**
- * Initializes potential list of extensions if ctx == NULL, or actually enables
- * extensions for a context.
- */
-void intelInitExtensions(GLcontext *ctx, GLboolean enable_imaging)
-{
-   struct intel_context *intel = ctx?intel_context(ctx):NULL;
-
-   /* Disable imaging extension until convolution is working in teximage paths.
-    */
-   enable_imaging = GL_FALSE;
-
-   driInitExtensions(ctx, card_extensions, enable_imaging);
-
-   if (intel == NULL || intel->ttm)
-      driInitExtensions(ctx, ttm_extensions, GL_FALSE);
-
-   if (intel == NULL || IS_965(intel->intelScreen->deviceID))
-      driInitExtensions(ctx, brw_extensions, GL_FALSE);
-}
 
 static const struct dri_debug_control debug_control[] = {
    { "tex",   DEBUG_TEXTURE},
@@ -525,6 +489,27 @@ intel_flush(GLcontext *ctx, GLboolean needs_mi_flush)
 
    if (intel->batch->map != intel->batch->ptr)
       intel_batchbuffer_flush(intel->batch);
+
+   if ((ctx->DrawBuffer->Name == 0) && intel->front_buffer_dirty) {
+      __DRIscreen *const screen = intel->intelScreen->driScrnPriv;
+
+      if (screen->dri2.loader &&
+          (screen->dri2.loader->base.version >= 2)
+	  && (screen->dri2.loader->flushFrontBuffer != NULL)) {
+	 (*screen->dri2.loader->flushFrontBuffer)(intel->driDrawable,
+						  intel->driDrawable->loaderPrivate);
+
+	 /* Only clear the dirty bit if front-buffer rendering is no longer
+	  * enabled.  This is done so that the dirty bit can only be set in
+	  * glDrawBuffer.  Otherwise the dirty bit would have to be set at
+	  * each of N places that do rendering.  This has worse performances,
+	  * but it is much easier to get correct.
+	  */
+	 if (intel->is_front_buffer_rendering) {
+	    intel->front_buffer_dirty = GL_FALSE;
+	 }
+      }
+   }
 }
 
 void
@@ -533,7 +518,7 @@ intelFlush(GLcontext * ctx)
    intel_flush(ctx, GL_FALSE);
 }
 
-void
+static void
 intel_glFlush(GLcontext *ctx)
 {
    intel_flush(ctx, GL_TRUE);
@@ -552,7 +537,7 @@ intelFinish(GLcontext * ctx)
 
        irb = intel_renderbuffer(fb->_ColorDrawBuffers[i]);
 
-       if (irb->region)
+       if (irb && irb->region)
 	  dri_bo_wait_rendering(irb->region->buffer);
    }
    if (fb->_DepthBuffer) {
@@ -636,8 +621,6 @@ intelInitContext(struct intel_context *intel,
       }
    }
 
-   ctx->Const.MaxTextureMaxAnisotropy = 2.0;
-
    /* This doesn't yet catch all non-conformant rendering, but it's a
     * start.
     */
@@ -719,8 +702,6 @@ intelInitContext(struct intel_context *intel,
 
    intel->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
 
-   _math_matrix_ctr(&intel->ViewportMatrix);
-
    if (IS_965(intelScreen->deviceID) && !intel->intelScreen->irq_active) {
       _mesa_printf("IRQs not active.  Exiting\n");
       exit(1);
@@ -756,6 +737,16 @@ intelInitContext(struct intel_context *intel,
       intel->no_rast = 1;
    }
 
+   if (driQueryOptionb(&intel->optionCache, "always_flush_batch")) {
+      fprintf(stderr, "flushing batchbuffer before/after each draw call\n");
+      intel->always_flush_batch = 1;
+   }
+
+   if (driQueryOptionb(&intel->optionCache, "always_flush_cache")) {
+      fprintf(stderr, "flushing GPU caches before/after each draw call\n");
+      intel->always_flush_cache = 1;
+   }
+
    /* Disable all hardware rendering (skip emitting batches and fences/waits
     * to the kernel)
     */
@@ -776,6 +767,9 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
 
       INTEL_FIREVERTICES(intel);
 
+      if (intel->clear.arrayObj)
+         _mesa_delete_array_object(&intel->ctx, intel->clear.arrayObj);
+
       intel->vtbl.destroy(intel);
 
       release_texture_heaps = (intel->ctx.Shared->RefCount == 1);
diff --git a/shared/intel_context.h b/shared/intel_context.h
index 8a8e59f..b5cf7e6 100644
--- a/shared/intel_context.h
+++ b/shared/intel_context.h
@@ -48,6 +48,8 @@
 #define DV_PF_555  (1<<8)
 #define DV_PF_565  (2<<8)
 #define DV_PF_8888 (3<<8)
+#define DV_PF_4444 (8<<8)
+#define DV_PF_1555 (9<<8)
 
 struct intel_region;
 struct intel_context;
@@ -100,7 +102,6 @@ struct intel_context
 			       GLuint num_regions);
 
       GLuint (*flush_cmd) (void);
-      void (*emit_flush) (struct intel_context *intel, GLuint unused);
 
       void (*reduced_primitive_state) (struct intel_context * intel,
                                        GLenum rprim);
@@ -181,6 +182,7 @@ struct intel_context
    struct intel_region *front_region;
    struct intel_region *back_region;
    struct intel_region *depth_region;
+   GLboolean internal_viewport_call;
 
    /**
     * This value indicates that the kernel memory manager is being used
@@ -213,6 +215,14 @@ struct intel_context
    GLuint ClearColor565;
    GLuint ClearColor8888;
 
+   /* info for intel_clear_tris() */
+   struct
+   {
+      struct gl_array_object *arrayObj;
+      GLfloat vertices[4][3];
+      GLfloat color[4][4];
+   } clear;
+
    /* Offsets of fields within the current vertex:
     */
    GLuint coloroffset;
@@ -229,6 +239,8 @@ struct intel_context
    GLboolean hw_stipple;
    GLboolean depth_buffer_is_float;
    GLboolean no_rast;
+   GLboolean always_flush_batch;
+   GLboolean always_flush_cache;
 
    /* 0 - nonconformant, best performance;
     * 1 - fallback to sw for known conformance bugs
@@ -260,11 +272,29 @@ struct intel_context
     * flush time while the lock is held.
     */
    GLboolean constant_cliprect;
+
    /**
     * In !constant_cliprect mode, set to true if the front cliprects should be
     * used instead of back.
     */
    GLboolean front_cliprects;
+
+   /**
+    * Set if rendering has occured to the drawable's front buffer.
+    *
+    * This is used in the DRI2 case to detect that glFlush should also copy
+    * the contents of the fake front buffer to the real front buffer.
+    */
+   GLboolean front_buffer_dirty;
+
+   /**
+    * Track whether front-buffer rendering is currently enabled
+    *
+    * A separate flag is used to track this in order to support MRT more
+    * easily.
+    */
+   GLboolean is_front_buffer_rendering;
+
    drm_clip_rect_t fboRect;     /**< cliprect for FBO rendering */
 
    int perf_boxes;
@@ -317,6 +347,7 @@ extern char *__progname;
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
 #define ALIGN(value, alignment)  ((value + alignment - 1) & ~(alignment - 1))
+#define IS_POWER_OF_TWO(val) (((val) & (val - 1)) == 0)
 
 #define INTEL_FIREVERTICES(intel)		\
 do {						\
@@ -440,10 +471,8 @@ extern void intelGetLock(struct intel_context *intel, GLuint flags);
 
 extern void intelFinish(GLcontext * ctx);
 extern void intelFlush(GLcontext * ctx);
-extern void intel_glFlush(GLcontext *ctx);
 
 extern void intelInitDriverFunctions(struct dd_function_table *functions);
-extern void intelInitExtensions(GLcontext *ctx, GLboolean enable_imaging);
 
 
 /* ================================================================
@@ -530,4 +559,10 @@ intel_context(GLcontext * ctx)
    return (struct intel_context *) ctx;
 }
 
+static INLINE GLboolean
+is_power_of_two(uint32_t value)
+{
+   return (value & (value - 1)) == 0;
+}
+
 #endif
diff --git a/shared/intel_decode.c b/shared/intel_decode.c
index 136221c..a9dfe28 100644
--- a/shared/intel_decode.c
+++ b/shared/intel_decode.c
@@ -800,6 +800,7 @@ static int
 decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i830)
 {
     unsigned int len, i, c, opcode, word, map, sampler, instr;
+    char *format;
 
     struct {
 	uint32_t opcode;
@@ -1001,6 +1002,35 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
 	    (*failures)++;
 	}
 	return len;
+    case 0x85:
+	len = (data[0] & 0x0000000f) + 2;
+
+	if (len != 2)
+	    fprintf(out, "Bad count in 3DSTATE_DEST_BUFFER_VARIABLES\n");
+	if (count < 2)
+	    BUFFER_FAIL(count, len, "3DSTATE_DEST_BUFFER_VARIABLES");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_DEST_BUFFER_VARIABLES\n");
+
+	switch ((data[1] >> 8) & 0xf) {
+	case 0x0: format = "g8"; break;
+	case 0x1: format = "x1r5g5b5"; break;
+	case 0x2: format = "r5g6b5"; break;
+	case 0x3: format = "a8r8g8b8"; break;
+	case 0x4: format = "ycrcb_swapy"; break;
+	case 0x5: format = "ycrcb_normal"; break;
+	case 0x6: format = "ycrcb_swapuv"; break;
+	case 0x7: format = "ycrcb_swapuvy"; break;
+	case 0x8: format = "a4r4g4b4"; break;
+	case 0x9: format = "a1r5g5b5"; break;
+	case 0xa: format = "a2r10g10b10"; break;
+	default: format = "BAD"; break;
+	}
+	instr_out(data, hw_offset, 1, "%s format, early Z %sabled\n",
+		  format,
+		  (data[1] & (1 << 31)) ? "en" : "dis");
+	return len;
     }
 
     for (opcode = 0; opcode < sizeof(opcodes_3d_1d) / sizeof(opcodes_3d_1d[0]);
@@ -1513,7 +1543,7 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 
 	for (i = 1; i < len;) {
 	    instr_out(data, hw_offset, i, "buffer %d: %svalid, type 0x%04x, "
-		      "src offset 0x%04xd bytes\n",
+		      "src offset 0x%04x bytes\n",
 		      data[i] >> 27,
 		      data[i] & (1 << 26) ? "" : "in",
 		      (data[i] >> 16) & 0x1ff,
@@ -1595,7 +1625,7 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 		  "3DPRIMITIVE: %s %s\n",
 		  get_965_prim_type(data[0]),
 		  (data[0] & (1 << 15)) ? "random" : "sequential");
-	instr_out(data, hw_offset, 1, "primitive count\n");
+	instr_out(data, hw_offset, 1, "vertex count\n");
 	instr_out(data, hw_offset, 2, "start vertex\n");
 	instr_out(data, hw_offset, 3, "instance count\n");
 	instr_out(data, hw_offset, 4, "start instance\n");
diff --git a/shared/intel_depthstencil.c b/shared/intel_depthstencil.c
deleted file mode 100644
index 354b3bf..0000000
--- a/shared/intel_depthstencil.c
+++ /dev/null
@@ -1,261 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/depthstencil.h"
-#include "main/fbobject.h"
-#include "main/framebuffer.h"
-#include "main/hash.h"
-#include "main/mtypes.h"
-#include "main/renderbuffer.h"
-
-#include "intel_context.h"
-#include "intel_fbo.h"
-#include "intel_depthstencil.h"
-#include "intel_regions.h"
-#include "intel_span.h"
-
-/**
- * The GL_EXT_framebuffer_object allows the user to create their own
- * framebuffer objects consisting of color renderbuffers (0 or more),
- * depth renderbuffers (0 or 1) and stencil renderbuffers (0 or 1).
- *
- * The spec considers depth and stencil renderbuffers to be totally independent
- * buffers.  In reality, most graphics hardware today uses a combined
- * depth+stencil buffer (one 32-bit pixel = 24 bits of Z + 8 bits of stencil).
- *
- * This causes difficulty because the user may create some number of depth
- * renderbuffers and some number of stencil renderbuffers and bind them
- * together in framebuffers in any combination.
- *
- * This code manages all that.
- *
- * 1. Depth renderbuffers are always allocated in hardware as 32bpp
- *    GL_DEPTH24_STENCIL8 buffers.
- *
- * 2. Stencil renderbuffers are initially allocated in software as 8bpp
- *    GL_STENCIL_INDEX8 buffers.
- *
- * 3. Depth and Stencil renderbuffers use the PairedStencil and PairedDepth
- *    fields (respectively) to indicate if the buffer's currently paired
- *    with another stencil or depth buffer (respectively).
- *
- * 4. When a depth and stencil buffer are initially both attached to the
- *    current framebuffer, we merge the stencil buffer values into the
- *    depth buffer (really a depth+stencil buffer).  The then hardware uses
- *    the combined buffer.
- *
- * 5. Whenever a depth or stencil buffer is reallocated (with
- *    glRenderbufferStorage) we undo the pairing and copy the stencil values
- *    from the combined depth/stencil buffer back to the stencil-only buffer.
- *
- * 6. We also undo the pairing when we find a change in buffer bindings.
- *
- * 7. If a framebuffer is only using a depth renderbuffer (no stencil), we
- *    just use the combined depth/stencil buffer and ignore the stencil values.
- *
- * 8. If a framebuffer is only using a stencil renderbuffer (no depth) we have
- *    to promote the 8bpp software stencil buffer to a 32bpp hardware
- *    depth+stencil buffer.
- *
- */
-
-/**
- * Undo the pairing/interleaving between depth and stencil buffers.
- * irb should be a depth/stencil or stencil renderbuffer.
- */
-void
-intel_unpair_depth_stencil(GLcontext *ctx, struct intel_renderbuffer *irb)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct gl_renderbuffer *rb = &irb->Base;
-
-   if (irb->PairedStencil) {
-      /* irb is a depth/stencil buffer */
-      struct gl_renderbuffer *stencilRb;
-      struct intel_renderbuffer *stencilIrb;
-
-      ASSERT(rb->_ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-
-      stencilRb = _mesa_lookup_renderbuffer(ctx, irb->PairedStencil);
-      stencilIrb = intel_renderbuffer(stencilRb);
-      if (stencilIrb) {
-         /* need to extract stencil values from the depth buffer */
-	 ASSERT(stencilIrb->PairedDepth == rb->Name);
-	 intel_renderbuffer_map(intel, rb);
-	 intel_renderbuffer_map(intel, stencilRb);
-#if 0
-         /* disable for now */
-	 _mesa_extract_stencil(ctx, rb, stencilRb);
-#endif
-	 intel_renderbuffer_unmap(intel, stencilRb);
-	 intel_renderbuffer_unmap(intel, rb);
-         stencilIrb->PairedDepth = 0;
-      }
-      irb->PairedStencil = 0;
-   }
-   else if (irb->PairedDepth) {
-      /* irb is a stencil buffer */
-      struct gl_renderbuffer *depthRb;
-      struct intel_renderbuffer *depthIrb;
-
-      ASSERT(rb->_ActualFormat == GL_STENCIL_INDEX8_EXT ||
-             rb->_ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-
-      depthRb = _mesa_lookup_renderbuffer(ctx, irb->PairedDepth);
-      depthIrb = intel_renderbuffer(depthRb);
-      if (depthIrb) {
-         /* need to extract stencil values from the depth buffer */
-	 ASSERT(depthIrb->PairedStencil == rb->Name);
-	 intel_renderbuffer_map(intel, rb);
-	 intel_renderbuffer_map(intel, depthRb);
-#if 0
-         /* disable for now */
-	 _mesa_extract_stencil(ctx, depthRb, rb);
-#endif
-	 intel_renderbuffer_unmap(intel, depthRb);
-	 intel_renderbuffer_unmap(intel, rb);
-         depthIrb->PairedStencil = 0;
-      }
-      irb->PairedDepth = 0;
-   }
-   else {
-      _mesa_problem(ctx, "Problem in undo_depth_stencil_pairing");
-   }
-
-   ASSERT(irb->PairedStencil == 0);
-   ASSERT(irb->PairedDepth == 0);
-}
-
-
-/**
- * Examine the depth and stencil renderbuffers which are attached to the
- * framebuffer.  If both depth and stencil are attached, make sure that the
- * renderbuffers are 'paired' (combined).  If only depth or only stencil is
- * attached, undo any previous pairing.
- *
- * Must be called if NewState & _NEW_BUFFER (when renderbuffer attachments
- * change, for example).
- */
-void
-intel_validate_paired_depth_stencil(GLcontext * ctx,
-                                    struct gl_framebuffer *fb)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_renderbuffer *depthRb, *stencilRb;
-
-   depthRb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
-   stencilRb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
-
-   if (depthRb && stencilRb) {
-      if (depthRb == stencilRb) {
-         /* Using a user-created combined depth/stencil buffer.
-          * Nothing to do.
-          */
-         ASSERT(depthRb->Base._BaseFormat == GL_DEPTH_STENCIL_EXT);
-         ASSERT(depthRb->Base._ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-      }
-      else {
-         /* Separate depth/stencil buffers, need to interleave now */
-         ASSERT(depthRb->Base._BaseFormat == GL_DEPTH_COMPONENT ||
-                depthRb->Base._BaseFormat == GL_DEPTH_STENCIL);
-         ASSERT(stencilRb->Base._BaseFormat == GL_STENCIL_INDEX ||
-                stencilRb->Base._BaseFormat == GL_DEPTH_STENCIL);
-
-         /* may need to interleave depth/stencil now */
-         if (depthRb->PairedStencil == stencilRb->Base.Name) {
-            /* OK, the depth and stencil buffers are already interleaved */
-            ASSERT(stencilRb->PairedDepth == depthRb->Base.Name);
-         }
-         else {
-            /* need to setup new pairing/interleaving */
-            if (depthRb->PairedStencil) {
-               intel_unpair_depth_stencil(ctx, depthRb);
-            }
-            if (stencilRb->PairedDepth) {
-               intel_unpair_depth_stencil(ctx, stencilRb);
-            }
-
-            ASSERT(depthRb->Base._ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-            ASSERT(stencilRb->Base._ActualFormat == GL_STENCIL_INDEX8_EXT ||
-                   stencilRb->Base._ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-
-            /* establish new pairing: interleave stencil into depth buffer */
-	    intel_renderbuffer_map(intel, &depthRb->Base);
-	    intel_renderbuffer_map(intel, &stencilRb->Base);
-            _mesa_insert_stencil(ctx, &depthRb->Base, &stencilRb->Base);
-	    intel_renderbuffer_unmap(intel, &stencilRb->Base);
-	    intel_renderbuffer_unmap(intel, &depthRb->Base);
-            depthRb->PairedStencil = stencilRb->Base.Name;
-            stencilRb->PairedDepth = depthRb->Base.Name;
-         }
-
-      }
-   }
-   else if (depthRb) {
-      /* Depth buffer but no stencil buffer.
-       * We'll use a GL_DEPTH24_STENCIL8 buffer and ignore the stencil bits.
-       */
-      /* can't assert this until storage is allocated:
-         ASSERT(depthRb->Base._ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-       */
-      /* intel_undo any previous pairing */
-      if (depthRb->PairedStencil) {
-         intel_unpair_depth_stencil(ctx, depthRb);
-      }
-   }
-   else if (stencilRb) {
-      /* Stencil buffer but no depth buffer.
-       * Since h/w doesn't typically support just 8bpp stencil w/out Z,
-       * we'll use a GL_DEPTH24_STENCIL8 buffer and ignore the depth bits.
-       */
-      /* undo any previous pairing */
-      if (stencilRb->PairedDepth) {
-         intel_unpair_depth_stencil(ctx, stencilRb);
-      }
-      if (stencilRb->Base._ActualFormat == GL_STENCIL_INDEX8_EXT) {
-         /* promote buffer to GL_DEPTH24_STENCIL8 for hw rendering */
-         _mesa_promote_stencil(ctx, &stencilRb->Base);
-         ASSERT(stencilRb->Base._ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-      }
-   }
-
-   /* Finally, update the fb->_DepthBuffer and fb->_StencilBuffer fields */
-   _mesa_update_depth_buffer(ctx, fb, BUFFER_DEPTH);
-   if (depthRb && depthRb->PairedStencil)
-      _mesa_update_stencil_buffer(ctx, fb, BUFFER_DEPTH);
-   else
-      _mesa_update_stencil_buffer(ctx, fb, BUFFER_STENCIL);
-
-
-   /* The hardware should use fb->Attachment[BUFFER_DEPTH].Renderbuffer
-    * first, if present, then fb->Attachment[BUFFER_STENCIL].Renderbuffer
-    * if present.
-    */
-}
diff --git a/shared/intel_depthstencil.h b/shared/intel_depthstencil.h
deleted file mode 100644
index 740eb0d..0000000
--- a/shared/intel_depthstencil.h
+++ /dev/null
@@ -1,15 +0,0 @@
-
-#ifndef INTEL_DEPTH_STENCIL_H
-#define INTEL_DEPTH_STENCIL_H
-
-#include "intel_fbo.h"
-
-extern void
-intel_unpair_depth_stencil(GLcontext * ctx, struct intel_renderbuffer *irb);
-
-extern void
-intel_validate_paired_depth_stencil(GLcontext * ctx,
-                                    struct gl_framebuffer *fb);
-
-
-#endif /* INTEL_DEPTH_STENCIL_H */
diff --git a/shared/intel_depthtmp.h b/shared/intel_depthtmp.h
new file mode 100644
index 0000000..16d7708
--- /dev/null
+++ b/shared/intel_depthtmp.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/**
+ * Wrapper around the depthtmp.h macrofest to generate spans code for
+ * all the tiling styles.
+ */
+
+#define VALUE_TYPE INTEL_VALUE_TYPE
+#define WRITE_DEPTH(_x, _y, d) INTEL_WRITE_DEPTH(NO_TILE(_x, _y), d)
+#define READ_DEPTH(d, _x, _y) d = INTEL_READ_DEPTH(NO_TILE(_x, _y))
+#define TAG(x) INTEL_TAG(intel##x)
+#include "depthtmp.h"
+
+#define VALUE_TYPE INTEL_VALUE_TYPE
+#define WRITE_DEPTH(_x, _y, d) INTEL_WRITE_DEPTH(X_TILE(_x, _y), d)
+#define READ_DEPTH(d, _x, _y) d = INTEL_READ_DEPTH(X_TILE(_x, _y))
+#define TAG(x) INTEL_TAG(intel_XTile_##x)
+#include "depthtmp.h"
+
+#define VALUE_TYPE INTEL_VALUE_TYPE
+#define WRITE_DEPTH(_x, _y, d) INTEL_WRITE_DEPTH(Y_TILE(_x, _y), d)
+#define READ_DEPTH(d, _x, _y) d = INTEL_READ_DEPTH(Y_TILE(_x, _y))
+#define TAG(x) INTEL_TAG(intel_YTile_##x)
+#include "depthtmp.h"
+
+#undef INTEL_VALUE_TYPE
+#undef INTEL_WRITE_DEPTH
+#undef INTEL_READ_DEPTH
+#undef INTEL_TAG
diff --git a/shared/intel_extensions.c b/shared/intel_extensions.c
new file mode 100644
index 0000000..9ec1b4e
--- /dev/null
+++ b/shared/intel_extensions.c
@@ -0,0 +1,188 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "intel_chipset.h"
+#include "intel_context.h"
+#include "intel_extensions.h"
+
+
+#define need_GL_ARB_framebuffer_object
+#define need_GL_ARB_occlusion_query
+#define need_GL_ARB_point_parameters
+#define need_GL_ARB_shader_objects
+#define need_GL_ARB_vertex_program
+#define need_GL_ARB_vertex_shader
+#define need_GL_ARB_window_pos
+#define need_GL_EXT_blend_color
+#define need_GL_EXT_blend_equation_separate
+#define need_GL_EXT_blend_func_separate
+#define need_GL_EXT_blend_minmax
+#define need_GL_EXT_cull_vertex
+#define need_GL_EXT_fog_coord
+#define need_GL_EXT_framebuffer_object
+#define need_GL_EXT_framebuffer_blit
+#define need_GL_EXT_point_parameters
+#define need_GL_EXT_secondary_color
+#define need_GL_EXT_stencil_two_side
+#define need_GL_ATI_separate_stencil
+#define need_GL_ATI_envmap_bumpmap
+#define need_GL_NV_point_sprite
+#define need_GL_NV_vertex_program
+#define need_GL_VERSION_2_0
+#define need_GL_VERSION_2_1
+
+#include "extension_helper.h"
+
+
+/**
+ * Extension strings exported by the intel driver.
+ *
+ * Extensions supported by all chips supported by i830_dri, i915_dri, or
+ * i965_dri.
+ */
+static const struct dri_extension card_extensions[] = {
+   { "GL_ARB_multitexture",               NULL },
+   { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
+   { "GL_ARB_texture_border_clamp",       NULL },
+   { "GL_ARB_texture_cube_map",           NULL },
+   { "GL_ARB_texture_env_add",            NULL },
+   { "GL_ARB_texture_env_combine",        NULL },
+   { "GL_ARB_texture_env_crossbar",       NULL },
+   { "GL_ARB_texture_env_dot3",           NULL },
+   { "GL_ARB_texture_mirrored_repeat",    NULL },
+   { "GL_ARB_texture_rectangle",          NULL },
+   { "GL_ARB_vertex_program",             GL_ARB_vertex_program_functions },
+   { "GL_ARB_window_pos",                 GL_ARB_window_pos_functions },
+   { "GL_EXT_blend_color",                GL_EXT_blend_color_functions },
+   { "GL_EXT_blend_equation_separate",    GL_EXT_blend_equation_separate_functions },
+   { "GL_EXT_blend_func_separate",        GL_EXT_blend_func_separate_functions },
+   { "GL_EXT_blend_minmax",               GL_EXT_blend_minmax_functions },
+   { "GL_EXT_blend_logic_op",             NULL },
+   { "GL_EXT_blend_subtract",             NULL },
+   { "GL_EXT_cull_vertex",                GL_EXT_cull_vertex_functions },
+   { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+   { "GL_EXT_packed_depth_stencil",       NULL },
+   { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
+   { "GL_EXT_stencil_wrap",               NULL },
+   { "GL_EXT_texture_edge_clamp",         NULL },
+   { "GL_EXT_texture_env_combine",        NULL },
+   { "GL_EXT_texture_env_dot3",           NULL },
+   { "GL_EXT_texture_filter_anisotropic", NULL },
+   { "GL_EXT_texture_lod_bias",           NULL },
+   { "GL_3DFX_texture_compression_FXT1",  NULL },
+   { "GL_APPLE_client_storage",           NULL },
+   { "GL_MESA_pack_invert",               NULL },
+   { "GL_MESA_ycbcr_texture",             NULL },
+   { "GL_NV_blend_square",                NULL },
+   { "GL_NV_point_sprite",                GL_NV_point_sprite_functions },
+   { "GL_NV_vertex_program",              GL_NV_vertex_program_functions },
+   { "GL_NV_vertex_program1_1",           NULL },
+   { "GL_SGIS_generate_mipmap",           NULL },
+   { NULL, NULL }
+};
+
+
+/** i915 / i945-only extensions */
+static const struct dri_extension i915_extensions[] = {
+   { "GL_ARB_depth_texture",              NULL },
+   { "GL_ARB_fragment_program",           NULL },
+   { "GL_ARB_shadow",                     NULL },
+   { "GL_ARB_texture_non_power_of_two",   NULL },
+   { "GL_ATI_texture_env_combine3",       NULL },
+   { "GL_EXT_shadow_funcs",               NULL },
+   { "GL_NV_texture_env_combine4",        NULL },
+   { NULL,                                NULL }
+};
+
+
+/** i965-only extensions */
+static const struct dri_extension brw_extensions[] = {
+   { "GL_ARB_depth_texture",              NULL },
+   { "GL_ARB_fragment_program",           NULL },
+   { "GL_ARB_fragment_program_shadow",    NULL },
+   { "GL_ARB_fragment_shader",            NULL },
+   { "GL_ARB_framebuffer_object",         GL_ARB_framebuffer_object_functions},
+   { "GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions },
+   { "GL_ARB_point_sprite", 		  NULL },
+   { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions },
+   { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions },
+   { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions },
+   { "GL_ARB_shadow",                     NULL },
+   { "GL_MESA_texture_signed_rgba",       NULL },
+   { "GL_ARB_texture_non_power_of_two",   NULL },
+   { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions },
+   { "GL_EXT_shadow_funcs",               NULL },
+   { "GL_EXT_stencil_two_side",           GL_EXT_stencil_two_side_functions },
+   { "GL_EXT_texture_sRGB",		  NULL },
+   { "GL_EXT_texture_swizzle",		  NULL },
+   { "GL_EXT_vertex_array_bgra",	  NULL },
+   { "GL_ATI_envmap_bumpmap",             GL_ATI_envmap_bumpmap_functions },
+   { "GL_ATI_separate_stencil",           GL_ATI_separate_stencil_functions },
+   { "GL_ATI_texture_env_combine3",       NULL },
+   { "GL_NV_texture_env_combine4",        NULL },
+   { NULL,                                NULL }
+};
+
+
+static const struct dri_extension arb_oq_extensions[] = {
+   { NULL, NULL }
+};
+
+
+static const struct dri_extension ttm_extensions[] = {
+   { "GL_ARB_pixel_buffer_object",      NULL },
+   { "GL_EXT_framebuffer_blit",         GL_EXT_framebuffer_blit_functions },
+   { "GL_EXT_framebuffer_object",       GL_EXT_framebuffer_object_functions },
+   { NULL, NULL }
+};
+
+
+/**
+ * Initializes potential list of extensions if ctx == NULL, or actually enables
+ * extensions for a context.
+ */
+void
+intelInitExtensions(GLcontext *ctx, GLboolean enable_imaging)
+{
+   struct intel_context *intel = ctx?intel_context(ctx):NULL;
+
+   /* Disable imaging extension until convolution is working in teximage paths.
+    */
+   enable_imaging = GL_FALSE;
+
+   driInitExtensions(ctx, card_extensions, enable_imaging);
+
+   if (intel == NULL || intel->ttm)
+      driInitExtensions(ctx, ttm_extensions, GL_FALSE);
+
+   if (intel == NULL || IS_965(intel->intelScreen->deviceID))
+      driInitExtensions(ctx, brw_extensions, GL_FALSE);
+
+   if (intel == NULL || IS_915(intel->intelScreen->deviceID)
+       || IS_945(intel->intelScreen->deviceID))
+      driInitExtensions(ctx, i915_extensions, GL_FALSE);
+}
diff --git a/i915/i915_tex.c b/shared/intel_extensions.h
index e38d8fe..97147ec 100644
--- a/i915/i915_tex.c
+++ b/shared/intel_extensions.h
@@ -25,54 +25,12 @@
  * 
  **************************************************************************/
 
-#include "main/glheader.h"
-#include "main/mtypes.h"
-#include "main/imports.h"
-#include "main/simple_list.h"
-#include "main/enums.h"
-#include "main/image.h"
-#include "main/mm.h"
-#include "main/texstore.h"
-#include "main/texformat.h"
-#include "swrast/swrast.h"
+#ifndef INTEL_EXTENSIONS_H
+#define INTEL_EXTENSIONS_H
 
-#include "texmem.h"
 
-#include "i915_context.h"
-#include "i915_reg.h"
+extern void
+intelInitExtensions(GLcontext *ctx, GLboolean enable_imaging);
 
 
-
-static void
-i915TexEnv(GLcontext * ctx, GLenum target,
-           GLenum pname, const GLfloat * param)
-{
-   struct i915_context *i915 = I915_CONTEXT(ctx);
-
-   switch (pname) {
-   case GL_TEXTURE_LOD_BIAS:{
-         GLuint unit = ctx->Texture.CurrentUnit;
-         GLint b = (int) ((*param) * 16.0);
-         if (b > 255)
-            b = 255;
-         if (b < -256)
-            b = -256;
-         I915_STATECHANGE(i915, I915_UPLOAD_TEX(unit));
-         i915->lodbias_ss2[unit] =
-            ((b << SS2_LOD_BIAS_SHIFT) & SS2_LOD_BIAS_MASK);
-         break;
-      }
-
-   default:
-      break;
-   }
-}
-
-
-void
-i915InitTextureFuncs(struct dd_function_table *functions)
-{
-/*
-   functions->TexEnv = i915TexEnv;
-*/
-}
+#endif
diff --git a/shared/intel_fbo.c b/shared/intel_fbo.c
index 54f6038..30f58b1 100644
--- a/shared/intel_fbo.c
+++ b/shared/intel_fbo.c
@@ -27,6 +27,7 @@
 
 
 #include "main/imports.h"
+#include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/fbobject.h"
 #include "main/framebuffer.h"
@@ -37,58 +38,13 @@
 
 #include "intel_context.h"
 #include "intel_buffers.h"
-#include "intel_depthstencil.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 #include "intel_regions.h"
-#include "intel_span.h"
 
 
 #define FILE_DEBUG_FLAG DEBUG_FBO
 
-#define INTEL_RB_CLASS 0x12345678
-
-
-/* XXX FBO: move this to intel_context.h (inlined) */
-/**
- * Return a gl_renderbuffer ptr casted to intel_renderbuffer.
- * NULL will be returned if the rb isn't really an intel_renderbuffer.
- * This is determiend by checking the ClassID.
- */
-struct intel_renderbuffer *
-intel_renderbuffer(struct gl_renderbuffer *rb)
-{
-   struct intel_renderbuffer *irb = (struct intel_renderbuffer *) rb;
-   if (irb && irb->Base.ClassID == INTEL_RB_CLASS) {
-      /*_mesa_warning(NULL, "Returning non-intel Rb\n");*/
-      return irb;
-   }
-   else
-      return NULL;
-}
-
-
-struct intel_renderbuffer *
-intel_get_renderbuffer(struct gl_framebuffer *fb, int attIndex)
-{
-   if (attIndex >= 0)
-      return intel_renderbuffer(fb->Attachment[attIndex].Renderbuffer);
-   else
-      return NULL;
-}
-
-struct intel_region *
-intel_get_rb_region(struct gl_framebuffer *fb, GLuint attIndex)
-{
-   struct intel_renderbuffer *irb = intel_get_renderbuffer(fb, attIndex);
-
-   if (irb)
-      return irb->region;
-   else
-      return NULL;
-}
-
-
 
 /**
  * Create a new framebuffer object.
@@ -103,6 +59,7 @@ intel_new_framebuffer(GLcontext * ctx, GLuint name)
 }
 
 
+/** Called by gl_renderbuffer::Delete() */
 static void
 intel_delete_renderbuffer(struct gl_renderbuffer *rb)
 {
@@ -112,10 +69,6 @@ intel_delete_renderbuffer(struct gl_renderbuffer *rb)
 
    ASSERT(irb);
 
-   if (irb->PairedStencil || irb->PairedDepth) {
-      intel_unpair_depth_stencil(ctx, irb);
-   }
-
    if (irb->span_cache != NULL)
       _mesa_free(irb->span_cache);
 
@@ -127,7 +80,6 @@ intel_delete_renderbuffer(struct gl_renderbuffer *rb)
 }
 
 
-
 /**
  * Return a pointer to a specific pixel in a renderbuffer.
  */
@@ -142,7 +94,6 @@ intel_get_pointer(GLcontext * ctx, struct gl_renderbuffer *rb,
 }
 
 
-
 /**
  * Called via glRenderbufferStorageEXT() to set the format and allocate
  * storage for a user-created renderbuffer.
@@ -168,6 +119,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->RedBits = 5;
       rb->GreenBits = 6;
       rb->BlueBits = 5;
+      irb->texformat = &_mesa_texformat_rgb565;
       cpp = 2;
       break;
    case GL_RGB:
@@ -181,6 +133,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->GreenBits = 8;
       rb->BlueBits = 8;
       rb->AlphaBits = 0;
+      irb->texformat = &_mesa_texformat_argb8888; /* XXX: Need xrgb8888 */
       cpp = 4;
       break;
    case GL_RGBA:
@@ -197,6 +150,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->GreenBits = 8;
       rb->BlueBits = 8;
       rb->AlphaBits = 8;
+      irb->texformat = &_mesa_texformat_argb8888;
       cpp = 4;
       break;
    case GL_STENCIL_INDEX:
@@ -209,20 +163,15 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
       rb->StencilBits = 8;
       cpp = 4;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    case GL_DEPTH_COMPONENT16:
-#if 0
       rb->_ActualFormat = GL_DEPTH_COMPONENT16;
       rb->DataType = GL_UNSIGNED_SHORT;
       rb->DepthBits = 16;
       cpp = 2;
+      irb->texformat = &_mesa_texformat_z16;
       break;
-#else
-      /* fall-through.
-       * 16bpp depth renderbuffer can't be paired with a stencil buffer so
-       * always used combined depth/stencil format.
-       */
-#endif
    case GL_DEPTH_COMPONENT:
    case GL_DEPTH_COMPONENT24:
    case GL_DEPTH_COMPONENT32:
@@ -230,6 +179,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
       rb->DepthBits = 24;
       cpp = 4;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    case GL_DEPTH_STENCIL_EXT:
    case GL_DEPTH24_STENCIL8_EXT:
@@ -238,6 +188,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->DepthBits = 24;
       rb->StencilBits = 8;
       cpp = 4;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    default:
       _mesa_problem(ctx,
@@ -281,7 +232,6 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
 }
 
 
-
 /**
  * Called for each hardware renderbuffer when a _window_ is resized.
  * Just update fields.
@@ -299,6 +249,7 @@ intel_alloc_window_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    return GL_TRUE;
 }
 
+
 static void
 intel_resize_buffers(GLcontext *ctx, struct gl_framebuffer *fb,
 		     GLuint width, GLuint height)
@@ -325,6 +276,8 @@ intel_resize_buffers(GLcontext *ctx, struct gl_framebuffer *fb,
    }
 }
 
+
+/** Dummy function for gl_renderbuffer::AllocStorage() */
 static GLboolean
 intel_nop_alloc_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
                         GLenum internalFormat, GLuint width, GLuint height)
@@ -344,10 +297,9 @@ intel_renderbuffer_set_region(struct intel_renderbuffer *rb,
    rb->region = NULL;
    intel_region_reference(&rb->region, region);
    intel_region_release(&old);
-
-   rb->pfPitch = region->pitch;
 }
 
+
 /**
  * Create a new intel_renderbuffer which corresponds to an on-screen window,
  * not a user-created renderbuffer.
@@ -377,6 +329,17 @@ intel_create_renderbuffer(GLenum intFormat)
       irb->Base.GreenBits = 6;
       irb->Base.BlueBits = 5;
       irb->Base.DataType = GL_UNSIGNED_BYTE;
+      irb->texformat = &_mesa_texformat_rgb565;
+      break;
+   case GL_RGB8:
+      irb->Base._ActualFormat = GL_RGB8;
+      irb->Base._BaseFormat = GL_RGB;
+      irb->Base.RedBits = 8;
+      irb->Base.GreenBits = 8;
+      irb->Base.BlueBits = 8;
+      irb->Base.AlphaBits = 0;
+      irb->Base.DataType = GL_UNSIGNED_BYTE;
+      irb->texformat = &_mesa_texformat_argb8888; /* XXX: Need xrgb8888 */
       break;
    case GL_RGBA8:
       irb->Base._ActualFormat = GL_RGBA8;
@@ -386,24 +349,28 @@ intel_create_renderbuffer(GLenum intFormat)
       irb->Base.BlueBits = 8;
       irb->Base.AlphaBits = 8;
       irb->Base.DataType = GL_UNSIGNED_BYTE;
+      irb->texformat = &_mesa_texformat_argb8888;
       break;
    case GL_STENCIL_INDEX8_EXT:
       irb->Base._ActualFormat = GL_STENCIL_INDEX8_EXT;
       irb->Base._BaseFormat = GL_STENCIL_INDEX;
       irb->Base.StencilBits = 8;
       irb->Base.DataType = GL_UNSIGNED_BYTE;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    case GL_DEPTH_COMPONENT16:
       irb->Base._ActualFormat = GL_DEPTH_COMPONENT16;
       irb->Base._BaseFormat = GL_DEPTH_COMPONENT;
       irb->Base.DepthBits = 16;
       irb->Base.DataType = GL_UNSIGNED_SHORT;
+      irb->texformat = &_mesa_texformat_z16;
       break;
    case GL_DEPTH_COMPONENT24:
       irb->Base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
       irb->Base._BaseFormat = GL_DEPTH_COMPONENT;
       irb->Base.DepthBits = 24;
       irb->Base.DataType = GL_UNSIGNED_INT;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    case GL_DEPTH24_STENCIL8_EXT:
       irb->Base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
@@ -411,6 +378,7 @@ intel_create_renderbuffer(GLenum intFormat)
       irb->Base.DepthBits = 24;
       irb->Base.StencilBits = 8;
       irb->Base.DataType = GL_UNSIGNED_INT_24_8_EXT;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    default:
       _mesa_problem(NULL,
@@ -467,9 +435,6 @@ intel_bind_framebuffer(GLcontext * ctx, GLenum target,
 {
    if (target == GL_FRAMEBUFFER_EXT || target == GL_DRAW_FRAMEBUFFER_EXT) {
       intel_draw_buffer(ctx, fb);
-      /* Integer depth range depends on depth buffer bits */
-      if (ctx->Driver.DepthRange != NULL)
-	 ctx->Driver.DepthRange(ctx, ctx->Viewport.Near, ctx->Viewport.Far);
    }
    else {
       /* don't need to do anything if target == GL_READ_FRAMEBUFFER_EXT */
@@ -493,10 +458,13 @@ intel_framebuffer_renderbuffer(GLcontext * ctx,
    intel_draw_buffer(ctx, fb);
 }
 
+
 static GLboolean
 intel_update_wrapper(GLcontext *ctx, struct intel_renderbuffer *irb, 
 		     struct gl_texture_image *texImage)
 {
+   irb->texformat = texImage->TexFormat;
+
    if (texImage->TexFormat == &_mesa_texformat_argb8888) {
       irb->Base._ActualFormat = GL_RGBA8;
       irb->Base._BaseFormat = GL_RGBA;
@@ -506,9 +474,21 @@ intel_update_wrapper(GLcontext *ctx, struct intel_renderbuffer *irb,
    else if (texImage->TexFormat == &_mesa_texformat_rgb565) {
       irb->Base._ActualFormat = GL_RGB5;
       irb->Base._BaseFormat = GL_RGB;
-      irb->Base.DataType = GL_UNSIGNED_SHORT;
+      irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to RGB5 texture OK\n");
    }
+   else if (texImage->TexFormat == &_mesa_texformat_argb1555) {
+      irb->Base._ActualFormat = GL_RGB5_A1;
+      irb->Base._BaseFormat = GL_RGBA;
+      irb->Base.DataType = GL_UNSIGNED_BYTE;
+      DBG("Render to ARGB1555 texture OK\n");
+   }
+   else if (texImage->TexFormat == &_mesa_texformat_argb4444) {
+      irb->Base._ActualFormat = GL_RGBA4;
+      irb->Base._BaseFormat = GL_RGBA;
+      irb->Base.DataType = GL_UNSIGNED_BYTE;
+      DBG("Render to ARGB4444 texture OK\n");
+   }
    else if (texImage->TexFormat == &_mesa_texformat_z16) {
       irb->Base._ActualFormat = GL_DEPTH_COMPONENT16;
       irb->Base._BaseFormat = GL_DEPTH_COMPONENT;
@@ -535,15 +515,15 @@ intel_update_wrapper(GLcontext *ctx, struct intel_renderbuffer *irb,
    irb->Base.BlueBits = texImage->TexFormat->BlueBits;
    irb->Base.AlphaBits = texImage->TexFormat->AlphaBits;
    irb->Base.DepthBits = texImage->TexFormat->DepthBits;
+   irb->Base.StencilBits = texImage->TexFormat->StencilBits;
 
    irb->Base.Delete = intel_delete_renderbuffer;
    irb->Base.AllocStorage = intel_nop_alloc_storage;
 
-   irb->RenderToTexture = GL_TRUE;
-
    return GL_TRUE;
 }
 
+
 /**
  * When glFramebufferTexture[123]D is called this function sets up the
  * gl_renderbuffer wrapper around the texture image.
@@ -552,7 +532,7 @@ intel_update_wrapper(GLcontext *ctx, struct intel_renderbuffer *irb,
 static struct intel_renderbuffer *
 intel_wrap_texture(GLcontext * ctx, struct gl_texture_image *texImage)
 {
-   const GLuint name = ~0;      /* not significant, but distinct for debugging */
+   const GLuint name = ~0;   /* not significant, but distinct for debugging */
    struct intel_renderbuffer *irb;
 
    /* make an intel_renderbuffer to wrap the texture image */
@@ -599,10 +579,11 @@ intel_render_texture(GLcontext * ctx,
       /* Fallback on drawing to a texture with a border, which won't have a
        * miptree.
        */
-       _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
-       _mesa_render_texture(ctx, fb, att);
-       return;
-   } else if (!irb) {
+      _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
+      _mesa_render_texture(ctx, fb, att);
+      return;
+   }
+   else if (!irb) {
       irb = intel_wrap_texture(ctx, newImage);
       if (irb) {
          /* bind the wrapper to the attachment point */
@@ -613,7 +594,9 @@ intel_render_texture(GLcontext * ctx,
          _mesa_render_texture(ctx, fb, att);
          return;
       }
-   } if (!intel_update_wrapper(ctx, irb, newImage)) {
+   }
+
+   if (!intel_update_wrapper(ctx, irb, newImage)) {
        _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
        _mesa_render_texture(ctx, fb, att);
        return;
@@ -658,19 +641,123 @@ static void
 intel_finish_render_texture(GLcontext * ctx,
                             struct gl_renderbuffer_attachment *att)
 {
-   struct intel_renderbuffer *irb = intel_renderbuffer(att->Renderbuffer);
+   /* no-op
+    * Previously we released the renderbuffer's intel_region but
+    * that's not necessary and actually caused problems when trying
+    * to do a glRead/CopyPixels from the renderbuffer later.
+    * The region will be released later if the texture is replaced
+    * or the renderbuffer deleted.
+    *
+    * The intention of this driver hook is more of a "done rendering
+    * to texture, please re-twiddle/etc if necessary".
+    */
+}
 
-   DBG("End render texture (tid %x) tex %u\n", _glthread_GetID(), att->Texture->Name);
 
-   if (irb) {
-      /* just release the region */
-      intel_region_release(&irb->region);
+/**
+ * Do additional "completeness" testing of a framebuffer object.
+ */
+static void
+intel_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
+{
+   const struct intel_renderbuffer *depthRb =
+      intel_get_renderbuffer(fb, BUFFER_DEPTH);
+   const struct intel_renderbuffer *stencilRb =
+      intel_get_renderbuffer(fb, BUFFER_STENCIL);
+   int i;
+
+   if (stencilRb && stencilRb != depthRb) {
+      /* we only support combined depth/stencil buffers, not separate
+       * stencil buffers.
+       */
+      fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
+   }
+
+   for (i = 0; i < ctx->Const.MaxDrawBuffers; i++) {
+      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+
+      if (rb == NULL)
+	 continue;
+
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
+      case MESA_FORMAT_RGB565:
+      case MESA_FORMAT_ARGB1555:
+      case MESA_FORMAT_ARGB4444:
+	 break;
+      default:
+	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
+      }
+   }
+}
+
+
+/**
+ * Called from glBlitFramebuffer().
+ * For now, we're doing an approximation with glCopyPixels().
+ * XXX we need to bypass all the per-fragment operations, except scissor.
+ */
+static void
+intel_blit_framebuffer(GLcontext *ctx,
+                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                       GLbitfield mask, GLenum filter)
+{
+   const GLfloat xZoomSave = ctx->Pixel.ZoomX;
+   const GLfloat yZoomSave = ctx->Pixel.ZoomY;
+   GLsizei width, height;
+   GLfloat xFlip = 1.0F, yFlip = 1.0F;
+
+   if (srcX1 < srcX0) {
+      GLint tmp = srcX1;
+      srcX1 = srcX0;
+      srcX0 = tmp;
+      xFlip = -1.0F;
+   }
+
+   if (srcY1 < srcY0) {
+      GLint tmp = srcY1;
+      srcY1 = srcY0;
+      srcY0 = tmp;
+      yFlip = -1.0F;
+   }
+
+   width = srcX1 - srcX0;
+   height = srcY1 - srcY0;
+
+   ctx->Pixel.ZoomX = xFlip * (dstX1 - dstX0) / (srcX1 - srcY0);
+   ctx->Pixel.ZoomY = yFlip * (dstY1 - dstY0) / (srcY1 - srcY0);
+
+   if (ctx->Pixel.ZoomX < 0.0F) {
+      dstX0 = MAX2(dstX0, dstX1);
+   }
+   else {
+      dstX0 = MIN2(dstX0, dstX1);
+   }
+
+   if (ctx->Pixel.ZoomY < 0.0F) {
+      dstY0 = MAX2(dstY0, dstY1);
+   }
+   else {
+      dstY0 = MIN2(dstY0, dstY1);
+   }
+
+   if (mask & GL_COLOR_BUFFER_BIT) {
+      ctx->Driver.CopyPixels(ctx, srcX0, srcY0, width, height,
+                             dstX0, dstY0, GL_COLOR);
+   }
+   if (mask & GL_DEPTH_BUFFER_BIT) {
+      ctx->Driver.CopyPixels(ctx, srcX0, srcY0, width, height,
+                             dstX0, dstY0, GL_DEPTH);
    }
-   else if (att->Renderbuffer) {
-      /* software fallback */
-      _mesa_finish_render_texture(ctx, att);
-      /* XXX FBO: Need to unmap the buffer (or in intelSpanRenderStart???) */
+   if (mask & GL_STENCIL_BUFFER_BIT) {
+      ctx->Driver.CopyPixels(ctx, srcX0, srcY0, width, height,
+                             dstX0, dstY0, GL_STENCIL);
    }
+      
+   ctx->Pixel.ZoomX = xZoomSave;
+   ctx->Pixel.ZoomY = yZoomSave;
 }
 
 
@@ -688,4 +775,6 @@ intel_fbo_init(struct intel_context *intel)
    intel->ctx.Driver.RenderTexture = intel_render_texture;
    intel->ctx.Driver.FinishRenderTexture = intel_finish_render_texture;
    intel->ctx.Driver.ResizeBuffers = intel_resize_buffers;
+   intel->ctx.Driver.ValidateFramebuffer = intel_validate_framebuffer;
+   intel->ctx.Driver.BlitFramebuffer = intel_blit_framebuffer;
 }
diff --git a/shared/intel_fbo.h b/shared/intel_fbo.h
index b7e9280..f0665af 100644
--- a/shared/intel_fbo.h
+++ b/shared/intel_fbo.h
@@ -55,19 +55,13 @@ struct intel_framebuffer
 
 /**
  * Intel renderbuffer, derived from gl_renderbuffer.
- * Note: The PairedDepth and PairedStencil fields use renderbuffer IDs,
- * not pointers because in some circumstances a deleted renderbuffer could
- * result in a dangling pointer here.
  */
 struct intel_renderbuffer
 {
    struct gl_renderbuffer Base;
    struct intel_region *region;
-   GLuint pfPitch;              /* possibly paged flipped pitch */
-   GLboolean RenderToTexture;   /* RTT? */
 
-   GLuint PairedDepth;   /**< only used if this is a depth renderbuffer */
-   GLuint PairedStencil; /**< only used if this is a stencil renderbuffer */
+   const struct gl_texture_format *texformat;
 
    GLuint vbl_pending;   /**< vblank sequence number of pending flip */
 
@@ -75,48 +69,70 @@ struct intel_renderbuffer
    unsigned long span_cache_offset;
 };
 
-extern struct intel_renderbuffer *intel_renderbuffer(struct gl_renderbuffer
-                                                     *rb);
+
+/**
+ * gl_renderbuffer is a base class which we subclass.  The Class field
+ * is used for simple run-time type checking.
+ */
+#define INTEL_RB_CLASS 0x12345678
+
+
+/**
+ * Return a gl_renderbuffer ptr casted to intel_renderbuffer.
+ * NULL will be returned if the rb isn't really an intel_renderbuffer.
+ * This is determined by checking the ClassID.
+ */
+static INLINE struct intel_renderbuffer *
+intel_renderbuffer(struct gl_renderbuffer *rb)
+{
+   struct intel_renderbuffer *irb = (struct intel_renderbuffer *) rb;
+   if (irb && irb->Base.ClassID == INTEL_RB_CLASS) {
+      /*_mesa_warning(NULL, "Returning non-intel Rb\n");*/
+      return irb;
+   }
+   else
+      return NULL;
+}
+
+
+/**
+ * Return a framebuffer's renderbuffer, named by a BUFFER_x index.
+ */
+static INLINE struct intel_renderbuffer *
+intel_get_renderbuffer(struct gl_framebuffer *fb, int attIndex)
+{
+   if (attIndex >= 0)
+      return intel_renderbuffer(fb->Attachment[attIndex].Renderbuffer);
+   else
+      return NULL;
+}
+
 
 extern void
 intel_renderbuffer_set_region(struct intel_renderbuffer *irb,
 			      struct intel_region *region);
 
+
 extern struct intel_renderbuffer *
 intel_create_renderbuffer(GLenum intFormat);
 
-extern void intel_fbo_init(struct intel_context *intel);
-
-
-/* XXX make inline or macro */
-extern struct intel_renderbuffer *intel_get_renderbuffer(struct gl_framebuffer
-                                                         *fb,
-                                                         int attIndex);
-
-extern void intel_flip_renderbuffers(struct intel_framebuffer *intel_fb);
 
+extern void
+intel_fbo_init(struct intel_context *intel);
 
-/* XXX make inline or macro */
-extern struct intel_region *intel_get_rb_region(struct gl_framebuffer *fb,
-                                                GLuint attIndex);
 
+extern void
+intel_flip_renderbuffers(struct intel_framebuffer *intel_fb);
 
 
-/**
- * Are we currently rendering into a texture?
- */
-static INLINE GLboolean
-intel_rendering_to_texture(const GLcontext *ctx)
+static INLINE struct intel_region *
+intel_get_rb_region(struct gl_framebuffer *fb, GLuint attIndex)
 {
-   if (ctx->DrawBuffer->Name) {
-      /* User-created FBO */
-      const struct intel_renderbuffer *irb =
-         intel_renderbuffer(ctx->DrawBuffer->_ColorDrawBuffers[0]);
-      return irb && irb->RenderToTexture;
-   }
-   else {
-      return GL_FALSE;
-   }
+   struct intel_renderbuffer *irb = intel_get_renderbuffer(fb, attIndex);
+   if (irb)
+      return irb->region;
+   else
+      return NULL;
 }
 
 
diff --git a/shared/intel_pixel.c b/shared/intel_pixel.c
index f440a77..fc0ac0b 100644
--- a/shared/intel_pixel.c
+++ b/shared/intel_pixel.c
@@ -30,6 +30,7 @@
 #include "main/context.h"
 #include "main/enable.h"
 #include "main/matrix.h"
+#include "main/viewport.h"
 #include "swrast/swrast.h"
 #include "shader/arbprogram.h"
 #include "shader/program.h"
@@ -112,7 +113,7 @@ intel_check_blit_fragment_ops(GLcontext * ctx, GLboolean src_alpha_is_one)
       return GL_FALSE;
    }
 
-   if (ctx->Stencil.Enabled) {
+   if (ctx->Stencil._Enabled) {
       DBG("fallback due to image stencil\n");
       return GL_FALSE;
    }
@@ -183,7 +184,9 @@ intel_meta_set_passthrough_transform(struct intel_context *intel)
    intel->meta.saved_vp_height = ctx->Viewport.Height;
    intel->meta.saved_matrix_mode = ctx->Transform.MatrixMode;
 
-   /*   _mesa_Viewport(0, 0, ctx->DrawBuffer->Width, ctx->DrawBuffer->Height);*/
+   intel->internal_viewport_call = GL_TRUE;
+   _mesa_Viewport(0, 0, ctx->DrawBuffer->Width, ctx->DrawBuffer->Height);
+   intel->internal_viewport_call = GL_FALSE;
 
    _mesa_MatrixMode(GL_PROJECTION);
    _mesa_PushMatrix();
@@ -205,8 +208,10 @@ intel_meta_restore_transform(struct intel_context *intel)
 
    _mesa_MatrixMode(intel->meta.saved_matrix_mode);
 
-   /*   _mesa_Viewport(intel->meta.saved_vp_x, intel->meta.saved_vp_y,
-	intel->meta.saved_vp_width, intel->meta.saved_vp_height);*/
+   intel->internal_viewport_call = GL_TRUE;
+   _mesa_Viewport(intel->meta.saved_vp_x, intel->meta.saved_vp_y,
+		  intel->meta.saved_vp_width, intel->meta.saved_vp_height);
+   intel->internal_viewport_call = GL_FALSE;
 }
 
 /**
diff --git a/shared/intel_pixel_bitmap.c b/shared/intel_pixel_bitmap.c
index 1db7f55..a2ccae1 100644
--- a/shared/intel_pixel_bitmap.c
+++ b/shared/intel_pixel_bitmap.c
@@ -401,6 +401,14 @@ intel_texture_bitmap(GLcontext * ctx,
       return GL_FALSE;
    }
 
+   if (!ctx->Extensions.ARB_texture_non_power_of_two &&
+       (!is_power_of_two(width) || !is_power_of_two(height))) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr,
+		 "glBitmap() fallback: NPOT texture\n");
+      return GL_FALSE;
+   }
+
    /* Check that we can load in a texture this big. */
    if (width > (1 << (ctx->Const.MaxTextureLevels - 1)) ||
        height > (1 << (ctx->Const.MaxTextureLevels - 1))) {
@@ -499,7 +507,7 @@ intel_texture_bitmap(GLcontext * ctx,
    _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
    _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
-   CALL_DrawArrays(ctx->Exec, (GL_TRIANGLE_FAN, 0, 4));
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
    intel_meta_restore_transform(intel);
    intel_meta_restore_fragment_program(intel);
diff --git a/shared/intel_pixel_copy.c b/shared/intel_pixel_copy.c
index 7c7aa60..d50dd68 100644
--- a/shared/intel_pixel_copy.c
+++ b/shared/intel_pixel_copy.c
@@ -87,7 +87,7 @@ intel_check_copypixel_blit_fragment_ops(GLcontext * ctx)
             ctx->Color.AlphaEnabled ||
             ctx->Depth.Test ||
             ctx->Fog.Enabled ||
-            ctx->Stencil.Enabled ||
+            ctx->Stencil._Enabled ||
             !ctx->Color.ColorMask[0] ||
             !ctx->Color.ColorMask[1] ||
             !ctx->Color.ColorMask[2] ||
diff --git a/shared/intel_pixel_draw.c b/shared/intel_pixel_draw.c
index 7be7ea8..d80069d 100644
--- a/shared/intel_pixel_draw.c
+++ b/shared/intel_pixel_draw.c
@@ -97,7 +97,7 @@ intel_texture_drawpixels(GLcontext * ctx,
    /* We don't have a way to generate fragments with stencil values which
     * will set the resulting stencil value.
     */
-   if (format == GL_STENCIL_INDEX)
+   if (format == GL_STENCIL_INDEX || format == GL_DEPTH_STENCIL)
       return GL_FALSE;
 
    /* Check that we can load in a texture this big. */
@@ -120,6 +120,14 @@ intel_texture_drawpixels(GLcontext * ctx,
       return GL_FALSE;
    }
 
+   if (!ctx->Extensions.ARB_texture_non_power_of_two &&
+       (!is_power_of_two(width) || !is_power_of_two(height))) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr,
+		 "glDrawPixels() fallback: NPOT texture\n");
+      return GL_FALSE;
+   }
+
    _mesa_PushAttrib(GL_ENABLE_BIT | GL_TEXTURE_BIT |
 		    GL_CURRENT_BIT);
    _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
@@ -183,7 +191,7 @@ intel_texture_drawpixels(GLcontext * ctx,
    _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
    _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
-   CALL_DrawArrays(ctx->Exec, (GL_TRIANGLE_FAN, 0, 4));
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
    intel_meta_restore_transform(intel);
 
@@ -233,7 +241,7 @@ intel_stencil_drawpixels(GLcontext * ctx,
    }
 
    /* We don't support stencil testing/ops here */
-   if (ctx->Stencil.Enabled)
+   if (ctx->Stencil._Enabled)
       return GL_FALSE;
 
    /* We use FBOs for our wrapping of the depthbuffer into a color
@@ -273,6 +281,14 @@ intel_stencil_drawpixels(GLcontext * ctx,
       return GL_FALSE;
    }
 
+   if (!ctx->Extensions.ARB_texture_non_power_of_two &&
+       (!is_power_of_two(width) || !is_power_of_two(height))) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr,
+		 "glDrawPixels(GL_STENCIL_INDEX) fallback: NPOT texture\n");
+      return GL_FALSE;
+   }
+
    _mesa_PushAttrib(GL_ENABLE_BIT | GL_TEXTURE_BIT |
 		    GL_CURRENT_BIT | GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
@@ -367,7 +383,7 @@ intel_stencil_drawpixels(GLcontext * ctx,
    _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
    _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
-   CALL_DrawArrays(ctx->Exec, (GL_TRIANGLE_FAN, 0, 4));
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
    intel_meta_restore_transform(intel);
 
diff --git a/shared/intel_regions.c b/shared/intel_regions.c
index 7de1e2b..0aa5b8c 100644
--- a/shared/intel_regions.c
+++ b/shared/intel_regions.c
@@ -158,7 +158,7 @@ void
 intel_region_reference(struct intel_region **dst, struct intel_region *src)
 {
    if (src)
-      DBG("%s %d\n", __FUNCTION__, src->refcount);
+      DBG("%s %p %d\n", __FUNCTION__, src, src->refcount);
 
    assert(*dst == NULL);
    if (src) {
@@ -175,7 +175,7 @@ intel_region_release(struct intel_region **region_handle)
    if (region == NULL)
       return;
 
-   DBG("%s %d\n", __FUNCTION__, region->refcount - 1);
+   DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
 
    ASSERT(region->refcount > 0);
    region->refcount--;
@@ -466,7 +466,8 @@ intel_recreate_static(struct intel_context *intel,
    else
       region->cpp = intel->ctx.Visual.rgbBits / 8;
    region->pitch = intelScreen->pitch;
-   region->height = intelScreen->height;     /* needed? */
+   region->width = intelScreen->width;
+   region->height = intelScreen->height;
 
    if (region->buffer != NULL) {
       dri_bo_unreference(region->buffer);
diff --git a/shared/intel_screen.c b/shared/intel_screen.c
index 4bd11dd..0f278b3 100644
--- a/shared/intel_screen.c
+++ b/shared/intel_screen.c
@@ -28,28 +28,27 @@
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/framebuffer.h"
-#include "main/matrix.h"
 #include "main/renderbuffer.h"
-#include "main/simple_list.h"
+
 #include "utils.h"
 #include "vblank.h"
 #include "xmlpool.h"
 
-
-#include "intel_screen.h"
-
+#include "intel_batchbuffer.h"
 #include "intel_buffers.h"
-#include "intel_tex.h"
-#include "intel_span.h"
-#include "intel_fbo.h"
+#include "intel_bufmgr.h"
 #include "intel_chipset.h"
+#include "intel_extensions.h"
+#include "intel_fbo.h"
+#include "intel_regions.h"
 #include "intel_swapbuffers.h"
+#include "intel_screen.h"
+#include "intel_span.h"
+#include "intel_tex.h"
 
 #include "i915_drm.h"
 #include "i830_dri.h"
-#include "intel_regions.h"
-#include "intel_batchbuffer.h"
-#include "intel_bufmgr.h"
+
 
 PUBLIC const char __driConfigOptions[] =
    DRI_CONF_BEGIN
@@ -72,10 +71,12 @@ PUBLIC const char __driConfigOptions[] =
    DRI_CONF_SECTION_END
    DRI_CONF_SECTION_DEBUG
      DRI_CONF_NO_RAST(false)
+     DRI_CONF_ALWAYS_FLUSH_BATCH(false)
+     DRI_CONF_ALWAYS_FLUSH_CACHE(false)
    DRI_CONF_SECTION_END
 DRI_CONF_END;
 
-const GLuint __driNConfigOptions = 6;
+const GLuint __driNConfigOptions = 8;
 
 #ifdef USE_NEW_INTERFACE
 static PFNGLXCREATECONTEXTMODES create_context_modes = NULL;
@@ -210,6 +211,7 @@ static const __DRItexOffsetExtension intelTexOffsetExtension = {
 static const __DRItexBufferExtension intelTexBufferExtension = {
     { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
    intelSetTexBuffer,
+   intelSetTexBuffer2,
 };
 
 static const __DRIextension *intelScreenExtensions[] = {
@@ -234,7 +236,7 @@ intel_get_param(__DRIscreenPrivate *psp, int param, int *value)
 
    ret = drmCommandWriteRead(psp->fd, DRM_I915_GETPARAM, &gp, sizeof(gp));
    if (ret) {
-      fprintf(stderr, "drm_i915_getparam: %d\n", ret);
+      _mesa_warning(NULL, "drm_i915_getparam: %d", ret);
       return GL_FALSE;
    }
 
@@ -303,6 +305,7 @@ intelDestroyScreen(__DRIscreenPrivate * sPriv)
 
    dri_bufmgr_destroy(intelScreen->bufmgr);
    intelUnmapScreenRegions(intelScreen);
+   driDestroyOptionCache(&intelScreen->optionCache);
 
    FREE(intelScreen);
    sPriv->private = NULL;
@@ -323,7 +326,7 @@ intelCreateBuffer(__DRIscreenPrivate * driScrnPriv,
    else {
       GLboolean swStencil = (mesaVis->stencilBits > 0 &&
                              mesaVis->depthBits != 24);
-      GLenum rgbFormat = (mesaVis->redBits == 5 ? GL_RGB5 : GL_RGBA8);
+      GLenum rgbFormat;
 
       struct intel_framebuffer *intel_fb = CALLOC_STRUCT(intel_framebuffer);
 
@@ -332,6 +335,13 @@ intelCreateBuffer(__DRIscreenPrivate * driScrnPriv,
 
       _mesa_initialize_framebuffer(&intel_fb->Base, mesaVis);
 
+      if (mesaVis->redBits == 5)
+	 rgbFormat = GL_RGB5;
+      else if (mesaVis->alphaBits == 0)
+	 rgbFormat = GL_RGB8;
+      else
+	 rgbFormat = GL_RGBA8;
+
       /* setup the hardware-based renderbuffers */
       intel_fb->color_rb[0] = intel_create_renderbuffer(rgbFormat);
       _mesa_add_renderbuffer(&intel_fb->Base, BUFFER_FRONT_LEFT,
@@ -385,7 +395,31 @@ intelCreateBuffer(__DRIscreenPrivate * driScrnPriv,
 static void
 intelDestroyBuffer(__DRIdrawablePrivate * driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   struct intel_framebuffer *intel_fb = driDrawPriv->driverPrivate;
+   struct intel_renderbuffer *depth_rb;
+   struct intel_renderbuffer *stencil_rb;
+
+   if (intel_fb) {
+      if (intel_fb->color_rb[0]) {
+         intel_renderbuffer_set_region(intel_fb->color_rb[0], NULL);
+      }
+
+      if (intel_fb->color_rb[1]) {
+         intel_renderbuffer_set_region(intel_fb->color_rb[1], NULL);
+      }
+
+      depth_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+      if (depth_rb) {
+         intel_renderbuffer_set_region(depth_rb, NULL);
+      }
+
+      stencil_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+      if (stencil_rb) {
+         intel_renderbuffer_set_region(stencil_rb, NULL);
+      }
+   }
+
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 
@@ -466,8 +500,6 @@ intelFillInModes(__DRIscreenPrivate *psp,
    __GLcontextModes *m;
    unsigned depth_buffer_factor;
    unsigned back_buffer_factor;
-   GLenum fb_format;
-   GLenum fb_type;
    int i;
 
    /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
@@ -479,6 +511,7 @@ intelFillInModes(__DRIscreenPrivate *psp,
 
    uint8_t depth_bits_array[3];
    uint8_t stencil_bits_array[3];
+   uint8_t msaa_samples_array[1];
 
    depth_bits_array[0] = 0;
    depth_bits_array[1] = depth_bits;
@@ -495,22 +528,39 @@ intelFillInModes(__DRIscreenPrivate *psp,
 
    stencil_bits_array[2] = (stencil_bits == 0) ? 8 : stencil_bits;
 
+   msaa_samples_array[0] = 0;
+
    depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 3 : 1;
    back_buffer_factor = (have_back_buffer) ? 3 : 1;
 
    if (pixel_bits == 16) {
-      fb_format = GL_RGB;
-      fb_type = GL_UNSIGNED_SHORT_5_6_5;
+      configs = driCreateConfigs(GL_RGB, GL_UNSIGNED_SHORT_5_6_5,
+				 depth_bits_array, stencil_bits_array,
+				 depth_buffer_factor, back_buffer_modes,
+				 back_buffer_factor,
+				 msaa_samples_array, 1);
    }
    else {
-      fb_format = GL_BGRA;
-      fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
+      __DRIconfig **configs_a8r8g8b8;
+      __DRIconfig **configs_x8r8g8b8;
+
+      configs_a8r8g8b8 = driCreateConfigs(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV,
+					  depth_bits_array,
+					  stencil_bits_array,
+					  depth_buffer_factor,
+					  back_buffer_modes,
+					  back_buffer_factor,
+					  msaa_samples_array, 1);
+      configs_x8r8g8b8 = driCreateConfigs(GL_BGR, GL_UNSIGNED_INT_8_8_8_8_REV,
+					  depth_bits_array,
+					  stencil_bits_array,
+					  depth_buffer_factor,
+					  back_buffer_modes,
+					  back_buffer_factor,
+					  msaa_samples_array, 1);
+      configs = driConcatConfigs(configs_a8r8g8b8, configs_x8r8g8b8);
    }
 
-   configs = driCreateConfigs(fb_format, fb_type,
-			      depth_bits_array, stencil_bits_array,
-			      depth_buffer_factor, back_buffer_modes,
-			      back_buffer_factor);
    if (configs == NULL) {
     fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
               __LINE__);
@@ -537,6 +587,7 @@ intel_init_bufmgr(intelScreenPrivate *intelScreen)
    GLboolean gem_supported;
    struct drm_i915_getparam gp;
    __DRIscreenPrivate *spriv = intelScreen->driScrnPriv;
+   int num_fences;
 
    intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL;
 
@@ -587,8 +638,10 @@ intel_init_bufmgr(intelScreenPrivate *intelScreen)
 				&intelScreen->sarea->last_dispatch);
    }
 
-   /* XXX bufmgr should be per-screen, not per-context */
-   intelScreen->ttm = intelScreen->ttm;
+   if (intel_get_param(spriv, I915_PARAM_NUM_FENCES_AVAIL, &num_fences))
+      intelScreen->kernel_exec_fencing = !!num_fences;
+   else
+      intelScreen->kernel_exec_fencing = GL_FALSE;
 
    return GL_TRUE;
 }
@@ -672,6 +725,17 @@ static const
 __DRIconfig **intelInitScreen2(__DRIscreenPrivate *psp)
 {
    intelScreenPrivate *intelScreen;
+   GLenum fb_format[3];
+   GLenum fb_type[3];
+   /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
+    * support pageflipping at all.
+    */
+   static const GLenum back_buffer_modes[] = {
+      GLX_NONE, GLX_SWAP_UNDEFINED_OML, GLX_SWAP_COPY_OML
+   };
+   uint8_t depth_bits[4], stencil_bits[4], msaa_samples_array[1];
+   int color;
+   __DRIconfig **configs = NULL;
 
    /* Calling driInitExtensions here, with a NULL context pointer,
     * does not actually enable the extensions.  It just makes sure
@@ -711,8 +775,71 @@ __DRIconfig **intelInitScreen2(__DRIscreenPrivate *psp)
    intelScreen->irq_active = 1;
    psp->extensions = intelScreenExtensions;
 
-   return driConcatConfigs(intelFillInModes(psp, 16, 16, 0, 1),
-			   intelFillInModes(psp, 32, 24, 8, 1));
+   depth_bits[0] = 0;
+   stencil_bits[0] = 0;
+   depth_bits[1] = 16;
+   stencil_bits[1] = 0;
+   depth_bits[2] = 24;
+   stencil_bits[2] = 0;
+   depth_bits[3] = 24;
+   stencil_bits[3] = 8;
+
+   msaa_samples_array[0] = 0;
+
+   fb_format[0] = GL_RGB;
+   fb_type[0] = GL_UNSIGNED_SHORT_5_6_5;
+
+   fb_format[1] = GL_BGR;
+   fb_type[1] = GL_UNSIGNED_INT_8_8_8_8_REV;
+
+   fb_format[2] = GL_BGRA;
+   fb_type[2] = GL_UNSIGNED_INT_8_8_8_8_REV;
+
+   depth_bits[0] = 0;
+   stencil_bits[0] = 0;
+
+   for (color = 0; color < ARRAY_SIZE(fb_format); color++) {
+      __DRIconfig **new_configs;
+      int depth_factor;
+
+      /* With DRI2 right now, GetBuffers always returns a depth/stencil buffer
+       * with the same cpp as the drawable.  So we can't support depth cpp !=
+       * color cpp currently.
+       */
+      if (fb_type[color] == GL_UNSIGNED_SHORT_5_6_5) {
+	 depth_bits[1] = 16;
+	 stencil_bits[1] = 0;
+
+	 depth_factor = 2;
+      } else {
+	 depth_bits[1] = 24;
+	 stencil_bits[1] = 0;
+	 depth_bits[2] = 24;
+	 stencil_bits[2] = 8;
+
+	 depth_factor = 3;
+      }
+      new_configs = driCreateConfigs(fb_format[color], fb_type[color],
+				     depth_bits,
+				     stencil_bits,
+				     depth_factor,
+				     back_buffer_modes,
+				     ARRAY_SIZE(back_buffer_modes),
+				     msaa_samples_array,
+				     ARRAY_SIZE(msaa_samples_array));
+      if (configs == NULL)
+	 configs = new_configs;
+      else
+	 configs = driConcatConfigs(configs, new_configs);
+   }
+
+   if (configs == NULL) {
+      fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
+              __LINE__);
+      return NULL;
+   }
+
+   return (const __DRIconfig **)configs;
 }
 
 const struct __DriverAPIRec driDriverAPI = {
diff --git a/shared/intel_screen.h b/shared/intel_screen.h
index e1036de..a9b9e10 100644
--- a/shared/intel_screen.h
+++ b/shared/intel_screen.h
@@ -79,6 +79,7 @@ typedef struct
    GLboolean no_vbo;
    int ttm;
    dri_bufmgr *bufmgr;
+   GLboolean kernel_exec_fencing;
 
    /**
    * Configuration cache with default values for all contexts
diff --git a/shared/intel_span.c b/shared/intel_span.c
index d931504..34b78eb 100644
--- a/shared/intel_span.c
+++ b/shared/intel_span.c
@@ -29,6 +29,7 @@
 #include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/colormac.h"
+#include "main/texformat.h"
 
 #include "intel_buffers.h"
 #include "intel_fbo.h"
@@ -131,6 +132,18 @@ pwrite_8(struct intel_renderbuffer *irb, uint32_t offset, uint8_t val)
    dri_bo_subdata(irb->region->buffer, offset, 1, &val);
 }
 
+static uint32_t
+z24s8_to_s8z24(uint32_t val)
+{
+   return (val << 24) | (val >> 8);
+}
+
+static uint32_t
+s8z24_to_z24s8(uint32_t val)
+{
+   return (val >> 24) | (val << 8);
+}
+
 static uint32_t no_tile_swizzle(struct intel_renderbuffer *irb,
 				int x, int y)
 {
@@ -150,7 +163,7 @@ static uint32_t x_tile_swizzle(struct intel_renderbuffer *irb,
 	int	x_tile_number, y_tile_number;
 	int	tile_off, tile_base;
 	
-	tile_stride = (irb->pfPitch * irb->region->cpp) << 3;
+	tile_stride = (irb->region->pitch * irb->region->cpp) << 3;
 
 	xbyte = x * irb->region->cpp;
 
@@ -190,7 +203,7 @@ static uint32_t x_tile_swizzle(struct intel_renderbuffer *irb,
 	printf("(%d,%d) -> %d + %d = %d (pitch = %d, tstride = %d)\n",
 	       x, y, tile_off, tile_base,
 	       tile_off + tile_base,
-	       irb->pfPitch, tile_stride);
+	       irb->region->pitch, tile_stride);
 #endif
 
 	return tile_base + tile_off;
@@ -205,7 +218,7 @@ static uint32_t y_tile_swizzle(struct intel_renderbuffer *irb,
 	int	x_tile_number, y_tile_number;
 	int	tile_off, tile_base;
 	
-	tile_stride = (irb->pfPitch * irb->region->cpp) << 5;
+	tile_stride = (irb->region->pitch * irb->region->cpp) << 5;
 
 	xbyte = x * irb->region->cpp;
 
@@ -255,8 +268,8 @@ static uint32_t y_tile_swizzle(struct intel_renderbuffer *irb,
 #define LOCAL_VARS							\
    struct intel_context *intel = intel_context(ctx);			\
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);		\
-   const GLint yScale = irb->RenderToTexture ? 1 : -1;			\
-   const GLint yBias = irb->RenderToTexture ? 0 : irb->Base.Height - 1;	\
+   const GLint yScale = ctx->DrawBuffer->Name ? 1 : -1;			\
+   const GLint yBias = ctx->DrawBuffer->Name ? 0 : irb->Base.Height - 1;\
    unsigned int num_cliprects;						\
    struct drm_clip_rect *cliprects;					\
    int x_off, y_off;							\
@@ -293,107 +306,51 @@ static uint32_t y_tile_swizzle(struct intel_renderbuffer *irb,
 #define X_TILE(_X, _Y) x_tile_swizzle(irb, (_X) + x_off, (_Y) + y_off)
 #define Y_TILE(_X, _Y) y_tile_swizzle(irb, (_X) + x_off, (_Y) + y_off)
 
-/* 16 bit, RGB565 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_RGB
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
-
-#define TAG(x)    intel##x##_RGB565
-#define TAG2(x,y) intel##x##_RGB565##y
-#define GET_VALUE(X, Y) pread_16(irb, NO_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_16(irb, NO_TILE(X, Y), V)
-#include "spantmp2.h"
-
-/* 32 bit, ARGB8888 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    intel##x##_ARGB8888
-#define TAG2(x,y) intel##x##_ARGB8888##y
-#define GET_VALUE(X, Y) pread_32(irb, NO_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_32(irb, NO_TILE(X, Y), V)
-#include "spantmp2.h"
-
-/* 32 bit, xRGB8888 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    intel##x##_xRGB8888
-#define TAG2(x,y) intel##x##_xRGB8888##y
-#define GET_VALUE(X, Y) pread_xrgb8888(irb, NO_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_xrgb8888(irb, NO_TILE(X, Y), V)
-#include "spantmp2.h"
-
-/* 16 bit RGB565 color tile spanline and pixel functions
- */
-
-#define SPANTMP_PIXEL_FMT GL_RGB
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
-
-#define TAG(x)    intel_XTile_##x##_RGB565
-#define TAG2(x,y) intel_XTile_##x##_RGB565##y
-#define GET_VALUE(X, Y) pread_16(irb, X_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_16(irb, X_TILE(X, Y), V)
-#include "spantmp2.h"
-
-#define SPANTMP_PIXEL_FMT GL_RGB
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
-
-#define TAG(x)    intel_YTile_##x##_RGB565
-#define TAG2(x,y) intel_YTile_##x##_RGB565##y
-#define GET_VALUE(X, Y) pread_16(irb, Y_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_16(irb, Y_TILE(X, Y), V)
-#include "spantmp2.h"
-
-/* 32 bit ARGB888 color tile spanline and pixel functions
- */
-
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    intel_XTile_##x##_ARGB8888
-#define TAG2(x,y) intel_XTile_##x##_ARGB8888##y
-#define GET_VALUE(X, Y) pread_32(irb, X_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_32(irb, X_TILE(X, Y), V)
-#include "spantmp2.h"
-
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    intel_YTile_##x##_ARGB8888
-#define TAG2(x,y) intel_YTile_##x##_ARGB8888##y
-#define GET_VALUE(X, Y) pread_32(irb, Y_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_32(irb, Y_TILE(X, Y), V)
-#include "spantmp2.h"
-
-/* 32 bit xRGB888 color tile spanline and pixel functions
- */
-
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    intel_XTile_##x##_xRGB8888
-#define TAG2(x,y) intel_XTile_##x##_xRGB8888##y
-#define GET_VALUE(X, Y) pread_xrgb8888(irb, X_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_xrgb8888(irb, X_TILE(X, Y), V)
-#include "spantmp2.h"
-
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    intel_YTile_##x##_xRGB8888
-#define TAG2(x,y) intel_YTile_##x##_xRGB8888##y
-#define GET_VALUE(X, Y) pread_xrgb8888(irb, Y_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_xrgb8888(irb, Y_TILE(X, Y), V)
-#include "spantmp2.h"
+/* r5g6b5 color span and pixel functions */
+#define INTEL_PIXEL_FMT GL_RGB
+#define INTEL_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
+#define INTEL_READ_VALUE(offset) pread_16(irb, offset)
+#define INTEL_WRITE_VALUE(offset, v) pwrite_16(irb, offset, v)
+#define INTEL_TAG(x) x##_RGB565
+#include "intel_spantmp.h"
+
+/* a4r4g4b4 color span and pixel functions */
+#define INTEL_PIXEL_FMT GL_BGRA
+#define INTEL_PIXEL_TYPE GL_UNSIGNED_SHORT_4_4_4_4_REV
+#define INTEL_READ_VALUE(offset) pread_16(irb, offset)
+#define INTEL_WRITE_VALUE(offset, v) pwrite_16(irb, offset, v)
+#define INTEL_TAG(x) x##_ARGB4444
+#include "intel_spantmp.h"
+
+/* a1r5g5b5 color span and pixel functions */
+#define INTEL_PIXEL_FMT GL_BGRA
+#define INTEL_PIXEL_TYPE GL_UNSIGNED_SHORT_1_5_5_5_REV
+#define INTEL_READ_VALUE(offset) pread_16(irb, offset)
+#define INTEL_WRITE_VALUE(offset, v) pwrite_16(irb, offset, v)
+#define INTEL_TAG(x) x##_ARGB1555
+#include "intel_spantmp.h"
+
+/* a8r8g8b8 color span and pixel functions */
+#define INTEL_PIXEL_FMT GL_BGRA
+#define INTEL_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+#define INTEL_READ_VALUE(offset) pread_32(irb, offset)
+#define INTEL_WRITE_VALUE(offset, v) pwrite_32(irb, offset, v)
+#define INTEL_TAG(x) x##_ARGB8888
+#include "intel_spantmp.h"
+
+/* x8r8g8b8 color span and pixel functions */
+#define INTEL_PIXEL_FMT GL_BGRA
+#define INTEL_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+#define INTEL_READ_VALUE(offset) pread_xrgb8888(irb, offset)
+#define INTEL_WRITE_VALUE(offset, v) pwrite_xrgb8888(irb, offset, v)
+#define INTEL_TAG(x) x##_xRGB8888
+#include "intel_spantmp.h"
 
 #define LOCAL_DEPTH_VARS						\
    struct intel_context *intel = intel_context(ctx);			\
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);		\
-   const GLint yScale = irb->RenderToTexture ? 1 : -1;			\
-   const GLint yBias = irb->RenderToTexture ? 0 : irb->Base.Height - 1; \
+   const GLint yScale = ctx->DrawBuffer->Name ? 1 : -1;			\
+   const GLint yBias = ctx->DrawBuffer->Name ? 0 : irb->Base.Height - 1;\
    unsigned int num_cliprects;						\
    struct drm_clip_rect *cliprects;					\
    int x_off, y_off;							\
@@ -402,98 +359,26 @@ static uint32_t y_tile_swizzle(struct intel_renderbuffer *irb,
 
 #define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
 
-/**
- ** 16-bit depthbuffer functions.
- **/
-#define VALUE_TYPE GLushort
-#define WRITE_DEPTH(_x, _y, d) pwrite_16(irb, NO_TILE(_x, _y), d)
-#define READ_DEPTH(d, _x, _y) d = pread_16(irb, NO_TILE(_x, _y))
-#define TAG(x) intel##x##_z16
-#include "depthtmp.h"
-
-
-/**
- ** 16-bit x tile depthbuffer functions.
- **/
-#define VALUE_TYPE GLushort
-#define WRITE_DEPTH(_x, _y, d) pwrite_16(irb, X_TILE(_x, _y), d)
-#define READ_DEPTH(d, _x, _y) d = pread_16(irb, X_TILE(_x, _y))
-#define TAG(x) intel_XTile_##x##_z16
-#include "depthtmp.h"
-
-/**
- ** 16-bit y tile depthbuffer functions.
- **/
-#define VALUE_TYPE GLushort
-#define WRITE_DEPTH(_x, _y, d) pwrite_16(irb, Y_TILE(_x, _y), d)
-#define READ_DEPTH(d, _x, _y) d = pread_16(irb, Y_TILE(_x, _y))
-#define TAG(x) intel_YTile_##x##_z16
-#include "depthtmp.h"
-
-
-/**
- ** 24/8-bit interleaved depth/stencil functions
- ** Note: we're actually reading back combined depth+stencil values.
- ** The wrappers in main/depthstencil.c are used to extract the depth
- ** and stencil values.
- **/
-#define VALUE_TYPE GLuint
-
-/* Change ZZZS -> SZZZ */
-#define WRITE_DEPTH(_x, _y, d)					\
-   pwrite_32(irb, NO_TILE(_x, _y), ((d) >> 8) | ((d) << 24))
-
-/* Change SZZZ -> ZZZS */
-#define READ_DEPTH( d, _x, _y ) {				\
-   GLuint tmp = pread_32(irb, NO_TILE(_x, _y));			\
-   d = (tmp << 8) | (tmp >> 24);				\
-}
-
-#define TAG(x) intel##x##_z24_s8
-#include "depthtmp.h"
-
-
-/**
- ** 24/8-bit x-tile interleaved depth/stencil functions
- ** Note: we're actually reading back combined depth+stencil values.
- ** The wrappers in main/depthstencil.c are used to extract the depth
- ** and stencil values.
- **/
-#define VALUE_TYPE GLuint
+/* z16 depthbuffer functions. */
+#define INTEL_VALUE_TYPE GLushort
+#define INTEL_WRITE_DEPTH(offset, d) pwrite_16(irb, offset, d)
+#define INTEL_READ_DEPTH(offset) pread_16(irb, offset)
+#define INTEL_TAG(name) name##_z16
+#include "intel_depthtmp.h"
 
-/* Change ZZZS -> SZZZ */
-#define WRITE_DEPTH(_x, _y, d)					\
-   pwrite_32(irb, X_TILE(_x, _y), ((d) >> 8) | ((d) << 24))
+/* z24 depthbuffer functions. */
+#define INTEL_VALUE_TYPE GLuint
+#define INTEL_WRITE_DEPTH(offset, d) pwrite_32(irb, offset, d)
+#define INTEL_READ_DEPTH(offset) pread_32(irb, offset)
+#define INTEL_TAG(name) name##_z24
+#include "intel_depthtmp.h"
 
-/* Change SZZZ -> ZZZS */
-#define READ_DEPTH( d, _x, _y ) {				\
-   GLuint tmp = pread_32(irb, X_TILE(_x, _y));		\
-   d = (tmp << 8) | (tmp >> 24);				\
-}
-
-#define TAG(x) intel_XTile_##x##_z24_s8
-#include "depthtmp.h"
-
-/**
- ** 24/8-bit y-tile interleaved depth/stencil functions
- ** Note: we're actually reading back combined depth+stencil values.
- ** The wrappers in main/depthstencil.c are used to extract the depth
- ** and stencil values.
- **/
-#define VALUE_TYPE GLuint
-
-/* Change ZZZS -> SZZZ */
-#define WRITE_DEPTH(_x, _y, d)					\
-   pwrite_32(irb, Y_TILE(_x, _y), ((d) >> 8) | ((d) << 24))
-
-/* Change SZZZ -> ZZZS */
-#define READ_DEPTH( d, _x, _y ) {				\
-   GLuint tmp = pread_32(irb, Y_TILE(_x, _y));			\
-   d = (tmp << 8) | (tmp >> 24);				\
-}
-
-#define TAG(x) intel_YTile_##x##_z24_s8
-#include "depthtmp.h"
+/* z24s8 depthbuffer functions. */
+#define INTEL_VALUE_TYPE GLuint
+#define INTEL_WRITE_DEPTH(offset, d) pwrite_32(irb, offset, z24s8_to_s8z24(d))
+#define INTEL_READ_DEPTH(offset) s8z24_to_z24s8(pread_32(irb, offset))
+#define INTEL_TAG(name) name##_z24_s8
+#include "intel_depthtmp.h"
 
 
 /**
@@ -528,8 +413,6 @@ intel_renderbuffer_map(struct intel_context *intel, struct gl_renderbuffer *rb)
    if (irb == NULL || irb->region == NULL)
       return;
 
-   irb->pfPitch = irb->region->pitch;
-
    intel_set_span_functions(intel, rb);
 }
 
@@ -543,7 +426,6 @@ intel_renderbuffer_unmap(struct intel_context *intel,
       return;
 
    clear_span_cache(irb);
-   irb->pfPitch = 0;
 
    rb->GetRow = NULL;
    rb->PutRow = NULL;
@@ -696,8 +578,8 @@ intel_set_span_functions(struct intel_context *intel,
    else
       tiling = I915_TILING_NONE;
 
-   if (rb->_ActualFormat == GL_RGB5) {
-      /* 565 RGB */
+   switch (irb->texformat->MesaFormat) {
+   case MESA_FORMAT_RGB565:
       switch (tiling) {
       case I915_TILING_NONE:
       default:
@@ -710,38 +592,67 @@ intel_set_span_functions(struct intel_context *intel,
 	 intel_YTile_InitPointers_RGB565(rb);
 	 break;
       }
-   }
-   else if (rb->_ActualFormat == GL_RGB8) {
-      /* 8888 RGBx */
+      break;
+   case MESA_FORMAT_ARGB4444:
       switch (tiling) {
       case I915_TILING_NONE:
       default:
-	 intelInitPointers_xRGB8888(rb);
+	 intelInitPointers_ARGB4444(rb);
 	 break;
       case I915_TILING_X:
-	 intel_XTile_InitPointers_xRGB8888(rb);
+	 intel_XTile_InitPointers_ARGB4444(rb);
 	 break;
       case I915_TILING_Y:
-	 intel_YTile_InitPointers_xRGB8888(rb);
+	 intel_YTile_InitPointers_ARGB4444(rb);
 	 break;
       }
-   }
-   else if (rb->_ActualFormat == GL_RGBA8) {
-      /* 8888 RGBA */
+      break;
+   case MESA_FORMAT_ARGB1555:
       switch (tiling) {
       case I915_TILING_NONE:
       default:
-	 intelInitPointers_ARGB8888(rb);
+	 intelInitPointers_ARGB1555(rb);
 	 break;
       case I915_TILING_X:
-	 intel_XTile_InitPointers_ARGB8888(rb);
+	 intel_XTile_InitPointers_ARGB1555(rb);
 	 break;
       case I915_TILING_Y:
-	 intel_YTile_InitPointers_ARGB8888(rb);
+	 intel_YTile_InitPointers_ARGB1555(rb);
 	 break;
       }
-   }
-   else if (rb->_ActualFormat == GL_DEPTH_COMPONENT16) {
+      break;
+   case MESA_FORMAT_ARGB8888:
+      if (rb->AlphaBits == 0) { /* XXX: Need xRGB8888 Mesa format */
+	 /* 8888 RGBx */
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitPointers_xRGB8888(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitPointers_xRGB8888(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitPointers_xRGB8888(rb);
+	    break;
+	 }
+      } else {
+	 /* 8888 RGBA */
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitPointers_ARGB8888(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitPointers_ARGB8888(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitPointers_ARGB8888(rb);
+	    break;
+	 }
+      }
+      break;
+   case MESA_FORMAT_Z16:
       switch (tiling) {
       case I915_TILING_NONE:
       default:
@@ -754,38 +665,57 @@ intel_set_span_functions(struct intel_context *intel,
 	 intel_YTile_InitDepthPointers_z16(rb);
 	 break;
       }
-   }
-   else if (rb->_ActualFormat == GL_DEPTH_COMPONENT24 ||        /* XXX FBO remove */
-            rb->_ActualFormat == GL_DEPTH24_STENCIL8_EXT) {
-      switch (tiling) {
-      case I915_TILING_NONE:
-      default:
-	 intelInitDepthPointers_z24_s8(rb);
-	 break;
-      case I915_TILING_X:
-	 intel_XTile_InitDepthPointers_z24_s8(rb);
-	 break;
-      case I915_TILING_Y:
-	 intel_YTile_InitDepthPointers_z24_s8(rb);
-	 break;
+      break;
+   case MESA_FORMAT_S8_Z24:
+      /* There are a few different ways SW asks us to access the S8Z24 data:
+       * Z24 depth-only depth reads
+       * S8Z24 depth reads
+       * S8Z24 stencil reads.
+       */
+      if (rb->_ActualFormat == GL_DEPTH_COMPONENT24) {
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitDepthPointers_z24(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitDepthPointers_z24(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitDepthPointers_z24(rb);
+	    break;
+	 }
+      } else if (rb->_ActualFormat == GL_DEPTH24_STENCIL8_EXT) {
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitDepthPointers_z24_s8(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitDepthPointers_z24_s8(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitDepthPointers_z24_s8(rb);
+	    break;
+	 }
+      } else if (rb->_ActualFormat == GL_STENCIL_INDEX8_EXT) {
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitStencilPointers_z24_s8(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitStencilPointers_z24_s8(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitStencilPointers_z24_s8(rb);
+	    break;
+	 }
       }
-   }
-   else if (rb->_ActualFormat == GL_STENCIL_INDEX8_EXT) {
-      switch (tiling) {
-      case I915_TILING_NONE:
-      default:
-	 intelInitStencilPointers_z24_s8(rb);
-	 break;
-      case I915_TILING_X:
-	 intel_XTile_InitStencilPointers_z24_s8(rb);
-	 break;
-      case I915_TILING_Y:
-	 intel_YTile_InitStencilPointers_z24_s8(rb);
-	 break;
-      }
-   }
-   else {
+      break;
+   default:
       _mesa_problem(NULL,
-                    "Unexpected _ActualFormat in intelSetSpanFunctions");
+                    "Unexpected MesaFormat in intelSetSpanFunctions");
+      break;
    }
 }
diff --git a/shared/intel_spantmp.h b/shared/intel_spantmp.h
new file mode 100644
index 0000000..ead0b1c
--- /dev/null
+++ b/shared/intel_spantmp.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/**
+ * Wrapper around the spantmp.h macrofest to generate spans code for
+ * all the tiling styles.
+ */
+
+#define SPANTMP_PIXEL_FMT INTEL_PIXEL_FMT
+#define SPANTMP_PIXEL_TYPE INTEL_PIXEL_TYPE
+#define PUT_VALUE(_x, _y, v) INTEL_WRITE_VALUE(NO_TILE(_x, _y), v)
+#define GET_VALUE(_x, _y) INTEL_READ_VALUE(NO_TILE(_x, _y))
+#define TAG(x) INTEL_TAG(intel##x)
+#define TAG2(x, y) INTEL_TAG(intel##x)##y
+#include "spantmp2.h"
+
+#define SPANTMP_PIXEL_FMT INTEL_PIXEL_FMT
+#define SPANTMP_PIXEL_TYPE INTEL_PIXEL_TYPE
+#define PUT_VALUE(_x, _y, v) INTEL_WRITE_VALUE(X_TILE(_x, _y), v)
+#define GET_VALUE(_x, _y) INTEL_READ_VALUE(X_TILE(_x, _y))
+#define TAG(x) INTEL_TAG(intel_XTile_##x)
+#define TAG2(x, y) INTEL_TAG(intel_XTile_##x)##y
+#include "spantmp2.h"
+
+#define SPANTMP_PIXEL_FMT INTEL_PIXEL_FMT
+#define SPANTMP_PIXEL_TYPE INTEL_PIXEL_TYPE
+#define PUT_VALUE(_x, _y, v) INTEL_WRITE_VALUE(X_TILE(_x, _y), v)
+#define GET_VALUE(_x, _y) INTEL_READ_VALUE(X_TILE(_x, _y))
+#define TAG(x) INTEL_TAG(intel_YTile_##x)
+#define TAG2(x, y) INTEL_TAG(intel_YTile_##x)##y
+#include "spantmp2.h"
+
+#undef INTEL_PIXEL_FMT
+#undef INTEL_PIXEL_TYPE
+#undef INTEL_WRITE_VALUE
+#undef INTEL_READ_VALUE
+#undef INTEL_TAG
diff --git a/i915/intel_state.c b/shared/intel_state.c
index 8d96e9b..4ee7423 100644
--- a/i915/intel_state.c
+++ b/shared/intel_state.c
@@ -38,30 +38,30 @@
 #include "intel_regions.h"
 #include "swrast/swrast.h"
 
-int 
-intel_translate_shadow_compare_func( GLenum func )
+int
+intel_translate_shadow_compare_func(GLenum func)
 {
-   switch(func) {
+   switch (func) {
    case GL_NEVER: 
-       return COMPAREFUNC_ALWAYS; 
+       return COMPAREFUNC_ALWAYS;
    case GL_LESS: 
-       return COMPAREFUNC_LEQUAL; 
+       return COMPAREFUNC_LEQUAL;
    case GL_LEQUAL: 
        return COMPAREFUNC_LESS;
    case GL_GREATER: 
-       return COMPAREFUNC_GEQUAL; 
+       return COMPAREFUNC_GEQUAL;
    case GL_GEQUAL: 
-      return COMPAREFUNC_GREATER; 
+      return COMPAREFUNC_GREATER;
    case GL_NOTEQUAL: 
-      return COMPAREFUNC_EQUAL; 
+      return COMPAREFUNC_EQUAL;
    case GL_EQUAL: 
-      return COMPAREFUNC_NOTEQUAL; 
+      return COMPAREFUNC_NOTEQUAL;
    case GL_ALWAYS: 
-       return COMPAREFUNC_NEVER; 
+       return COMPAREFUNC_NEVER;
    }
 
    fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
-   return COMPAREFUNC_NEVER; 
+   return COMPAREFUNC_NEVER;
 }
 
 int
@@ -198,7 +198,7 @@ intel_translate_logic_op(GLenum opcode)
 
 
 static void
-intelClearColor(GLcontext * ctx, const GLfloat color[4])
+intelClearColor(GLcontext *ctx, const GLfloat color[4])
 {
    struct intel_context *intel = intel_context(ctx);
    GLubyte clear[4];
@@ -218,7 +218,7 @@ intelClearColor(GLcontext * ctx, const GLfloat color[4])
 /* Fallback to swrast for select and feedback.
  */
 static void
-intelRenderMode(GLcontext * ctx, GLenum mode)
+intelRenderMode(GLcontext *ctx, GLenum mode)
 {
    struct intel_context *intel = intel_context(ctx);
    FALLBACK(intel, INTEL_FALLBACK_RENDERMODE, (mode != GL_RENDER));
diff --git a/shared/intel_swapbuffers.c b/shared/intel_swapbuffers.c
index c135166..7d035b9 100644
--- a/shared/intel_swapbuffers.c
+++ b/shared/intel_swapbuffers.c
@@ -43,7 +43,6 @@
 GLuint
 intelFixupVblank(struct intel_context *intel, __DRIdrawablePrivate *dPriv)
 {
-
    if (!intel->intelScreen->driScrnPriv->dri2.enabled &&
        intel->intelScreen->driScrnPriv->ddx_version.minor >= 7) {
       volatile drm_i915_sarea_t *sarea = intel->sarea;
@@ -77,11 +76,14 @@ intelFixupVblank(struct intel_context *intel, __DRIdrawablePrivate *dPriv)
 
       return flags;
    } else {
-	return dPriv->vblFlags & ~VBLANK_FLAG_SECONDARY;
+      return dPriv->vblFlags & ~VBLANK_FLAG_SECONDARY;
    }
 }
 
 
+/**
+ * Called from driSwapBuffers()
+ */
 void
 intelSwapBuffers(__DRIdrawablePrivate * dPriv)
 {
@@ -130,7 +132,6 @@ intelSwapBuffers(__DRIdrawablePrivate * dPriv)
 	 intel_fb->swap_ust = ust;
       }
       drmCommandNone(intel->driFd, DRM_I915_GEM_THROTTLE);
-
    }
    else {
       /* XXX this shouldn't be an error but we can't handle it for now */
@@ -138,6 +139,10 @@ intelSwapBuffers(__DRIdrawablePrivate * dPriv)
    }
 }
 
+
+/**
+ * Called from driCopySubBuffer()
+ */
 void
 intelCopySubBuffer(__DRIdrawablePrivate * dPriv, int x, int y, int w, int h)
 {
diff --git a/shared/intel_tex.c b/shared/intel_tex.c
index e64d8a1..ae0994b 100644
--- a/shared/intel_tex.c
+++ b/shared/intel_tex.c
@@ -93,7 +93,7 @@ intelFreeTextureImageData(GLcontext * ctx, struct gl_texture_image *texImage)
 static void *
 do_memcpy(void *dest, const void *src, size_t n)
 {
-   if ((((unsigned) src) & 63) || (((unsigned) dest) & 63)) {
+   if ((((unsigned long) src) & 63) || (((unsigned long) dest) & 63)) {
       return __memcpy(dest, src, n);
    }
    else
diff --git a/shared/intel_tex.h b/shared/intel_tex.h
index 742ccc0..f5372d8 100644
--- a/shared/intel_tex.h
+++ b/shared/intel_tex.h
@@ -149,6 +149,8 @@ void intelSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 		       unsigned long long offset, GLint depth, GLuint pitch);
 void intelSetTexBuffer(__DRIcontext *pDRICtx,
 		       GLint target, __DRIdrawable *pDraw);
+void intelSetTexBuffer2(__DRIcontext *pDRICtx,
+			GLint target, GLint format, __DRIdrawable *pDraw);
 
 GLuint intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit);
 
diff --git a/shared/intel_tex_copy.c b/shared/intel_tex_copy.c
index a7143b8..90bbb8c 100644
--- a/shared/intel_tex_copy.c
+++ b/shared/intel_tex_copy.c
@@ -104,7 +104,7 @@ do_copy_texsubimage(struct intel_context *intel,
       return GL_FALSE;
    }
 
-   intel_glFlush(ctx);
+   intelFlush(ctx);
    LOCK_HARDWARE(intel);
    {
       GLuint image_offset = intel_miptree_image_offset(intelImage->mt,
@@ -155,7 +155,6 @@ do_copy_texsubimage(struct intel_context *intel,
    }
 
    UNLOCK_HARDWARE(intel);
-   intel_glFlush(ctx);
 
    /* GL_SGIS_generate_mipmap */
    if (intelImage->level == texObj->BaseLevel && texObj->GenerateMipmap) {
@@ -232,6 +231,14 @@ intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
    if (border)
       goto fail;
 
+   /* Setup or redefine the texture object, mipmap tree and texture
+    * image.  Don't populate yet.
+    */
+   ctx->Driver.TexImage2D(ctx, target, level, internalFormat,
+                          width, height, border,
+                          GL_RGBA, CHAN_TYPE, NULL,
+                          &ctx->DefaultPacking, texObj, texImage);
+
    srcx = x;
    srcy = y;
    dstx = 0;
@@ -242,15 +249,6 @@ intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
 				   &width, &height))
       return;
 
-   /* Setup or redefine the texture object, mipmap tree and texture
-    * image.  Don't populate yet.  
-    */
-   ctx->Driver.TexImage2D(ctx, target, level, internalFormat,
-                          width, height, border,
-                          GL_RGBA, CHAN_TYPE, NULL,
-                          &ctx->DefaultPacking, texObj, texImage);
-
-
    if (!do_copy_texsubimage(intel_context(ctx), target,
                             intel_texture_image(texImage),
                             internalFormat, 0, 0, x, y, width, height))
diff --git a/shared/intel_tex_format.c b/shared/intel_tex_format.c
index 5e418ac..3322a71 100644
--- a/shared/intel_tex_format.c
+++ b/shared/intel_tex_format.c
@@ -1,13 +1,18 @@
 #include "intel_context.h"
 #include "intel_tex.h"
+#include "intel_chipset.h"
 #include "main/texformat.h"
 #include "main/enums.h"
 
-/* It works out that this function is fine for all the supported
+
+/**
+ * Choose hardware texture format given the user's glTexImage parameters.
+ *
+ * It works out that this function is fine for all the supported
  * hardware.  However, there is still a need to map the formats onto
  * hardware descriptors.
- */
-/* Note that the i915 can actually support many more formats than
+ *
+ * Note that the i915 can actually support many more formats than
  * these if we take the step of simply swizzling the colors
  * immediately after sampling...
  */
@@ -16,7 +21,12 @@ intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat,
                          GLenum format, GLenum type)
 {
    struct intel_context *intel = intel_context(ctx);
-   const GLboolean do32bpt = (intel->ctx.Visual.rgbBits == 32);
+   const GLboolean do32bpt = (intel->ctx.Visual.rgbBits >= 24);
+
+#if 0
+   printf("%s intFmt=0x%x format=0x%x type=0x%x\n",
+          __FUNCTION__, internalFormat, format, type);
+#endif
 
    switch (internalFormat) {
    case 4:
@@ -151,20 +161,36 @@ intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat,
    case GL_SRGB8_EXT:
    case GL_SRGB_ALPHA_EXT:
    case GL_SRGB8_ALPHA8_EXT:
-   case GL_SLUMINANCE_EXT:
-   case GL_SLUMINANCE8_EXT:
-   case GL_SLUMINANCE_ALPHA_EXT:
-   case GL_SLUMINANCE8_ALPHA8_EXT:
    case GL_COMPRESSED_SRGB_EXT:
    case GL_COMPRESSED_SRGB_ALPHA_EXT:
    case GL_COMPRESSED_SLUMINANCE_EXT:
    case GL_COMPRESSED_SLUMINANCE_ALPHA_EXT:
-       return &_mesa_texformat_srgba8;
+      return &_mesa_texformat_sargb8;
+   case GL_SLUMINANCE_EXT:
+   case GL_SLUMINANCE8_EXT:
+      if (IS_G4X(intel->intelScreen->deviceID))
+         return &_mesa_texformat_sl8;
+      else
+         return &_mesa_texformat_sargb8;
+   case GL_SLUMINANCE_ALPHA_EXT:
+   case GL_SLUMINANCE8_ALPHA8_EXT:
+      if (IS_G4X(intel->intelScreen->deviceID))
+         return &_mesa_texformat_sla8;
+      else
+         return &_mesa_texformat_sargb8;
    case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT:
    case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT:
    case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT:
    case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT:
       return &_mesa_texformat_srgb_dxt1;
+
+   /* i915 could also do this */
+   case GL_DUDV_ATI:
+   case GL_DU8DV8_ATI:
+      return &_mesa_texformat_dudv8;
+   case GL_RGBA_SNORM:
+   case GL_RGBA8_SNORM:
+      return &_mesa_texformat_signed_rgba8888_rev;
 #endif
 
    default:
diff --git a/shared/intel_tex_image.c b/shared/intel_tex_image.c
index 90894f9..5e61e9e 100644
--- a/shared/intel_tex_image.c
+++ b/shared/intel_tex_image.c
@@ -12,6 +12,7 @@
 #include "main/simple_list.h"
 #include "main/texcompress.h"
 #include "main/texformat.h"
+#include "main/texgetimage.h"
 #include "main/texobj.h"
 #include "main/texstore.h"
 #include "main/teximage.h"
@@ -207,11 +208,12 @@ try_pbo_upload(struct intel_context *intel,
    if (!pbo ||
        intel->ctx._ImageTransferState ||
        unpack->SkipPixels || unpack->SkipRows) {
-      _mesa_printf("%s: failure 1\n", __FUNCTION__);
+      DBG("%s: failure 1\n", __FUNCTION__);
       return GL_FALSE;
    }
 
-   src_offset = (GLuint) pixels;
+   /* note: potential 64-bit ptr to 32-bit int cast */
+   src_offset = (GLuint) (unsigned long) pixels;
 
    if (unpack->RowLength > 0)
       src_stride = unpack->RowLength;
@@ -262,11 +264,12 @@ try_pbo_zcopy(struct intel_context *intel,
    if (!pbo ||
        intel->ctx._ImageTransferState ||
        unpack->SkipPixels || unpack->SkipRows) {
-      _mesa_printf("%s: failure 1\n", __FUNCTION__);
+      DBG("%s: failure 1\n", __FUNCTION__);
       return GL_FALSE;
    }
 
-   src_offset = (GLuint) pixels;
+   /* note: potential 64-bit ptr to 32-bit int cast */
+   src_offset = (GLuint) (unsigned long) pixels;
 
    if (unpack->RowLength > 0)
       src_stride = unpack->RowLength;
@@ -280,7 +283,7 @@ try_pbo_zcopy(struct intel_context *intel,
    dst_stride = intelImage->mt->pitch;
 
    if (src_stride != dst_stride || dst_offset != 0 || src_offset != 0) {
-      _mesa_printf("%s: failure 2\n", __FUNCTION__);
+      DBG("%s: failure 2\n", __FUNCTION__);
       return GL_FALSE;
    }
 
@@ -312,8 +315,8 @@ intelTexImage(GLcontext * ctx,
    GLint postConvWidth = width;
    GLint postConvHeight = height;
    GLint texelBytes, sizeInBytes;
-   GLuint dstRowStride, srcRowStride = texImage->RowStride;
-
+   GLuint dstRowStride = 0, srcRowStride = texImage->RowStride;
+   GLboolean needs_map;
 
    DBG("%s target %s level %d %dx%dx%d border %d\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target), level, width, height, depth, border);
@@ -479,13 +482,21 @@ intelTexImage(GLcontext * ctx,
 
    LOCK_HARDWARE(intel);
 
+   /* Two cases where we need a mapping of the miptree: when the user supplied
+    * data is mapped as well (non-PBO, memcpy upload) or when we're going to do
+    * (software) mipmap generation.
+    */
+   needs_map = (pixels != NULL) || (level == texObj->BaseLevel &&
+				  texObj->GenerateMipmap);
+
    if (intelImage->mt) {
-      texImage->Data = intel_miptree_image_map(intel,
-                                               intelImage->mt,
-                                               intelImage->face,
-                                               intelImage->level,
-                                               &dstRowStride,
-                                               intelImage->base.ImageOffsets);
+      if (needs_map)
+         texImage->Data = intel_miptree_image_map(intel,
+                                                  intelImage->mt,
+                                                  intelImage->face,
+                                                  intelImage->level,
+                                                  &dstRowStride,
+                                                  intelImage->base.ImageOffsets);
       texImage->RowStride = dstRowStride / intelImage->mt->cpp;
    }
    else {
@@ -505,8 +516,9 @@ intelTexImage(GLcontext * ctx,
    }
 
    DBG("Upload image %dx%dx%d row_len %d "
-       "pitch %d\n",
-       width, height, depth, width * texelBytes, dstRowStride);
+       "pitch %d pixels %d compressed %d\n",
+       width, height, depth, width * texelBytes, dstRowStride,
+       pixels ? 1 : 0, compressed);
 
    /* Copy data.  Would like to know when it's ok for us to eg. use
     * the blitter to copy.  Or, use the hardware to do the format
@@ -519,7 +531,7 @@ intelTexImage(GLcontext * ctx,
 	       _mesa_copy_rect(texImage->Data, dst->cpp, dst->pitch,
 			       0, 0,
 			       intelImage->mt->level[level].width,
-			       intelImage->mt->level[level].height/4,
+			       (intelImage->mt->level[level].height+3)/4,
 			       pixels,
 			       srcRowStride,
 			       0, 0);
@@ -535,17 +547,18 @@ intelTexImage(GLcontext * ctx,
 						   format, type, pixels, unpack)) {
 	   _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
        }
-   }
 
-   /* GL_SGIS_generate_mipmap */
-   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      intel_generate_mipmap(ctx, target, texObj);
+       /* GL_SGIS_generate_mipmap */
+       if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+	  intel_generate_mipmap(ctx, target, texObj);
+       }
    }
 
    _mesa_unmap_teximage_pbo(ctx, unpack);
 
    if (intelImage->mt) {
-      intel_miptree_image_unmap(intel, intelImage->mt);
+      if (needs_map)
+         intel_miptree_image_unmap(intel, intelImage->mt);
       texImage->Data = NULL;
    }
 
@@ -624,6 +637,12 @@ intel_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
    struct intel_context *intel = intel_context(ctx);
    struct intel_texture_image *intelImage = intel_texture_image(texImage);
 
+   /* If we're reading from a texture that has been rendered to, need to
+    * make sure rendering is complete.
+    * We could probably predicate this on texObj->_RenderToTexture
+    */
+   intelFlush(ctx);
+
    /* Map */
    if (intelImage->mt) {
       /* Image is stored in hardware format in a buffer managed by the
@@ -712,7 +731,9 @@ intelSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 }
 
 void
-intelSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
+intelSetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
+		   GLint glx_texture_format,
+		   __DRIdrawable *dPriv)
 {
    struct intel_framebuffer *intel_fb = dPriv->driverPrivate;
    struct intel_context *intel = pDRICtx->driverPrivate;
@@ -743,7 +764,10 @@ intelSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
 
    type = GL_BGRA;
    format = GL_UNSIGNED_BYTE;
-   internalFormat = (rb->region->cpp == 3 ? 3 : 4);
+   if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
+      internalFormat = GL_RGB;
+   else
+      internalFormat = GL_RGBA;
 
    mt = intel_miptree_create_for_region(intel, target,
 					internalFormat,
@@ -783,3 +807,12 @@ intelSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
 
    _mesa_unlock_texture(&intel->ctx, texObj);
 }
+
+void
+intelSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
+{
+   /* The old interface didn't have the format argument, so copy our
+    * implementation's behavior at the time.
+    */
+   intelSetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
+}