Import i915 and i965 dri drivers from mesa 7.0.4 (debians 7.0.3-7).7.0.4

author: Luc Verhaegen <libv@skynet.be> 2010-03-12 19:46:04 +0100
committer: Luc Verhaegen <libv@skynet.be> 2010-03-12 19:46:04 +0100
commit: 5c28087bfde504266a79dbbc8aef480009d88d2f (patch)
tree: 91c96846540390f5cb3c0daf36b17f1f65af9621
parent: 6e23622cb869c14d82f8c901c4bbea80ded6220e (diff)
72 files changed, 2816 insertions, 496 deletions
diff --git a/configure.ac b/configure.ac
index 70d46ac..4723f7f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,7 +1,7 @@
 # Process this file with autoconf to produce a configure script
 
 AC_PREREQ(2.57)
-AC_INIT([mesa-dri-i9xx], 7.0.3, [], mesa-dri-i9xx)
+AC_INIT([mesa-dri-i9xx], 7.0.4, [], mesa-dri-i9xx)
 
 AM_INIT_AUTOMAKE([dist-bzip2])
 
@@ -16,8 +16,9 @@ AC_PROG_CC
 AC_HEADER_STDC
 
 PKG_CHECK_MODULES([DRM], [libdrm >= 2.3.0])
-PKG_CHECK_MODULES([DRI], [libmesadri >= 7.0.3 libmesadri < 7.1.0
-			  libmesadricommon >= 7.0.3 libmesadricommon < 7.1.0])
+# needs MESA_FORMAT_SRGB_DXT1
+PKG_CHECK_MODULES([DRI], [libmesadri >= 7.0.4 libmesadri < 7.1.0
+			  libmesadricommon >= 7.0.4 libmesadricommon < 7.1.0])
 
 AC_OUTPUT([
 	Makefile
diff --git a/i915/i915_context.h b/i915/i915_context.h
index ec15501..7b51d77 100644
--- a/i915/i915_context.h
+++ b/i915/i915_context.h
@@ -29,6 +29,7 @@
 #define I915CONTEXT_INC
 
 #include "intel_context.h"
+#include "i915_reg.h"
 
 #define I915_FALLBACK_TEXTURE		 0x1000
 #define I915_FALLBACK_COLORMASK		 0x2000
@@ -103,6 +104,7 @@
 
 #define I915_PROGRAM_SIZE      192
 
+#define I915_MAX_INSN          (I915_MAX_TEX_INSN+I915_MAX_ALU_INSN)
 
 /* Hardware version of a parsed fragment program.  "Derived" from the
  * mesa fragment_program struct.
@@ -153,6 +155,10 @@ struct i915_fragment_program {
 				 */
 
 
+   /* Track which R registers are "live" for each instruction.
+    * A register is live between the time it's written to and the last time
+    * it's read. */
+   GLuint usedRegs[I915_MAX_INSN];
 
    /* Helpers for i915_fragprog.c:
     */
diff --git a/i915/i915_fragprog.c b/i915/i915_fragprog.c
index a28c8bb..c46ef24 100644
--- a/i915/i915_fragprog.c
+++ b/i915/i915_fragprog.c
@@ -42,7 +42,20 @@
 #include "program.h"
 #include "programopt.h"
 
-
+static const GLfloat sin_quad_constants[2][4] = {
+   {
+      2.0,
+      -1.0,
+      .5,
+      .75
+   },
+   {
+      4.0,
+      -4.0,
+      1.0 / (2.0 * M_PI),
+      .2225
+   }
+};
 
 /* 1, -1/3!, 1/5!, -1/7! */
 static const GLfloat sin_constants[4] = {  1.0, 
@@ -91,7 +104,7 @@ static GLuint src_vector( struct i915_fragment_program *p,
 	    break;
 	 case FRAG_ATTRIB_FOGC:
 	    src = i915_emit_decl( p,  REG_TYPE_T, T_FOG_W, D0_CHANNEL_W ); 
-	    src = swizzle( src, W, W, W, W );
+	    src = swizzle(src, W, ZERO, ZERO, ONE);
 	    break;
 	 case FRAG_ATTRIB_TEX0:
 	 case FRAG_ATTRIB_TEX1:
@@ -211,7 +224,7 @@ do {								\
    GLuint coord = src_vector( p, &inst->SrcReg[0], program);	\
    /* Texel lookup */						\
 								\
-   i915_emit_texld( p,						\
+   i915_emit_texld( p, get_live_regs(p, inst),						\
 	       get_result_vector( p, inst ),			\
 	       get_result_flags( inst ),			\
 	       sampler,						\
@@ -234,6 +247,43 @@ do {									\
 #define EMIT_2ARG_ARITH( OP ) EMIT_ARITH( OP, 2 )
 #define EMIT_3ARG_ARITH( OP ) EMIT_ARITH( OP, 3 )
 
+/* 
+ * TODO: consider moving this into core 
+ */
+static void calc_live_regs( struct i915_fragment_program *p )
+{
+    const struct gl_fragment_program *program = p->ctx->FragmentProgram._Current;
+    GLuint regsUsed = 0xffff0000;
+    GLint i;
+   
+    for (i = program->Base.NumInstructions - 1; i >= 0; i--) {
+        struct prog_instruction *inst = &program->Base.Instructions[i];
+        int opArgs = _mesa_num_inst_src_regs(inst->Opcode);
+        int a;
+
+        /* Register is written to: unmark as live for this and preceeding ops */ 
+        if (inst->DstReg.File == PROGRAM_TEMPORARY)
+            regsUsed &= ~(1 << inst->DstReg.Index);
+
+        for (a = 0; a < opArgs; a++) {
+            /* Register is read from: mark as live for this and preceeding ops */ 
+            if (inst->SrcReg[a].File == PROGRAM_TEMPORARY)
+                regsUsed |= 1 << inst->SrcReg[a].Index;
+        }
+
+        p->usedRegs[i] = regsUsed;
+    }
+}
+
+static GLuint get_live_regs( struct i915_fragment_program *p, 
+                             const struct prog_instruction *inst )
+{
+    const struct gl_fragment_program *program = p->ctx->FragmentProgram._Current;
+    GLuint nr = inst - program->Base.Instructions;
+
+    return p->usedRegs[nr];
+}
+ 
 
 /* Possible concerns:
  *
@@ -267,9 +317,18 @@ static void upload_program( struct i915_fragment_program *p )
       return;
    }
 
+   if (program->Base.NumInstructions > I915_MAX_INSN) {
+       i915_program_error( p, "Exceeded max instructions" );
+       return;
+    }
+
+   /* Not always needed:
+    */
+   calc_live_regs(p);
+
    while (1) {
       GLuint src0, src1, src2, flags;
-      GLuint tmp = 0;
+      GLuint tmp = 0, consts0 = 0, consts1 = 0;
 
       switch (inst->Opcode) {
       case OPCODE_ABS: 
@@ -297,67 +356,87 @@ static void upload_program( struct i915_fragment_program *p )
 	 break;
 
       case OPCODE_COS:
-	 src0 = src_vector( p, &inst->SrcReg[0], program);
-	 tmp = i915_get_utemp( p );
-
-	 i915_emit_arith( p, 
-			 A0_MUL,
-			 tmp, A0_DEST_CHANNEL_X, 0,
-			 src0, 
-			 i915_emit_const1f(p, 1.0/(M_PI * 2)),
-			 0);
-
-	 i915_emit_arith( p, 
-			 A0_MOD,
+         src0 = src_vector(p, &inst->SrcReg[0], program);
+         tmp = i915_get_utemp(p);
+	 consts0 = i915_emit_const4fv(p, sin_quad_constants[0]);
+	 consts1 = i915_emit_const4fv(p, sin_quad_constants[1]);
+
+	 /* Reduce range from repeating about [-pi,pi] to [-1,1] */
+         i915_emit_arith(p,
+                         A0_MAD,
+                         tmp, A0_DEST_CHANNEL_X, 0,
+                         src0,
+			 swizzle(consts1, Z, ZERO, ZERO, ZERO), /* 1/(2pi) */
+			 swizzle(consts0, W, ZERO, ZERO, ZERO)); /* .75 */
+
+         i915_emit_arith(p, A0_FRC, tmp, A0_DEST_CHANNEL_X, 0, tmp, 0, 0);
+
+	 i915_emit_arith(p,
+			 A0_MAD,
 			 tmp, A0_DEST_CHANNEL_X, 0,
-			 tmp, 
-			 0, 0 );
+			 tmp,
+			 swizzle(consts0, X, ZERO, ZERO, ZERO), /* 2 */
+			 swizzle(consts0, Y, ZERO, ZERO, ZERO)); /* -1 */
 
-	 /* By choosing different taylor constants, could get rid of this mul:
+	 /* Compute COS with the same calculation used for SIN, but a
+	  * different source range has been mapped to [-1,1] this time.
 	  */
-	 i915_emit_arith( p, 
-			 A0_MUL,
-			 tmp, A0_DEST_CHANNEL_X, 0,
-			 tmp, 
-			 i915_emit_const1f(p, (M_PI * 2)),
+
+	 /* tmp.y = abs(tmp.x); {x, abs(x), 0, 0} */
+	 i915_emit_arith(p,
+                         A0_MAX,
+			 tmp, A0_DEST_CHANNEL_Y, 0,
+			 swizzle(tmp, ZERO, X, ZERO, ZERO),
+			 negate(swizzle(tmp, ZERO, X, ZERO, ZERO), 0, 1, 0, 0),
 			 0);
 
-	 /* 
-	  * t0.xy = MUL x.xx11, x.x1111  ; x^2, x, 1, 1
-	  * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, 1
-	  * t0 = MUL t0.xxz1 t0.z111    ; x^6 x^4 x^2 1
-	  * result = DP4 t0, cos_constants
-	  */
-	 i915_emit_arith( p, 
+	 /* tmp.y = tmp.y * tmp.x; {x, x * abs(x), 0, 0} */
+	 i915_emit_arith(p,
 			 A0_MUL,
-			 tmp, A0_DEST_CHANNEL_XY, 0,
-			 swizzle(tmp, X,X,ONE,ONE), 
-			 swizzle(tmp, X,ONE,ONE,ONE), 0);
+			 tmp, A0_DEST_CHANNEL_Y, 0,
+			 swizzle(tmp, ZERO, X, ZERO, ZERO),
+			 tmp,
+			 0);
 
-	 i915_emit_arith( p, 
-			 A0_MUL,
-			 tmp, A0_DEST_CHANNEL_XYZ, 0,
-			 swizzle(tmp, X,Y,X,ONE), 
-			 swizzle(tmp, X,X,ONE,ONE), 0);
+	 /* tmp.x = tmp.xy DP sin_quad_constants[2].xy */
+         i915_emit_arith(p,
+                         A0_DP3,
+                         tmp, A0_DEST_CHANNEL_X, 0,
+			 tmp,
+                         swizzle(consts1, X, Y, ZERO, ZERO),
+			 0);
 
-	 i915_emit_arith( p, 
-			 A0_MUL,
-			 tmp, A0_DEST_CHANNEL_XYZ, 0,
-			 swizzle(tmp, X,X,Z,ONE), 
-			 swizzle(tmp, Z,ONE,ONE,ONE), 0);
-	    
-	 i915_emit_arith( p, 
-			 A0_DP4,
-			 get_result_vector( p, inst ), 
-			 get_result_flags( inst ), 0,
-			 swizzle(tmp, ONE,Z,Y,X),
-			 i915_emit_const4fv( p, cos_constants ), 0);
+	 /* tmp.x now contains a first approximation (y).  Now, weight it
+	  * against tmp.y**2 to get closer.
+	  */
+	 i915_emit_arith(p,
+                         A0_MAX,
+			 tmp, A0_DEST_CHANNEL_Y, 0,
+			 swizzle(tmp, ZERO, X, ZERO, ZERO),
+			 negate(swizzle(tmp, ZERO, X, ZERO, ZERO), 0, 1, 0, 0),
+			 0);
 
-	 break;
+	 /* tmp.y = tmp.x * tmp.y - tmp.x; {y, y * abs(y) - y, 0, 0} */
+	 i915_emit_arith(p,
+			 A0_MAD,
+			 tmp, A0_DEST_CHANNEL_Y, 0,
+			 swizzle(tmp, ZERO, X, ZERO, ZERO),
+			 swizzle(tmp, ZERO, Y, ZERO, ZERO),
+			 negate(swizzle(tmp, ZERO, X, ZERO, ZERO), 0, 1, 0, 0));
 
-      case OPCODE_DP3: 
-	 EMIT_2ARG_ARITH( A0_DP3 );
-	 break;
+	 /* result = .2225 * tmp.y + tmp.x =.2225(y * abs(y) - y) + y= */
+	 i915_emit_arith(p,
+			 A0_MAD,
+                         get_result_vector(p, inst),
+                         get_result_flags(inst), 0,
+			 swizzle(consts1, W, W, W, W),
+			 swizzle(tmp, Y, Y, Y, Y),
+			 swizzle(tmp, X, X, X, X));
+         break;
+
+      case OPCODE_DP3:
+         EMIT_2ARG_ARITH(A0_DP3);
+         break;
 
       case OPCODE_DP4: 
 	 EMIT_2ARG_ARITH( A0_DP4 );
@@ -414,11 +493,9 @@ static void upload_program( struct i915_fragment_program *p )
 	 src0 = src_vector( p, &inst->SrcReg[0], program);
 	 tmp = i915_get_utemp( p );
 
-	 i915_emit_texld( p,
-			 tmp, A0_DEST_CHANNEL_ALL, /* use a dummy dest reg */
-			 0,
-			 src0,
-			 T0_TEXKILL );
+	 i915_emit_texld(p, get_live_regs(p, inst),
+			 tmp, A0_DEST_CHANNEL_ALL,   /* use a dummy dest reg */
+			 0, src0, T0_TEXKILL);
 	 break;
 
       case OPCODE_LG2: 
@@ -638,62 +715,86 @@ static void upload_program( struct i915_fragment_program *p )
 	 break;
 
       case OPCODE_SIN:
-	 src0 = src_vector( p, &inst->SrcReg[0], program);
-	 tmp = i915_get_utemp( p );
+         src0 = src_vector(p, &inst->SrcReg[0], program);
+         tmp = i915_get_utemp(p);
+	 consts0 = i915_emit_const4fv(p, sin_quad_constants[0]);
+	 consts1 = i915_emit_const4fv(p, sin_quad_constants[1]);
+
+	 /* Reduce range from repeating about [-pi,pi] to [-1,1] */
+         i915_emit_arith(p,
+                         A0_MAD,
+                         tmp, A0_DEST_CHANNEL_X, 0,
+                         src0,
+			 swizzle(consts1, Z, ZERO, ZERO, ZERO), /* 1/(2pi) */
+			 swizzle(consts0, Z, ZERO, ZERO, ZERO)); /* .5 */
+
+         i915_emit_arith(p, A0_FRC, tmp, A0_DEST_CHANNEL_X, 0, tmp, 0, 0);
+	 i915_emit_arith(p,
+			 A0_MAD,
+			 tmp, A0_DEST_CHANNEL_X, 0,
+			 tmp,
+			 swizzle(consts0, X, ZERO, ZERO, ZERO), /* 2 */
+			 swizzle(consts0, Y, ZERO, ZERO, ZERO)); /* -1 */
 
-	 i915_emit_arith( p, 
+	 /* Compute sin using a quadratic and quartic.  It gives continuity
+	  * that repeating the Taylor series lacks every 2*pi, and has
+	  * reduced error.
+	  *
+	  * The idea was described at:
+	  * http://www.devmaster.net/forums/showthread.php?t=5784
+	  */
+	 /* tmp.y = abs(tmp.x); {x, abs(x), 0, 0} */
+	 i915_emit_arith(p,
+                         A0_MAX,
+			 tmp, A0_DEST_CHANNEL_Y, 0,
+			 swizzle(tmp, ZERO, X, ZERO, ZERO),
+			 negate(swizzle(tmp, ZERO, X, ZERO, ZERO), 0, 1, 0, 0),
+			 0);
+
+	 /* tmp.y = tmp.y * tmp.x; {x, x * abs(x), 0, 0} */
+	 i915_emit_arith(p,
 			 A0_MUL,
-			 tmp, A0_DEST_CHANNEL_X, 0,
-			 src0, 
-			 i915_emit_const1f(p, 1.0/(M_PI * 2)),
+			 tmp, A0_DEST_CHANNEL_Y, 0,
+			 swizzle(tmp, ZERO, X, ZERO, ZERO),
+			 tmp,
 			 0);
 
-	 i915_emit_arith( p, 
-			 A0_MOD,
-			 tmp, A0_DEST_CHANNEL_X, 0,
-			 tmp, 
-			 0, 0 );
+	 /* tmp.x = tmp.xy DP sin_quad_constants[2].xy */
+         i915_emit_arith(p,
+                         A0_DP3,
+                         tmp, A0_DEST_CHANNEL_X, 0,
+			 tmp,
+                         swizzle(consts1, X, Y, ZERO, ZERO),
+			 0);
 
-	 /* By choosing different taylor constants, could get rid of this mul:
+	 /* tmp.x now contains a first approximation (y).  Now, weight it
+	  * against tmp.y**2 to get closer.
 	  */
-	 i915_emit_arith( p, 
-			 A0_MUL,
-			 tmp, A0_DEST_CHANNEL_X, 0,
-			 tmp, 
-			 i915_emit_const1f(p, (M_PI * 2)),
+	 i915_emit_arith(p,
+                         A0_MAX,
+			 tmp, A0_DEST_CHANNEL_Y, 0,
+			 swizzle(tmp, ZERO, X, ZERO, ZERO),
+			 negate(swizzle(tmp, ZERO, X, ZERO, ZERO), 0, 1, 0, 0),
 			 0);
 
-	 /* 
-	  * t0.xy = MUL x.xx11, x.x1111  ; x^2, x, 1, 1
-	  * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, x
-	  * t1 = MUL t0.xyyw t0.yz11    ; x^7 x^5 x^3 x
-	  * result = DP4 t1.wzyx, sin_constants
-	  */
-	 i915_emit_arith( p, 
-			 A0_MUL,
-			 tmp, A0_DEST_CHANNEL_XY, 0,
-			 swizzle(tmp, X,X,ONE,ONE), 
-			 swizzle(tmp, X,ONE,ONE,ONE), 0);
+	 /* tmp.y = tmp.x * tmp.y - tmp.x; {y, y * abs(y) - y, 0, 0} */
+	 i915_emit_arith(p,
+			 A0_MAD,
+			 tmp, A0_DEST_CHANNEL_Y, 0,
+			 swizzle(tmp, ZERO, X, ZERO, ZERO),
+			 swizzle(tmp, ZERO, Y, ZERO, ZERO),
+			 negate(swizzle(tmp, ZERO, X, ZERO, ZERO), 0, 1, 0, 0));
 
-	 i915_emit_arith( p, 
-			 A0_MUL,
-			 tmp, A0_DEST_CHANNEL_ALL, 0,
-			 swizzle(tmp, X,Y,X,Y), 
-			 swizzle(tmp, X,X,ONE,ONE), 0);
+	 /* result = .2225 * tmp.y + tmp.x =.2225(y * abs(y) - y) + y= */
+	 i915_emit_arith(p,
+			 A0_MAD,
+                         get_result_vector(p, inst),
+                         get_result_flags(inst), 0,
+			 swizzle(consts1, W, W, W, W),
+			 swizzle(tmp, Y, Y, Y, Y),
+			 swizzle(tmp, X, X, X, X));
 
-	 i915_emit_arith( p, 
-			 A0_MUL,
-			 tmp, A0_DEST_CHANNEL_ALL, 0,
-			 swizzle(tmp, X,Y,Y,W), 
-			 swizzle(tmp, X,Z,ONE,ONE), 0);
-	    
-	 i915_emit_arith( p, 
-			 A0_DP4,
-			 get_result_vector( p, inst ), 
-			 get_result_flags( inst ), 0,
-			 swizzle(tmp, W, Z, Y, X ),
-			 i915_emit_const4fv( p, sin_constants ), 0);
-	 break;
+         break;
 
       case OPCODE_SLT: 
 	 EMIT_2ARG_ARITH( A0_SLT );
diff --git a/i915/i915_program.c b/i915/i915_program.c
index 6849112..31be4b5 100644
--- a/i915/i915_program.c
+++ b/i915/i915_program.c
@@ -194,27 +194,43 @@ GLuint i915_emit_arith( struct i915_fragment_program *p,
    return dest;
 }
 
+static GLuint get_free_rreg (struct i915_fragment_program *p, 
+                             GLuint live_regs)
+{
+    int bit = ffs(~live_regs);
+    if (!bit) {
+        i915_program_error(p, "Can't find free R reg");
+        return UREG_BAD;
+    }
+    return UREG(REG_TYPE_R, bit - 1);
+}
+
 GLuint i915_emit_texld( struct i915_fragment_program *p,
+			GLuint live_regs,               
 			GLuint dest,
 			GLuint destmask,
 			GLuint sampler,
 			GLuint coord,
 			GLuint op )
 {
-   if (coord != UREG(GET_UREG_TYPE(coord), GET_UREG_NR(coord))) {
-      /* No real way to work around this in the general case - need to
-       * allocate and declare a new temporary register (a utemp won't
-       * do).  Will fallback for now.
-       */
-      i915_program_error(p, "Can't (yet) swizzle TEX arguments");
-      return 0;
-   }
-
-   /* Don't worry about saturate as we only support  
+    if (coord != UREG(GET_UREG_TYPE(coord), GET_UREG_NR(coord))) {
+        /* With the help of the "needed registers" table created earlier, pick
+         * a register we can MOV the swizzled TC to (since TEX doesn't support
+         * swizzled sources) */
+        GLuint swizCoord = get_free_rreg(p, live_regs);
+        if (swizCoord == UREG_BAD) 
+            return 0;
+
+        i915_emit_arith( p, A0_MOV, swizCoord, A0_DEST_CHANNEL_ALL, 0, coord, 0, 0 );
+        coord = swizCoord;
+    }
+
+   /* Don't worry about saturate as we only support texture formats
+    * that are always in the 0..1 range.
     */
    if (destmask != A0_DEST_CHANNEL_ALL) {
       GLuint tmp = i915_get_utemp(p);
-      i915_emit_texld( p, tmp, A0_DEST_CHANNEL_ALL, sampler, coord, op );
+      i915_emit_texld( p, 0, tmp, A0_DEST_CHANNEL_ALL, sampler, coord, op );
       i915_emit_arith( p, A0_MOV, dest, destmask, 0, tmp, 0, 0 );
       return dest;
    }
diff --git a/i915/i915_program.h b/i915/i915_program.h
index 8891a17..d9760f9 100644
--- a/i915/i915_program.h
+++ b/i915/i915_program.h
@@ -110,6 +110,7 @@ extern void i915_release_utemps( struct i915_fragment_program *p );
 
 
 extern GLuint i915_emit_texld( struct i915_fragment_program *p,
+			      GLuint live_regs, 
 			      GLuint dest,
 			      GLuint destmask,
 			      GLuint sampler,
diff --git a/i915/i915_texprog.c b/i915/i915_texprog.c
index f6a8b02..c467fe1 100644
--- a/i915/i915_texprog.c
+++ b/i915/i915_texprog.c
@@ -69,7 +69,7 @@ static GLuint get_source( struct i915_fragment_program *p,
 	 if (p->VB->TexCoordPtr[unit]->size == 4)
 	    op = T0_TEXLDP;
 
-	 p->src_texture = i915_emit_texld( p, tmp, A0_DEST_CHANNEL_ALL, 
+	 p->src_texture = i915_emit_texld( p, 0, tmp, A0_DEST_CHANNEL_ALL, 
 					  sampler, texcoord, op );
       }
 
diff --git a/i915/i915_texstate.c b/i915/i915_texstate.c
index a19d4b6..08c561e 100644
--- a/i915/i915_texstate.c
+++ b/i915/i915_texstate.c
@@ -454,7 +454,12 @@ static void i915SetTexImages( i915ContextPtr i915,
 
    case MESA_FORMAT_Z16:
       t->intel.texelBytes = 2;
-      textureFormat = (MAPSURF_16BIT | MT_16BIT_L16);
+      if (tObj->DepthMode == GL_ALPHA)
+	  textureFormat = (MAPSURF_16BIT | MT_16BIT_A16);
+      else if (tObj->DepthMode == GL_INTENSITY)
+	  textureFormat = (MAPSURF_16BIT | MT_16BIT_I16);
+      else
+	  textureFormat = (MAPSURF_16BIT | MT_16BIT_L16);
       break;
 
    case MESA_FORMAT_RGBA_DXT1:
@@ -737,6 +742,9 @@ static GLboolean enable_tex_common( GLcontext *ctx, GLuint unit )
       return GL_FALSE;
    }
 
+   if (tObj->Target == GL_TEXTURE_1D &&
+       tObj->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB)
+      return GL_FALSE;
 
    /* Update state if this is a different texture object to last
     * time.
diff --git a/i915/intel_context.c b/i915/intel_context.c
index bb5ce64..9bca64a 100644
--- a/i915/intel_context.c
+++ b/i915/intel_context.c
@@ -117,6 +117,8 @@ const GLubyte *intelGetString( GLcontext *ctx, GLenum name )
 	 chipset = "Intel(R) 865G"; break;
       case PCI_CHIP_I915_G:
 	 chipset = "Intel(R) 915G"; break;
+      case PCI_CHIP_E7221_G:
+	 chipset = "Intel (R) E7221G (i915)"; break;
       case PCI_CHIP_I915_GM:
 	 chipset = "Intel(R) 915GM"; break;
       case PCI_CHIP_I945_G:
diff --git a/i915/intel_context.h b/i915/intel_context.h
index 50e6178..634d581 100644
--- a/i915/intel_context.h
+++ b/i915/intel_context.h
@@ -361,6 +361,8 @@ do {									\
 #define SUBPIXEL_X 0.125
 #define SUBPIXEL_Y 0.125
 
+#define ALIGN(value, alignment)  ((value + alignment - 1) & ~(alignment - 1))
+
 #define INTEL_FIREVERTICES(intel)		\
 do {						\
    if ((intel)->prim.flush)			\
@@ -451,6 +453,7 @@ extern int INTEL_DEBUG;
 #define PCI_CHIP_I855_GM		0x3582
 #define PCI_CHIP_I865_G			0x2572
 #define PCI_CHIP_I915_G			0x2582
+#define PCI_CHIP_E7221_G		0x258A
 #define PCI_CHIP_I915_GM		0x2592
 #define PCI_CHIP_I945_G			0x2772
 #define PCI_CHIP_I945_GM		0x27A2
diff --git a/i915/intel_pixel.c b/i915/intel_pixel.c
index 535cbfc..c5005ba 100644
--- a/i915/intel_pixel.c
+++ b/i915/intel_pixel.c
@@ -228,7 +228,7 @@ intelTryReadPixels( GLcontext *ctx,
       __DRIdrawablePrivate *dPriv = intel->driDrawable;
       int nbox = dPriv->numClipRects;
       int src_offset = intel->readRegion->offset;
-      int src_pitch = intel->intelScreen->front.pitch;
+      int src_pitch = intel->intelScreen->front.pitch / intel->intelScreen->cpp; /* in pixels */
       int dst_offset = intelAgpOffsetFromVirtual( intel, pixels);
       drm_clip_rect_t *box = dPriv->pClipRects;
       int i;
@@ -308,7 +308,7 @@ static void do_draw_pix( GLcontext *ctx,
    int nbox = dPriv->numClipRects;
    int i;
    int src_offset = intelAgpOffsetFromVirtual( intel, pixels);
-   int src_pitch = pitch;
+   int src_pitch = pitch;  /* in pixels */
 
    assert(src_offset != ~0);  /* should be caught earlier */
 
@@ -339,7 +339,7 @@ static void do_draw_pix( GLcontext *ctx,
             intelEmitCopyBlitLocked( intel,
                                      intel->intelScreen->cpp,
                                      src_pitch, src_offset,
-                                     intel->intelScreen->front.pitch,
+                                     intel->intelScreen->front.pitch / intel->intelScreen->cpp, /* in pixels */
                                      intel->drawRegion->offset,
                                      bx - x, by - y,
                                      bx, by,
@@ -364,7 +364,7 @@ intelTryDrawPixels( GLcontext *ctx,
    GLint pitch = unpack->RowLength ? unpack->RowLength : width;
    GLuint dest;
    GLuint cpp = intel->intelScreen->cpp;
-   GLint size = width * pitch * cpp;
+   GLint size = height * pitch * cpp;
 
    if (INTEL_DEBUG & DEBUG_PIXEL)
       fprintf(stderr, "%s\n", __FUNCTION__);
diff --git a/i915/intel_screen.c b/i915/intel_screen.c
index ca8610b..a66cfd6 100644
--- a/i915/intel_screen.c
+++ b/i915/intel_screen.c
@@ -53,7 +53,7 @@ DRI_CONF_BEGIN
     DRI_CONF_SECTION_END
     DRI_CONF_SECTION_QUALITY
        DRI_CONF_FORCE_S3TC_ENABLE(false)
-       DRI_CONF_ALLOW_LARGE_TEXTURES(1)
+       DRI_CONF_ALLOW_LARGE_TEXTURES(2)
       DRI_CONF_SECTION_END
 DRI_CONF_END;
 const GLuint __driNConfigOptions = 4;
@@ -511,6 +511,7 @@ static GLboolean intelCreateContext( const __GLcontextModes *mesaVis,
 				sharedContextPrivate );
 
    case PCI_CHIP_I915_G:
+   case PCI_CHIP_E7221_G: 
    case PCI_CHIP_I915_GM:
    case PCI_CHIP_I945_G:
    case PCI_CHIP_I945_GM:
diff --git a/i915/intel_state.c b/i915/intel_state.c
index e5988a5..b333ec5 100644
--- a/i915/intel_state.c
+++ b/i915/intel_state.c
@@ -189,12 +189,12 @@ static void intelDrawBuffer(GLcontext *ctx, GLenum mode )
    if ( intel->sarea->pf_current_page == 1 ) 
       front ^= 1;
    
-   intelSetFrontClipRects( intel );
-
    if (front) {
+      intelSetFrontClipRects( intel );
       intel->drawRegion = &intel->intelScreen->front;
       intel->readRegion = &intel->intelScreen->front;
    } else {
+      intelSetBackClipRects( intel );
       intel->drawRegion = &intel->intelScreen->back;
       intel->readRegion = &intel->intelScreen->back;
    }
diff --git a/i915/intel_tex.c b/i915/intel_tex.c
index 5bd2806..8460134 100644
--- a/i915/intel_tex.c
+++ b/i915/intel_tex.c
@@ -759,7 +759,7 @@ int intelUploadTexImages( intelContextPtr intel,
 			  GLuint face)
 {
    const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-   const struct gl_texture_image *firstImage = t->image[face][t->base.firstLevel].image;
+   const struct gl_texture_image *firstImage = t->image[face][0].image;
    int pitch = firstImage->RowStride * firstImage->TexFormat->TexelBytes;
 
    /* Can we texture out of the existing client data? */
diff --git a/i915/intel_tris.c b/i915/intel_tris.c
index b2787ee..3c5ed47 100644
--- a/i915/intel_tris.c
+++ b/i915/intel_tris.c
@@ -202,12 +202,19 @@ static void intel_wpos_triangle( intelContextPtr intel,
 {
    GLuint offset = intel->wpos_offset;
    GLuint size = intel->wpos_size;
-   
-   __memcpy( ((char *)v0) + offset, v0, size );
-   __memcpy( ((char *)v1) + offset, v1, size );
-   __memcpy( ((char *)v2) + offset, v2, size );
+   GLfloat *v0_wpos = (GLfloat *)((char *)v0 + offset);
+   GLfloat *v1_wpos = (GLfloat *)((char *)v1 + offset);
+   GLfloat *v2_wpos = (GLfloat *)((char *)v2 + offset);
+
+   __memcpy(v0_wpos, v0, size);
+   __memcpy(v1_wpos, v1, size);
+   __memcpy(v2_wpos, v2, size);
 
-   intel_draw_triangle( intel, v0, v1, v2 );
+   v0_wpos[1] = -v0_wpos[1] + intel->driDrawable->h;
+   v1_wpos[1] = -v1_wpos[1] + intel->driDrawable->h;
+   v2_wpos[1] = -v2_wpos[1] + intel->driDrawable->h;
+
+   intel_draw_triangle(intel, v0, v1, v2);
 }
 
 
@@ -217,9 +224,14 @@ static void intel_wpos_line( intelContextPtr intel,
 {
    GLuint offset = intel->wpos_offset;
    GLuint size = intel->wpos_size;
+   GLfloat *v0_wpos = (GLfloat *)((char *)v0 + offset);
+   GLfloat *v1_wpos = (GLfloat *)((char *)v1 + offset);
+
+   __memcpy(v0_wpos, v0, size);
+   __memcpy(v1_wpos, v1, size);
 
-   __memcpy( ((char *)v0) + offset, v0, size );
-   __memcpy( ((char *)v1) + offset, v1, size );
+   v0_wpos[1] = -v0_wpos[1] + intel->driDrawable->h;
+   v1_wpos[1] = -v1_wpos[1] + intel->driDrawable->h;
 
    intel_draw_line( intel, v0, v1 );
 }
@@ -230,8 +242,10 @@ static void intel_wpos_point( intelContextPtr intel,
 {
    GLuint offset = intel->wpos_offset;
    GLuint size = intel->wpos_size;
+   GLfloat *v0_wpos = (GLfloat *)((char *)v0 + offset);
 
-   __memcpy( ((char *)v0) + offset, v0, size );
+   __memcpy(v0_wpos, v0, size);
+   v0_wpos[1] = -v0_wpos[1] + intel->driDrawable->h;
 
    intel_draw_point( intel, v0 );
 }
diff --git a/i965/Makefile.am b/i965/Makefile.am
index 163ad0f..55d7cbc 100644
--- a/i965/Makefile.am
+++ b/i965/Makefile.am
@@ -71,6 +71,7 @@ i965_dri_la_SOURCES = \
 	brw_wm_emit.c \
 	brw_wm_fp.c \
 	brw_wm_iz.c \
+	brw_wm_glsl.c \
 	brw_wm_pass0.c \
 	brw_wm_pass1.c \
 	brw_wm_pass2.c \
diff --git a/i965/brw_aub_playback.c b/i965/brw_aub_playback.c
index 99d9475..3a6c4dd 100644
--- a/i965/brw_aub_playback.c
+++ b/i965/brw_aub_playback.c
@@ -144,14 +144,14 @@ static struct {
    { CMD_STATE_BASE_ADDRESS, "STATE_BASE_ADDRESS",  1 },
    { CMD_STATE_INSN_POINTER, "STATE_INSN_POINTER",  1 },
    { CMD_PIPELINE_SELECT_965, "PIPELINE_SELECT", 0, },
-   { CMD_PIPELINE_SELECT_IGD, "PIPELINE_SELECT", 0,},
+   { CMD_PIPELINE_SELECT_GM45, "PIPELINE_SELECT", 0,},
    { CMD_PIPELINED_STATE_POINTERS, "PIPELINED_STATE_POINTERS", 1 },
    { CMD_BINDING_TABLE_PTRS, "BINDING_TABLE_PTRS", 1 },
    { CMD_VERTEX_BUFFER, "VERTEX_BUFFER", 1 },
    { CMD_VERTEX_ELEMENT, "VERTEX_ELEMENT", 1 },
    { CMD_INDEX_BUFFER, "INDEX_BUFFER", 1 },
    { CMD_VF_STATISTICS_965, "VF_STATISTICS", 0 },
-   { CMD_VF_STATISTICS_IGD, "VF_STATISTICS", 0 },
+   { CMD_VF_STATISTICS_GM45, "VF_STATISTICS", 0 },
    { CMD_DRAW_RECT, "DRAW_RECT", 1 },
    { CMD_BLEND_CONSTANT_COLOR, "BLEND_CONSTANT_COLOR", 1 },
    { CMD_CHROMA_KEY, "CHROMA_KEY", 1 },
diff --git a/i965/brw_cc.c b/i965/brw_cc.c
index 8a1d152..1d7a3cb 100644
--- a/i965/brw_cc.c
+++ b/i965/brw_cc.c
@@ -76,8 +76,8 @@ static void upload_cc_unit( struct brw_context *brw )
       cc.cc1.stencil_write_mask = brw->attribs.Stencil->WriteMask[0];
       cc.cc1.stencil_test_mask = brw->attribs.Stencil->ValueMask[0];
 
-      if (brw->attribs.Stencil->TestTwoSide) {
-	 cc.cc0.bf_stencil_enable = brw->attribs.Stencil->TestTwoSide;
+      if (brw->attribs.Stencil->_TestTwoSide) {
+	 cc.cc0.bf_stencil_enable = brw->attribs.Stencil->_TestTwoSide;
 	 cc.cc0.bf_stencil_func = intel_translate_compare_func(brw->attribs.Stencil->Function[1]);
 	 cc.cc0.bf_stencil_fail_op = intel_translate_stencil_op(brw->attribs.Stencil->FailFunc[1]);
 	 cc.cc0.bf_stencil_pass_depth_fail_op = intel_translate_stencil_op(brw->attribs.Stencil->ZFailFunc[1]);
@@ -90,7 +90,8 @@ static void upload_cc_unit( struct brw_context *brw )
       /* Not really sure about this:
        */
       if (brw->attribs.Stencil->WriteMask[0] ||
-	  (brw->attribs.Stencil->TestTwoSide && brw->attribs.Stencil->WriteMask[1]))
+	  (brw->attribs.Stencil->_TestTwoSide &&
+	   brw->attribs.Stencil->WriteMask[1]))
 	 cc.cc0.stencil_write_enable = 1;
    }
 
diff --git a/i965/brw_clip.h b/i965/brw_clip.h
index 49b2770..2a65697 100644
--- a/i965/brw_clip.h
+++ b/i965/brw_clip.h
@@ -42,7 +42,7 @@
  * up polygon offset and flatshading at this point:
  */
 struct brw_clip_prog_key {
-   GLuint attrs:16;		
+   GLuint attrs:32;		
    GLuint primitive:4;
    GLuint nr_userclip:3;
    GLuint do_flat_shading:1;
@@ -51,7 +51,7 @@ struct brw_clip_prog_key {
    GLuint fill_ccw:2;		/* includes cull information */
    GLuint offset_cw:1;
    GLuint offset_ccw:1;
-   GLuint pad0:1;
+   GLuint pad0:17;
 
    GLuint copy_bfc_cw:1;
    GLuint copy_bfc_ccw:1;
diff --git a/i965/brw_clip_line.c b/i965/brw_clip_line.c
index 8318227..ff9003e 100644
--- a/i965/brw_clip_line.c
+++ b/i965/brw_clip_line.c
@@ -146,6 +146,15 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    brw_clip_init_planes(c);
    brw_clip_init_clipmask(c);
 
+   /* -ve rhw workaround */
+   if (!(BRW_IS_GM45(p->brw) || BRW_IS_G4X(p->brw))) { 
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+              brw_imm_ud(1<<20));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   }
+
    plane_loop = brw_DO(p, BRW_EXECUTE_1);
    {
       /* if (planemask & 1)
diff --git a/i965/brw_clip_state.c b/i965/brw_clip_state.c
index 37a25a9..b715aca 100644
--- a/i965/brw_clip_state.c
+++ b/i965/brw_clip_state.c
@@ -43,7 +43,8 @@ static void upload_clip_unit( struct brw_context *brw )
    memset(&clip, 0, sizeof(clip));
 
    /* CACHE_NEW_CLIP_PROG */
-   clip.thread0.grf_reg_count = ((brw->clip.prog_data->total_grf-1) & ~15) / 16;
+   clip.thread0.grf_reg_count =
+      ALIGN(brw->clip.prog_data->total_grf, 16) / 16 - 1;
    clip.thread0.kernel_start_pointer = brw->clip.prog_gs_offset >> 6;
    clip.thread3.urb_entry_read_length = brw->clip.prog_data->urb_read_length;
    clip.thread3.const_urb_entry_read_length = brw->clip.prog_data->curb_read_length;
@@ -55,7 +56,7 @@ static void upload_clip_unit( struct brw_context *brw )
    /* BRW_NEW_URB_FENCE */
    clip.thread4.nr_urb_entries = brw->urb.nr_clip_entries; 
    clip.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
-   clip.thread4.max_threads = 0; /* Hmm, maybe the max is 1 or 2 threads */
+   clip.thread4.max_threads = 1; /* 2 threads */
 
    if (INTEL_DEBUG & DEBUG_STATS)
       clip.thread4.stats_enable = 1; 
@@ -73,7 +74,7 @@ static void upload_clip_unit( struct brw_context *brw )
    clip.clip5.vertex_position_space = BRW_CLIP_NDCSPACE;
    clip.clip5.api_mode = BRW_CLIP_API_OGL;   
 
-   if (BRW_IS_IGD(brw))
+   if (BRW_IS_GM45(brw) || BRW_IS_G4X(brw))
       clip.clip5.negative_w_clip_test = 1;
 
    clip.clip6.clipper_viewport_state_ptr = 0;
diff --git a/i965/brw_clip_tri.c b/i965/brw_clip_tri.c
index 0fc7306..a602000 100644
--- a/i965/brw_clip_tri.c
+++ b/i965/brw_clip_tri.c
@@ -42,6 +42,20 @@
 #include "brw_util.h"
 #include "brw_clip.h"
 
+static struct brw_reg get_tmp( struct brw_clip_compile *c )
+{
+   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmps( struct brw_clip_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
 
 
 void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
@@ -435,15 +449,104 @@ static void maybe_do_clip_tri( struct brw_clip_compile *c )
    brw_ENDIF(p, do_clip);
 }
 
-
+static void brw_clip_test( struct brw_clip_compile *c )
+{
+    struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+
+    struct brw_reg v0 = get_tmp(c);
+    struct brw_reg v1 = get_tmp(c);
+    struct brw_reg v2 = get_tmp(c);
+
+    struct brw_indirect vt0 = brw_indirect(0, 0);
+    struct brw_indirect vt1 = brw_indirect(1, 0);
+    struct brw_indirect vt2 = brw_indirect(2, 0);
+
+    struct brw_compile *p = &c->func;
+
+    brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
+    brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
+    brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
+    brw_MOV(p, v0, deref_4f(vt0, c->offset[VERT_RESULT_HPOS]));
+    brw_MOV(p, v1, deref_4f(vt1, c->offset[VERT_RESULT_HPOS]));
+    brw_MOV(p, v2, deref_4f(vt2, c->offset[VERT_RESULT_HPOS]));
+
+    /* test nearz, xmin, ymin plane */
+    brw_CMP(p, t1, BRW_CONDITIONAL_LE, negate(v0), get_element(v0, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_LE, negate(v1), get_element(v1, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_LE, negate(v2), get_element(v2, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
+	    get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
+	    get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
+	    get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* test farz, xmax, ymax plane */
+    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, get_element(v0, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, get_element(v1, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, get_element(v2, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
+	    get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
+	    get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
+	    get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    release_tmps(c);
+}
 
 
 void brw_emit_tri_clip( struct brw_clip_compile *c )
 {
+   struct brw_instruction *neg_rhw;
+   struct brw_compile *p = &c->func;
    brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
    brw_clip_tri_init_vertices(c);
    brw_clip_init_clipmask(c);
 
+   /* if -ve rhw workaround bit is set, 
+      do cliptest */
+   if (!(BRW_IS_GM45(p->brw) || BRW_IS_G4X(p->brw))) {   
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
+              brw_imm_ud(1<<20));
+      neg_rhw = brw_IF(p, BRW_EXECUTE_1); 
+      {
+          brw_clip_test(c);
+       }
+      brw_ENDIF(p, neg_rhw);
+   }
+
    /* Can't push into do_clip_tri because with polygon (or quad)
     * flatshading, need to apply the flatshade here because we don't
     * respect the PV when converting to trifan for emit:
@@ -462,6 +565,3 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
     */
    brw_clip_kill_thread(c);
 }
-
-
-
diff --git a/i965/brw_clip_unfilled.c b/i965/brw_clip_unfilled.c
index 918e000..57ebf38 100644
--- a/i965/brw_clip_unfilled.c
+++ b/i965/brw_clip_unfilled.c
@@ -220,8 +220,8 @@ static void apply_one_offset( struct brw_clip_compile *c,
 			  struct brw_indirect vert )
 {
    struct brw_compile *p = &c->func;
-   struct brw_reg pos = deref_4f(vert, c->offset[VERT_RESULT_HPOS]);
-   struct brw_reg z = get_element(pos, 2);
+   struct brw_reg z = deref_1f(vert, c->header_position_offset +
+			       2 * type_sz(BRW_REGISTER_TYPE_F));
 
    brw_ADD(p, z, z, vec1(c->reg.offset));
 }
diff --git a/i965/brw_clip_util.c b/i965/brw_clip_util.c
index 41d9b75..0ca2a67 100644
--- a/i965/brw_clip_util.c
+++ b/i965/brw_clip_util.c
@@ -272,6 +272,7 @@ void brw_clip_kill_thread(struct brw_clip_compile *c)
 
 
 
+
 struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c )
 {
    return brw_address(c->reg.fixed_planes);
@@ -327,8 +328,7 @@ void brw_clip_init_clipmask( struct brw_clip_compile *c )
    
    /* Shift so that lowest outcode bit is rightmost: 
     */
-   brw_MOV(p, c->reg.planemask, incoming);
-   brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(26));
+   brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));
 
    if (c->key.nr_userclip) {
       struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);
@@ -342,15 +342,5 @@ void brw_clip_init_clipmask( struct brw_clip_compile *c )
       
       release_tmp(c, tmp);
    }
-
-   if (!BRW_IS_IGD(p->brw)) {
-       /* Test for -ve rhw workaround 
-        */
-       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-       brw_AND(p, vec1(brw_null_reg()), incoming, brw_imm_ud(1<<20));
-       brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
-   }
-
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 }
 
diff --git a/i965/brw_context.c b/i965/brw_context.c
index 397a9bd..e19d36e 100644
--- a/i965/brw_context.c
+++ b/i965/brw_context.c
@@ -44,6 +44,8 @@
 #include "api_noop.h"
 #include "vtxfmt.h"
 
+#include "shader/shader_api.h"
+
 /***************************************
  * Mesa's Driver Functions
  ***************************************/
@@ -60,12 +62,21 @@ static const struct dri_extension brw_extensions[] =
     { NULL,                                NULL }
 };
 
+static void brwUseProgram(GLcontext *ctx, GLuint program)
+{
+   _mesa_use_program(ctx, program);
+}
 
+static void brwInitProgFuncs( struct dd_function_table *functions )
+{
+   functions->UseProgram = brwUseProgram;
+}
 static void brwInitDriverFunctions( struct dd_function_table *functions )
 {
    intelInitDriverFunctions( functions );
    brwInitTextureFuncs( functions );
    brwInitFragProgFuncs( functions );
+   brwInitProgFuncs( functions );
 }
 
 
diff --git a/i965/brw_curbe.c b/i965/brw_curbe.c
index 5bf0ed5..3343bed 100644
--- a/i965/brw_curbe.c
+++ b/i965/brw_curbe.c
@@ -305,7 +305,7 @@ static void upload_constant_buffer(struct brw_context *brw)
       
       if (!brw_pool_alloc(pool, 
 			  bufsz,
-			  6,
+			  1 << 6,
 			  &brw->curbe.gs_offset)) {
 	 _mesa_printf("out of GS memory for curbe\n");
 	 assert(0);
diff --git a/i965/brw_defines.h b/i965/brw_defines.h
index 101828b..018da89 100644
--- a/i965/brw_defines.h
+++ b/i965/brw_defines.h
@@ -240,6 +240,8 @@
 #define BRW_FRONTWINDING_CW      0
 #define BRW_FRONTWINDING_CCW     1
 
+#define BRW_SPRITE_POINT_ENABLE  16
+
 #define BRW_INDEX_BYTE     0
 #define BRW_INDEX_WORD     1
 #define BRW_INDEX_DWORD    2
@@ -816,7 +818,7 @@
 #define CMD_STATE_BASE_ADDRESS        0x6101
 #define CMD_STATE_INSN_POINTER        0x6102
 #define CMD_PIPELINE_SELECT_965       0x6104
-#define CMD_PIPELINE_SELECT_IGD       0x6904
+#define CMD_PIPELINE_SELECT_GM45      0x6904
 
 #define CMD_PIPELINED_STATE_POINTERS  0x7800
 #define CMD_BINDING_TABLE_PTRS        0x7801
@@ -824,7 +826,7 @@
 #define CMD_VERTEX_ELEMENT            0x7809
 #define CMD_INDEX_BUFFER              0x780a
 #define CMD_VF_STATISTICS_965         0x780b
-#define CMD_VF_STATISTICS_IGD         0x680b
+#define CMD_VF_STATISTICS_GM45        0x680b
 
 #define CMD_DRAW_RECT                 0x7900
 #define CMD_BLEND_CONSTANT_COLOR      0x7901
@@ -848,9 +850,12 @@
 #define R02_PRIM_END    0x1
 #define R02_PRIM_START  0x2
 
-#define BRW_IS_IGD(brw)     ((brw)->intel.intelScreen->deviceID == PCI_CHIP_IGD_GM)
-#define CMD_PIPELINE_SELECT(brw)       ((BRW_IS_IGD(brw)) ? CMD_PIPELINE_SELECT_IGD : CMD_PIPELINE_SELECT_965)
-#define CMD_VF_STATISTICS(brw)         ((BRW_IS_IGD(brw)) ? CMD_VF_STATISTICS_IGD : CMD_VF_STATISTICS_965)
-#define URB_SIZES(brw)                 ((BRW_IS_IGD(brw)) ? 384 : 256)  /* 512 bit unit */
+#define BRW_IS_GM45(brw)                ((brw)->intel.intelScreen->deviceID == PCI_CHIP_GM45_GM)
+#define BRW_IS_G4X(brw)                 (((brw)->intel.intelScreen->deviceID == PCI_CHIP_IGD_E_G) || \
+                                         ((brw)->intel.intelScreen->deviceID == PCI_CHIP_G45_G) || \
+                                         ((brw)->intel.intelScreen->deviceID == PCI_CHIP_Q45_G))
+#define CMD_PIPELINE_SELECT(brw)       ((BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
+#define CMD_VF_STATISTICS(brw)         ((BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
+#define URB_SIZES(brw)                 ((BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) ? 384 : 256)  /* 512 bit unit */
 
 #endif
diff --git a/i965/brw_draw_upload.c b/i965/brw_draw_upload.c
index 6150cac..133d8f4 100644
--- a/i965/brw_draw_upload.c
+++ b/i965/brw_draw_upload.c
@@ -291,7 +291,7 @@ static void get_space( struct brw_context *brw,
 		       struct gl_buffer_object **vbo_return,
 		       GLuint *offset_return )
 {
-   size = (size + 63) & ~63;
+   size = ALIGN(size, 64);
    
    if (brw->vb.upload.offset + size > BRW_UPLOAD_INIT_SIZE)
       wrap_buffers(brw, size);
@@ -593,6 +593,31 @@ void brw_upload_indices( struct brw_context *brw,
 				 ib_size,
 				 index_buffer->ptr,
 				 bufferobj);
+   } else {
+      /* If the index buffer isn't aligned to its element size, we have to
+       * rebase it into a temporary.
+       */
+       if ((get_size(index_buffer->type) - 1) & offset) {
+           struct gl_buffer_object *vbo;
+           GLuint voffset;
+           GLubyte *map = ctx->Driver.MapBuffer(ctx,
+                                                GL_ELEMENT_ARRAY_BUFFER_ARB,
+                                                GL_DYNAMIC_DRAW_ARB,
+                                                bufferobj);
+           map += offset;
+           get_space(brw, ib_size, &vbo, &voffset);
+           
+           ctx->Driver.BufferSubData(ctx,
+                                     GL_ELEMENT_ARRAY_BUFFER_ARB,
+                                     voffset,
+                                     ib_size,
+                                     map,
+                                     vbo);
+           ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB, bufferobj);
+
+           bufferobj = vbo;
+           offset = voffset;
+       }
    }
 
    /* Emit the indexbuffer packet:
diff --git a/i965/brw_eu.h b/i965/brw_eu.h
index 9d46aac..c138d15 100644
--- a/i965/brw_eu.h
+++ b/i965/brw_eu.h
@@ -335,14 +335,14 @@ static __inline struct brw_reg brw_imm_ud( GLuint ud )
 static __inline struct brw_reg brw_imm_uw( GLushort uw )
 {
    struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
-   imm.dw1.ud = uw;
+   imm.dw1.ud = uw | (uw << 16);
    return imm;
 }
 
 static __inline struct brw_reg brw_imm_w( GLshort w )
 {
    struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
-   imm.dw1.d = w;
+   imm.dw1.d = w | (w << 16);
    return imm;
 }
 
@@ -649,6 +649,16 @@ static __inline struct brw_reg deref_1uw(struct brw_indirect ptr, GLint offset)
    return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW);
 }
 
+static __inline struct brw_reg deref_1d(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_D);
+}
+
+static __inline struct brw_reg deref_1ud(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD);
+}
+
 static __inline struct brw_reg get_addr_reg(struct brw_indirect ptr)
 {
    return brw_address_reg(ptr.addr_subnr);
@@ -669,7 +679,10 @@ static __inline struct brw_indirect brw_indirect( GLuint addr_subnr, GLint offse
    return ptr;
 }
 
-
+static __inline struct brw_instruction *current_insn( struct brw_compile *p)
+{
+	return &p->store[p->nr_insn];
+}
 
 void brw_pop_insn_state( struct brw_compile *p );
 void brw_push_insn_state( struct brw_compile *p );
@@ -809,9 +822,11 @@ void brw_ENDIF(struct brw_compile *p,
 struct brw_instruction *brw_DO(struct brw_compile *p,
 			       GLuint execute_size);
 
-void brw_WHILE(struct brw_compile *p, 
+struct brw_instruction *brw_WHILE(struct brw_compile *p, 
 	       struct brw_instruction *patch_insn);
 
+struct brw_instruction *brw_BREAK(struct brw_compile *p);
+struct brw_instruction *brw_CONT(struct brw_compile *p);
 /* Forward jumps:
  */
 void brw_land_fwd_jump(struct brw_compile *p, 
@@ -861,5 +876,6 @@ void brw_math_invert( struct brw_compile *p,
 		      struct brw_reg dst,
 		      struct brw_reg src);
 
-
+void brw_set_src1( struct brw_instruction *insn,
+                          struct brw_reg reg );
 #endif
diff --git a/i965/brw_eu_emit.c b/i965/brw_eu_emit.c
index 1c717e4..d35b184 100644
--- a/i965/brw_eu_emit.c
+++ b/i965/brw_eu_emit.c
@@ -164,7 +164,7 @@ static void brw_set_src0( struct brw_instruction *insn,
 }
 
 
-static void brw_set_src1( struct brw_instruction *insn,
+void brw_set_src1( struct brw_instruction *insn,
 			  struct brw_reg reg )
 {
    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
@@ -186,7 +186,7 @@ static void brw_set_src1( struct brw_instruction *insn,
        * in the future:
        */
       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
-      assert (reg.file == BRW_GENERAL_REGISTER_FILE);
+      //assert (reg.file == BRW_GENERAL_REGISTER_FILE);
 
       if (insn->header.access_mode == BRW_ALIGN_1) {
 	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
@@ -329,14 +329,14 @@ static void brw_set_sampler_message(struct brw_context *brw,
 {
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGD(brw)) {
-      insn->bits3.sampler_igd.binding_table_index = binding_table_index;
-      insn->bits3.sampler_igd.sampler = sampler;
-      insn->bits3.sampler_igd.msg_type = msg_type;
-      insn->bits3.sampler_igd.response_length = response_length;
-      insn->bits3.sampler_igd.msg_length = msg_length;
-      insn->bits3.sampler_igd.end_of_thread = eot;
-      insn->bits3.sampler_igd.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
+   if (BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) {
+      insn->bits3.sampler_gm45_g4x.binding_table_index = binding_table_index;
+      insn->bits3.sampler_gm45_g4x.sampler = sampler;
+      insn->bits3.sampler_gm45_g4x.msg_type = msg_type;
+      insn->bits3.sampler_gm45_g4x.response_length = response_length;
+      insn->bits3.sampler_gm45_g4x.msg_length = msg_length;
+      insn->bits3.sampler_gm45_g4x.end_of_thread = eot;
+      insn->bits3.sampler_gm45_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
    } else {
       insn->bits3.sampler.binding_table_index = binding_table_index;
       insn->bits3.sampler.sampler = sampler;
@@ -608,6 +608,34 @@ void brw_ENDIF(struct brw_compile *p,
    }
 }
 
+struct brw_instruction *brw_BREAK(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+   insn = next_insn(p, BRW_OPCODE_BREAK);
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+   insn->bits3.if_else.pad0 = 0;
+   return insn;
+}
+
+struct brw_instruction *brw_CONT(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+   insn = next_insn(p, BRW_OPCODE_CONTINUE);
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+   insn->bits3.if_else.pad0 = 0;
+   return insn;
+}
+
 /* DO/WHILE loop:
  */
 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
@@ -619,13 +647,15 @@ struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 
       /* Override the defaults for this instruction:
        */
-      brw_set_dest(insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
-      brw_set_src0(insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
-      brw_set_src1(insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_dest(insn, brw_null_reg());
+      brw_set_src0(insn, brw_null_reg());
+      brw_set_src1(insn, brw_null_reg());
 
       insn->header.compression_control = BRW_COMPRESSION_NONE;
       insn->header.execution_size = execute_size;
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
       /* insn->header.mask_control = BRW_MASK_ENABLE; */
+      insn->header.mask_control = BRW_MASK_DISABLE;
 
       return insn;
    }
@@ -633,7 +663,7 @@ struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 
 
 
-void brw_WHILE(struct brw_compile *p, 
+struct brw_instruction *brw_WHILE(struct brw_compile *p, 
 	       struct brw_instruction *do_insn)
 {
    struct brw_instruction *insn;
@@ -657,14 +687,16 @@ void brw_WHILE(struct brw_compile *p,
       insn->header.execution_size = do_insn->header.execution_size;
 
       assert(do_insn->header.opcode == BRW_OPCODE_DO);
-      insn->bits3.if_else.jump_count = do_insn - insn;
+      insn->bits3.if_else.jump_count = do_insn - insn + 1;
       insn->bits3.if_else.pop_count = 0;
       insn->bits3.if_else.pad0 = 0;
    }
 
 /*    insn->header.mask_control = BRW_MASK_ENABLE; */
 
+   insn->header.mask_control = BRW_MASK_DISABLE;
    p->current->header.predicate_control = BRW_PREDICATE_NONE;   
+   return insn;
 }
 
 
diff --git a/i965/brw_gs.h b/i965/brw_gs.h
index 29a4e80..18a4537 100644
--- a/i965/brw_gs.h
+++ b/i965/brw_gs.h
@@ -40,11 +40,11 @@
 #define MAX_GS_VERTS (4)	     
 
 struct brw_gs_prog_key {
+   GLuint attrs:32;
    GLuint primitive:4;
-   GLuint attrs:16;		
    GLuint hint_gs_always:1;
    GLuint need_gs_prog:1;
-   GLuint pad:10;
+   GLuint pad:26;
 };
 
 struct brw_gs_compile {
diff --git a/i965/brw_gs_state.c b/i965/brw_gs_state.c
index 5826c01..5db4dd4 100644
--- a/i965/brw_gs_state.c
+++ b/i965/brw_gs_state.c
@@ -46,7 +46,8 @@ static void upload_gs_unit( struct brw_context *brw )
 
    /* CACHE_NEW_GS_PROG */
    if (brw->gs.prog_active) {
-      gs.thread0.grf_reg_count = ((brw->gs.prog_data->total_grf-1) & ~15) / 16;
+      gs.thread0.grf_reg_count =
+	 ALIGN(brw->gs.prog_data->total_grf, 16) / 16 - 1;
       gs.thread0.kernel_start_pointer = brw->gs.prog_gs_offset >> 6;
       gs.thread3.urb_entry_read_length = brw->gs.prog_data->urb_read_length;
    }
diff --git a/i965/brw_metaops.c b/i965/brw_metaops.c
index 6e030f1..cd6d287 100644
--- a/i965/brw_metaops.c
+++ b/i965/brw_metaops.c
@@ -195,7 +195,7 @@ static void init_metaops_state( struct brw_context *brw )
 				  vp_prog, strlen(vp_prog),
 				  brw->metaops.vp);
 
-   brw->metaops.attribs.VertexProgram->Current = brw->metaops.vp;
+   brw->metaops.attribs.VertexProgram->_Current = brw->metaops.vp;
    brw->metaops.attribs.VertexProgram->_Enabled = GL_TRUE;
 
    brw->metaops.attribs.FragmentProgram->_Current = brw->metaops.fp;
diff --git a/i965/brw_misc_state.c b/i965/brw_misc_state.c
index fe476c9..8f23316 100644
--- a/i965/brw_misc_state.c
+++ b/i965/brw_misc_state.c
@@ -249,7 +249,7 @@ static void upload_depthbuffer(struct brw_context *brw)
    memset(&bd, 0, sizeof(bd));
 
    bd.header.bits.opcode = CMD_DEPTH_BUFFER;
-   bd.header.bits.length = BRW_IS_IGD(brw) ? (sizeof(bd)/4-2) : (sizeof(bd)/4-3);
+   bd.header.bits.length = (BRW_IS_GM45(brw) || BRW_IS_G4X(brw)) ? (sizeof(bd)/4-2) : (sizeof(bd)/4-3);
    bd.dword1.bits.pitch = (region->pitch * region->cpp) - 1;
    
    switch (region->cpp) {
@@ -366,7 +366,7 @@ static void upload_aa_line_parameters(struct brw_context *brw)
 {
    struct brw_aa_line_parameters balp;
    
-   if (!BRW_IS_IGD(brw))
+   if (!(BRW_IS_GM45(brw) || BRW_IS_G4X(brw)))
       return;
 
    /* use legacy aa line coverage computation */
diff --git a/i965/brw_program.c b/i965/brw_program.c
index 752fe49..389fd89 100644
--- a/i965/brw_program.c
+++ b/i965/brw_program.c
@@ -125,6 +125,9 @@ static void brwProgramStringNotify( GLcontext *ctx,
       struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program;
       if (p == vp)
 	 brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
+      if (p->program.IsPositionInvariant) {
+	 _mesa_insert_mvp_code(ctx, &p->program);
+      }
       p->id = brw->program_id++;      
       p->param_state = p->program.Base.Parameters->StateFlags;
 
diff --git a/i965/brw_sf.c b/i965/brw_sf.c
index 6dcfa62..83e2314 100644
--- a/i965/brw_sf.c
+++ b/i965/brw_sf.c
@@ -74,6 +74,11 @@ static void compile_sf_prog( struct brw_context *brw,
       if (c.key.attrs & (1<<i)) {
 	 c.attr_to_idx[i] = idx;
 	 c.idx_to_attr[idx] = i;
+	 if (i >= VERT_RESULT_TEX0 && i <= VERT_RESULT_TEX7) {
+		 c.point_attrs[i].CoordReplace = 
+			brw->attribs.Point->CoordReplace[i - VERT_RESULT_TEX0];
+	 } else
+		 c.point_attrs[i].CoordReplace = GL_FALSE;
 	 idx++;
       }
    
@@ -90,7 +95,10 @@ static void compile_sf_prog( struct brw_context *brw,
       break;
    case SF_POINTS:
       c.nr_verts = 1;
-      brw_emit_point_setup( &c, GL_TRUE );
+      if (key->do_point_sprite)
+	  brw_emit_point_sprite_setup( &c, GL_TRUE );
+      else
+	  brw_emit_point_setup( &c, GL_TRUE );
       break;
    case SF_UNFILLED_TRIS:
       c.nr_verts = 3;
@@ -162,7 +170,8 @@ static void upload_sf_prog( struct brw_context *brw )
       break;
    }
 
-
+   key.do_point_sprite = brw->attribs.Point->PointSprite;
+   key.SpriteOrigin = brw->attribs.Point->SpriteOrigin;
    /* _NEW_LIGHT */
    key.do_flat_shading = (brw->attribs.Light->ShadeModel == GL_FLAT);
    key.do_twoside_color = (brw->attribs.Light->Enabled && brw->attribs.Light->Model.TwoSide);
@@ -179,7 +188,7 @@ static void upload_sf_prog( struct brw_context *brw )
 
 const struct brw_tracked_state brw_sf_prog = {
    .dirty = {
-      .mesa  = (_NEW_LIGHT|_NEW_POLYGON),
+      .mesa  = (_NEW_LIGHT|_NEW_POLYGON|_NEW_POINT),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/i965/brw_sf.h b/i965/brw_sf.h
index b321cda..1aadc71 100644
--- a/i965/brw_sf.h
+++ b/i965/brw_sf.h
@@ -45,14 +45,19 @@
 #define SF_UNFILLED_TRIS   3
 
 struct brw_sf_prog_key {
+   GLuint attrs:32;
    GLuint primitive:2;
    GLuint do_twoside_color:1;
    GLuint do_flat_shading:1;
-   GLuint attrs:16;
    GLuint frontface_ccw:1;
-   GLuint pad:11;
+   GLuint do_point_sprite:1;
+   GLuint pad:10;
+   GLenum SpriteOrigin;
 };
 
+struct brw_sf_point_tex {
+	GLboolean CoordReplace;	
+};
 
 struct brw_sf_compile {
    struct brw_compile func;
@@ -94,12 +99,14 @@ struct brw_sf_compile {
 
    GLubyte attr_to_idx[VERT_RESULT_MAX];   
    GLubyte idx_to_attr[VERT_RESULT_MAX];   
+   struct brw_sf_point_tex point_attrs[VERT_RESULT_MAX];
 };
 
  
 void brw_emit_tri_setup( struct brw_sf_compile *c, GLboolean allocate );
 void brw_emit_line_setup( struct brw_sf_compile *c, GLboolean allocate );
 void brw_emit_point_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate );
 void brw_emit_anyprim_setup( struct brw_sf_compile *c );
 
 #endif
diff --git a/i965/brw_sf_emit.c b/i965/brw_sf_emit.c
index 94be815..2f06cc5 100644
--- a/i965/brw_sf_emit.c
+++ b/i965/brw_sf_emit.c
@@ -503,6 +503,90 @@ void brw_emit_line_setup( struct brw_sf_compile *c, GLboolean allocate)
    } 
 }
 
+void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate )
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   c->nr_verts = 1;
+
+   if (allocate)
+       alloc_regs(c);
+
+   copy_z_inv_w(c);
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      struct brw_sf_point_tex *tex = &c->point_attrs[c->idx_to_attr[2*i]];
+      struct brw_reg a0 = offset(c->vert[0], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+            
+      if (pc_persp)
+      {				
+	  if (!tex->CoordReplace) {
+	      brw_set_predicate_control_flag_value(p, pc_persp);
+	      brw_MUL(p, a0, a0, c->inv_w[0]);
+	  }
+      }
+
+      if (tex->CoordReplace) {
+	  /* Caculate 1.0/PointWidth */
+	  brw_math(&c->func,
+		  c->tmp,
+		  BRW_MATH_FUNCTION_INV,
+		  BRW_MATH_SATURATE_NONE,
+		  0,
+		  c->dx0,
+		  BRW_MATH_DATA_SCALAR,
+		  BRW_MATH_PRECISION_FULL);
+
+	  if (c->key.SpriteOrigin == GL_LOWER_LEFT) {
+	   	brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
+		brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
+	  	brw_MUL(p, c->m2Cy, c->tmp, negate(c->inv_w[0]));
+		brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
+	  } else {
+	   	brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
+		brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
+	  	brw_MUL(p, c->m2Cy, c->tmp, c->inv_w[0]);
+		brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
+	  }
+      } else {
+	  brw_MOV(p, c->m1Cx, brw_imm_ud(0));
+	  brw_MOV(p, c->m2Cy, brw_imm_ud(0));
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+	 if (tex->CoordReplace) {
+	     if (c->key.SpriteOrigin == GL_LOWER_LEFT) {
+		 brw_MUL(p, c->m3C0, c->inv_w[0], brw_imm_f(1.0));
+		 brw_MOV(p, vec1(suboffset(c->m3C0, 0)), brw_imm_f(0.0));
+	     }
+	     else
+		 brw_MOV(p, c->m3C0, brw_imm_f(0.0));
+	 } else {
+	 	brw_MOV(p, c->m3C0, a0); /* constant value */
+	 }
+
+	 /* Copy m0..m3 to URB. 
+	  */
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
+
 /* Points setup - several simplifications as all attributes are
  * constant across the face of the point (point sprites excluded!)
  */
@@ -569,6 +653,7 @@ void brw_emit_anyprim_setup( struct brw_sf_compile *c )
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
    struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
+   struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0); 
    struct brw_reg primmask;
    struct brw_instruction *jmp;
    struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
@@ -623,6 +708,19 @@ void brw_emit_anyprim_setup( struct brw_sf_compile *c )
    }
    brw_land_fwd_jump(p, jmp); 
 
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+   brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+   {
+      saveflag = p->flag_value;
+      brw_push_insn_state(p); 
+      brw_emit_point_sprite_setup( c, GL_FALSE );
+      brw_pop_insn_state(p);
+      p->flag_value = saveflag;
+      /* note - thread killed in subroutine */
+   }
+   brw_land_fwd_jump(p, jmp); 
+
    brw_emit_point_setup( c, GL_FALSE );
 }
 
diff --git a/i965/brw_sf_state.c b/i965/brw_sf_state.c
index 2fd75a0..7445d59 100644
--- a/i965/brw_sf_state.c
+++ b/i965/brw_sf_state.c
@@ -38,6 +38,8 @@
 
 static void upload_sf_vp(struct brw_context *brw)
 {
+   GLcontext *ctx = &brw->intel.ctx;
+   const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
    struct brw_sf_viewport sfv;
 
    memset(&sfv, 0, sizeof(sfv));
@@ -47,14 +49,14 @@ static void upload_sf_vp(struct brw_context *brw)
       /* _NEW_VIEWPORT, BRW_NEW_METAOPS */
 
       if (!brw->metaops.active) {
-	 const GLfloat *v = brw->intel.ctx.Viewport._WindowMap.m;
+	 const GLfloat *v = ctx->Viewport._WindowMap.m;
 	 
 	 sfv.viewport.m00 =   v[MAT_SX];
 	 sfv.viewport.m11 = - v[MAT_SY];
-	 sfv.viewport.m22 =   v[MAT_SZ] * brw->intel.depth_scale;
+	 sfv.viewport.m22 =   v[MAT_SZ] * depth_scale;
 	 sfv.viewport.m30 =   v[MAT_TX];
 	 sfv.viewport.m31 = - v[MAT_TY] + brw->intel.driDrawable->h;
-	 sfv.viewport.m32 =   v[MAT_TZ] * brw->intel.depth_scale;
+	 sfv.viewport.m32 =   v[MAT_TZ] * depth_scale;
       }
       else {
 	 sfv.viewport.m00 =   1;
@@ -118,7 +120,7 @@ static void upload_sf_unit( struct brw_context *brw )
    memset(&sf, 0, sizeof(sf));
 
    /* CACHE_NEW_SF_PROG */
-   sf.thread0.grf_reg_count = ((brw->sf.prog_data->total_grf-1) & ~15) / 16;
+   sf.thread0.grf_reg_count = ALIGN(brw->sf.prog_data->total_grf, 16) / 16 - 1;
    sf.thread0.kernel_start_pointer = brw->sf.prog_gs_offset >> 6;
    sf.thread3.urb_entry_read_length = brw->sf.prog_data->urb_read_length;
 
@@ -184,6 +186,7 @@ static void upload_sf_unit( struct brw_context *brw )
    /* _NEW_POINT */
    sf.sf6.point_rast_rule = 1;	/* opengl conventions */
    sf.sf7.point_size = brw->attribs.Point->_Size * (1<<3);
+   sf.sf7.sprite_point = brw->attribs.Point->PointSprite;
    sf.sf7.use_point_size_state = !brw->attribs.Point->_Attenuated;
    sf.sf7.aa_line_distance_mode = 0;
 
diff --git a/i965/brw_state_cache.c b/i965/brw_state_cache.c
index 71c6938..5739c5c 100644
--- a/i965/brw_state_cache.c
+++ b/i965/brw_state_cache.c
@@ -149,7 +149,7 @@ GLuint brw_upload_cache( struct brw_cache *cache,
    GLuint hash = hash_key(key, key_size);
    void *tmp = _mesa_malloc(key_size + cache->aux_size);
    
-   if (!brw_pool_alloc(cache->pool, data_size, 6, &offset)) {
+   if (!brw_pool_alloc(cache->pool, data_size, 1 << 6, &offset)) {
       /* Should not be possible: 
        */
       _mesa_printf("brw_pool_alloc failed\n");
diff --git a/i965/brw_state_pool.c b/i965/brw_state_pool.c
index b9926f2..cf7cdd0 100644
--- a/i965/brw_state_pool.c
+++ b/i965/brw_state_pool.c
@@ -41,10 +41,9 @@ GLboolean brw_pool_alloc( struct brw_mem_pool *pool,
 			  GLuint align,
 			  GLuint *offset_return)
 {
-   GLuint align_mask = (1<<align)-1;
-   GLuint fixup = ((pool->offset + align_mask) & ~align_mask) - pool->offset;
+   GLuint fixup = ALIGN(pool->offset, align) - pool->offset;
 
-   size = (size + 3) & ~3;
+   size = ALIGN(size, 4);
 
    if (pool->offset + fixup + size >= pool->size) {
       _mesa_printf("%s failed\n", __FUNCTION__);
diff --git a/i965/brw_structs.h b/i965/brw_structs.h
index a799122..ee0e309 100644
--- a/i965/brw_structs.h
+++ b/i965/brw_structs.h
@@ -1362,7 +1362,7 @@ struct brw_instruction
          GLuint msg_target:4;
          GLuint pad1:3;
          GLuint end_of_thread:1;
-      } sampler_igd; 
+      } sampler_gm45_g4x; 
 
       struct brw_urb_immediate urb;
 
diff --git a/i965/brw_tex.c b/i965/brw_tex.c
index 9d4b986..ad29316 100644
--- a/i965/brw_tex.c
+++ b/i965/brw_tex.c
@@ -154,13 +154,19 @@ brwChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
 
    case GL_RGB_S3TC:
    case GL_RGB4_S3TC:
+   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+       return &_mesa_texformat_rgb_dxt1;
+
+   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+       return &_mesa_texformat_rgba_dxt1;
+
    case GL_RGBA_S3TC:
    case GL_RGBA4_S3TC:
    case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+       return &_mesa_texformat_rgba_dxt3;
+
    case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-     return &_mesa_texformat_rgb_dxt1; /* there is no rgba support? */
+       return &_mesa_texformat_rgba_dxt5;
 
    case GL_DEPTH_COMPONENT:
    case GL_DEPTH_COMPONENT16:
@@ -168,6 +174,25 @@ brwChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
    case GL_DEPTH_COMPONENT32:
       return &_mesa_texformat_z16;
 
+   case GL_SRGB_EXT:
+   case GL_SRGB8_EXT:
+   case GL_SRGB_ALPHA_EXT:
+   case GL_SRGB8_ALPHA8_EXT:
+   case GL_SLUMINANCE_EXT:
+   case GL_SLUMINANCE8_EXT:
+   case GL_SLUMINANCE_ALPHA_EXT:
+   case GL_SLUMINANCE8_ALPHA8_EXT:
+   case GL_COMPRESSED_SRGB_EXT:
+   case GL_COMPRESSED_SRGB_ALPHA_EXT:
+   case GL_COMPRESSED_SLUMINANCE_EXT:
+   case GL_COMPRESSED_SLUMINANCE_ALPHA_EXT:
+	return &_mesa_texformat_srgba8;
+   case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT:
+   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT:
+   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT:
+   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT:
+     return &_mesa_texformat_srgb_dxt1;
+
    default:
       fprintf(stderr, "unexpected texture format %s in %s\n", 
 	      _mesa_lookup_enum_by_nr(internalFormat),
diff --git a/i965/brw_tex_layout.c b/i965/brw_tex_layout.c
index d4888a4..427a132 100644
--- a/i965/brw_tex_layout.c
+++ b/i965/brw_tex_layout.c
@@ -37,7 +37,6 @@
 #include "intel_tex_layout.h"
 #include "macros.h"
 
-
 GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_tree *mt )
 {
    /* XXX: these vary depending on image format: 
@@ -53,11 +52,20 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
       GLuint pack_x_pitch, pack_x_nr;
       GLuint pack_y_pitch;
       GLuint level;
+      GLuint align_h = 2;
+      GLuint align_w = 4;
 
-      mt->pitch = ((mt->width0 * mt->cpp + 3) & ~3) / mt->cpp;
       mt->total_height = 0;
+      
+      if (mt->compressed) {
+          align_w = intel_compressed_alignment(mt->internal_format);
+          mt->pitch = ALIGN(width, align_w);
+          pack_y_pitch = (height + 3) / 4;
+      } else {
+          mt->pitch = ALIGN(mt->width0 * mt->cpp, 4) / mt->cpp;
+          pack_y_pitch = ALIGN(mt->height0, align_h);
+      }
 
-      pack_y_pitch = MAX2(mt->height0, 2);
       pack_x_pitch = mt->pitch;
       pack_x_nr = 1;
 
@@ -83,20 +91,30 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
 
 
 	 mt->total_height += y;
-
-	 if (pack_x_pitch > 4) {
-	    pack_x_pitch >>= 1;
-	    pack_x_nr <<= 1;
-	    assert(pack_x_pitch * pack_x_nr <= mt->pitch);
-	 }
-
-	 if (pack_y_pitch > 2) {
-	    pack_y_pitch >>= 1;
-	 }
-
 	 width  = minify(width);
 	 height = minify(height);
 	 depth  = minify(depth);
+
+    if (mt->compressed) {
+        pack_y_pitch = (height + 3) / 4;
+        
+        if (pack_x_pitch > ALIGN(width, align_w)) {
+            pack_x_pitch = ALIGN(width, align_w);
+            pack_x_nr <<= 1;
+        }
+    } else {
+        if (pack_x_pitch > 4) {
+            pack_x_pitch >>= 1;
+            pack_x_nr <<= 1;
+            assert(pack_x_pitch * pack_x_nr <= mt->pitch);
+        }
+
+        if (pack_y_pitch > 2) {
+            pack_y_pitch >>= 1;
+            pack_y_pitch = ALIGN(pack_y_pitch, align_h);
+        }
+    }
+
       }
       break;
    }
diff --git a/i965/brw_urb.c b/i965/brw_urb.c
index 4ca6e99..76d0c29 100644
--- a/i965/brw_urb.c
+++ b/i965/brw_urb.c
@@ -53,7 +53,7 @@ static const struct {
    GLuint min_entry_size;
    GLuint max_entry_size;
 } limits[CS+1] = {
-   { 8, 32, 1, 5 },			/* vs */
+   { 16, 32, 1, 5 },			/* vs */
    { 4, 8,  1, 5 },			/* gs */
    { 6, 8,  1, 5 },			/* clp */
    { 1, 8,  1, 12 },		        /* sf */
diff --git a/i965/brw_vs.h b/i965/brw_vs.h
index fdb5785..36636b5 100644
--- a/i965/brw_vs.h
+++ b/i965/brw_vs.h
@@ -67,6 +67,12 @@ struct brw_vs_compile {
    struct brw_reg r1;
    struct brw_reg regs[PROGRAM_ADDRESS+1][128];
    struct brw_reg tmp;
+   struct brw_reg stack;
+
+   struct {	
+       GLboolean used_in_src;
+       struct brw_reg reg;
+   } output_regs[128];
 
    struct brw_reg userplane[6];
 
diff --git a/i965/brw_vs_emit.c b/i965/brw_vs_emit.c
index c38e998..6d41205 100644
--- a/i965/brw_vs_emit.c
+++ b/i965/brw_vs_emit.c
@@ -134,6 +134,16 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 					     WRITEMASK_X);
       reg++;
    }
+
+   for (i = 0; i < 128; i++) {
+       if (c->output_regs[i].used_in_src) {
+            c->output_regs[i].reg = brw_vec8_grf(reg, 0);
+            reg++;
+        }
+   }
+
+   c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
+   reg += 2;
  
    
    /* Some opcodes need an internal temporary:
@@ -213,57 +223,65 @@ static void unalias2( struct brw_vs_compile *c,
    }
 }
 
+static void emit_sop( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1, 
+		      GLuint cond)
+{
+   brw_MOV(p, dst, brw_imm_f(0.0f));
+   brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
+   brw_MOV(p, dst, brw_imm_f(1.0f));
+   brw_set_predicate_control_flag_value(p, 0xff);
+}
 
+static void emit_seq( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
+}
 
-
+static void emit_sne( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
+}
 static void emit_slt( struct brw_compile *p, 
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-   /* Could be done with an if/else/endif, but this method uses half
-    * the instructions.  Note that we are careful to reference the
-    * arguments before writing the dest.  That means we emit the
-    * instructions in an odd order and have to play with the flag
-    * values.
-    */
-   brw_push_insn_state(p);
-   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
-
-   /* Write all values to 1:
-    */
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-   brw_MOV(p, dst, brw_imm_f(1.0));
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
+}
 
-   /* Where the test succeeded, overwite with zero:
-    */
-   brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-   brw_MOV(p, dst, brw_imm_f(0.0));
-   brw_pop_insn_state(p);
+static void emit_sle( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 }
 
+static void emit_sgt( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
+}
 
 static void emit_sge( struct brw_compile *p, 
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-   brw_push_insn_state(p);
-   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
-
-   /* Write all values to zero:
-    */
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-   brw_MOV(p, dst, brw_imm_f(0));
-
-   /* Where the test succeeded, overwite with 1:
-    */
-   brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-   brw_MOV(p, dst, brw_imm_f(1.0));
-   brw_pop_insn_state(p);
+  emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
 }
 
-
 static void emit_max( struct brw_compile *p, 
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
@@ -592,9 +610,13 @@ static struct brw_reg get_reg( struct brw_vs_compile *c,
    case PROGRAM_TEMPORARY:
    case PROGRAM_INPUT:
    case PROGRAM_OUTPUT:
-   case PROGRAM_STATE_VAR:
       assert(c->regs[file][index].nr != 0);
       return c->regs[file][index];
+   case PROGRAM_STATE_VAR:
+   case PROGRAM_CONSTANT:
+   case PROGRAM_UNIFORM:
+      assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
+      return c->regs[PROGRAM_STATE_VAR][index];
    case PROGRAM_ADDRESS:
       assert(index == 0);
       return c->regs[file][index];
@@ -668,28 +690,28 @@ static void emit_arl( struct brw_vs_compile *c,
  * account.
  */
 static struct brw_reg get_arg( struct brw_vs_compile *c,
-			       struct prog_src_register src )
+			       struct prog_src_register *src )
 {
    struct brw_reg reg;
 
-   if (src.File == PROGRAM_UNDEFINED)
+   if (src->File == PROGRAM_UNDEFINED)
       return brw_null_reg();
 
-   if (src.RelAddr) 
-      reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
+   if (src->RelAddr) 
+      reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
    else
-      reg = get_reg(c, src.File, src.Index);
+      reg = get_reg(c, src->File, src->Index);
 
    /* Convert 3-bit swizzle to 2-bit.  
     */
-   reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src.Swizzle, 0),
-				       GET_SWZ(src.Swizzle, 1),
-				       GET_SWZ(src.Swizzle, 2),
-				       GET_SWZ(src.Swizzle, 3));
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
+				       GET_SWZ(src->Swizzle, 1),
+				       GET_SWZ(src->Swizzle, 2),
+				       GET_SWZ(src->Swizzle, 3));
 
    /* Note this is ok for non-swizzle instructions: 
     */
-   reg.negate = src.NegateBase ? 1 : 0;   
+   reg.negate = src->NegateBase ? 1 : 0;   
 
    return reg;
 }
@@ -845,7 +867,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (!BRW_IS_IGD(p->brw) && !c->key.know_w_is_one) {
+      if (!(BRW_IS_GM45(p->brw) || BRW_IS_G4X(p->brw)) && !c->key.know_w_is_one) {
 	 brw_CMP(p,
 		 vec8(brw_null_reg()),
 		 BRW_CONDITIONAL_L,
@@ -891,17 +913,50 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 
 }
 
-
-
+static void 
+post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst )
+{
+   GLuint nr_insns = c->vp->program.Base.NumInstructions;
+   GLuint insn, target_insn;
+   struct prog_instruction *inst1, *inst2;
+   struct brw_instruction *brw_inst1, *brw_inst2;
+   int offset;
+   for (insn = 0; insn < nr_insns; insn++) {
+       inst1 = &c->vp->program.Base.Instructions[insn];
+       brw_inst1 = inst1->Data;
+       switch (inst1->Opcode) {
+	   case OPCODE_CAL:
+	   case OPCODE_BRA:
+	       target_insn = inst1->BranchTarget;
+	       inst2 = &c->vp->program.Base.Instructions[target_insn];
+	       brw_inst2 = inst2->Data;
+	       offset = brw_inst2 - brw_inst1;
+	       brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+	       break;
+	   case OPCODE_END:
+	       offset = end_inst - brw_inst1;
+	       brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+	       break;
+	   default:
+	       break;
+       }
+   }
+}
 
 /* Emit the fragment program instructions here.
  */
-void brw_vs_emit( struct brw_vs_compile *c )
+void brw_vs_emit(struct brw_vs_compile *c )
 {
+#define MAX_IFSN 32
    struct brw_compile *p = &c->func;
    GLuint nr_insns = c->vp->program.Base.NumInstructions;
-   GLuint insn;
+   GLuint insn, if_insn = 0;
+   struct brw_instruction *end_inst;
+   struct brw_instruction *if_inst[MAX_IFSN];
+   struct brw_indirect stack_index = brw_indirect(0, 0);   
 
+   GLuint index;
+   GLuint file;
 
    if (INTEL_DEBUG & DEBUG_VS) {
       _mesa_printf("\n\n\nvs-emit:\n");
@@ -912,9 +967,24 @@ void brw_vs_emit( struct brw_vs_compile *c )
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_access_mode(p, BRW_ALIGN_16);
    
+   /* Message registers can't be read, so copy the output into GRF register
+      if they are used in source registers */
+   for (insn = 0; insn < nr_insns; insn++) {
+       GLuint i;
+       struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
+       for (i = 0; i < 3; i++) {
+	   struct prog_src_register *src = &inst->SrcReg[i];
+	   GLuint index = src->Index;
+	   GLuint file = src->File;	
+	   if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
+	       c->output_regs[index].used_in_src = GL_TRUE;
+       }
+   }
+
    /* Static register allocation
     */
    brw_vs_alloc_regs(c);
+   brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
 
    for (insn = 0; insn < nr_insns; insn++) {
 
@@ -924,17 +994,29 @@ void brw_vs_emit( struct brw_vs_compile *c )
       
       /* Get argument regs.  SWZ is special and does this itself.
        */
+      inst->Data = &p->store[p->nr_insn];
       if (inst->Opcode != OPCODE_SWZ)
-	 for (i = 0; i < 3; i++) 
-	    args[i] = get_arg(c, inst->SrcReg[i]);
+	  for (i = 0; i < 3; i++) {
+	      struct prog_src_register *src = &inst->SrcReg[i];
+	      index = src->Index;
+	      file = src->File;	
+	      if (file == PROGRAM_OUTPUT&&c->output_regs[index].used_in_src)
+		  args[i] = c->output_regs[index].reg;
+	      else
+		  args[i] = get_arg(c, src);
+	  }
 
       /* Get dest regs.  Note that it is possible for a reg to be both
        * dst and arg, given the static allocation of registers.  So
        * care needs to be taken emitting multi-operation instructions.
-       */
-      dst = get_dst(c, inst->DstReg);
+       */ 
+      index = inst->DstReg.Index;
+      file = inst->DstReg.File;
+      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
+	  dst = c->output_regs[index].reg;
+      else
+	  dst = get_dst(c, inst->DstReg);
 
-      
       switch (inst->Opcode) {
       case OPCODE_ABS:
 	 brw_MOV(p, dst, brw_abs(args[0]));
@@ -1003,12 +1085,25 @@ void brw_vs_emit( struct brw_vs_compile *c )
       case OPCODE_RSQ:
 	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
+
+      case OPCODE_SEQ:
+         emit_seq(p, dst, args[0], args[1]);
+         break;
+      case OPCODE_SNE:
+         emit_sne(p, dst, args[0], args[1]);
+         break;
       case OPCODE_SGE:
 	 emit_sge(p, dst, args[0], args[1]);
 	 break;
+      case OPCODE_SGT:
+         emit_sgt(p, dst, args[0], args[1]);
+        break;
       case OPCODE_SLT:
 	 emit_slt(p, dst, args[0], args[1]);
 	 break;
+      case OPCODE_SLE:
+         emit_sle(p, dst, args[0], args[1]);
+         break;
       case OPCODE_SUB:
 	 brw_ADD(p, dst, args[0], negate(args[1]));
 	 break;
@@ -1021,21 +1116,60 @@ void brw_vs_emit( struct brw_vs_compile *c )
       case OPCODE_XPD:
 	 emit_xpd(p, dst, args[0], args[1]);
 	 break;
+      case OPCODE_IF:
+	 assert(if_insn < MAX_IFSN);
+         if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+	 break;
+      case OPCODE_ELSE:
+	 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
+	 break;
+      case OPCODE_ENDIF:
+         assert(if_insn > 0);
+	 brw_ENDIF(p, if_inst[--if_insn]);
+	 break;			
+      case OPCODE_BRA:
+         brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+         brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+         brw_set_predicate_control_flag_value(p, 0xff);
+        break;
+      case OPCODE_CAL:
+	 brw_set_access_mode(p, BRW_ALIGN_1);
+	 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+	 brw_set_access_mode(p, BRW_ALIGN_16);
+	 brw_ADD(p, get_addr_reg(stack_index),
+			 get_addr_reg(stack_index), brw_imm_d(4));
+	 inst->Data = &p->store[p->nr_insn];
+	 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+        break;
+      case OPCODE_RET:
+	 brw_ADD(p, get_addr_reg(stack_index),
+			 get_addr_reg(stack_index), brw_imm_d(-4));
+	 brw_set_access_mode(p, BRW_ALIGN_1);
+         brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
+	 brw_set_access_mode(p, BRW_ALIGN_16);
       case OPCODE_END:	
+         brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+        break;
       case OPCODE_PRINT:
+      case OPCODE_BGNSUB:
+      case OPCODE_ENDSUB:
 	 break;
       default:
+	 _mesa_printf("Unsupport opcode %d in vertex shader\n", inst->Opcode);
 	 break;
       }
 
+      if (inst->DstReg.File == PROGRAM_OUTPUT
+	      &&inst->DstReg.Index != VERT_RESULT_HPOS
+	      &&c->output_regs[inst->DstReg.Index].used_in_src)
+	  brw_MOV(p, get_dst(c, inst->DstReg), dst);
+
       release_tmps(c);
    }
 
+   end_inst = &p->store[p->nr_insn];
    emit_vertex_write(c);
-
+   post_vs_emit(c, end_inst);
+   for (insn = 0; insn < nr_insns; insn++)
+       c->vp->program.Base.Instructions[insn].Data = NULL;
 }
-
-
-
-
-
diff --git a/i965/brw_vs_state.c b/i965/brw_vs_state.c
index c225bf8..f561979 100644
--- a/i965/brw_vs_state.c
+++ b/i965/brw_vs_state.c
@@ -44,7 +44,7 @@ static void upload_vs_unit( struct brw_context *brw )
 
    /* CACHE_NEW_VS_PROG */
    vs.thread0.kernel_start_pointer = brw->vs.prog_gs_offset >> 6;
-   vs.thread0.grf_reg_count = ((brw->vs.prog_data->total_grf-1) & ~15) / 16;
+   vs.thread0.grf_reg_count = ALIGN(brw->vs.prog_data->total_grf, 16) / 16 - 1;
    vs.thread3.urb_entry_read_length = brw->vs.prog_data->urb_read_length;
    vs.thread3.const_urb_entry_read_length = brw->vs.prog_data->curb_read_length;
    vs.thread3.dispatch_grf_start_reg = 1;
diff --git a/i965/brw_vs_tnl.c b/i965/brw_vs_tnl.c
index 14483b3..c06ef5c 100644
--- a/i965/brw_vs_tnl.c
+++ b/i965/brw_vs_tnl.c
@@ -524,10 +524,13 @@ static void emit_op3fn(struct tnl_program *p,
    GLuint nr = p->program->Base.NumInstructions++;
       
    if (nr >= p->nr_instructions) {
+      int new_nr_instructions = p->nr_instructions * 2;
+
       p->program->Base.Instructions = 
 	 _mesa_realloc(p->program->Base.Instructions,
 		       sizeof(struct prog_instruction) * p->nr_instructions,
-		       sizeof(struct prog_instruction) * (p->nr_instructions *= 2));
+		       sizeof(struct prog_instruction) * new_nr_instructions);
+      p->nr_instructions = new_nr_instructions;
    }
 
    {      
@@ -1167,6 +1170,11 @@ static void build_fog( struct tnl_program *p )
    }
    else {
       input = swizzle1(register_input(p, VERT_ATTRIB_FOG), X);
+      if (p->state->fog_option &&
+	  p->state->tnl_do_vertex_fog)
+	  input = swizzle1(register_input(p, VERT_ATTRIB_FOG), X);
+      else
+	  input = register_input(p, VERT_ATTRIB_FOG);
    }
 
    if (p->state->fog_option &&
@@ -1575,7 +1583,7 @@ static void update_tnl_program( struct brw_context *brw )
    struct gl_vertex_program *old = brw->tnl_program;
 
    /* _NEW_PROGRAM */
-   if (brw->attribs.VertexProgram->_Enabled) 
+   if (brw->attribs.VertexProgram->_Current) 
       return;
       
    /* Grab all the relevent state and put it in a single structure:
@@ -1622,7 +1630,8 @@ const struct brw_tracked_state brw_tnl_vertprog = {
 	       _NEW_FOG | 
 	       _NEW_HINT | 
 	       _NEW_POINT | 
-	       _NEW_TEXTURE),
+	       _NEW_TEXTURE |
+          _NEW_TEXTURE_MATRIX),
       .brw = (BRW_NEW_FRAGMENT_PROGRAM | 
 	      BRW_NEW_INPUT_VARYING),
       .cache = 0
@@ -1638,8 +1647,8 @@ static void update_active_vertprog( struct brw_context *brw )
    const struct gl_vertex_program *prev = brw->vertex_program;
 
    /* NEW_PROGRAM */
-   if (brw->attribs.VertexProgram->_Enabled) {
-      brw->vertex_program = brw->attribs.VertexProgram->Current;
+   if (brw->attribs.VertexProgram->_Current) {
+      brw->vertex_program = brw->attribs.VertexProgram->_Current;
    }
    else {
       /* BRW_NEW_TNL_PROGRAM */
diff --git a/i965/brw_wm.c b/i965/brw_wm.c
index f80ba17..b2ad0f7 100644
--- a/i965/brw_wm.c
+++ b/i965/brw_wm.c
@@ -29,7 +29,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
              
-
+#include "main/texformat.h"
 #include "brw_context.h"
 #include "brw_util.h"
 #include "brw_wm.h"
@@ -66,7 +66,11 @@ GLuint brw_wm_nr_args( GLuint opcode )
    case OPCODE_POW:
    case OPCODE_SUB:
    case OPCODE_SGE:
+   case OPCODE_SGT:
+   case OPCODE_SLE:
    case OPCODE_SLT:
+   case OPCODE_SEQ:
+   case OPCODE_SNE:
    case OPCODE_ADD:
    case OPCODE_MAX:
    case OPCODE_MIN:
@@ -150,46 +154,50 @@ static void do_wm_prog( struct brw_context *brw,
    c->fp = fp;
    c->env_param = brw->intel.ctx.FragmentProgram.Parameters;
 
-   /* Augment fragment program.  Add instructions for pre- and
-    * post-fragment-program tasks such as interpolation and fogging.
-    */
-   brw_wm_pass_fp(c);
+   if (brw_wm_is_glsl(&c->fp->program)) {
+       brw_wm_glsl_emit(brw, c);
+   } else {
+       /* Augment fragment program.  Add instructions for pre- and
+	* post-fragment-program tasks such as interpolation and fogging.
+	*/
+       brw_wm_pass_fp(c);
    
-   /* Translate to intermediate representation.  Build register usage
-    * chains.
-    */
-   brw_wm_pass0(c);
-
-   /* Dead code removal.
-    */
-   brw_wm_pass1(c);
-
-   /* Hal optimization
-    */
-   brw_wm_pass_hal (c);
+       /* Translate to intermediate representation.  Build register usage
+	* chains.
+	*/
+       brw_wm_pass0(c);
+
+       /* Dead code removal.
+	*/
+       brw_wm_pass1(c);
+
+       /* Hal optimization
+	*/
+       brw_wm_pass_hal (c);
    
-   /* Register allocation.
-    */
-   c->grf_limit = BRW_WM_MAX_GRF/2;
-
-   /* This is where we start emitting gen4 code:
-    */
-   brw_init_compile(brw, &c->func);    
-
-   brw_wm_pass2(c);
-
-   c->prog_data.total_grf = c->max_wm_grf;
-   if (c->last_scratch) {
-      c->prog_data.total_scratch =
-	 c->last_scratch + 0x40;
-   } else {
-      c->prog_data.total_scratch = 0;
+       /* Register allocation.
+	*/
+       c->grf_limit = BRW_WM_MAX_GRF/2;
+
+       /* This is where we start emitting gen4 code:
+	*/
+       brw_init_compile(brw, &c->func);    
+
+       brw_wm_pass2(c);
+
+       c->prog_data.total_grf = c->max_wm_grf;
+       if (c->last_scratch) {
+	   c->prog_data.total_scratch =
+	       c->last_scratch + 0x40;
+       } else {
+	   c->prog_data.total_scratch = 0;
+       }
+
+       /* Emit GEN4 code.
+	*/
+       brw_wm_emit(c);
    }
 
-   /* Emit GEN4 code.
-    */
-   brw_wm_emit(c);
-
    /* get the program
     */
    program = brw_get_program(&c->func, &program_size);
@@ -242,7 +250,8 @@ static void brw_wm_populate_key( struct brw_context *brw,
       lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
 
       if (brw->attribs.Stencil->WriteMask[0] ||
-	  (brw->attribs.Stencil->TestTwoSide && brw->attribs.Stencil->WriteMask[1]))
+	  (brw->attribs.Stencil->_TestTwoSide &&
+	   brw->attribs.Stencil->WriteMask[1]))
 	 lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
    }
 
@@ -284,7 +293,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
 
 
    /* BRW_NEW_WM_INPUT_DIMENSIONS */
-   key->projtex_mask = brw->wm.input_size_masks[4-1]; 
+   key->projtex_mask = brw->wm.input_size_masks[4-1] >> (FRAG_ATTRIB_TEX0 - FRAG_ATTRIB_WPOS); 
 
    /* _NEW_LIGHT */
    key->flat_shade = (brw->attribs.Light->ShadeModel == GL_FLAT);
@@ -301,11 +310,38 @@ static void brw_wm_populate_key( struct brw_context *brw,
 	    key->shadowtex_mask |= 1<<i;
 	 }
 
-	 if (t->Image[0][t->BaseLevel]->InternalFormat == GL_YCBCR_MESA)
+	 if (t->Image[0][t->BaseLevel]->InternalFormat == GL_YCBCR_MESA) {
 	    key->yuvtex_mask |= 1<<i;
+	    if (t->Image[0][t->BaseLevel]->TexFormat->MesaFormat == 
+		    MESA_FORMAT_YCBCR)
+		key->yuvtex_swap_mask |= 1<< i;
+	 }
       }
    }
-	  
+
+   /* _NEW_BUFFERS */
+   /*
+    * Include the draw buffer origin and height so that we can calculate
+    * fragment position values relative to the bottom left of the drawable,
+    * from the incoming screen origin relative position we get as part of our
+    * payload.
+    *
+    * We could avoid recompiling by including this as a constant referenced by
+    * our program, but if we were to do that it would also be nice to handle
+    * getting that constant updated at batchbuffer submit time (when we
+    * hold the lock and know where the buffer really is) rather than at emit
+    * time when we don't hold the lock and are just guessing.  We could also
+    * just avoid using this as key data if the program doesn't use
+    * fragment.position.
+    *
+    * This pretty much becomes moot with DRI2 and redirected buffers anyway,
+    * as our origins will always be zero then.
+    */
+   if (brw->intel.driDrawable != NULL) {
+      key->origin_x = brw->intel.driDrawable->x;
+      key->origin_y = brw->intel.driDrawable->y;
+      key->drawable_height = brw->intel.driDrawable->h;
+   }
 
    /* Extra info:
     */
@@ -344,6 +380,7 @@ const struct brw_tracked_state brw_wm_prog = {
 		_NEW_POLYGON |
 		_NEW_LINE |
 		_NEW_LIGHT |
+		_NEW_BUFFERS |
 		_NEW_TEXTURE),
       .brw   = (BRW_NEW_FRAGMENT_PROGRAM |
 		BRW_NEW_WM_INPUT_DIMENSIONS |
diff --git a/i965/brw_wm.h b/i965/brw_wm.h
index f5fddfd..9fb231d 100644
--- a/i965/brw_wm.h
+++ b/i965/brw_wm.h
@@ -69,9 +69,12 @@ struct brw_wm_prog_key {
    GLuint runtime_check_aads_emit:1;
    
    GLuint yuvtex_mask:8;
-   GLuint pad1:24;
+   GLuint yuvtex_swap_mask:8;	/* UV swaped */
+   GLuint pad1:16;
 
    GLuint program_string_id:32;
+   GLuint origin_x, origin_y;
+   GLuint drawable_height;
 };
 
 
@@ -194,6 +197,7 @@ struct brw_wm_compile {
    GLuint nr_fp_insns;
    GLuint fp_temp;
    GLuint fp_interp_emitted;
+   GLuint fp_deriv_emitted;
 
    struct prog_src_register pixel_xy;
    struct prog_src_register delta_xy;
@@ -231,6 +235,15 @@ struct brw_wm_compile {
    GLuint grf_limit;
    GLuint max_wm_grf;
    GLuint last_scratch;
+
+   struct {
+	GLboolean inited;
+	struct brw_reg reg;
+   } wm_regs[PROGRAM_PAYLOAD+1][256][4];
+   struct brw_reg stack;
+   struct brw_reg emit_mask_reg;
+   GLuint reg_index;
+   GLuint tmp_index;
 };
 
 
@@ -259,4 +272,6 @@ void brw_wm_lookup_iz( GLuint line_aa,
 		       GLuint lookup,
 		       struct brw_wm_prog_key *key );
 
+GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
+void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
 #endif
diff --git a/i965/brw_wm_emit.c b/i965/brw_wm_emit.c
index 80bd576..fd66631 100644
--- a/i965/brw_wm_emit.c
+++ b/i965/brw_wm_emit.c
@@ -122,26 +122,30 @@ static void emit_delta_xy(struct brw_compile *p,
    }
 }
 
-static void emit_wpos_xy(struct brw_compile *p,
-			   const struct brw_reg *dst,
-			   GLuint mask,
-			   const struct brw_reg *arg0)
+static void emit_wpos_xy(struct brw_wm_compile *c,
+			 const struct brw_reg *dst,
+			 GLuint mask,
+			 const struct brw_reg *arg0)
 {
-   /* Calc delta X,Y by subtracting origin in r1 from the pixel
-    * centers.
+   struct brw_compile *p = &c->func;
+
+   /* Calculate the pixel offset from window bottom left into destination
+    * X and Y channels.
     */
    if (mask & WRITEMASK_X) {
-      brw_MOV(p,
+      /* X' = X - origin */
+      brw_ADD(p,
 	      dst[0],
-	      retype(arg0[0], BRW_REGISTER_TYPE_UW));
+	      retype(arg0[0], BRW_REGISTER_TYPE_W),
+	      brw_imm_d(0 - c->key.origin_x));
    }
 
    if (mask & WRITEMASK_Y) {
-      /* TODO -- window_height - Y */
-      brw_MOV(p,
+      /* Y' = height - (Y - origin_y) = height + origin_y - Y */
+      brw_ADD(p,
 	      dst[1],
-	      negate(retype(arg0[1], BRW_REGISTER_TYPE_UW)));
-
+	      negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
+	      brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
    }
 }
 
@@ -219,6 +223,10 @@ static void emit_pinterp( struct brw_compile *p,
       if (mask & (1<<i)) {
 	 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
 	 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
+      }
+   }
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
 	 brw_MUL(p, dst[i], dst[i], w[3]);
       }
    }
@@ -229,20 +237,20 @@ static void emit_cinterp( struct brw_compile *p,
 			 GLuint mask,
 			 const struct brw_reg *arg0 )
 {
-   struct brw_reg interp[4];
-   GLuint nr = arg0[0].nr;
-   GLuint i;
-
-   interp[0] = brw_vec1_grf(nr, 0);
-   interp[1] = brw_vec1_grf(nr, 4);
-   interp[2] = brw_vec1_grf(nr+1, 0);
-   interp[3] = brw_vec1_grf(nr+1, 4);
-
-   for(i = 0; i < 4; i++ ) {
-      if (mask & (1<<i)) {
-	 brw_MOV(p, dst[i], suboffset(interp[i],3));	/* TODO: optimize away like other moves */
-      }
-   }
+	struct brw_reg interp[4];
+	GLuint nr = arg0[0].nr;
+	GLuint i;
+
+	interp[0] = brw_vec1_grf(nr, 0);
+	interp[1] = brw_vec1_grf(nr, 4);
+	interp[2] = brw_vec1_grf(nr+1, 0);
+	interp[3] = brw_vec1_grf(nr+1, 4);
+
+	for(i = 0; i < 4; i++ ) {
+		if (mask & (1<<i)) {
+			brw_MOV(p, dst[i], suboffset(interp[i],3));	/* TODO: optimize away like other moves */
+		}
+	}
 }
 
 
@@ -343,11 +351,10 @@ static void emit_lrp( struct brw_compile *p,
       }
    }
 }
-
-
-static void emit_slt( struct brw_compile *p, 
+static void emit_sop( struct brw_compile *p, 
 		      const struct brw_reg *dst,
 		      GLuint mask,
+		      GLuint cond,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
@@ -356,34 +363,66 @@ static void emit_slt( struct brw_compile *p,
    for (i = 0; i < 4; i++) {
       if (mask & (1<<i)) {	
 	 brw_MOV(p, dst[i], brw_imm_f(0));
-	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
+	 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
 	 brw_MOV(p, dst[i], brw_imm_f(1.0));
 	 brw_set_predicate_control_flag_value(p, 0xff);
       }
    }
 }
 
-/* Isn't this just the same as the above with the args swapped?
- */
-static void emit_sge( struct brw_compile *p, 
+static void emit_slt( struct brw_compile *p, 
 		      const struct brw_reg *dst,
 		      GLuint mask,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
-   GLuint i;
+	 emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
+}
 
-   for (i = 0; i < 4; i++) {
-      if (mask & (1<<i)) {	
-	 brw_MOV(p, dst[i], brw_imm_f(0));
-	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
-	 brw_MOV(p, dst[i], brw_imm_f(1.0));
-	 brw_set_predicate_control_flag_value(p, 0xff);
-      }
-   }
+static void emit_sle( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+	 emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
+}
+
+static void emit_sgt( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+	 emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
+}
+
+static void emit_sge( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+	 emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
 }
 
+static void emit_seq( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+	 emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
+}
 
+static void emit_sne( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+	 emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
+}
 
 static void emit_cmp( struct brw_compile *p, 
 		      const struct brw_reg *dst,
@@ -465,6 +504,9 @@ static void emit_dp3( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
+   if (!(mask & WRITEMASK_XYZW))
+      return; /* Do not emit dead code*/
+
    assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
@@ -482,6 +524,9 @@ static void emit_dp4( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
+   if (!(mask & WRITEMASK_XYZW))
+      return; /* Do not emit dead code*/
+
    assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
@@ -500,6 +545,9 @@ static void emit_dph( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
+   if (!(mask & WRITEMASK_XYZW))
+      return; /* Do not emit dead code*/
+
    assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
@@ -543,8 +591,11 @@ static void emit_math1( struct brw_compile *p,
 			GLuint mask,
 			const struct brw_reg *arg0 )
 {
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X ||
-	  function == BRW_MATH_FUNCTION_SINCOS);
+   if (!(mask & WRITEMASK_XYZW))
+      return; /* Do not emit dead code*/
+
+   //assert((mask & WRITEMASK_XYZW) == WRITEMASK_X ||
+   //	  function == BRW_MATH_FUNCTION_SINCOS);
    
    brw_MOV(p, brw_message_reg(2), arg0[0]);
 
@@ -567,6 +618,9 @@ static void emit_math2( struct brw_compile *p,
 			const struct brw_reg *arg0,
 			const struct brw_reg *arg1)
 {
+   if (!(mask & WRITEMASK_XYZW))
+      return; /* Do not emit dead code*/
+
    assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
 
    brw_push_insn_state(p);
@@ -670,7 +724,6 @@ static void emit_tex( struct brw_wm_compile *c,
 	      responseLength,
 	      msgLength,
 	      0);	
-
 }
 
 
@@ -1081,7 +1134,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       case WM_WPOSXY:
-	 emit_wpos_xy(p, dst, dst_flags, args[0]);
+	 emit_wpos_xy(c, dst, dst_flags, args[0]);
 	 break;
 
       case WM_PIXELW:
@@ -1209,9 +1262,21 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_slt(p, dst, dst_flags, args[0], args[1]);
 	 break;
 
+      case OPCODE_SLE:
+	 emit_sle(p, dst, dst_flags, args[0], args[1]);
+	break;
+      case OPCODE_SGT:
+	 emit_sgt(p, dst, dst_flags, args[0], args[1]);
+	break;
       case OPCODE_SGE:
 	 emit_sge(p, dst, dst_flags, args[0], args[1]);
 	 break;
+      case OPCODE_SEQ:
+	 emit_seq(p, dst, dst_flags, args[0], args[1]);
+	break;
+      case OPCODE_SNE:
+	 emit_sne(p, dst, dst_flags, args[0], args[1]);
+	break;
 
       case OPCODE_LIT:
 	 emit_lit(p, dst, dst_flags, args[0]);
@@ -1232,7 +1297,8 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       default:
-	 assert(0);
+	_mesa_printf("unsupport opcode %d in fragment program\n", 
+		inst->opcode);
       }
       
       for (i = 0; i < 4; i++)
diff --git a/i965/brw_wm_fp.c b/i965/brw_wm_fp.c
index dc57fd2..f895f96 100644
--- a/i965/brw_wm_fp.c
+++ b/i965/brw_wm_fp.c
@@ -144,7 +144,7 @@ static struct prog_dst_register dst_undef( void )
 
 static struct prog_dst_register get_temp( struct brw_wm_compile *c )
 {
-   int bit = ffs( ~c->fp_temp );
+   int bit = _mesa_ffs( ~c->fp_temp );
 
    if (!bit) {
       _mesa_printf("%s: out of temporaries\n", __FILE__);
@@ -158,7 +158,7 @@ static struct prog_dst_register get_temp( struct brw_wm_compile *c )
 
 static void release_temp( struct brw_wm_compile *c, struct prog_dst_register temp )
 {
-   c->fp_temp &= ~1<<(temp.Index + 1 - FIRST_INTERNAL_TEMP);
+   c->fp_temp &= ~(1 << (temp.Index - FIRST_INTERNAL_TEMP));
 }
 
 
@@ -176,6 +176,7 @@ static struct prog_instruction *emit_insn(struct brw_wm_compile *c,
 {
    struct prog_instruction *inst = get_fp_inst(c);
    *inst = *inst0;
+   inst->Data = (void *)inst0;
    return inst;
 }
 
@@ -201,7 +202,6 @@ static struct prog_instruction * emit_op(struct brw_wm_compile *c,
    inst->SrcReg[0] = src0;
    inst->SrcReg[1] = src1;
    inst->SrcReg[2] = src2;
-   
    return inst;
 }
    
@@ -361,6 +361,37 @@ static void emit_interp( struct brw_wm_compile *c,
    c->fp_interp_emitted |= 1<<idx;
 }
 
+static void emit_ddx( struct brw_wm_compile *c,
+        const struct prog_instruction *inst )
+{
+    GLuint idx = inst->SrcReg[0].Index;
+    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
+
+    c->fp_deriv_emitted |= 1<<idx;
+    emit_op(c,
+            OPCODE_DDX,
+            inst->DstReg,
+            0, 0, 0,
+            interp,
+            get_pixel_w(c),
+            src_undef());
+}
+
+static void emit_ddy( struct brw_wm_compile *c,
+        const struct prog_instruction *inst )
+{
+    GLuint idx = inst->SrcReg[0].Index;
+    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
+
+    c->fp_deriv_emitted |= 1<<idx;
+    emit_op(c,
+            OPCODE_DDY,
+            inst->DstReg,
+            0, 0, 0,
+            interp,
+            get_pixel_w(c),
+            src_undef());
+}
 
 /***********************************************************************
  * Hacks to extend the program parameter and constant lists.
@@ -463,17 +494,20 @@ static void precalc_dst( struct brw_wm_compile *c,
 
 
    if (dst.WriteMask & WRITEMASK_XZ) {
+      struct prog_instruction *swz;
       GLuint z = GET_SWZ(src0.Swizzle, Z);
 
       /* dst.xz = swz src0.1zzz
        */
-      emit_op(c,
-	      OPCODE_SWZ,
-	      dst_mask(dst, WRITEMASK_XZ),
-	      inst->SaturateMode, 0, 0,
-	      src_swizzle(src0, SWIZZLE_ONE, z, z, z),
-	      src_undef(),
-	      src_undef());
+      swz = emit_op(c,
+		    OPCODE_SWZ,
+		    dst_mask(dst, WRITEMASK_XZ),
+		    inst->SaturateMode, 0, 0,
+		    src_swizzle(src0, SWIZZLE_ONE, z, z, z),
+		    src_undef(),
+		    src_undef());
+      /* Avoid letting negation flag of src0 affect our 1 constant. */
+      swz->SrcReg[0].NegateBase &= ~NEGATE_X;
    }
    if (dst.WriteMask & WRITEMASK_W) {
       /* dst.w = mov src1.w
@@ -496,15 +530,19 @@ static void precalc_lit( struct brw_wm_compile *c,
    struct prog_dst_register dst = inst->DstReg;
    
    if (dst.WriteMask & WRITEMASK_XW) {
+      struct prog_instruction *swz;
+
       /* dst.xw = swz src0.1111
        */
-      emit_op(c,
-	      OPCODE_SWZ,
-	      dst_mask(dst, WRITEMASK_XW),
-	      0, 0, 0,
-	      src_swizzle1(src0, SWIZZLE_ONE),
-	      src_undef(),
-	      src_undef());
+      swz = emit_op(c,
+		    OPCODE_SWZ,
+		    dst_mask(dst, WRITEMASK_XW),
+		    0, 0, 0,
+		    src_swizzle1(src0, SWIZZLE_ONE),
+		    src_undef(),
+		    src_undef());
+      /* Avoid letting the negation flag of src0 affect our 1 constant. */
+      swz->SrcReg[0].NegateBase = 0;
    }
 
 
@@ -618,17 +656,21 @@ static void precalc_tex( struct brw_wm_compile *c,
 	      src_undef());
    }
    else {
+       GLboolean  swap_uv = c->key.yuvtex_swap_mask & (1<<inst->TexSrcUnit);
+
       /* 
 	 CONST C0 = { -.5, -.0625,  -.5, 1.164 }
 	 CONST C1 = { 1.596, -0.813, 2.018, -.391 }
 	 UYV     = TEX ...
 	 UYV.xyz = ADD UYV,     C0
 	 UYV.y   = MUL UYV.y,   C0.w
-	 RGB.xyz = MAD UYV.xxz, C1,   UYV.y
+ 	 if (UV swaped)
+	    RGB.xyz = MAD UYV.zzx, C1,   UYV.y
+	 else
+	    RGB.xyz = MAD UYV.xxz, C1,   UYV.y 
 	 RGB.y   = MAD UYV.z,   C1.w, RGB.y
       */
       struct prog_dst_register dst = inst->DstReg;
-      struct prog_src_register src0 = inst->SrcReg[0];
       struct prog_dst_register tmp = get_temp(c);
       struct prog_src_register tmpsrc = src_reg_from_dst(tmp);
       struct prog_src_register C0 = search_or_add_const4f( c,  -.5, -.0625, -.5, 1.164 );
@@ -642,7 +684,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 	      inst->SaturateMode,
 	      inst->TexSrcUnit,
 	      inst->TexSrcTarget,
-	      src0,
+	      coord,
 	      src_undef(),
 	      src_undef());
 
@@ -658,6 +700,7 @@ static void precalc_tex( struct brw_wm_compile *c,
 
       /* YUV.y   = MUL YUV.y, C0.w
        */
+
       emit_op(c,
 	      OPCODE_MUL,
 	      dst_mask(tmp, WRITEMASK_Y),
@@ -666,13 +709,18 @@ static void precalc_tex( struct brw_wm_compile *c,
 	      src_swizzle1(C0, W),
 	      src_undef());
 
-      /* RGB.xyz = MAD YUV.xxz, C1, YUV.y
+      /* 
+       * if (UV swaped)
+       *     RGB.xyz = MAD YUV.zzx, C1, YUV.y
+       * else
+       *     RGB.xyz = MAD YUV.xxz, C1, YUV.y
        */
+
       emit_op(c,
 	      OPCODE_MAD,
 	      dst_mask(dst, WRITEMASK_XYZ),
 	      0, 0, 0,
-	      src_swizzle(tmpsrc, X,X,Z,Z),
+	      swap_uv?src_swizzle(tmpsrc, Z,Z,X,X):src_swizzle(tmpsrc, X,X,Z,Z),
 	      C1,
 	      src_swizzle1(tmpsrc, Y));
 
@@ -689,7 +737,8 @@ static void precalc_tex( struct brw_wm_compile *c,
       release_temp(c, tmp);
    }
 
-   if (inst->TexSrcTarget == GL_TEXTURE_RECTANGLE_NV) 
+   if ((inst->TexSrcTarget == TEXTURE_RECT_INDEX) ||
+       (inst->TexSrcTarget == TEXTURE_CUBE_INDEX))
       release_temp(c, tmpcoord);
 }
 
@@ -710,7 +759,7 @@ static GLboolean projtex( struct brw_wm_compile *c,
       return 0;  /* ut2004 gun rendering !?! */
    else if (src.File == PROGRAM_INPUT && 
 	    GET_SWZ(src.Swizzle, W) == W &&
-	    (c->key.projtex_mask & (1<<src.Index)) == 0)
+           (c->key.projtex_mask & (1<<(src.Index + FRAG_ATTRIB_WPOS - FRAG_ATTRIB_TEX0))) == 0)
       return 0;
    else
       return 1;
@@ -939,7 +988,11 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
       case OPCODE_LIT:
 	 precalc_lit(c, inst);
 	 break;
-     
+
+      case OPCODE_TEX:
+	 precalc_tex(c, inst);
+	 break;
+
       case OPCODE_TXP:
 	 precalc_txp(c, inst);
 	 break;
@@ -957,8 +1010,16 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	  */
 	 out->DstReg.WriteMask = 0;
 	 break;
-
+      case OPCODE_DDX:
+	 emit_ddx(c, inst);
+	 break;
+      case OPCODE_DDY:
+         emit_ddy(c, inst);
+	break;
       case OPCODE_END:
+	 emit_fog(c);
+	 emit_fb_write(c);
+	 break;
       case OPCODE_PRINT:
 	 break;
 	 
@@ -967,15 +1028,11 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 break;
       }
    }
-   
-   emit_fog(c);
-   emit_fb_write(c);
-
 
    if (INTEL_DEBUG & DEBUG_WM) {
-      _mesa_printf("\n\n\npass_fp:\n");
-      print_insns( c->prog_instructions, c->nr_fp_insns );
-      _mesa_printf("\n");
+	   _mesa_printf("\n\n\npass_fp:\n");
+	   print_insns( c->prog_instructions, c->nr_fp_insns );
+	   _mesa_printf("\n");
    }
 }
 
diff --git a/i965/brw_wm_glsl.c b/i965/brw_wm_glsl.c
new file mode 100644
index 0000000..5a1f80d
--- /dev/null
+++ b/i965/brw_wm_glsl.c
@@ -0,0 +1,1370 @@
+#include "macros.h"
+#include "shader/prog_parameter.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_wm.h"
+
+/* Only guess, need a flag in gl_fragment_program later */
+GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
+{
+    int i;
+    for (i = 0; i < fp->Base.NumInstructions; i++) {
+	struct prog_instruction *inst = &fp->Base.Instructions[i];
+	switch (inst->Opcode) {
+	    case OPCODE_IF:
+	    case OPCODE_INT:
+	    case OPCODE_ENDIF:
+	    case OPCODE_CAL:
+	    case OPCODE_BRK:
+	    case OPCODE_RET:
+	    case OPCODE_DDX:
+	    case OPCODE_DDY:
+	    case OPCODE_BGNLOOP:
+		return GL_TRUE; 
+	    default:
+		break;
+	}
+    }
+    return GL_FALSE; 
+}
+
+static void set_reg(struct brw_wm_compile *c, int file, int index, 
+	int component, struct brw_reg reg)
+{
+    c->wm_regs[file][index][component].reg = reg;
+    c->wm_regs[file][index][component].inited = GL_TRUE;
+}
+
+static int get_scalar_dst_index(struct prog_instruction *inst)
+{
+    int i;
+    for (i = 0; i < 4; i++)
+	if (inst->DstReg.WriteMask & (1<<i))
+	    break;
+    return i;
+}
+
+static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
+{
+    struct brw_reg reg;
+    reg = brw_vec8_grf(c->tmp_index--, 0);
+    return reg;
+}
+
+static void release_tmps(struct brw_wm_compile *c)
+{
+    c->tmp_index = 127;
+}
+
+static struct brw_reg 
+get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GLuint neg, GLuint abs)
+{
+    struct brw_reg reg;
+    switch (file) {
+	case PROGRAM_STATE_VAR:
+	case PROGRAM_CONSTANT:
+	case PROGRAM_UNIFORM:
+	    file = PROGRAM_STATE_VAR;
+	    break;
+	case PROGRAM_UNDEFINED:
+	    return brw_null_reg();	
+	default:
+	    break;
+    }
+
+    if(c->wm_regs[file][index][component].inited)
+	reg = c->wm_regs[file][index][component].reg;
+    else 
+	reg = brw_vec8_grf(c->reg_index, 0);
+
+    if(!c->wm_regs[file][index][component].inited) {
+	set_reg(c, file, index, component, reg);
+	c->reg_index++;
+    }
+
+    if (neg & (1<< component)) {
+	reg = negate(reg);
+    }
+    if (abs)
+	reg = brw_abs(reg);
+    return reg;
+}
+
+static void prealloc_reg(struct brw_wm_compile *c)
+{
+    int i, j;
+    struct brw_reg reg;
+    int nr_interp_regs = 0;
+    GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
+
+    for (i = 0; i < 4; i++) {
+	reg = (i < c->key.nr_depth_regs) 
+	    ? brw_vec8_grf(i*2, 0) : brw_vec8_grf(0, 0);
+	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
+    }
+    c->reg_index += 2*c->key.nr_depth_regs;
+    {
+	int nr_params = c->fp->program.Base.Parameters->NumParameters;
+	struct gl_program_parameter_list *plist = 
+	    c->fp->program.Base.Parameters;
+	int index = 0;
+	c->prog_data.nr_params = 4*nr_params;
+	for (i = 0; i < nr_params; i++) {
+	    for (j = 0; j < 4; j++, index++) {
+		reg = brw_vec1_grf(c->reg_index + index/8, 
+			index%8);
+		c->prog_data.param[index] = 
+		    &plist->ParameterValues[i][j];
+		set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
+	    }
+	}
+	c->nr_creg = 2*((4*nr_params+15)/16);
+	c->reg_index += c->nr_creg;
+    }
+    for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
+	if (inputs & (1<<i)) {
+	    nr_interp_regs++;
+	    reg = brw_vec8_grf(c->reg_index, 0);
+	    for (j = 0; j < 4; j++)
+		set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
+	    c->reg_index += 2;
+
+	}
+    }
+    c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
+    c->prog_data.urb_read_length = nr_interp_regs * 2;
+    c->prog_data.curb_read_length = c->nr_creg;
+    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
+    c->reg_index++;
+    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
+    c->reg_index += 2;
+}
+
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c, 
+	struct prog_instruction *inst, int component, int nr)
+{
+    return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
+	    0, 0);
+}
+
+static struct brw_reg get_src_reg(struct brw_wm_compile *c, 
+	struct prog_src_register *src, int index, int nr)
+{
+    int component = GET_SWZ(src->Swizzle, index);
+    return get_reg(c, src->File, src->Index, component, nr, 
+	    src->NegateBase, src->Abs);
+}
+
+static void emit_abs( struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    int i;
+    struct brw_compile *p = &c->func;
+    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+    for (i = 0; i < 4; i++) {
+	if (inst->DstReg.WriteMask & (1<<i)) {
+	    struct brw_reg src, dst;
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    brw_MOV(p, dst, brw_abs(src));
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_int( struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    struct brw_reg src, dst;
+	    dst = get_dst_reg(c, inst, i, 1) ;
+	    src = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    brw_RNDD(p, dst, src);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_mov( struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    struct brw_reg src, dst;
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    brw_MOV(p, dst, src);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_pixel_xy(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_reg r1 = brw_vec1_grf(1, 0);
+    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
+
+    struct brw_reg dst0, dst1;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+
+    dst0 = get_dst_reg(c, inst, 0, 1);
+    dst1 = get_dst_reg(c, inst, 1, 1);
+    /* Calculate pixel centers by adding 1 or 0 to each of the
+     * micro-tile coordinates passed in r1.
+     */
+    if (mask & WRITEMASK_X) {
+	brw_ADD(p,
+		vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
+		stride(suboffset(r1_uw, 4), 2, 4, 0),
+		brw_imm_v(0x10101010));
+    }
+
+    if (mask & WRITEMASK_Y) {
+	brw_ADD(p,
+		vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
+		stride(suboffset(r1_uw, 5), 2, 4, 0),
+		brw_imm_v(0x11001100));
+    }
+
+}
+
+static void emit_delta_xy(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_reg r1 = brw_vec1_grf(1, 0);
+    struct brw_reg dst0, dst1, src0, src1;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+
+    dst0 = get_dst_reg(c, inst, 0, 1);
+    dst1 = get_dst_reg(c, inst, 1, 1);
+    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+    src1 = get_src_reg(c, &inst->SrcReg[0], 1, 1);
+    /* Calc delta X,Y by subtracting origin in r1 from the pixel
+     * centers.
+     */
+    if (mask & WRITEMASK_X) {
+	brw_ADD(p,
+		dst0,
+		retype(src0, BRW_REGISTER_TYPE_UW),
+		negate(r1));
+    }
+
+    if (mask & WRITEMASK_Y) {
+	brw_ADD(p,
+		dst1,
+		retype(src1, BRW_REGISTER_TYPE_UW),
+		negate(suboffset(r1,1)));
+
+    }
+
+}
+
+
+static void fire_fb_write( struct brw_wm_compile *c,
+                           GLuint base_reg,
+                           GLuint nr )
+{
+    struct brw_compile *p = &c->func;
+
+    /* Pass through control information:
+     */
+    /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
+    {
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
+	brw_MOV(p,
+		brw_message_reg(base_reg + 1),
+		brw_vec8_grf(1, 0));
+	brw_pop_insn_state(p);
+    }
+    /* Send framebuffer write message: */
+    brw_fb_WRITE(p,
+	    retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+	    base_reg,
+	    retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
+	    0,              /* render surface always 0 */
+	    nr,
+	    0,
+	    1);
+}
+
+static void emit_fb_write(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    int nr = 2;
+    int channel;
+    struct brw_reg src0;//, src1, src2, dst;
+
+    /* Reserve a space for AA - may not be needed:
+     */
+    if (c->key.aa_dest_stencil_reg)
+	nr += 1;
+    {
+	brw_push_insn_state(p);
+	for (channel = 0; channel < 4; channel++) {
+	    src0 = get_src_reg(c,  &inst->SrcReg[0], channel, 1);
+	    /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
+	    /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
+	    brw_MOV(p, brw_message_reg(nr + channel), src0);
+	}
+	/* skip over the regs populated above: */
+	nr += 8;
+	brw_pop_insn_state(p);
+    }
+
+   if (c->key.source_depth_to_render_target)
+   {
+      if (c->key.computes_depth) {
+         src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1);
+         brw_MOV(p, brw_message_reg(nr), src0);
+      } else {
+         src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
+         brw_MOV(p, brw_message_reg(nr), src0);
+      }
+
+      nr += 2;
+   }
+
+    fire_fb_write(c, 0, nr);
+}
+
+static void emit_pixel_w( struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    if (mask & WRITEMASK_W) {
+	struct brw_reg dst, src0, delta0, delta1;
+	struct brw_reg interp3;
+
+	dst = get_dst_reg(c, inst, 3, 1);
+	src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+	delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
+	delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
+
+	interp3 = brw_vec1_grf(src0.nr+1, 4);
+	/* Calc 1/w - just linterp wpos[3] optimized by putting the
+	 * result straight into a message reg.
+	 */
+	brw_LINE(p, brw_null_reg(), interp3, delta0);
+	brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
+
+	/* Calc w */
+	brw_math_16( p, dst,
+		BRW_MATH_FUNCTION_INV,
+		BRW_MATH_SATURATE_NONE,
+		2, brw_null_reg(),
+		BRW_MATH_PRECISION_FULL);
+    }
+}
+
+static void emit_linterp(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg interp[4];
+    struct brw_reg dst, delta0, delta1;
+    struct brw_reg src0;
+
+    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+    delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
+    delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
+    GLuint nr = src0.nr;
+    int i;
+
+    interp[0] = brw_vec1_grf(nr, 0);
+    interp[1] = brw_vec1_grf(nr, 4);
+    interp[2] = brw_vec1_grf(nr+1, 0);
+    interp[3] = brw_vec1_grf(nr+1, 4);
+
+    for(i = 0; i < 4; i++ ) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
+	    brw_MAC(p, dst, suboffset(interp[i],1), delta1);
+	}
+    }
+}
+
+static void emit_cinterp(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+
+    struct brw_reg interp[4];
+    struct brw_reg dst, src0;
+
+    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+    GLuint nr = src0.nr;
+    int i;
+
+    interp[0] = brw_vec1_grf(nr, 0);
+    interp[1] = brw_vec1_grf(nr, 4);
+    interp[2] = brw_vec1_grf(nr+1, 0);
+    interp[3] = brw_vec1_grf(nr+1, 4);
+
+    for(i = 0; i < 4; i++ ) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    brw_MOV(p, dst, suboffset(interp[i],3));
+	}
+    }
+}
+
+static void emit_pinterp(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+
+    struct brw_reg interp[4];
+    struct brw_reg dst, delta0, delta1;
+    struct brw_reg src0, w;
+
+    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+    delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
+    delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
+    w = get_src_reg(c, &inst->SrcReg[2], 3, 1);
+    GLuint nr = src0.nr;
+    int i;
+
+    interp[0] = brw_vec1_grf(nr, 0);
+    interp[1] = brw_vec1_grf(nr, 4);
+    interp[2] = brw_vec1_grf(nr+1, 0);
+    interp[3] = brw_vec1_grf(nr+1, 4);
+
+    for(i = 0; i < 4; i++ ) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
+	    brw_MAC(p, dst, suboffset(interp[i],1), 
+		    delta1);
+	    brw_MUL(p, dst, dst, w);
+	}
+    }
+}
+
+static void emit_xpd(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    for (i = 0; i < 4; i++) {
+	GLuint i2 = (i+2)%3;
+	GLuint i1 = (i+1)%3;
+	if (mask & (1<<i)) {
+	    struct brw_reg src0, src1, dst;
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src0 = negate(get_src_reg(c, &inst->SrcReg[0], i2, 1));
+	    src1 = get_src_reg(c, &inst->SrcReg[1], i1, 1);
+	    brw_MUL(p, brw_null_reg(), src0, src1);
+	    src0 = get_src_reg(c, &inst->SrcReg[0], i1, 1);
+	    src1 = get_src_reg(c, &inst->SrcReg[1], i2, 1);
+	    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+	    brw_MAC(p, dst, src0, src1);
+	    brw_set_saturate(p, 0);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_dp3(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_reg src0[3], src1[3], dst;
+    int i;
+    struct brw_compile *p = &c->func;
+    for (i = 0; i < 3; i++) {
+	src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
+    }
+
+    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    brw_MAC(p, dst, src0[2], src1[2]);
+    brw_set_saturate(p, 0);
+}
+
+static void emit_dp4(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_reg src0[4], src1[4], dst;
+    int i;
+    struct brw_compile *p = &c->func;
+    for (i = 0; i < 4; i++) {
+	src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
+    }
+    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+    brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    brw_MAC(p, dst, src0[3], src1[3]);
+    brw_set_saturate(p, 0);
+}
+
+static void emit_dph(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_reg src0[4], src1[4], dst;
+    int i;
+    struct brw_compile *p = &c->func;
+    for (i = 0; i < 4; i++) {
+	src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
+    }
+    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+    brw_MAC(p, dst, src0[2], src1[2]);
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    brw_ADD(p, dst, src0[3], src1[3]);
+    brw_set_saturate(p, 0);
+}
+
+static void emit_math1(struct brw_wm_compile *c,
+		struct prog_instruction *inst, GLuint func)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, dst;
+
+    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+    brw_MOV(p, brw_message_reg(2), src0);
+    brw_math(p,
+	    dst,
+	    func,
+	    (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+static void emit_rcp(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
+}
+
+static void emit_rsq(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
+}
+
+static void emit_sin(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
+}
+
+static void emit_cos(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
+}
+
+static void emit_ex2(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
+}
+
+static void emit_lg2(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
+}
+
+static void emit_add(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, src1, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    brw_ADD(p, dst, src0, src1);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_sub(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, src1, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    brw_ADD(p, dst, src0, negate(src1));
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_mul(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, src1, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    brw_MUL(p, dst, src0, src1);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_frc(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    brw_FRC(p, dst, src0);
+	}
+    }
+    if (inst->SaturateMode != SATURATE_OFF)
+	brw_set_saturate(p, 0);
+}
+
+static void emit_flr(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    brw_RNDD(p, dst, src0);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_max(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg src0, src1, dst;
+    int i;
+    brw_push_insn_state(p);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_MOV(p, dst, src0);
+	    brw_set_saturate(p, 0);
+
+	    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	    brw_MOV(p, dst, src1);
+	    brw_set_saturate(p, 0);
+	    brw_set_predicate_control_flag_value(p, 0xff);
+	}
+    }
+    brw_pop_insn_state(p);
+}
+
+static void emit_min(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg src0, src1, dst;
+    int i;
+    brw_push_insn_state(p);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_MOV(p, dst, src0);
+	    brw_set_saturate(p, 0);
+
+	    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	    brw_MOV(p, dst, src1);
+	    brw_set_saturate(p, 0);
+	    brw_set_predicate_control_flag_value(p, 0xff);
+	}
+    }
+    brw_pop_insn_state(p);
+}
+
+static void emit_pow(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg dst, src0, src1;
+    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+    src1 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
+
+    brw_MOV(p, brw_message_reg(2), src0);
+    brw_MOV(p, brw_message_reg(3), src1);
+
+    brw_math(p,
+	    dst,
+	    BRW_MATH_FUNCTION_POW,
+	    (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+static void emit_lrp(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
+    int i;
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+
+	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+
+	    if (src1.nr == dst.nr) {
+		tmp1 = alloc_tmp(c);
+		brw_MOV(p, tmp1, src1);
+	    } else
+		tmp1 = src1;
+
+	    src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
+	    if (src2.nr == dst.nr) {
+		tmp2 = alloc_tmp(c);
+		brw_MOV(p, tmp2, src2);
+	    } else
+		tmp2 = src2;
+
+	    brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
+	    brw_MUL(p, brw_null_reg(), dst, tmp2);
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_MAC(p, dst, src0, tmp1);
+	    brw_set_saturate(p, 0);
+	}
+	release_tmps(c);
+    }
+}
+
+static void emit_kil(struct brw_wm_compile *c)
+{
+	struct brw_compile *p = &c->func;
+	struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+	brw_AND(p, depth, c->emit_mask_reg, depth);
+	brw_pop_insn_state(p);
+}
+
+static void emit_mad(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg dst, src0, src1, src2;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
+	    brw_MUL(p, dst, src0, src1);
+
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_ADD(p, dst, dst, src2);
+	    brw_set_saturate(p, 0);
+	}
+    }
+}
+
+static void emit_sop(struct brw_wm_compile *c,
+		struct prog_instruction *inst, GLuint cond)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg dst, src0, src1;
+    int i;
+
+    brw_push_insn_state(p);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i, 1);
+	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    brw_CMP(p, brw_null_reg(), cond, src0, src1);
+	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	    brw_MOV(p, dst, brw_imm_f(0.0));
+	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	    brw_MOV(p, dst, brw_imm_f(1.0));
+	}
+    }
+    brw_pop_insn_state(p);
+}
+
+static void emit_slt(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_L);
+}
+
+static void emit_sle(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_LE);
+}
+
+static void emit_sgt(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_G);
+}
+
+static void emit_sge(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_GE);
+}
+
+static void emit_seq(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_EQ);
+}
+
+static void emit_sne(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
+}
+
+static void emit_ddx(struct brw_wm_compile *c,
+                struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg interp[4];
+    struct brw_reg dst;
+    struct brw_reg src0, w;
+    GLuint nr, i;
+    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+    w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
+    nr = src0.nr;
+    interp[0] = brw_vec1_grf(nr, 0);
+    interp[1] = brw_vec1_grf(nr, 4);
+    interp[2] = brw_vec1_grf(nr+1, 0);
+    interp[3] = brw_vec1_grf(nr+1, 4);
+    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+    for(i = 0; i < 4; i++ ) {
+        if (mask & (1<<i)) {
+            dst = get_dst_reg(c, inst, i, 1);
+            brw_MOV(p, dst, interp[i]);
+            brw_MUL(p, dst, dst, w);
+        }
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_ddy(struct brw_wm_compile *c,
+                struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg interp[4];
+    struct brw_reg dst;
+    struct brw_reg src0, w;
+    GLuint nr, i;
+
+    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+    nr = src0.nr;
+    w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
+    interp[0] = brw_vec1_grf(nr, 0);
+    interp[1] = brw_vec1_grf(nr, 4);
+    interp[2] = brw_vec1_grf(nr+1, 0);
+    interp[3] = brw_vec1_grf(nr+1, 4);
+    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+    for(i = 0; i < 4; i++ ) {
+        if (mask & (1<<i)) {
+            dst = get_dst_reg(c, inst, i, 1);
+            brw_MOV(p, dst, suboffset(interp[i], 1));
+            brw_MUL(p, dst, dst, w);
+        }
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_wpos_xy(struct brw_wm_compile *c,
+                struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg src0[2], dst[2];
+
+    dst[0] = get_dst_reg(c, inst, 0, 1);
+    dst[1] = get_dst_reg(c, inst, 1, 1);
+
+    src0[0] = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+    src0[1] = get_src_reg(c, &inst->SrcReg[0], 1, 1);
+
+    /* Calculate the pixel offset from window bottom left into destination
+     * X and Y channels.
+     */
+    if (mask & WRITEMASK_X) {
+	/* X' = X - origin_x */
+	brw_ADD(p,
+		dst[0],
+		retype(src0[0], BRW_REGISTER_TYPE_W),
+		brw_imm_d(0 - c->key.origin_x));
+    }
+
+    if (mask & WRITEMASK_Y) {
+	/* Y' = height - (Y - origin_y) = height + origin_y - Y */
+	brw_ADD(p,
+		dst[1],
+		negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
+		brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
+    }
+}
+
+/* TODO
+   BIAS on SIMD8 not workind yet...
+ */	
+static void emit_txb(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg dst[4], src[4], payload_reg;
+    GLuint i;
+    payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+    for (i = 0; i < 4; i++) 
+	dst[i] = get_dst_reg(c, inst, i, 1);
+    for (i = 0; i < 4; i++)
+	src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
+
+    switch (inst->TexSrcTarget) {
+	case TEXTURE_1D_INDEX:
+	    brw_MOV(p, brw_message_reg(2), src[0]);
+	    brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
+	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+	    break;
+	case TEXTURE_2D_INDEX:
+	case TEXTURE_RECT_INDEX:
+	    brw_MOV(p, brw_message_reg(2), src[0]);
+	    brw_MOV(p, brw_message_reg(3), src[1]);
+	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+	    break;
+	default:
+	    brw_MOV(p, brw_message_reg(2), src[0]);
+	    brw_MOV(p, brw_message_reg(3), src[1]);
+	    brw_MOV(p, brw_message_reg(4), src[2]);
+	    break;
+    }
+    brw_MOV(p, brw_message_reg(5), src[3]);
+    brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+    brw_SAMPLE(p,
+	    retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
+	    1,
+	    retype(payload_reg, BRW_REGISTER_TYPE_UW),
+	    inst->TexSrcUnit + 1, /* surface */
+	    inst->TexSrcUnit,     /* sampler */
+	    inst->DstReg.WriteMask,
+	    BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
+	    4,
+	    4,
+	    0);
+}
+
+static void emit_tex(struct brw_wm_compile *c,
+		struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg dst[4], src[4], payload_reg;
+    GLuint msg_len;
+    GLuint i, nr;
+    GLuint emit;
+    GLboolean shadow = (c->key.shadowtex_mask & (1<<inst->TexSrcUnit)) ? 1 : 0;
+
+    payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+
+    for (i = 0; i < 4; i++) 
+	dst[i] = get_dst_reg(c, inst, i, 1);
+    for (i = 0; i < 4; i++)
+	src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
+
+
+    switch (inst->TexSrcTarget) {
+	case TEXTURE_1D_INDEX:
+	    emit = WRITEMASK_X;
+	    nr = 1;
+	    break;
+	case TEXTURE_2D_INDEX:
+	case TEXTURE_RECT_INDEX:
+	    emit = WRITEMASK_XY;
+	    nr = 2;
+	    break;
+	default:
+	    emit = WRITEMASK_XYZ;
+	    nr = 3;
+	    break;
+    }
+    msg_len = 1;
+
+    for (i = 0; i < nr; i++) {
+	static const GLuint swz[4] = {0,1,2,2};
+	if (emit & (1<<i))
+	    brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
+	else
+	    brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
+	msg_len += 1;
+    }
+
+    if (shadow) {
+	brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
+	brw_MOV(p, brw_message_reg(6), src[2]);
+    }
+
+    brw_SAMPLE(p,
+	    retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
+	    1,
+	    retype(payload_reg, BRW_REGISTER_TYPE_UW),
+	    inst->TexSrcUnit + 1, /* surface */
+	    inst->TexSrcUnit,     /* sampler */
+	    inst->DstReg.WriteMask,
+	    BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
+	    4,
+	    shadow ? 6 : 4,
+	    0);
+
+    if (shadow)
+	brw_MOV(p, dst[3], brw_imm_f(1.0));
+}
+
+static void post_wm_emit( struct brw_wm_compile *c )
+{
+    GLuint nr_insns = c->fp->program.Base.NumInstructions;
+    GLuint insn, target_insn;
+    struct prog_instruction *inst1, *inst2;
+    struct brw_instruction *brw_inst1, *brw_inst2;
+    int offset;
+    for (insn = 0; insn < nr_insns; insn++) {
+	inst1 = &c->fp->program.Base.Instructions[insn];
+	brw_inst1 = inst1->Data;
+	switch (inst1->Opcode) {
+	    case OPCODE_CAL:
+		target_insn = inst1->BranchTarget;
+		inst2 = &c->fp->program.Base.Instructions[target_insn];
+		brw_inst2 = inst2->Data;
+		offset = brw_inst2 - brw_inst1;
+		brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+		break;
+	    default:
+		break;
+	}
+    }
+}
+
+static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
+
+{
+#define MAX_IFSN 32
+#define MAX_LOOP_DEPTH 32
+    struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
+    struct brw_instruction *inst0, *inst1;
+    int i, if_insn = 0, loop_insn = 0;
+    struct brw_compile *p = &c->func;
+    struct brw_indirect stack_index = brw_indirect(0, 0);
+
+    brw_init_compile(brw, &c->func);
+    c->reg_index = 0;
+    prealloc_reg(c);
+    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
+
+    for (i = 0; i < c->nr_fp_insns; i++) {
+	struct prog_instruction *inst = &c->prog_instructions[i];
+	struct prog_instruction *orig_inst;
+
+	if ((orig_inst = inst->Data) != 0)
+	    orig_inst->Data = current_insn(p);
+
+	if (inst->CondUpdate)
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	else
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+
+	switch (inst->Opcode) {
+	    case WM_PIXELXY:
+		emit_pixel_xy(c, inst);
+		break;
+	    case WM_DELTAXY: 
+		emit_delta_xy(c, inst);
+		break;
+	    case WM_PIXELW:
+		emit_pixel_w(c, inst);
+		break;	
+	    case WM_LINTERP:
+		emit_linterp(c, inst);
+		break;
+	    case WM_PINTERP:
+		emit_pinterp(c, inst);
+		break;
+	    case WM_CINTERP:
+		emit_cinterp(c, inst);
+		break;
+	    case WM_WPOSXY:
+		emit_wpos_xy(c, inst);
+		break;
+	    case WM_FB_WRITE:
+		emit_fb_write(c, inst);
+		break;
+	    case OPCODE_ABS:
+		emit_abs(c, inst);
+		break;
+	    case OPCODE_ADD:
+		emit_add(c, inst);
+		break;
+	    case OPCODE_SUB:
+		emit_sub(c, inst);
+		break;
+	    case OPCODE_FRC:
+		emit_frc(c, inst);
+		break;
+	    case OPCODE_FLR:
+		emit_flr(c, inst);
+		break;
+	    case OPCODE_LRP:
+		emit_lrp(c, inst);
+		break;
+	    case OPCODE_INT:
+		emit_int(c, inst);
+		break;
+	    case OPCODE_MOV:
+		emit_mov(c, inst);
+		break;
+	    case OPCODE_DP3:
+		emit_dp3(c, inst);
+		break;
+	    case OPCODE_DP4:
+		emit_dp4(c, inst);
+		break;
+	    case OPCODE_XPD:
+		emit_xpd(c, inst);
+		break;
+	    case OPCODE_DPH:
+		emit_dph(c, inst);
+		break;
+	    case OPCODE_RCP:
+		emit_rcp(c, inst);
+		break;
+	    case OPCODE_RSQ:
+		emit_rsq(c, inst);
+		break;
+	    case OPCODE_SIN:
+		emit_sin(c, inst);
+		break;
+	    case OPCODE_COS:
+		emit_cos(c, inst);
+		break;
+	    case OPCODE_EX2:
+		emit_ex2(c, inst);
+		break;
+	    case OPCODE_LG2:
+		emit_lg2(c, inst);
+		break;
+	    case OPCODE_MAX:	
+		emit_max(c, inst);
+		break;
+	    case OPCODE_MIN:	
+		emit_min(c, inst);
+		break;
+	    case OPCODE_DDX:
+		emit_ddx(c, inst);
+		break;
+	    case OPCODE_DDY:
+                emit_ddy(c, inst);
+                break;
+	    case OPCODE_SLT:
+		emit_slt(c, inst);
+		break;
+	    case OPCODE_SLE:
+		emit_sle(c, inst);
+		break;
+	    case OPCODE_SGT:
+		emit_sgt(c, inst);
+		break;
+	    case OPCODE_SGE:
+		emit_sge(c, inst);
+		break;
+	    case OPCODE_SEQ:
+		emit_seq(c, inst);
+		break;
+	    case OPCODE_SNE:
+		emit_sne(c, inst);
+		break;
+	    case OPCODE_MUL:
+		emit_mul(c, inst);
+		break;
+	    case OPCODE_POW:
+		emit_pow(c, inst);
+		break;
+	    case OPCODE_MAD:
+		emit_mad(c, inst);
+		break;
+	    case OPCODE_TEX:
+		emit_tex(c, inst);
+		break;
+	    case OPCODE_TXB:
+		emit_txb(c, inst);
+		break;
+	    case OPCODE_KIL_NV:
+		emit_kil(c);
+		break;
+	    case OPCODE_IF:
+		assert(if_insn < MAX_IFSN);
+		if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+		break;
+	    case OPCODE_ELSE:
+		if_inst[if_insn-1]  = brw_ELSE(p, if_inst[if_insn-1]);
+		break;
+	    case OPCODE_ENDIF:
+		assert(if_insn > 0);
+		brw_ENDIF(p, if_inst[--if_insn]);
+		break;
+	    case OPCODE_BGNSUB:
+	    case OPCODE_ENDSUB:
+		break;
+	    case OPCODE_CAL: 
+		brw_push_insn_state(p);
+		brw_set_mask_control(p, BRW_MASK_DISABLE);
+                brw_set_access_mode(p, BRW_ALIGN_1);
+                brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+                brw_set_access_mode(p, BRW_ALIGN_16);
+                brw_ADD(p, get_addr_reg(stack_index),
+                         get_addr_reg(stack_index), brw_imm_d(4));
+                orig_inst = inst->Data;
+                orig_inst->Data = &p->store[p->nr_insn];
+                brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+                brw_pop_insn_state(p);
+		break;
+
+	    case OPCODE_RET:
+		brw_push_insn_state(p);
+		brw_set_mask_control(p, BRW_MASK_DISABLE);
+                brw_ADD(p, get_addr_reg(stack_index),
+                        get_addr_reg(stack_index), brw_imm_d(-4));
+                brw_set_access_mode(p, BRW_ALIGN_1);
+                brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
+                brw_set_access_mode(p, BRW_ALIGN_16);
+		brw_pop_insn_state(p);
+
+		break;
+	    case OPCODE_BGNLOOP:
+		loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
+		break;
+	    case OPCODE_BRK:
+		brw_BREAK(p);
+		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+		break;
+	    case OPCODE_CONT:
+		brw_CONT(p);
+		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+		break;
+	    case OPCODE_ENDLOOP: 
+		loop_insn--;
+		inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
+		/* patch all the BREAK instructions from
+		   last BEGINLOOP */
+		while (inst0 > loop_inst[loop_insn]) {
+		    inst0--;
+		    if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+			inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
+			inst0->bits3.if_else.pop_count = 0;
+		    } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+                        inst0->bits3.if_else.jump_count = inst1 - inst0;
+                        inst0->bits3.if_else.pop_count = 0;
+                    }
+		}
+		break;
+	    default:
+		_mesa_printf("unsupported IR in fragment shader %d\n",
+			inst->Opcode);
+	}
+	if (inst->CondUpdate)
+	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	else
+	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    }
+    post_wm_emit(c);
+    for (i = 0; i < c->fp->program.Base.NumInstructions; i++)
+	c->fp->program.Base.Instructions[i].Data = NULL;
+}
+
+void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+    brw_wm_pass_fp(c);
+    c->tmp_index = 127;
+    brw_wm_emit_glsl(brw, c);
+    c->prog_data.total_grf = c->reg_index;
+    c->prog_data.total_scratch = 0;
+}
diff --git a/i965/brw_wm_pass0.c b/i965/brw_wm_pass0.c
index 00f6f6b..1bfae5a 100644
--- a/i965/brw_wm_pass0.c
+++ b/i965/brw_wm_pass0.c
@@ -168,6 +168,7 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
       case PROGRAM_PAYLOAD:
       case PROGRAM_TEMPORARY:
       case PROGRAM_OUTPUT:
+      case PROGRAM_VARYING:
 	 break;
 
       case PROGRAM_LOCAL_PARAM:
@@ -179,6 +180,8 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 	 break;
 
       case PROGRAM_STATE_VAR:
+      case PROGRAM_UNIFORM:
+      case PROGRAM_CONSTANT:
       case PROGRAM_NAMED_PARAM: {
 	 struct gl_program_parameter_list *plist = c->fp->program.Base.Parameters;
 	 
@@ -197,6 +200,7 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 	    break;
 	    
 	 case PROGRAM_STATE_VAR:
+	 case PROGRAM_UNIFORM:
 	    /* These may change from run to run:
 	     */
 	    ref = get_param_ref(c, &plist->ParameterValues[idx][component] );
diff --git a/i965/brw_wm_pass1.c b/i965/brw_wm_pass1.c
index d668def..f6f3a38 100644
--- a/i965/brw_wm_pass1.c
+++ b/i965/brw_wm_pass1.c
@@ -150,12 +150,17 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       case OPCODE_FLR:
       case OPCODE_FRC:
       case OPCODE_MOV:
+      case OPCODE_SWZ:
 	 read0 = writemask;
 	 break;
 
       case OPCODE_SUB:
       case OPCODE_SLT:
+      case OPCODE_SLE:
       case OPCODE_SGE:
+      case OPCODE_SGT:
+      case OPCODE_SEQ:
+      case OPCODE_SNE:
       case OPCODE_ADD:
       case OPCODE_MAX:
       case OPCODE_MIN:
@@ -253,11 +258,9 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 read0 = WRITEMASK_XYW;
 	 break;
 
-      case OPCODE_SWZ:
       case OPCODE_DST:
       case OPCODE_TXP:
       default:
-	 assert(0);
 	 break;
       }
 
diff --git a/i965/brw_wm_pass2.c b/i965/brw_wm_pass2.c
index a1edbd6..c1ce6a9 100644
--- a/i965/brw_wm_pass2.c
+++ b/i965/brw_wm_pass2.c
@@ -328,7 +328,7 @@ void brw_wm_pass2( struct brw_wm_compile *c )
    c->state = PASS2_DONE;
 
    if (INTEL_DEBUG & DEBUG_WM) {
-      brw_wm_print_program(c, "pass2/done");
+       brw_wm_print_program(c, "pass2/done");
    }
 }
 
diff --git a/i965/brw_wm_sampler_state.c b/i965/brw_wm_sampler_state.c
index 794c7d9..3c0952a 100644
--- a/i965/brw_wm_sampler_state.c
+++ b/i965/brw_wm_sampler_state.c
@@ -54,7 +54,7 @@ static GLuint translate_wrap_mode( GLenum wrap )
    case GL_REPEAT: 
       return BRW_TEXCOORDMODE_WRAP;
    case GL_CLAMP:  
-      return BRW_TEXCOORDMODE_CLAMP_BORDER; /* conform likes it this way */
+      return BRW_TEXCOORDMODE_CLAMP;
    case GL_CLAMP_TO_EDGE: 
       return BRW_TEXCOORDMODE_CLAMP; /* conform likes it this way */
    case GL_CLAMP_TO_BORDER: 
diff --git a/i965/brw_wm_state.c b/i965/brw_wm_state.c
index 5b4f2ab..7856da0 100644
--- a/i965/brw_wm_state.c
+++ b/i965/brw_wm_state.c
@@ -34,6 +34,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "brw_wm.h"
 #include "bufmgr.h"
 
 /***********************************************************************
@@ -62,7 +63,7 @@ static void upload_wm_unit(struct brw_context *brw )
    memset(&wm, 0, sizeof(wm));
 
    /* CACHE_NEW_WM_PROG */
-   wm.thread0.grf_reg_count = ((brw->wm.prog_data->total_grf-1) & ~15) / 16;
+   wm.thread0.grf_reg_count = ALIGN(brw->wm.prog_data->total_grf, 16) / 16 - 1;
    wm.thread0.kernel_start_pointer = brw->wm.prog_gs_offset >> 6;
    wm.thread3.dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
    wm.thread3.urb_entry_read_length = brw->wm.prog_data->urb_read_length;
@@ -71,7 +72,7 @@ static void upload_wm_unit(struct brw_context *brw )
    wm.wm5.max_threads = max_threads;      
 
    if (brw->wm.prog_data->total_scratch) {
-      GLuint per_thread = (brw->wm.prog_data->total_scratch + 1023) / 1024;
+      GLuint per_thread = ALIGN(brw->wm.prog_data->total_scratch, 1024);
       GLuint total = per_thread * (max_threads + 1);
 
       /* Scratch space -- just have to make sure there is sufficient
@@ -134,9 +135,13 @@ static void upload_wm_unit(struct brw_context *brw )
       if (fp->UsesKill || 
 	  brw->attribs.Color->AlphaEnabled) 
 	 wm.wm5.program_uses_killpixel = 1; 
+      
+      if (brw_wm_is_glsl(fp))
+	  wm.wm5.enable_8_pix = 1;
+      else
+	  wm.wm5.enable_16_pix = 1;
    }
 
-   wm.wm5.enable_16_pix = 1;
    wm.wm5.thread_dispatch_enable = 1;	/* AKA: color_write */
    wm.wm5.legacy_line_rast = 0;
    wm.wm5.legacy_global_depth_bias = 0;
diff --git a/i965/brw_wm_surface_state.c b/i965/brw_wm_surface_state.c
index d24c618..12cd089 100644
--- a/i965/brw_wm_surface_state.c
+++ b/i965/brw_wm_surface_state.c
@@ -69,7 +69,7 @@ static GLuint translate_tex_target( GLenum target )
 }
 
 
-static GLuint translate_tex_format( GLuint mesa_format )
+static GLuint translate_tex_format( GLuint mesa_format, GLenum depth_mode )
 {
    switch( mesa_format ) {
    case MESA_FORMAT_L8:
@@ -114,11 +114,29 @@ static GLuint translate_tex_format( GLuint mesa_format )
       return BRW_SURFACEFORMAT_FXT1;
 
    case MESA_FORMAT_Z16:
-      return BRW_SURFACEFORMAT_L16_UNORM;
+       if (depth_mode == GL_INTENSITY)
+	   return BRW_SURFACEFORMAT_I16_UNORM;
+       else if (depth_mode == GL_ALPHA)
+	   return BRW_SURFACEFORMAT_A16_UNORM;
+       else
+	   return BRW_SURFACEFORMAT_L16_UNORM;
 
-   case MESA_FORMAT_RGBA_DXT1:
    case MESA_FORMAT_RGB_DXT1:
-      return BRW_SURFACEFORMAT_DXT1_RGB;
+       return BRW_SURFACEFORMAT_DXT1_RGB;
+
+   case MESA_FORMAT_RGBA_DXT1:
+       return BRW_SURFACEFORMAT_BC1_UNORM;
+       
+   case MESA_FORMAT_RGBA_DXT3:
+       return BRW_SURFACEFORMAT_BC2_UNORM;
+       
+   case MESA_FORMAT_RGBA_DXT5:
+       return BRW_SURFACEFORMAT_BC3_UNORM;
+
+   case MESA_FORMAT_SRGBA8:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB;
+   case MESA_FORMAT_SRGB_DXT1:
+      return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
 
    default:
       assert(0);
@@ -141,7 +159,7 @@ void brw_update_texture_surface( GLcontext *ctx,
 
    surf->ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;   
    surf->ss0.surface_type = translate_tex_target(tObj->Target);
-   surf->ss0.surface_format = translate_tex_format(firstImage->TexFormat->MesaFormat);
+   surf->ss0.surface_format = translate_tex_format(firstImage->TexFormat->MesaFormat, tObj->DepthMode);
 
    /* This is ok for all textures with channel width 8bit or less:
     */
@@ -181,11 +199,8 @@ static void upload_wm_surfaces(struct brw_context *brw )
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
-   struct brw_surface_binding_table bind;
    GLuint i;
 
-   memcpy(&bind, &brw->wm.bind, sizeof(bind));
-      
    {
       struct brw_surface_state surf;
       struct intel_region *region = brw->state.draw_region;
diff --git a/i965/bufmgr_fake.c b/i965/bufmgr_fake.c
index fb4903d..3b7229c 100644
--- a/i965/bufmgr_fake.c
+++ b/i965/bufmgr_fake.c
@@ -169,7 +169,7 @@ static GLboolean alloc_from_pool( struct intel_context *intel,
    if (!block)
       return GL_FALSE;
 
-   sz = (buf->size + align-1) & ~(align-1);
+   sz = ALIGN(buf->size, align);
 
    block->mem = mmAllocMem(pool->heap, 
 			   sz, 
diff --git a/i965/intel_batchbuffer.c b/i965/intel_batchbuffer.c
index 64885ed..ab61d07 100644
--- a/i965/intel_batchbuffer.c
+++ b/i965/intel_batchbuffer.c
@@ -36,7 +36,7 @@ static void intel_batchbuffer_reset( struct intel_batchbuffer *batch )
    assert(batch->map == NULL);
 
    batch->offset = (unsigned long)batch->ptr;
-   batch->offset = (batch->offset + 63) & ~63;
+   batch->offset = ALIGN(batch->offset, 64);
    batch->ptr = (unsigned char *) batch->offset;
 
    if (BATCH_SZ - batch->offset < BATCH_REFILL) {
@@ -216,7 +216,7 @@ void intel_batchbuffer_align( struct intel_batchbuffer *batch,
 			      GLuint sz )
 {
    unsigned long ptr = (unsigned long) batch->ptr;
-   unsigned long aptr = (ptr + align) & ~((unsigned long)align-1);
+   unsigned long aptr = ALIGN(ptr, align);
    GLuint fixup = aptr - ptr;
 
    if (intel_batchbuffer_space(batch) < fixup + sz)
diff --git a/i965/intel_batchbuffer.h b/i965/intel_batchbuffer.h
index 25e0a65..b794772 100644
--- a/i965/intel_batchbuffer.h
+++ b/i965/intel_batchbuffer.h
@@ -84,7 +84,7 @@ void intel_batchbuffer_release_space(struct intel_batchbuffer *batch,
 static inline GLuint 
 intel_batchbuffer_space( struct intel_batchbuffer *batch )
 {
-   return (BATCH_SZ - BATCH_RESERVED) - (batch->ptr - (batch->map + batch->offset));
+   return (BATCH_SZ - BATCH_RESERVED) - (batch->ptr - batch->map);
 }
 
 
diff --git a/i965/intel_blit.c b/i965/intel_blit.c
index f88cbb2..d2068aa 100644
--- a/i965/intel_blit.c
+++ b/i965/intel_blit.c
@@ -373,7 +373,7 @@ void intelClearWithBlit(GLcontext *ctx, GLbitfield flags)
    clear_depth = 0;
 
    if (flags & BUFFER_BIT_DEPTH) {
-      clear_depth = (GLuint)(ctx->Depth.Clear * intel->ClearDepth);
+      clear_depth = (GLuint)(ctx->Depth.Clear * ctx->DrawBuffer->_DepthMax);
    }
 
    if (flags & BUFFER_BIT_STENCIL) {
@@ -537,7 +537,8 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 {
    struct xy_setup_blit setup;
    struct xy_text_immediate_blit text;
-   int dwords = ((src_size + 7) & ~7) / 4;
+   int dwords = ALIGN(src_size, 8) / 4;
+   uint32_t opcode, br13;
 
    assert( logic_op - GL_CLEAR >= 0 );
    assert( logic_op - GL_CLEAR < 0x10 );
diff --git a/i965/intel_buffers.c b/i965/intel_buffers.c
index de6a867..769f75c 100644
--- a/i965/intel_buffers.c
+++ b/i965/intel_buffers.c
@@ -545,15 +545,14 @@ static void intelDrawBuffer(GLcontext *ctx, GLenum mode )
    if ( intel->sarea->pf_current_page == 1 ) 
       front ^= 1;
    
-   intelSetFrontClipRects( intel );
-
-
    if (front) {
+      intelSetFrontClipRects(intel);
       if (intel->draw_region != intel->front_region) {
 	 intel_region_release(intel, &intel->draw_region);
 	 intel_region_reference(&intel->draw_region, intel->front_region);
       }
    } else {
+      intelSetBackClipRects(intel);
       if (intel->draw_region != intel->back_region) {
 	 intel_region_release(intel, &intel->draw_region);
 	 intel_region_reference(&intel->draw_region, intel->back_region);
diff --git a/i965/intel_context.c b/i965/intel_context.c
index 5ee5282..65b3c99 100644
--- a/i965/intel_context.c
+++ b/i965/intel_context.c
@@ -66,6 +66,7 @@
 int INTEL_DEBUG = (0);
 #endif
 
+#define need_GL_NV_point_sprite
 #define need_GL_ARB_multisample
 #define need_GL_ARB_point_parameters
 #define need_GL_ARB_texture_compression
@@ -81,6 +82,13 @@ int INTEL_DEBUG = (0);
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_multi_draw_arrays
 #define need_GL_EXT_secondary_color
+#define need_GL_ATI_separate_stencil
+#define need_GL_EXT_point_parameters
+#define need_GL_VERSION_2_0
+#define need_GL_VERSION_2_1
+#define need_GL_ARB_shader_objects
+#define need_GL_ARB_vertex_shader
+
 #include "extension_helper.h"
 
 #ifndef VERBOSE
@@ -118,9 +126,17 @@ static const GLubyte *intelGetString( GLcontext *ctx, GLenum name )
       case PCI_CHIP_I965_GM:
 	 chipset = "Intel(R) 965GM"; break;
          break;
-      case PCI_CHIP_IGD_GM:
+      case PCI_CHIP_GM45_GM:
+	 chipset = "Mobile Intel® GM45 Express Chipset";
+	 break;
+      case PCI_CHIP_IGD_E_G:
 	 chipset = "Intel(R) Integrated Graphics Device";
          break;
+      case PCI_CHIP_Q45_G:
+	  chipset = "Intel(R) Q45/Q43"; break;
+      case PCI_CHIP_G45_G:
+	  chipset = "Intel(R) G45/G43"; break;
+
       default:
 	 chipset = "Unknown Intel Chipset"; break;
       }
@@ -146,6 +162,7 @@ const struct dri_extension card_extensions[] =
     { "GL_ARB_multisample",                GL_ARB_multisample_functions },
     { "GL_ARB_multitexture",               NULL },
     { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
+    { "GL_NV_point_sprite",                GL_NV_point_sprite_functions },
     { "GL_ARB_texture_border_clamp",       NULL },
     { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
     { "GL_ARB_texture_cube_map",           NULL },
@@ -158,6 +175,8 @@ const struct dri_extension card_extensions[] =
     { "GL_NV_texture_rectangle",           NULL },
     { "GL_EXT_texture_rectangle",          NULL },
     { "GL_ARB_texture_rectangle",          NULL },
+    { "GL_ARB_point_sprite",               NULL},
+    { "GL_ARB_point_parameters",	   NULL }, 
     { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
     { "GL_ARB_vertex_program",             GL_ARB_vertex_program_functions },
     { "GL_ARB_window_pos",                 GL_ARB_window_pos_functions },
@@ -171,18 +190,33 @@ const struct dri_extension card_extensions[] =
     { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
     { "GL_EXT_multi_draw_arrays",          GL_EXT_multi_draw_arrays_functions },
     { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
+    { "GL_ATI_separate_stencil",           GL_ATI_separate_stencil_functions },
     { "GL_EXT_stencil_wrap",               NULL },
+    /* Do not enable this extension.  It conflicts with GL_ATI_separate_stencil
+     * and 2.0's separate stencil, because mesa's computed _TestTwoSide will
+     * only reflect whether it's enabled through this extension, even if the
+     * application is using the other interfaces.
+     */
+/*{ "GL_EXT_stencil_two_side",           GL_EXT_stencil_two_side_functions },*/
     { "GL_EXT_texture_edge_clamp",         NULL },
     { "GL_EXT_texture_env_combine",        NULL },
     { "GL_EXT_texture_env_dot3",           NULL },
     { "GL_EXT_texture_filter_anisotropic", NULL },
     { "GL_EXT_texture_lod_bias",           NULL },
+    { "GL_EXT_texture_sRGB",               NULL },
     { "GL_3DFX_texture_compression_FXT1",  NULL },
     { "GL_APPLE_client_storage",           NULL },
     { "GL_MESA_pack_invert",               NULL },
     { "GL_MESA_ycbcr_texture",             NULL },
     { "GL_NV_blend_square",                NULL },
     { "GL_SGIS_generate_mipmap",           NULL },
+    { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions},
+    { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions},
+    { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions},
+    { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions},
+    { "GL_ARB_fragment_shader",            NULL },
+    /* XXX not implement yet, to compile builtin glsl lib */
+    { "GL_ARB_draw_buffers",               NULL },
     { NULL,                                NULL }
 };
 
@@ -399,17 +433,10 @@ GLboolean intelInitContext( struct intel_context *intel,
    switch(mesaVis->depthBits) {
    case 0:			/* what to do in this case? */
    case 16:
-      intel->depth_scale = 1.0/0xffff;
       intel->polygon_offset_scale = 1.0/0xffff;
-      intel->depth_clear_mask = ~0;
-      intel->ClearDepth = 0xffff;
       break;
    case 24:
-      intel->depth_scale = 1.0/0xffffff;
       intel->polygon_offset_scale = 2.0/0xffffff; /* req'd to pass glean */
-      intel->depth_clear_mask = 0x00ffffff;
-      intel->stencil_clear_mask = 0xff000000;
-      intel->ClearDepth = 0x00ffffff;
       break;
    default:
       assert(0); 
@@ -551,6 +578,8 @@ void intelDestroyContext(__DRIcontextPrivate *driContextPriv)
 #endif
 
       /* free the Mesa context */
+      intel->ctx.VertexProgram.Current = NULL;
+      intel->ctx.FragmentProgram.Current = NULL;
       _mesa_destroy_context(&intel->ctx);
    }
 
diff --git a/i965/intel_context.h b/i965/intel_context.h
index a244757..76ee119 100644
--- a/i965/intel_context.h
+++ b/i965/intel_context.h
@@ -183,12 +183,8 @@ struct intel_context
 
    GLubyte clear_chan[4];
    GLuint ClearColor;
-   GLuint ClearDepth;
 
-   GLfloat depth_scale;
    GLfloat polygon_offset_scale; /* dependent on depth_scale, bpp */
-   GLuint depth_clear_mask;
-   GLuint stencil_clear_mask;
 
    GLboolean hw_stencil;
    GLboolean hw_stipple;
@@ -269,6 +265,8 @@ void UNLOCK_HARDWARE( struct intel_context *intel );
 #define SUBPIXEL_X 0.125
 #define SUBPIXEL_Y 0.125
 
+#define ALIGN(value, alignment)  ((value + alignment - 1) & ~(alignment - 1))
+
 /* ================================================================
  * Color packing:
  */
@@ -387,8 +385,11 @@ extern int INTEL_DEBUG;
 #define PCI_CHIP_I946_GZ		0x2972
 #define PCI_CHIP_I965_GM                0x2A02
 
-#define PCI_CHIP_IGD_GM       0x2A42
+#define PCI_CHIP_GM45_GM                0x2A42
 
+#define PCI_CHIP_IGD_E_G                0x2E02
+#define PCI_CHIP_Q45_G                  0x2E12
+#define PCI_CHIP_G45_G                  0x2E22
 
 /* ================================================================
  * intel_context.c:
diff --git a/i965/intel_mipmap_tree.c b/i965/intel_mipmap_tree.c
index 8548bc8..5177802 100644
--- a/i965/intel_mipmap_tree.c
+++ b/i965/intel_mipmap_tree.c
@@ -75,7 +75,7 @@ struct intel_mipmap_tree *intel_miptree_create( struct intel_context *intel,
    mt->width0 = width0;
    mt->height0 = height0;
    mt->depth0 = depth0;
-   mt->cpp = compressed ? 2 : cpp;
+   mt->cpp = cpp;
    mt->compressed = compressed;
 
    switch (intel->intelScreen->deviceID) {
@@ -128,7 +128,7 @@ int intel_miptree_pitch_align (struct intel_context *intel,
 			       int pitch)
 {
    if (!mt->compressed)
-      pitch = ((pitch * mt->cpp + 3) & ~3) / mt->cpp;
+      pitch = ALIGN(pitch * mt->cpp, 4) / mt->cpp;
 
    return pitch;
 }
@@ -234,7 +234,7 @@ GLuint intel_miptree_image_offset(struct intel_mipmap_tree *mt,
 
 
 
-
+extern GLuint intel_compressed_alignment(GLenum);
 /* Upload data for a particular image.
  */
 GLboolean intel_miptree_image_data(struct intel_context *intel, 
@@ -249,6 +249,17 @@ GLboolean intel_miptree_image_data(struct intel_context *intel,
    GLuint dst_offset = intel_miptree_image_offset(dst, face, level);
    const GLuint *dst_depth_offset = intel_miptree_depth_offsets(dst, level);
    GLuint i;
+   GLuint width, height, alignment;
+
+   width = dst->level[level].width;
+   height = dst->level[level].height;
+
+   if (dst->compressed) {
+       alignment = intel_compressed_alignment(dst->internal_format);
+       src_row_pitch = ALIGN(src_row_pitch, alignment);
+       width = ALIGN(width, alignment);
+       height = (height + 3) / 4;
+   }
 
    DBG("%s\n", __FUNCTION__);
    for (i = 0; i < depth; i++) {
@@ -260,8 +271,8 @@ GLboolean intel_miptree_image_data(struct intel_context *intel,
 			     src,
 			     src_row_pitch,
 			     0, 0,	/* source x,y */
-			     dst->level[level].width,
-			     dst->level[level].height))
+			     width,
+			     height))
 	 return GL_FALSE;
       src += src_image_pitch;
    }
diff --git a/i965/intel_pixel_bitmap.c b/i965/intel_pixel_bitmap.c
index df9d688..54a88c5 100644
--- a/i965/intel_pixel_bitmap.c
+++ b/i965/intel_pixel_bitmap.c
@@ -91,11 +91,6 @@ static void set_bit( GLubyte *dest,
    dest[bit/8] |= 1 << (bit % 8);
 }
 
-static int align(int x, int align)
-{
-   return (x + align - 1) & ~(align - 1);
-}
-
 /* Extract a rectangle's worth of data from the bitmap.  Called
  * per-cliprect.
  */
@@ -147,7 +142,7 @@ static GLuint get_bitmap_rect(GLsizei width, GLsizei height,
       }
 
       if (row_align)
-	 bit = (bit + row_align - 1) & ~(row_align - 1);
+	 bit = ALIGN(bit, row_align);
    }
 
    return count;
@@ -169,11 +164,8 @@ do_blit_bitmap( GLcontext *ctx,
    struct intel_context *intel = intel_context(ctx);
    struct intel_region *dst = intel_drawbuf_region(intel);
    GLfloat tmpColor[4];
-
-   union {
-      GLuint ui;
-      GLubyte ub[4];
-   } color;
+   GLubyte ubcolor[4];
+   GLuint color8888, color565;
 
    if (!dst)
        return GL_FALSE;
@@ -190,10 +182,14 @@ do_blit_bitmap( GLcontext *ctx,
        ADD_3V(tmpColor, tmpColor, ctx->Current.RasterSecondaryColor);
    }
 
-   UNCLAMPED_FLOAT_TO_CHAN(color.ub[0], tmpColor[2]);
-   UNCLAMPED_FLOAT_TO_CHAN(color.ub[1], tmpColor[1]);
-   UNCLAMPED_FLOAT_TO_CHAN(color.ub[2], tmpColor[0]);
-   UNCLAMPED_FLOAT_TO_CHAN(color.ub[3], tmpColor[3]);
+   UNCLAMPED_FLOAT_TO_UBYTE(ubcolor[0], tmpColor[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(ubcolor[1], tmpColor[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(ubcolor[2], tmpColor[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(ubcolor[3], tmpColor[3]);
+
+   color8888 = INTEL_PACKCOLOR8888(ubcolor[0], ubcolor[1], ubcolor[2], ubcolor[3]);
+   color565 = INTEL_PACKCOLOR565(ubcolor[0], ubcolor[1], ubcolor[2]);
+ 
 
    /* Does zoom apply to bitmaps?
     */
@@ -235,10 +231,10 @@ do_blit_bitmap( GLcontext *ctx,
       dsty = dPriv->y + (dPriv->h - dsty - height);  
       dstx = dPriv->x + dstx;
 
-      dest_rect.x1 = dstx;
-      dest_rect.y1 = dsty;
-      dest_rect.x2 = dstx + width;
-      dest_rect.y2 = dsty + height;
+      dest_rect.x1 = dstx < 0 ? 0 : dstx;
+      dest_rect.y1 = dsty < 0 ? 0 : dsty;
+      dest_rect.x2 = dstx + width < 0 ? 0 : dstx + width;
+      dest_rect.y2 = dsty + height < 0 ? 0 : dsty + height;
 
       for (i = 0; i < nbox; i++) {
          drm_clip_rect_t rect;
@@ -268,7 +264,7 @@ do_blit_bitmap( GLcontext *ctx,
 	    for (px = 0; px < box_w; px += DX) { 
 	       int h = MIN2(DY, box_h - py);
 	       int w = MIN2(DX, box_w - px); 
-	       GLuint sz = align(align(w,8) * h, 64)/8;
+	       GLuint sz = ALIGN(ALIGN(w,8) * h, 64)/8;
 	       GLenum logic_op = ctx->Color.ColorLogicOpEnabled ?
 		  ctx->Color.LogicOp : GL_COPY;
 
@@ -292,7 +288,7 @@ do_blit_bitmap( GLcontext *ctx,
 						  dst->cpp,
 						  (GLubyte *)stipple, 
 						  sz,
-						  color.ui,
+						  (dst->cpp == 2) ? color565 : color8888,
 						  dst->pitch,
 						  dst->buffer,
 						  0,
diff --git a/i965/intel_screen.c b/i965/intel_screen.c
index 5dac50d..b08531c 100644
--- a/i965/intel_screen.c
+++ b/i965/intel_screen.c
@@ -53,7 +53,7 @@ DRI_CONF_BEGIN
     DRI_CONF_SECTION_END
     DRI_CONF_SECTION_QUALITY
        DRI_CONF_FORCE_S3TC_ENABLE(false)
-       DRI_CONF_ALLOW_LARGE_TEXTURES(1)
+       DRI_CONF_ALLOW_LARGE_TEXTURES(2)
       DRI_CONF_SECTION_END
 DRI_CONF_END;
 const GLuint __driNConfigOptions = 4;
diff --git a/i965/intel_tex_validate.c b/i965/intel_tex_validate.c
index cb23b9d..775b689 100644
--- a/i965/intel_tex_validate.c
+++ b/i965/intel_tex_validate.c
@@ -122,6 +122,29 @@ static void intel_texture_invalidate_cb( struct intel_context *intel,
    intel_texture_invalidate( (struct intel_texture_object *) ptr );
 }
 
+#include "texformat.h"
+static GLuint intel_compressed_num_bytes(GLenum mesaFormat)
+{
+    GLuint bytes = 0;
+
+    switch (mesaFormat) {
+    case MESA_FORMAT_RGB_FXT1:
+    case MESA_FORMAT_RGBA_FXT1:
+    case MESA_FORMAT_RGB_DXT1:
+    case MESA_FORMAT_RGBA_DXT1:
+        bytes = 2;
+        break;
+
+    case MESA_FORMAT_RGBA_DXT3:
+    case MESA_FORMAT_RGBA_DXT5:
+        bytes = 4;
+    
+    default:
+        break;
+    }
+
+    return bytes;
+}
 
 /*  
  */
@@ -132,7 +155,8 @@ GLuint intel_finalize_mipmap_tree( struct intel_context *intel,
    GLuint face, i;
    GLuint nr_faces = 0;
    struct gl_texture_image *firstImage;
-
+   GLuint cpp = 0;
+   
    if( tObj == intel->frame_buffer_texobj )
       return GL_FALSE;
    
@@ -165,6 +189,12 @@ GLuint intel_finalize_mipmap_tree( struct intel_context *intel,
 
 
 
+   if (firstImage->IsCompressed) {
+       cpp = intel_compressed_num_bytes(firstImage->TexFormat->MesaFormat);
+   } else {
+       cpp = firstImage->TexFormat->TexelBytes;
+   }
+       
    /* Check tree can hold all active levels.  Check tree matches
     * target, imageFormat, etc.
     */
@@ -176,7 +206,7 @@ GLuint intel_finalize_mipmap_tree( struct intel_context *intel,
 	intelObj->mt->width0 != firstImage->Width ||
 	intelObj->mt->height0 != firstImage->Height ||
 	intelObj->mt->depth0 != firstImage->Depth ||
-	intelObj->mt->cpp != firstImage->TexFormat->TexelBytes ||
+	intelObj->mt->cpp != cpp ||
 	intelObj->mt->compressed != firstImage->IsCompressed)) 
    {
       intel_miptree_destroy(intel, intelObj->mt);
@@ -199,7 +229,7 @@ GLuint intel_finalize_mipmap_tree( struct intel_context *intel,
 					  firstImage->Width,
 					  firstImage->Height,
 					  firstImage->Depth,
-					  firstImage->TexFormat->TexelBytes,
+					  cpp,
 					  firstImage->IsCompressed);
 
       /* Tell the buffer manager that we will manage the backing
diff --git a/shared/intel_tex_layout.c b/shared/intel_tex_layout.c
index 39a443c..cc73694 100644
--- a/shared/intel_tex_layout.c
+++ b/shared/intel_tex_layout.c
@@ -34,10 +34,21 @@
 #include "intel_tex_layout.h"
 #include "macros.h"
 
-
-static int align(int value, int alignment)
+GLuint intel_compressed_alignment(GLenum internalFormat)
 {
-   return (value + alignment - 1) & ~(alignment - 1);
+    GLuint alignment = 4;
+
+    switch (internalFormat) {
+    case GL_COMPRESSED_RGB_FXT1_3DFX:
+    case GL_COMPRESSED_RGBA_FXT1_3DFX:
+        alignment = 8;
+        break;
+
+    default:
+        break;
+    }
+
+    return alignment;
 }
 
 void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tree *mt )
@@ -51,17 +62,30 @@ void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tr
 
    mt->pitch = mt->width0;
 
+   if (mt->compressed) {
+       align_w = intel_compressed_alignment(mt->internal_format);
+       mt->pitch = ALIGN(mt->width0, align_w);
+   }
+
    /* May need to adjust pitch to accomodate the placement of
     * the 2nd mipmap.  This occurs when the alignment
     * constraints of mipmap placement push the right edge of the
     * 2nd mipmap out past the width of its parent.
     */
    if (mt->first_level != mt->last_level) {
-      GLuint mip1_width = align(minify(mt->width0), align_w)
-			+ minify(minify(mt->width0));
+       GLuint mip1_width;
+
+       if (mt->compressed) {
+           mip1_width = ALIGN(minify(mt->width0), align_w)
+               + ALIGN(minify(minify(mt->width0)), align_w);
+       } else {
+           mip1_width = ALIGN(minify(mt->width0), align_w)
+               + minify(minify(mt->width0));
+       }
 
-      if (mip1_width > mt->width0)
-	 mt->pitch = mip1_width;
+       if (mip1_width > mt->pitch) {
+           mt->pitch = mip1_width;
+       }
    }
 
    /* Pitch must be a whole number of dwords, even though we
@@ -79,7 +103,7 @@ void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tr
       if (mt->compressed)
 	 img_height = MAX2(1, height/4);
       else
-	 img_height = align(height, align_h);
+	 img_height = ALIGN(height, align_h);
 
 
       /* Because the images are packed better, the final offset
@@ -90,7 +114,7 @@ void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tr
       /* Layout_below: step right after second mipmap.
        */
       if (level == mt->first_level + 1) {
-	 x += align(width, align_w);
+	 x += ALIGN(width, align_w);
       }
       else {
 	 y += img_height;
diff --git a/shared/intel_tex_layout.h b/shared/intel_tex_layout.h
index 46151db..193699d 100644
--- a/shared/intel_tex_layout.h
+++ b/shared/intel_tex_layout.h
@@ -39,3 +39,4 @@ static GLuint minify( GLuint d )
 }
 
 extern void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tree *mt );
+extern GLuint intel_compressed_alignment(GLenum);
author	Luc Verhaegen <libv@skynet.be>	2010-03-12 19:46:04 +0100
committer	Luc Verhaegen <libv@skynet.be>	2010-03-12 19:46:04 +0100
commit	5c28087bfde504266a79dbbc8aef480009d88d2f (patch)
tree	91c96846540390f5cb3c0daf36b17f1f65af9621
parent	6e23622cb869c14d82f8c901c4bbea80ded6220e (diff)