Import radeon, r200, r300 and r600 dri drivers from mesa 7.7.0.7.7.0

author: Luc Verhaegen <libv@skynet.be> 2010-03-14 22:15:41 +0000
committer: Luc Verhaegen <libv@skynet.be> 2010-03-14 22:15:41 +0000
commit: 534eb0f6eea95ff5851d3cb74663679fcd375572 (patch)
tree: fe6b3c1c482725e17e0fed3ad9d9004e8870c988
parent: 0c8469d1892b441c38d1cb09d6bbf85692c89e92 (diff)
115 files changed, 9048 insertions, 4906 deletions
diff --git a/configure.ac b/configure.ac
index d9be032..5cd936f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,7 +1,7 @@
 # Process this file with autoconf to produce a configure script
 
 AC_PREREQ(2.57)
-AC_INIT([mesa-dri-radeon], 7.6.0, [], mesa-dri-radeon)
+AC_INIT([mesa-dri-radeon], 7.7.0, [], mesa-dri-radeon)
 
 AM_INIT_AUTOMAKE([dist-bzip2])
 
@@ -16,12 +16,11 @@ AC_PROG_CC
 AC_HEADER_STDC
 
 PKG_CHECK_MODULES([DRM], [libdrm >= 2.3.0])
-# we now need dri_metaops.h
-PKG_CHECK_MODULES([DRI], [libmesadri >= 7.6.0 libmesadri < 7.7.0
-			  libmesadricommon >= 7.6.0 libmesadricommon < 7.7.0])
+PKG_CHECK_MODULES([DRI], [libmesadri >= 7.7.0 libmesadri < 7.8.0
+			  libmesadricommon >= 7.7.0 libmesadricommon < 7.8.0])
 
 # libdrm 2.4.17 changed the api significantly.
-PKG_CHECK_MODULES([LIBDRM_RADEON], [libdrm_radeon libdrm <= 2.4.16],
+PKG_CHECK_MODULES([LIBDRM_RADEON], [libdrm_radeon libdrm >= 2.4.17],
 		 HAVE_LIBDRM_RADEON=yes, HAVE_LIBDRM_RADEON=no)
 AM_CONDITIONAL(HAVE_LIBDRM_RADEON, test "x$HAVE_LIBDRM_RADEON" = xyes)
 
diff --git a/r200/Makefile.am b/r200/Makefile.am
index 804b285..7cdc634 100644
--- a/r200/Makefile.am
+++ b/r200/Makefile.am
@@ -1,7 +1,6 @@
 AM_CFLAGS = -DIN_DRI_DRIVER -DGLX_DIRECT_RENDERING -DGLX_INDIRECT_RENDERING
 
-R200_CFLAGS = -DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R200
-R200_CFLAGS += -I../radeon -I../radeon/server
+R200_CFLAGS = -DRADEON_R200 -I../radeon -I../radeon/server
 
 r200_dri_la_LTLIBRARIES = r200_dri.la
 r200_dri_la_CFLAGS = $(AM_CFLAGS) $(DRM_CFLAGS) $(DRI_CFLAGS) $(R200_CFLAGS)
@@ -41,5 +40,7 @@ if HAVE_LIBDRM_RADEON
 r200_dri_la_CFLAGS += -DHAVE_LIBDRM_RADEON=1 $(LIBDRM_RADEON_CFLAGS)
 r200_dri_la_LDFLAGS += $(LIBDRM_RADEON_LIBS)
 r200_dri_la_SOURCES += \
-	../radeon/radeon_cs_space_drm.c
+	../radeon/radeon_cs_space_drm.c \
+	../radeon/radeon_bo.c \
+	../radeon/radeon_cs.c
 endif
diff --git a/r200/r200_context.c b/r200/r200_context.c
index 3ddb5bf..5f985d6 100644
--- a/r200/r200_context.c
+++ b/r200/r200_context.c
@@ -75,7 +75,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define need_GL_NV_vertex_program
 #define need_GL_ARB_point_parameters
 #define need_GL_EXT_framebuffer_object
-#include "extension_helper.h"
+#include "main/remap_helper.h"
 
 #define DRIVER_DATE	"20060602"
 
@@ -115,7 +115,7 @@ static const GLubyte *r200GetString( GLcontext *ctx, GLenum name )
 
 /* Extension strings exported by the R200 driver.
  */
-const struct dri_extension card_extensions[] =
+static const struct dri_extension card_extensions[] =
 {
     { "GL_ARB_multitexture",               NULL },
     { "GL_ARB_occlusion_query",		   GL_ARB_occlusion_query_functions},
@@ -146,31 +146,31 @@ const struct dri_extension card_extensions[] =
     { NULL,                                NULL }
 };
 
-const struct dri_extension blend_extensions[] = {
+static const struct dri_extension blend_extensions[] = {
     { "GL_EXT_blend_equation_separate",    GL_EXT_blend_equation_separate_functions },
     { "GL_EXT_blend_func_separate",        GL_EXT_blend_func_separate_functions },
     { NULL,                                NULL }
 };
 
-const struct dri_extension ARB_vp_extension[] = {
+static const struct dri_extension ARB_vp_extension[] = {
     { "GL_ARB_vertex_program",             GL_ARB_vertex_program_functions }
 };
 
-const struct dri_extension NV_vp_extension[] = {
+static const struct dri_extension NV_vp_extension[] = {
     { "GL_NV_vertex_program",              GL_NV_vertex_program_functions }
 };
 
-const struct dri_extension ATI_fs_extension[] = {
+static const struct dri_extension ATI_fs_extension[] = {
     { "GL_ATI_fragment_shader",            GL_ATI_fragment_shader_functions }
 };
 
-const struct dri_extension point_extensions[] = {
+static const struct dri_extension point_extensions[] = {
     { "GL_ARB_point_sprite",               NULL },
     { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
     { NULL,                                NULL }
 };
 
-const struct dri_extension mm_extensions[] = {
+static const struct dri_extension mm_extensions[] = {
   { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
   { NULL, NULL }
 };
@@ -325,9 +325,9 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    _mesa_init_driver_functions(&functions);
    r200InitDriverFuncs(&functions);
    r200InitIoctlFuncs(&functions);
-   r200InitStateFuncs(&functions, screen->kernel_mm);
+   r200InitStateFuncs(&functions);
    r200InitTextureFuncs(&functions);
-   r200InitShaderFuncs(&functions); 
+   r200InitShaderFuncs(&functions);
    radeonInitQueryObjFunctions(&functions);
 
    if (!radeonInitContext(&rmesa->radeon, &functions,
diff --git a/r200/r200_state.c b/r200/r200_state.c
index 76852e3..6d99c03 100644
--- a/r200/r200_state.c
+++ b/r200/r200_state.c
@@ -1578,13 +1578,6 @@ static void r200ClearStencil( GLcontext *ctx, GLint s )
  * Window position and viewport transformation
  */
 
-/*
- * To correctly position primitives:
- */
-#define SUBPIXEL_X 0.125
-#define SUBPIXEL_Y 0.125
-
-
 /**
  * Called when window size or position changes or viewport or depth range
  * state is changed.  We update the hardware viewport state here.
@@ -1609,9 +1602,9 @@ void r200UpdateWindow( GLcontext *ctx )
    }
 
    float_ui32_type sx = { v[MAT_SX] };
-   float_ui32_type tx = { v[MAT_TX] + xoffset + SUBPIXEL_X };
+   float_ui32_type tx = { v[MAT_TX] + xoffset };
    float_ui32_type sy = { v[MAT_SY] * y_scale };
-   float_ui32_type ty = { (v[MAT_TY] * y_scale) + y_bias + SUBPIXEL_Y };
+   float_ui32_type ty = { (v[MAT_TY] * y_scale) + y_bias };
    float_ui32_type sz = { v[MAT_SZ] * depthScale };
    float_ui32_type tz = { v[MAT_TZ] * depthScale };
 
@@ -1680,8 +1673,8 @@ void r200UpdateViewportOffset( GLcontext *ctx )
    float_ui32_type tx;
    float_ui32_type ty;
 
-   tx.f = v[MAT_TX] + xoffset + SUBPIXEL_X;
-   ty.f = (- v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+   tx.f = v[MAT_TX] + xoffset;
+   ty.f = (- v[MAT_TY]) + yoffset;
 
    if ( rmesa->hw.vpt.cmd[VPT_SE_VPORT_XOFFSET] != tx.ui32 ||
 	rmesa->hw.vpt.cmd[VPT_SE_VPORT_YOFFSET] != ty.ui32 )
@@ -2483,7 +2476,7 @@ static void r200PolygonStipple( GLcontext *ctx, const GLubyte *mask )
 }
 /* Initialize the driver's state functions.
  */
-void r200InitStateFuncs( struct dd_function_table *functions, GLboolean dri2 )
+void r200InitStateFuncs( struct dd_function_table *functions )
 {
    functions->UpdateState		= r200InvalidateState;
    functions->LightingSpaceChange	= r200LightingSpaceChange;
@@ -2517,10 +2510,7 @@ void r200InitStateFuncs( struct dd_function_table *functions, GLboolean dri2 )
    functions->LogicOpcode		= r200LogicOpCode;
    functions->PolygonMode		= r200PolygonMode;
    functions->PolygonOffset		= r200PolygonOffset;
-   if (dri2)
-      functions->PolygonStipple		= r200PolygonStipple;
-   else
-      functions->PolygonStipple		= radeonPolygonStipplePreKMS;
+   functions->PolygonStipple		= r200PolygonStipple;
    functions->PointParameterfv		= r200PointParameter;
    functions->PointSize			= r200PointSize;
    functions->RenderMode		= r200RenderMode;
diff --git a/r200/r200_state.h b/r200/r200_state.h
index 9c62f0a..7b9b0c1 100644
--- a/r200/r200_state.h
+++ b/r200/r200_state.h
@@ -38,7 +38,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_context.h"
 
 extern void r200InitState( r200ContextPtr rmesa );
-extern void r200InitStateFuncs( struct dd_function_table *functions, GLboolean dri2 );
+extern void r200InitStateFuncs( struct dd_function_table *functions );
 extern void r200InitTnlFuncs( GLcontext *ctx );
 
 extern void r200UpdateMaterial( GLcontext *ctx );
diff --git a/r200/r200_state_init.c b/r200/r200_state_init.c
index 7697306..6c5a0b7 100644
--- a/r200/r200_state_init.c
+++ b/r200/r200_state_init.c
@@ -529,16 +529,18 @@ static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
    atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10);
    if (rrb->cpp == 4)
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
-   else switch (rrb->base._ActualFormat) {
-   case GL_RGB5:
+   else switch (rrb->base.Format) {
+   case MESA_FORMAT_RGB565:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
 	break;
-   case GL_RGBA4:
+   case MESA_FORMAT_ARGB4444:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB4444;
 	break;
-   case GL_RGB5_A1:
+   case MESA_FORMAT_ARGB1555:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB1555;
 	break;
+   default:
+	_mesa_problem(ctx, "Unexpected format in ctx_emit_cs");
    }
 
    cbpitch = (rrb->pitch / rrb->cpp);
@@ -638,7 +640,7 @@ static void tex_emit(GLcontext *ctx, struct radeon_state_atom *atom)
    OUT_BATCH_TABLE(atom->cmd, 10);
 
    if (t && t->mt && !t->image_override) {
-     OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+     OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, get_base_teximage_offset(t),
 		  RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
    } else if (!t) {
      /* workaround for old CS mechanism */
@@ -885,10 +887,8 @@ void r200InitState( r200ContextPtr rmesa )
          }
       }
    }
-   /* polygon stipple is done with irq for non-kms */
-   if (rmesa->radeon.radeonScreen->kernel_mm) {
-       ALLOC_STATE( stp, always, STP_STATE_SIZE, "STP/stp", 0 );
-   }
+
+   ALLOC_STATE( stp, always, STP_STATE_SIZE, "STP/stp", 0 );
 
    for (i = 0; i < 6; i++)
       if (rmesa->radeon.radeonScreen->kernel_mm)
@@ -1120,12 +1120,11 @@ void r200InitState( r200ContextPtr rmesa )
    rmesa->hw.sci.cmd[SCI_CMD_1] = CP_PACKET0(R200_RE_TOP_LEFT, 0);
    rmesa->hw.sci.cmd[SCI_CMD_2] = CP_PACKET0(R200_RE_WIDTH_HEIGHT, 0);
 
-   if (rmesa->radeon.radeonScreen->kernel_mm) {
-
-	rmesa->hw.stp.cmd[STP_CMD_0] = CP_PACKET0(RADEON_RE_STIPPLE_ADDR, 0);
-	rmesa->hw.stp.cmd[STP_DATA_0] = 0;
-	rmesa->hw.stp.cmd[STP_CMD_1] = CP_PACKET0_ONE(RADEON_RE_STIPPLE_DATA, 31);
+   rmesa->hw.stp.cmd[STP_CMD_0] = CP_PACKET0(RADEON_RE_STIPPLE_ADDR, 0);
+   rmesa->hw.stp.cmd[STP_DATA_0] = 0;
+   rmesa->hw.stp.cmd[STP_CMD_1] = CP_PACKET0_ONE(RADEON_RE_STIPPLE_DATA, 31);
 
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
         rmesa->hw.mtl[0].emit = mtl_emit;
         rmesa->hw.mtl[1].emit = mtl_emit;
 
diff --git a/r200/r200_tcl.c b/r200/r200_tcl.c
index c702910..e7d48a7 100644
--- a/r200/r200_tcl.c
+++ b/r200/r200_tcl.c
@@ -509,25 +509,26 @@ static GLboolean r200_run_tcl_render( GLcontext *ctx,
 	 prog to a not enabled output however, so just don't mess with it.
 	 We only need to change compsel. */
       GLuint out_compsel = 0;
-      GLuint vp_out = rmesa->curr_vp_hw->mesa_program.Base.OutputsWritten;
+      const GLbitfield64 vp_out =
+	 rmesa->curr_vp_hw->mesa_program.Base.OutputsWritten;
 
       vimap_rev = &rmesa->curr_vp_hw->inputmap_rev[0];
-      assert(vp_out & (1 << VERT_RESULT_HPOS));
+      assert(vp_out & BITFIELD64_BIT(VERT_RESULT_HPOS));
       out_compsel = R200_OUTPUT_XYZW;
-      if (vp_out & (1 << VERT_RESULT_COL0)) {
+      if (vp_out & BITFIELD64_BIT(VERT_RESULT_COL0)) {
 	 out_compsel |= R200_OUTPUT_COLOR_0;
       }
-      if (vp_out & (1 << VERT_RESULT_COL1)) {
+      if (vp_out & BITFIELD64_BIT(VERT_RESULT_COL1)) {
 	 out_compsel |= R200_OUTPUT_COLOR_1;
       }
-      if (vp_out & (1 << VERT_RESULT_FOGC)) {
+      if (vp_out & BITFIELD64_BIT(VERT_RESULT_FOGC)) {
          out_compsel |= R200_OUTPUT_DISCRETE_FOG;
       }
-      if (vp_out & (1 << VERT_RESULT_PSIZ)) {
+      if (vp_out & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
 	 out_compsel |= R200_OUTPUT_PT_SIZE;
       }
       for (i = VERT_RESULT_TEX0; i < VERT_RESULT_TEX6; i++) {
-	 if (vp_out & (1 << i)) {
+	 if (vp_out & BITFIELD64_BIT(i)) {
 	    out_compsel |= R200_OUTPUT_TEX_0 << (i - VERT_RESULT_TEX0);
 	 }
       }
diff --git a/r200/r200_tex.c b/r200/r200_tex.c
index 36d9e37..a417721 100644
--- a/r200/r200_tex.c
+++ b/r200/r200_tex.c
@@ -38,7 +38,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/simple_list.h"
-#include "main/texformat.h"
 #include "main/texstore.h"
 #include "main/teximage.h"
 #include "main/texobj.h"
@@ -386,16 +385,7 @@ static void r200TexParameter( GLcontext *ctx, GLenum target,
    case GL_TEXTURE_MAX_LEVEL:
    case GL_TEXTURE_MIN_LOD:
    case GL_TEXTURE_MAX_LOD:
-      /* This isn't the most efficient solution but there doesn't appear to
-       * be a nice alternative.  Since there's no LOD clamping,
-       * we just have to rely on loading the right subset of mipmap levels
-       * to simulate a clamped LOD.
-       */
-      if (t->mt) {
-         radeon_miptree_unreference(t->mt);
-	 t->mt = 0;
-	 t->validated = GL_FALSE;
-      }
+      t->validated = GL_FALSE;
       break;
 
    default:
@@ -414,7 +404,7 @@ static void r200DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 	      (void *)texObj,
 	      _mesa_lookup_enum_by_nr(texObj->Target));
    }
-   
+
    if (rmesa) {
       int i;
       radeon_firevertices(&rmesa->radeon);
@@ -426,11 +416,9 @@ static void r200DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 	 }
       }      
    }
-   
-   if (t->mt) {
-      radeon_miptree_unreference(t->mt);
-      t->mt = 0;
-   }
+
+   radeon_miptree_unreference(&t->mt);
+
    _mesa_delete_texture_object(ctx, texObj);
 }
 
diff --git a/r200/r200_texstate.c b/r200/r200_texstate.c
index c948347..7782404 100644
--- a/r200/r200_texstate.c
+++ b/r200/r200_texstate.c
@@ -36,7 +36,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/imports.h"
 #include "main/context.h"
 #include "main/macros.h"
-#include "main/texformat.h"
 #include "main/teximage.h"
 #include "main/texobj.h"
 #include "main/enums.h"
@@ -825,20 +824,14 @@ void r200SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 		radeon_bo_unref(rImage->bo);
 		rImage->bo = NULL;
 	}
-	if (t->mt) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = NULL;
-	}
-	if (rImage->mt) {
-		radeon_miptree_unreference(rImage->mt);
-		rImage->mt = NULL;
-	}
+
+	radeon_miptree_unreference(&t->mt);
+	radeon_miptree_unreference(&rImage->mt);
+
 	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
 				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
 	texImage->RowStride = rb->pitch / rb->cpp;
-	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
-							internalFormat,
-							type, format, 0);
+
 	rImage->bo = rb->bo;
 	radeon_bo_ref(rImage->bo);
 	t->bo = rb->bo;
@@ -1426,10 +1419,9 @@ void set_re_cntl_d3d( GLcontext *ctx, int unit, GLboolean use_d3d )
  */
 static void setup_hardware_state(r200ContextPtr rmesa, radeonTexObj *t)
 {
-   int firstlevel = t->mt ? t->mt->firstLevel : 0;
-   const struct gl_texture_image *firstImage = t->base.Image[0][firstlevel];
+   const struct gl_texture_image *firstImage = t->base.Image[0][t->minLod];
    GLint log2Width, log2Height, log2Depth, texelBytes;
-   
+
    if ( t->bo ) {
        return;
    }
@@ -1437,11 +1429,11 @@ static void setup_hardware_state(r200ContextPtr rmesa, radeonTexObj *t)
    log2Width  = firstImage->WidthLog2;
    log2Height = firstImage->HeightLog2;
    log2Depth  = firstImage->DepthLog2;
-   texelBytes = firstImage->TexFormat->TexelBytes;
+   texelBytes = _mesa_get_format_bytes(firstImage->TexFormat);
 
 
    if (!t->image_override) {
-      if (VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
+      if (VALID_FORMAT(firstImage->TexFormat)) {
 	 const struct tx_table *table = _mesa_little_endian() ? tx_table_le :
 	    tx_table_be;
 	 
@@ -1449,17 +1441,17 @@ static void setup_hardware_state(r200ContextPtr rmesa, radeonTexObj *t)
 			     R200_TXFORMAT_ALPHA_IN_MAP);
 	 t->pp_txfilter &= ~R200_YUV_TO_RGB;
 	 
-	 t->pp_txformat |= table[ firstImage->TexFormat->MesaFormat ].format;
-	 t->pp_txfilter |= table[ firstImage->TexFormat->MesaFormat ].filter;
+	 t->pp_txformat |= table[ firstImage->TexFormat ].format;
+	 t->pp_txfilter |= table[ firstImage->TexFormat ].filter;
       } else {
 	 _mesa_problem(NULL, "unexpected texture format in %s",
 		       __FUNCTION__);
 	 return;
       }
    }
-   
+
    t->pp_txfilter &= ~R200_MAX_MIP_LEVEL_MASK;
-   t->pp_txfilter |= (t->mt->lastLevel - t->mt->firstLevel) << R200_MAX_MIP_LEVEL_SHIFT;
+   t->pp_txfilter |= (t->maxLod - t->minLod) << R200_MAX_MIP_LEVEL_SHIFT;
 	
    t->pp_txformat &= ~(R200_TXFORMAT_WIDTH_MASK |
 		       R200_TXFORMAT_HEIGHT_MASK |
@@ -1504,7 +1496,7 @@ static void setup_hardware_state(r200ContextPtr rmesa, radeonTexObj *t)
 		   | ((firstImage->Height - 1) << R200_PP_TX_HEIGHTMASK_SHIFT));
 
    if ( !t->image_override ) {
-      if (firstImage->IsCompressed)
+      if (_mesa_is_format_compressed(firstImage->TexFormat))
          t->pp_txpitch = (firstImage->Width + 63) & ~(63);
       else
          t->pp_txpitch = ((firstImage->Width * texelBytes) + 63) & ~(63);
diff --git a/r300/Makefile.am b/r300/Makefile.am
index a678ddb..236710a 100644
--- a/r300/Makefile.am
+++ b/r300/Makefile.am
@@ -2,8 +2,7 @@ SUBDIRS = compiler
 
 AM_CFLAGS = -DIN_DRI_DRIVER -DGLX_DIRECT_RENDERING -DGLX_INDIRECT_RENDERING
 
-R300_CFLAGS = -DCOMPILE_R300 -DR200_MERGED=0 -DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R300
-R300_CFLAGS += -I../radeon -I../radeon/server
+R300_CFLAGS = -DRADEON_R300 -I../radeon -I../radeon/server
 
 r300_dri_la_LTLIBRARIES = r300_dri.la
 r300_dri_la_CFLAGS = $(AM_CFLAGS) $(DRM_CFLAGS) $(DRI_CFLAGS) $(R300_CFLAGS)
@@ -37,6 +36,7 @@ r300_dri_la_SOURCES = \
 	r300_vertprog.c \
 	r300_fragprog_common.c \
 	r300_shader.c \
+	radeon_mesa_to_rc.c \
 	r300_emit.c \
 	r300_swtcl.c
 
@@ -44,5 +44,7 @@ if HAVE_LIBDRM_RADEON
 r300_dri_la_CFLAGS += -DHAVE_LIBDRM_RADEON=1 $(LIBDRM_RADEON_CFLAGS)
 r300_dri_la_LDFLAGS += $(LIBDRM_RADEON_LIBS)
 r300_dri_la_SOURCES += \
-	../radeon/radeon_cs_space_drm.c
+	../radeon/radeon_cs_space_drm.c \
+	../radeon/radeon_bo.c \
+	../radeon/radeon_cs.c
 endif
diff --git a/r300/compiler/Makefile.am b/r300/compiler/Makefile.am
index e24a719..c3dc39f 100644
--- a/r300/compiler/Makefile.am
+++ b/r300/compiler/Makefile.am
@@ -5,10 +5,17 @@ libr300compiler_la_CFLAGS = $(AM_CFLAGS) $(DRI_CFLAGS)
 libr300compiler_la_SOURCES = \
 	radeon_code.c \
 	radeon_compiler.c \
-	radeon_nqssadce.c \
 	radeon_program.c \
+	radeon_program_print.c \
+	radeon_opcodes.c \
 	radeon_program_alu.c \
 	radeon_program_pair.c \
+	radeon_pair_translate.c \
+	radeon_pair_schedule.c \
+	radeon_pair_regalloc.c \
+	radeon_dataflow.c \
+	radeon_dataflow_deadcode.c \
+	radeon_dataflow_swizzles.c \
 	r3xx_fragprog.c \
 	r300_fragprog.c \
 	r300_fragprog_swizzle.c \
diff --git a/r300/compiler/SConscript b/r300/compiler/SConscript
new file mode 100644
index 0000000..46075a8
--- /dev/null
+++ b/r300/compiler/SConscript
@@ -0,0 +1,37 @@
+Import('*')
+
+env = env.Clone()
+env.Append(CPPPATH = '#/include')
+env.Append(CPPPATH = '#/src/mesa')
+
+# temporary fix
+env['CFLAGS'] = str(env['CFLAGS']).replace('-Werror=declaration-after-statement', '')
+
+r300compiler = env.ConvenienceLibrary(
+    target = 'r300compiler',
+    source = [
+        'radeon_code.c',
+        'radeon_compiler.c',
+        'radeon_program.c',
+        'radeon_program_print.c',
+        'radeon_opcodes.c',
+        'radeon_program_alu.c',
+        'radeon_program_pair.c',
+        'radeon_pair_translate.c',
+        'radeon_pair_schedule.c',
+        'radeon_pair_regalloc.c',
+        'radeon_dataflow.c',
+        'radeon_dataflow_deadcode.c',
+        'radeon_dataflow_swizzles.c',
+        'r3xx_fragprog.c',
+        'r300_fragprog.c',
+        'r300_fragprog_swizzle.c',
+        'r300_fragprog_emit.c',
+        'r500_fragprog.c',
+        'r500_fragprog_emit.c',
+        'r3xx_vertprog.c',
+        'r3xx_vertprog_dump.c',
+        'memory_pool.c',
+    ])
+
+Return('r300compiler')
diff --git a/r300/compiler/r300_fragprog.c b/r300/compiler/r300_fragprog.c
index 6c9fba4..aa69b0f 100644
--- a/r300/compiler/r300_fragprog.c
+++ b/r300/compiler/r300_fragprog.c
@@ -27,17 +27,17 @@
 
 #include "r300_fragprog.h"
 
-#include "shader/prog_parameter.h"
+#include <stdio.h>
 
 #include "../r300_reg.h"
 
-static struct prog_src_register shadow_ambient(struct radeon_compiler * c, int tmu)
+static struct rc_src_register shadow_ambient(struct radeon_compiler * c, int tmu)
 {
-	struct prog_src_register reg = { 0, };
+	struct rc_src_register reg = { 0, };
 
-	reg.File = PROGRAM_STATE_VAR;
+	reg.File = RC_FILE_CONSTANT;
 	reg.Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_SHADOW_AMBIENT, tmu);
-	reg.Swizzle = SWIZZLE_WWWW;
+	reg.Swizzle = RC_SWIZZLE_WWWW;
 	return reg;
 }
 
@@ -47,7 +47,7 @@ static struct prog_src_register shadow_ambient(struct radeon_compiler * c, int t
  *  - extract operand swizzles
  *  - introduce a temporary register when write masks are needed
  */
-GLboolean r300_transform_TEX(
+int r300_transform_TEX(
 	struct radeon_compiler * c,
 	struct rc_instruction* inst,
 	void* data)
@@ -55,77 +55,77 @@ GLboolean r300_transform_TEX(
 	struct r300_fragment_program_compiler *compiler =
 		(struct r300_fragment_program_compiler*)data;
 
-	if (inst->I.Opcode != OPCODE_TEX &&
-	    inst->I.Opcode != OPCODE_TXB &&
-	    inst->I.Opcode != OPCODE_TXP &&
-	    inst->I.Opcode != OPCODE_KIL)
-		return GL_FALSE;
+	if (inst->U.I.Opcode != RC_OPCODE_TEX &&
+	    inst->U.I.Opcode != RC_OPCODE_TXB &&
+	    inst->U.I.Opcode != RC_OPCODE_TXP &&
+	    inst->U.I.Opcode != RC_OPCODE_KIL)
+		return 0;
 
 	/* ARB_shadow & EXT_shadow_funcs */
-	if (inst->I.Opcode != OPCODE_KIL &&
-	    c->Program.ShadowSamplers & (1 << inst->I.TexSrcUnit)) {
-		GLuint comparefunc = GL_NEVER + compiler->state.unit[inst->I.TexSrcUnit].texture_compare_func;
+	if (inst->U.I.Opcode != RC_OPCODE_KIL &&
+	    c->Program.ShadowSamplers & (1 << inst->U.I.TexSrcUnit)) {
+		rc_compare_func comparefunc = compiler->state.unit[inst->U.I.TexSrcUnit].texture_compare_func;
 
-		if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) {
-			inst->I.Opcode = OPCODE_MOV;
+		if (comparefunc == RC_COMPARE_FUNC_NEVER || comparefunc == RC_COMPARE_FUNC_ALWAYS) {
+			inst->U.I.Opcode = RC_OPCODE_MOV;
 
-			if (comparefunc == GL_ALWAYS) {
-				inst->I.SrcReg[0].File = PROGRAM_BUILTIN;
-				inst->I.SrcReg[0].Swizzle = SWIZZLE_1111;
+			if (comparefunc == RC_COMPARE_FUNC_ALWAYS) {
+				inst->U.I.SrcReg[0].File = RC_FILE_NONE;
+				inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_1111;
 			} else {
-				inst->I.SrcReg[0] = shadow_ambient(c, inst->I.TexSrcUnit);
+				inst->U.I.SrcReg[0] = shadow_ambient(c, inst->U.I.TexSrcUnit);
 			}
 
-			return GL_TRUE;
+			return 1;
 		} else {
-			GLuint comparefunc = GL_NEVER + compiler->state.unit[inst->I.TexSrcUnit].texture_compare_func;
-			GLuint depthmode = compiler->state.unit[inst->I.TexSrcUnit].depth_texture_mode;
+			rc_compare_func comparefunc = compiler->state.unit[inst->U.I.TexSrcUnit].texture_compare_func;
+			unsigned int depthmode = compiler->state.unit[inst->U.I.TexSrcUnit].depth_texture_mode;
 			struct rc_instruction * inst_rcp = rc_insert_new_instruction(c, inst);
 			struct rc_instruction * inst_mad = rc_insert_new_instruction(c, inst_rcp);
 			struct rc_instruction * inst_cmp = rc_insert_new_instruction(c, inst_mad);
 			int pass, fail;
 
-			inst_rcp->I.Opcode = OPCODE_RCP;
-			inst_rcp->I.DstReg.File = PROGRAM_TEMPORARY;
-			inst_rcp->I.DstReg.Index = rc_find_free_temporary(c);
-			inst_rcp->I.DstReg.WriteMask = WRITEMASK_W;
-			inst_rcp->I.SrcReg[0] = inst->I.SrcReg[0];
-			inst_rcp->I.SrcReg[0].Swizzle = SWIZZLE_WWWW;
-
-			inst_cmp->I.DstReg = inst->I.DstReg;
-			inst->I.DstReg.File = PROGRAM_TEMPORARY;
-			inst->I.DstReg.Index = rc_find_free_temporary(c);
-			inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
-
-			inst_mad->I.Opcode = OPCODE_MAD;
-			inst_mad->I.DstReg.File = PROGRAM_TEMPORARY;
-			inst_mad->I.DstReg.Index = rc_find_free_temporary(c);
-			inst_mad->I.SrcReg[0] = inst->I.SrcReg[0];
-			inst_mad->I.SrcReg[0].Swizzle = SWIZZLE_ZZZZ;
-			inst_mad->I.SrcReg[1].File = PROGRAM_TEMPORARY;
-			inst_mad->I.SrcReg[1].Index = inst_rcp->I.DstReg.Index;
-			inst_mad->I.SrcReg[1].Swizzle = SWIZZLE_WWWW;
-			inst_mad->I.SrcReg[2].File = PROGRAM_TEMPORARY;
-			inst_mad->I.SrcReg[2].Index = inst->I.DstReg.Index;
+			inst_rcp->U.I.Opcode = RC_OPCODE_RCP;
+			inst_rcp->U.I.DstReg.File = RC_FILE_TEMPORARY;
+			inst_rcp->U.I.DstReg.Index = rc_find_free_temporary(c);
+			inst_rcp->U.I.DstReg.WriteMask = RC_MASK_W;
+			inst_rcp->U.I.SrcReg[0] = inst->U.I.SrcReg[0];
+			inst_rcp->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_WWWW;
+
+			inst_cmp->U.I.DstReg = inst->U.I.DstReg;
+			inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
+			inst->U.I.DstReg.Index = rc_find_free_temporary(c);
+			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
+
+			inst_mad->U.I.Opcode = RC_OPCODE_MAD;
+			inst_mad->U.I.DstReg.File = RC_FILE_TEMPORARY;
+			inst_mad->U.I.DstReg.Index = rc_find_free_temporary(c);
+			inst_mad->U.I.SrcReg[0] = inst->U.I.SrcReg[0];
+			inst_mad->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_ZZZZ;
+			inst_mad->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
+			inst_mad->U.I.SrcReg[1].Index = inst_rcp->U.I.DstReg.Index;
+			inst_mad->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_WWWW;
+			inst_mad->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
+			inst_mad->U.I.SrcReg[2].Index = inst->U.I.DstReg.Index;
 			if (depthmode == 0) /* GL_LUMINANCE */
-				inst_mad->I.SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z);
+				inst_mad->U.I.SrcReg[2].Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_Z);
 			else if (depthmode == 2) /* GL_ALPHA */
-				inst_mad->I.SrcReg[2].Swizzle = SWIZZLE_WWWW;
+				inst_mad->U.I.SrcReg[2].Swizzle = RC_SWIZZLE_WWWW;
 
 			/* Recall that SrcReg[0] is tex, SrcReg[2] is r and:
 			 *   r  < tex  <=>      -tex+r < 0
 			 *   r >= tex  <=> not (-tex+r < 0 */
-			if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
-				inst_mad->I.SrcReg[2].Negate = inst_mad->I.SrcReg[2].Negate ^ NEGATE_XYZW;
+			if (comparefunc == RC_COMPARE_FUNC_LESS || comparefunc == RC_COMPARE_FUNC_GEQUAL)
+				inst_mad->U.I.SrcReg[2].Negate = inst_mad->U.I.SrcReg[2].Negate ^ RC_MASK_XYZW;
 			else
-				inst_mad->I.SrcReg[0].Negate = inst_mad->I.SrcReg[0].Negate ^ NEGATE_XYZW;
+				inst_mad->U.I.SrcReg[0].Negate = inst_mad->U.I.SrcReg[0].Negate ^ RC_MASK_XYZW;
 
-			inst_cmp->I.Opcode = OPCODE_CMP;
+			inst_cmp->U.I.Opcode = RC_OPCODE_CMP;
 			/* DstReg has been filled out above */
-			inst_cmp->I.SrcReg[0].File = PROGRAM_TEMPORARY;
-			inst_cmp->I.SrcReg[0].Index = inst_mad->I.DstReg.Index;
+			inst_cmp->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
+			inst_cmp->U.I.SrcReg[0].Index = inst_mad->U.I.DstReg.Index;
 
-			if (comparefunc == GL_LESS || comparefunc == GL_GREATER) {
+			if (comparefunc == RC_COMPARE_FUNC_LESS || comparefunc == RC_COMPARE_FUNC_GREATER) {
 				pass = 1;
 				fail = 2;
 			} else {
@@ -133,9 +133,9 @@ GLboolean r300_transform_TEX(
 				fail = 1;
 			}
 
-			inst_cmp->I.SrcReg[pass].File = PROGRAM_BUILTIN;
-			inst_cmp->I.SrcReg[pass].Swizzle = SWIZZLE_1111;
-			inst_cmp->I.SrcReg[fail] = shadow_ambient(c, inst->I.TexSrcUnit);
+			inst_cmp->U.I.SrcReg[pass].File = RC_FILE_NONE;
+			inst_cmp->U.I.SrcReg[pass].Swizzle = RC_SWIZZLE_1111;
+			inst_cmp->U.I.SrcReg[fail] = shadow_ambient(c, inst->U.I.TexSrcUnit);
 		}
 	}
 
@@ -143,52 +143,52 @@ GLboolean r300_transform_TEX(
 	 * instead of [0..Width]x[0..Height].
 	 * Add a scaling instruction.
 	 */
-	if (inst->I.Opcode != OPCODE_KIL && inst->I.TexSrcTarget == TEXTURE_RECT_INDEX) {
+	if (inst->U.I.Opcode != RC_OPCODE_KIL && inst->U.I.TexSrcTarget == RC_TEXTURE_RECT) {
 		struct rc_instruction * inst_mul = rc_insert_new_instruction(c, inst->Prev);
 
-		inst_mul->I.Opcode = OPCODE_MUL;
-		inst_mul->I.DstReg.File = PROGRAM_TEMPORARY;
-		inst_mul->I.DstReg.Index = rc_find_free_temporary(c);
-		inst_mul->I.SrcReg[0] = inst->I.SrcReg[0];
-		inst_mul->I.SrcReg[1].File = PROGRAM_STATE_VAR;
-		inst_mul->I.SrcReg[1].Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_TEXRECT_FACTOR, inst->I.TexSrcUnit);
+		inst_mul->U.I.Opcode = RC_OPCODE_MUL;
+		inst_mul->U.I.DstReg.File = RC_FILE_TEMPORARY;
+		inst_mul->U.I.DstReg.Index = rc_find_free_temporary(c);
+		inst_mul->U.I.SrcReg[0] = inst->U.I.SrcReg[0];
+		inst_mul->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
+		inst_mul->U.I.SrcReg[1].Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_TEXRECT_FACTOR, inst->U.I.TexSrcUnit);
 
-		reset_srcreg(&inst->I.SrcReg[0]);
-		inst->I.SrcReg[0].File = PROGRAM_TEMPORARY;
-		inst->I.SrcReg[0].Index = inst_mul->I.DstReg.Index;
+		reset_srcreg(&inst->U.I.SrcReg[0]);
+		inst->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
+		inst->U.I.SrcReg[0].Index = inst_mul->U.I.DstReg.Index;
 	}
 
 	/* Cannot write texture to output registers or with masks */
-	if (inst->I.Opcode != OPCODE_KIL &&
-	    (inst->I.DstReg.File != PROGRAM_TEMPORARY || inst->I.DstReg.WriteMask != WRITEMASK_XYZW)) {
+	if (inst->U.I.Opcode != RC_OPCODE_KIL &&
+	    (inst->U.I.DstReg.File != RC_FILE_TEMPORARY || inst->U.I.DstReg.WriteMask != RC_MASK_XYZW)) {
 		struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst);
 
-		inst_mov->I.Opcode = OPCODE_MOV;
-		inst_mov->I.DstReg = inst->I.DstReg;
-		inst_mov->I.SrcReg[0].File = PROGRAM_TEMPORARY;
-		inst_mov->I.SrcReg[0].Index = rc_find_free_temporary(c);
+		inst_mov->U.I.Opcode = RC_OPCODE_MOV;
+		inst_mov->U.I.DstReg = inst->U.I.DstReg;
+		inst_mov->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
+		inst_mov->U.I.SrcReg[0].Index = rc_find_free_temporary(c);
 
-		inst->I.DstReg.File = PROGRAM_TEMPORARY;
-		inst->I.DstReg.Index = inst_mov->I.SrcReg[0].Index;
-		inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
+		inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
+		inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
+		inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
 	}
 
 
 	/* Cannot read texture coordinate from constants file */
-	if (inst->I.SrcReg[0].File != PROGRAM_TEMPORARY && inst->I.SrcReg[0].File != PROGRAM_INPUT) {
+	if (inst->U.I.SrcReg[0].File != RC_FILE_TEMPORARY && inst->U.I.SrcReg[0].File != RC_FILE_INPUT) {
 		struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
 
-		inst_mov->I.Opcode = OPCODE_MOV;
-		inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
-		inst_mov->I.DstReg.Index = rc_find_free_temporary(c);
-		inst_mov->I.SrcReg[0] = inst->I.SrcReg[0];
+		inst_mov->U.I.Opcode = RC_OPCODE_MOV;
+		inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
+		inst_mov->U.I.DstReg.Index = rc_find_free_temporary(c);
+		inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[0];
 
-		reset_srcreg(&inst->I.SrcReg[0]);
-		inst->I.SrcReg[0].File = PROGRAM_TEMPORARY;
-		inst->I.SrcReg[0].Index = inst_mov->I.DstReg.Index;
+		reset_srcreg(&inst->U.I.SrcReg[0]);
+		inst->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
+		inst->U.I.SrcReg[0].Index = inst_mov->U.I.DstReg.Index;
 	}
 
-	return GL_TRUE;
+	return 1;
 }
 
 /* just some random things... */
diff --git a/r300/compiler/r300_fragprog.h b/r300/compiler/r300_fragprog.h
index 0ac46db..418df36 100644
--- a/r300/compiler/r300_fragprog.h
+++ b/r300/compiler/r300_fragprog.h
@@ -33,9 +33,6 @@
 #ifndef __R300_FRAGPROG_H_
 #define __R300_FRAGPROG_H_
 
-#include "shader/program.h"
-#include "shader/prog_instruction.h"
-
 #include "radeon_compiler.h"
 #include "radeon_program.h"
 
@@ -44,6 +41,6 @@ extern void r300BuildFragmentProgramHwCode(struct r300_fragment_program_compiler
 
 extern void r300FragmentProgramDump(struct rX00_fragment_program_code *c);
 
-extern GLboolean r300_transform_TEX(struct radeon_compiler * c, struct rc_instruction* inst, void* data);
+extern int r300_transform_TEX(struct radeon_compiler * c, struct rc_instruction* inst, void* data);
 
 #endif
diff --git a/r300/compiler/r300_fragprog_emit.c b/r300/compiler/r300_fragprog_emit.c
index c7227bb..bbc0003 100644
--- a/r300/compiler/r300_fragprog_emit.c
+++ b/r300/compiler/r300_fragprog_emit.c
@@ -56,7 +56,6 @@ struct r300_emit_state {
 };
 
 #define PROG_CODE \
-	struct r300_emit_state * emit = (struct r300_emit_state*)data; \
 	struct r300_fragment_program_compiler *c = emit->compiler; \
 	struct r300_fragment_program_code *code = &c->code->code.r300
 
@@ -69,64 +68,76 @@ struct r300_emit_state {
 /**
  * Mark a temporary register as used.
  */
-static void use_temporary(struct r300_fragment_program_code *code, GLuint index)
+static void use_temporary(struct r300_fragment_program_code *code, unsigned int index)
 {
 	if (index > code->pixsize)
 		code->pixsize = index;
 }
 
+static unsigned int use_source(struct r300_fragment_program_code* code, struct radeon_pair_instruction_source src)
+{
+	if (src.File == RC_FILE_CONSTANT) {
+		return src.Index | (1 << 5);
+	} else if (src.File == RC_FILE_TEMPORARY) {
+		use_temporary(code, src.Index);
+		return src.Index;
+	}
+
+	return 0;
+}
+
 
-static GLuint translate_rgb_opcode(struct r300_fragment_program_compiler * c, GLuint opcode)
+static unsigned int translate_rgb_opcode(struct r300_fragment_program_compiler * c, rc_opcode opcode)
 {
 	switch(opcode) {
-	case OPCODE_CMP: return R300_ALU_OUTC_CMP;
-	case OPCODE_DP3: return R300_ALU_OUTC_DP3;
-	case OPCODE_DP4: return R300_ALU_OUTC_DP4;
-	case OPCODE_FRC: return R300_ALU_OUTC_FRC;
+	case RC_OPCODE_CMP: return R300_ALU_OUTC_CMP;
+	case RC_OPCODE_DP3: return R300_ALU_OUTC_DP3;
+	case RC_OPCODE_DP4: return R300_ALU_OUTC_DP4;
+	case RC_OPCODE_FRC: return R300_ALU_OUTC_FRC;
 	default:
 		error("translate_rgb_opcode(%i): Unknown opcode", opcode);
 		/* fall through */
-	case OPCODE_NOP:
+	case RC_OPCODE_NOP:
 		/* fall through */
-	case OPCODE_MAD: return R300_ALU_OUTC_MAD;
-	case OPCODE_MAX: return R300_ALU_OUTC_MAX;
-	case OPCODE_MIN: return R300_ALU_OUTC_MIN;
-	case OPCODE_REPL_ALPHA: return R300_ALU_OUTC_REPL_ALPHA;
+	case RC_OPCODE_MAD: return R300_ALU_OUTC_MAD;
+	case RC_OPCODE_MAX: return R300_ALU_OUTC_MAX;
+	case RC_OPCODE_MIN: return R300_ALU_OUTC_MIN;
+	case RC_OPCODE_REPL_ALPHA: return R300_ALU_OUTC_REPL_ALPHA;
 	}
 }
 
-static GLuint translate_alpha_opcode(struct r300_fragment_program_compiler * c, GLuint opcode)
+static unsigned int translate_alpha_opcode(struct r300_fragment_program_compiler * c, rc_opcode opcode)
 {
 	switch(opcode) {
-	case OPCODE_CMP: return R300_ALU_OUTA_CMP;
-	case OPCODE_DP3: return R300_ALU_OUTA_DP4;
-	case OPCODE_DP4: return R300_ALU_OUTA_DP4;
-	case OPCODE_EX2: return R300_ALU_OUTA_EX2;
-	case OPCODE_FRC: return R300_ALU_OUTA_FRC;
-	case OPCODE_LG2: return R300_ALU_OUTA_LG2;
+	case RC_OPCODE_CMP: return R300_ALU_OUTA_CMP;
+	case RC_OPCODE_DP3: return R300_ALU_OUTA_DP4;
+	case RC_OPCODE_DP4: return R300_ALU_OUTA_DP4;
+	case RC_OPCODE_EX2: return R300_ALU_OUTA_EX2;
+	case RC_OPCODE_FRC: return R300_ALU_OUTA_FRC;
+	case RC_OPCODE_LG2: return R300_ALU_OUTA_LG2;
 	default:
 		error("translate_rgb_opcode(%i): Unknown opcode", opcode);
 		/* fall through */
-	case OPCODE_NOP:
+	case RC_OPCODE_NOP:
 		/* fall through */
-	case OPCODE_MAD: return R300_ALU_OUTA_MAD;
-	case OPCODE_MAX: return R300_ALU_OUTA_MAX;
-	case OPCODE_MIN: return R300_ALU_OUTA_MIN;
-	case OPCODE_RCP: return R300_ALU_OUTA_RCP;
-	case OPCODE_RSQ: return R300_ALU_OUTA_RSQ;
+	case RC_OPCODE_MAD: return R300_ALU_OUTA_MAD;
+	case RC_OPCODE_MAX: return R300_ALU_OUTA_MAX;
+	case RC_OPCODE_MIN: return R300_ALU_OUTA_MIN;
+	case RC_OPCODE_RCP: return R300_ALU_OUTA_RCP;
+	case RC_OPCODE_RSQ: return R300_ALU_OUTA_RSQ;
 	}
 }
 
 /**
  * Emit one paired ALU instruction.
  */
-static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
+static int emit_alu(struct r300_emit_state * emit, struct rc_pair_instruction* inst)
 {
 	PROG_CODE;
 
 	if (code->alu.length >= R300_PFS_MAX_ALU_INST) {
 		error("Too many ALU instructions");
-		return GL_FALSE;
+		return 0;
 	}
 
 	int ip = code->alu.length++;
@@ -136,17 +147,13 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
 	code->alu.inst[ip].alpha_inst = translate_alpha_opcode(c, inst->Alpha.Opcode);
 
 	for(j = 0; j < 3; ++j) {
-		GLuint src = inst->RGB.Src[j].Index | (inst->RGB.Src[j].Constant << 5);
-		if (!inst->RGB.Src[j].Constant)
-			use_temporary(code, inst->RGB.Src[j].Index);
+		unsigned int src = use_source(code, inst->RGB.Src[j]);
 		code->alu.inst[ip].rgb_addr |= src << (6*j);
 
-		src = inst->Alpha.Src[j].Index | (inst->Alpha.Src[j].Constant << 5);
-		if (!inst->Alpha.Src[j].Constant)
-			use_temporary(code, inst->Alpha.Src[j].Index);
+		src = use_source(code, inst->Alpha.Src[j]);
 		code->alu.inst[ip].alpha_addr |= src << (6*j);
 
-		GLuint arg = r300FPTranslateRGBSwizzle(inst->RGB.Arg[j].Source, inst->RGB.Arg[j].Swizzle);
+		unsigned int arg = r300FPTranslateRGBSwizzle(inst->RGB.Arg[j].Source, inst->RGB.Arg[j].Swizzle);
 		arg |= inst->RGB.Arg[j].Abs << 6;
 		arg |= inst->RGB.Arg[j].Negate << 5;
 		code->alu.inst[ip].rgb_inst |= arg << (7*j);
@@ -186,27 +193,27 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
 	if (inst->Alpha.DepthWriteMask) {
 		code->alu.inst[ip].alpha_addr |= R300_ALU_DSTA_DEPTH;
 		emit->node_flags |= R300_W_OUT;
-		c->code->writes_depth = GL_TRUE;
+		c->code->writes_depth = 1;
 	}
 
-	return GL_TRUE;
+	return 1;
 }
 
 
 /**
  * Finish the current node without advancing to the next one.
  */
-static GLboolean finish_node(struct r300_emit_state * emit)
+static int finish_node(struct r300_emit_state * emit)
 {
 	struct r300_fragment_program_compiler * c = emit->compiler;
 	struct r300_fragment_program_code *code = &emit->compiler->code->code.r300;
 
 	if (code->alu.length == emit->node_first_alu) {
 		/* Generate a single NOP for this node */
-		struct radeon_pair_instruction inst;
-		_mesa_bzero(&inst, sizeof(inst));
+		struct rc_pair_instruction inst;
+		memset(&inst, 0, sizeof(inst));
 		if (!emit_alu(emit, &inst))
-			return GL_FALSE;
+			return 0;
 	}
 
 	unsigned alu_offset = emit->node_first_alu;
@@ -217,7 +224,7 @@ static GLboolean finish_node(struct r300_emit_state * emit)
 	if (code->tex.length == emit->node_first_tex) {
 		if (emit->current_node > 0) {
 			error("Node %i has no TEX instructions", emit->current_node);
-			return GL_FALSE;
+			return 0;
 		}
 
 		tex_end = 0;
@@ -240,7 +247,7 @@ static GLboolean finish_node(struct r300_emit_state * emit)
 			(tex_end << R300_TEX_SIZE_SHIFT) |
 			emit->node_flags;
 
-	return GL_TRUE;
+	return 1;
 }
 
 
@@ -248,79 +255,72 @@ static GLboolean finish_node(struct r300_emit_state * emit)
  * Begin a block of texture instructions.
  * Create the necessary indirection.
  */
-static GLboolean begin_tex(void* data)
+static int begin_tex(struct r300_emit_state * emit)
 {
 	PROG_CODE;
 
 	if (code->alu.length == emit->node_first_alu &&
 	    code->tex.length == emit->node_first_tex) {
-		return GL_TRUE;
+		return 1;
 	}
 
 	if (emit->current_node == 3) {
 		error("Too many texture indirections");
-		return GL_FALSE;
+		return 0;
 	}
 
 	if (!finish_node(emit))
-		return GL_FALSE;
+		return 0;
 
 	emit->current_node++;
 	emit->node_first_tex = code->tex.length;
 	emit->node_first_alu = code->alu.length;
 	emit->node_flags = 0;
-	return GL_TRUE;
+	return 1;
 }
 
 
-static GLboolean emit_tex(void* data, struct radeon_pair_texture_instruction* inst)
+static int emit_tex(struct r300_emit_state * emit, struct rc_instruction * inst)
 {
 	PROG_CODE;
 
 	if (code->tex.length >= R300_PFS_MAX_TEX_INST) {
 		error("Too many TEX instructions");
-		return GL_FALSE;
+		return 0;
 	}
 
-	GLuint unit = inst->TexSrcUnit;
-	GLuint dest = inst->DestIndex;
-	GLuint opcode;
+	unsigned int unit = inst->U.I.TexSrcUnit;
+	unsigned int dest = inst->U.I.DstReg.Index;
+	unsigned int opcode;
 
-	switch(inst->Opcode) {
-	case RADEON_OPCODE_KIL: opcode = R300_TEX_OP_KIL; break;
-	case RADEON_OPCODE_TEX: opcode = R300_TEX_OP_LD; break;
-	case RADEON_OPCODE_TXB: opcode = R300_TEX_OP_TXB; break;
-	case RADEON_OPCODE_TXP: opcode = R300_TEX_OP_TXP; break;
+	switch(inst->U.I.Opcode) {
+	case RC_OPCODE_KIL: opcode = R300_TEX_OP_KIL; break;
+	case RC_OPCODE_TEX: opcode = R300_TEX_OP_LD; break;
+	case RC_OPCODE_TXB: opcode = R300_TEX_OP_TXB; break;
+	case RC_OPCODE_TXP: opcode = R300_TEX_OP_TXP; break;
 	default:
-		error("Unknown texture opcode %i", inst->Opcode);
-		return GL_FALSE;
+		error("Unknown texture opcode %i", inst->U.I.Opcode);
+		return 0;
 	}
 
-	if (inst->Opcode == RADEON_OPCODE_KIL) {
+	if (inst->U.I.Opcode == RC_OPCODE_KIL) {
 		unit = 0;
 		dest = 0;
 	} else {
 		use_temporary(code, dest);
 	}
 
-	use_temporary(code, inst->SrcIndex);
+	use_temporary(code, inst->U.I.SrcReg[0].Index);
 
 	code->tex.inst[code->tex.length++] =
-		(inst->SrcIndex << R300_SRC_ADDR_SHIFT) |
+		(inst->U.I.SrcReg[0].Index << R300_SRC_ADDR_SHIFT) |
 		(dest << R300_DST_ADDR_SHIFT) |
 		(unit << R300_TEX_ID_SHIFT) |
 		(opcode << R300_TEX_INST_SHIFT);
-	return GL_TRUE;
+	return 1;
 }
 
 
-static const struct radeon_pair_handler pair_handler = {
-	.EmitPaired = &emit_alu,
-	.EmitTex = &emit_tex,
-	.BeginTexBlock = &begin_tex,
-	.MaxHwTemps = R300_PFS_NUM_TEMP_REGS
-};
-
 /**
  * Final compilation step: Turn the intermediate radeon_program into
  * machine-readable instructions.
@@ -329,13 +329,31 @@ void r300BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compi
 {
 	struct r300_emit_state emit;
 	struct r300_fragment_program_code *code = &compiler->code->code.r300;
+	struct rc_instruction * inst;
 
 	memset(&emit, 0, sizeof(emit));
 	emit.compiler = compiler;
 
-	_mesa_bzero(code, sizeof(struct r300_fragment_program_code));
+	memset(code, 0, sizeof(struct r300_fragment_program_code));
+
+	for(inst = compiler->Base.Program.Instructions.Next;
+	    inst != &compiler->Base.Program.Instructions && !compiler->Base.Error;
+	    inst = inst->Next) {
+		if (inst->Type == RC_INSTRUCTION_NORMAL) {
+			if (inst->U.I.Opcode == RC_OPCODE_BEGIN_TEX) {
+				begin_tex(&emit);
+				continue;
+			}
+
+			emit_tex(&emit, inst);
+		} else {
+			emit_alu(&emit, &inst->U.P);
+		}
+	}
+
+	if (code->pixsize >= R300_PFS_NUM_TEMP_REGS)
+		rc_error(&compiler->Base, "Too many hardware temporaries used.\n");
 
-	radeonPairProgram(compiler, &pair_handler, &emit);
 	if (compiler->Base.Error)
 		return;
 
diff --git a/r300/compiler/r300_fragprog_swizzle.c b/r300/compiler/r300_fragprog_swizzle.c
index 1b14cc3..cfa48a5 100644
--- a/r300/compiler/r300_fragprog_swizzle.c
+++ b/r300/compiler/r300_fragprog_swizzle.c
@@ -33,16 +33,17 @@
 
 #include "r300_fragprog_swizzle.h"
 
+#include <stdio.h>
+
 #include "../r300_reg.h"
-#include "radeon_nqssadce.h"
 #include "radeon_compiler.h"
 
-#define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, SWIZZLE_##y, SWIZZLE_##z, SWIZZLE_ZERO))
+#define MAKE_SWZ3(x, y, z) (RC_MAKE_SWIZZLE(RC_SWIZZLE_##x, RC_SWIZZLE_##y, RC_SWIZZLE_##z, RC_SWIZZLE_ZERO))
 
 struct swizzle_data {
-	GLuint hash; /**< swizzle value this matches */
-	GLuint base; /**< base value for hw swizzle */
-	GLuint stride; /**< difference in base between arg0/1/2 */
+	unsigned int hash; /**< swizzle value this matches */
+	unsigned int base; /**< base value for hw swizzle */
+	unsigned int stride; /**< difference in base between arg0/1/2 */
 };
 
 static const struct swizzle_data native_swizzles[] = {
@@ -65,15 +66,15 @@ static const int num_native_swizzles = sizeof(native_swizzles)/sizeof(native_swi
  * Find a native RGB swizzle that matches the given swizzle.
  * Returns 0 if none found.
  */
-static const struct swizzle_data* lookup_native_swizzle(GLuint swizzle)
+static const struct swizzle_data* lookup_native_swizzle(unsigned int swizzle)
 {
 	int i, comp;
 
 	for(i = 0; i < num_native_swizzles; ++i) {
 		const struct swizzle_data* sd = &native_swizzles[i];
 		for(comp = 0; comp < 3; ++comp) {
-			GLuint swz = GET_SWZ(swizzle, comp);
-			if (swz == SWIZZLE_NIL)
+			unsigned int swz = GET_SWZ(swizzle, comp);
+			if (swz == RC_SWIZZLE_UNUSED)
 				continue;
 			if (swz != GET_SWZ(sd->hash, comp))
 				break;
@@ -90,71 +91,72 @@ static const struct swizzle_data* lookup_native_swizzle(GLuint swizzle)
  * Check whether the given instruction supports the swizzle and negate
  * combinations in the given source register.
  */
-GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
+static int r300_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 {
 	if (reg.Abs)
-		reg.Negate = NEGATE_NONE;
+		reg.Negate = RC_MASK_NONE;
 
-	if (opcode == OPCODE_KIL ||
-	    opcode == OPCODE_TEX ||
-	    opcode == OPCODE_TXB ||
-	    opcode == OPCODE_TXP) {
+	if (opcode == RC_OPCODE_KIL ||
+	    opcode == RC_OPCODE_TEX ||
+	    opcode == RC_OPCODE_TXB ||
+	    opcode == RC_OPCODE_TXP) {
 		int j;
 
 		if (reg.Abs || reg.Negate)
-			return GL_FALSE;
+			return 0;
 
 		for(j = 0; j < 4; ++j) {
-			GLuint swz = GET_SWZ(reg.Swizzle, j);
-			if (swz == SWIZZLE_NIL)
+			unsigned int swz = GET_SWZ(reg.Swizzle, j);
+			if (swz == RC_SWIZZLE_UNUSED)
 				continue;
 			if (swz != j)
-				return GL_FALSE;
+				return 0;
 		}
 
-		return GL_TRUE;
+		return 1;
 	}
 
-	GLuint relevant = 0;
+	unsigned int relevant = 0;
 	int j;
 
 	for(j = 0; j < 3; ++j)
-		if (GET_SWZ(reg.Swizzle, j) != SWIZZLE_NIL)
+		if (GET_SWZ(reg.Swizzle, j) != RC_SWIZZLE_UNUSED)
 			relevant |= 1 << j;
 
 	if ((reg.Negate & relevant) && ((reg.Negate & relevant) != relevant))
-		return GL_FALSE;
+		return 0;
 
 	if (!lookup_native_swizzle(reg.Swizzle))
-		return GL_FALSE;
+		return 0;
 
-	return GL_TRUE;
+	return 1;
 }
 
 
-/**
- * Generate MOV dst, src using only native swizzles.
- */
-void r300FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src)
+static void r300_swizzle_split(
+		struct rc_src_register src, unsigned int mask,
+		struct rc_swizzle_split * split)
 {
 	if (src.Abs)
-		src.Negate = NEGATE_NONE;
+		src.Negate = RC_MASK_NONE;
+
+	split->NumPhases = 0;
 
-	while(dst.WriteMask) {
+	while(mask) {
 		const struct swizzle_data *best_swizzle = 0;
-		GLuint best_matchcount = 0;
-		GLuint best_matchmask = 0;
+		unsigned int best_matchcount = 0;
+		unsigned int best_matchmask = 0;
 		int i, comp;
 
 		for(i = 0; i < num_native_swizzles; ++i) {
 			const struct swizzle_data *sd = &native_swizzles[i];
-			GLuint matchcount = 0;
-			GLuint matchmask = 0;
+			unsigned int matchcount = 0;
+			unsigned int matchmask = 0;
 			for(comp = 0; comp < 3; ++comp) {
-				if (!GET_BIT(dst.WriteMask, comp))
+				if (!GET_BIT(mask, comp))
 					continue;
-				GLuint swz = GET_SWZ(src.Swizzle, comp);
-				if (swz == SWIZZLE_NIL)
+				unsigned int swz = GET_SWZ(src.Swizzle, comp);
+				if (swz == RC_SWIZZLE_UNUSED)
 					continue;
 				if (swz == GET_SWZ(sd->hash, comp)) {
 					/* check if the negate bit of current component
@@ -170,34 +172,35 @@ void r300FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst,
 				best_swizzle = sd;
 				best_matchcount = matchcount;
 				best_matchmask = matchmask;
-				if (matchmask == (dst.WriteMask & WRITEMASK_XYZ))
+				if (matchmask == (mask & RC_MASK_XYZ))
 					break;
 			}
 		}
 
-		struct rc_instruction *inst = rc_insert_new_instruction(s->Compiler, s->IP->Prev);
-		inst->I.Opcode = OPCODE_MOV;
-		inst->I.DstReg = dst;
-		inst->I.DstReg.WriteMask &= (best_matchmask | WRITEMASK_W);
-		inst->I.SrcReg[0] = src;
-		inst->I.SrcReg[0].Negate = (best_matchmask & src.Negate) ? NEGATE_XYZW : NEGATE_NONE;
-		/* Note: We rely on NqSSA/DCE to set unused swizzle components to NIL */
+		if (mask & RC_MASK_W)
+			best_matchmask |= RC_MASK_W;
 
-		dst.WriteMask &= ~inst->I.DstReg.WriteMask;
+		split->Phase[split->NumPhases++] = best_matchmask;
+		mask &= ~best_matchmask;
 	}
 }
 
+struct rc_swizzle_caps r300_swizzle_caps = {
+	.IsNative = r300_swizzle_is_native,
+	.Split = r300_swizzle_split
+};
+
 
 /**
  * Translate an RGB (XYZ) swizzle into the hardware code for the given
  * instruction source.
  */
-GLuint r300FPTranslateRGBSwizzle(GLuint src, GLuint swizzle)
+unsigned int r300FPTranslateRGBSwizzle(unsigned int src, unsigned int swizzle)
 {
 	const struct swizzle_data* sd = lookup_native_swizzle(swizzle);
 
 	if (!sd) {
-		_mesa_printf("Not a native swizzle: %08x\n", swizzle);
+		fprintf(stderr, "Not a native swizzle: %08x\n", swizzle);
 		return 0;
 	}
 
@@ -209,15 +212,15 @@ GLuint r300FPTranslateRGBSwizzle(GLuint src, GLuint swizzle)
  * Translate an Alpha (W) swizzle into the hardware code for the given
  * instruction source.
  */
-GLuint r300FPTranslateAlphaSwizzle(GLuint src, GLuint swizzle)
+unsigned int r300FPTranslateAlphaSwizzle(unsigned int src, unsigned int swizzle)
 {
 	if (swizzle < 3)
 		return swizzle + 3*src;
 
 	switch(swizzle) {
-	case SWIZZLE_W: return R300_ALU_ARGA_SRC0A + src;
-	case SWIZZLE_ONE: return R300_ALU_ARGA_ONE;
-	case SWIZZLE_ZERO: return R300_ALU_ARGA_ZERO;
+	case RC_SWIZZLE_W: return R300_ALU_ARGA_SRC0A + src;
+	case RC_SWIZZLE_ONE: return R300_ALU_ARGA_ONE;
+	case RC_SWIZZLE_ZERO: return R300_ALU_ARGA_ZERO;
 	default: return R300_ALU_ARGA_ONE;
 	}
 }
diff --git a/r300/compiler/r300_fragprog_swizzle.h b/r300/compiler/r300_fragprog_swizzle.h
index 231bf4e..118476a 100644
--- a/r300/compiler/r300_fragprog_swizzle.h
+++ b/r300/compiler/r300_fragprog_swizzle.h
@@ -28,15 +28,11 @@
 #ifndef __R300_FRAGPROG_SWIZZLE_H_
 #define __R300_FRAGPROG_SWIZZLE_H_
 
-#include "main/glheader.h"
-#include "shader/prog_instruction.h"
+#include "radeon_swizzle.h"
 
-struct nqssadce_state;
+extern struct rc_swizzle_caps r300_swizzle_caps;
 
-GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg);
-void r300FPBuildSwizzle(struct nqssadce_state*, struct prog_dst_register dst, struct prog_src_register src);
-
-GLuint r300FPTranslateRGBSwizzle(GLuint src, GLuint swizzle);
-GLuint r300FPTranslateAlphaSwizzle(GLuint src, GLuint swizzle);
+unsigned int r300FPTranslateRGBSwizzle(unsigned int src, unsigned int swizzle);
+unsigned int r300FPTranslateAlphaSwizzle(unsigned int src, unsigned int swizzle);
 
 #endif /* __R300_FRAGPROG_SWIZZLE_H_ */
diff --git a/r300/compiler/r3xx_fragprog.c b/r300/compiler/r3xx_fragprog.c
index 76c3a7e..5581f25 100644
--- a/r300/compiler/r3xx_fragprog.c
+++ b/r300/compiler/r3xx_fragprog.c
@@ -22,22 +22,21 @@
 
 #include "radeon_compiler.h"
 
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
+#include <stdio.h>
 
-#include "radeon_nqssadce.h"
+#include "radeon_dataflow.h"
 #include "radeon_program_alu.h"
 #include "r300_fragprog.h"
 #include "r300_fragprog_swizzle.h"
 #include "r500_fragprog.h"
 
 
-static void nqssadce_init(struct nqssadce_state* s)
+static void dataflow_outputs_mark_use(void * userdata, void * data,
+		void (*callback)(void *, unsigned int, unsigned int))
 {
-	struct r300_fragment_program_compiler * c = s->UserData;
-	s->Outputs[c->OutputColor].Sourced = WRITEMASK_XYZW;
-	s->Outputs[c->OutputDepth].Sourced = WRITEMASK_W;
+	struct r300_fragment_program_compiler * c = userdata;
+	callback(data, c->OutputColor, RC_MASK_XYZW);
+	callback(data, c->OutputDepth, RC_MASK_W);
 }
 
 static void rewrite_depth_out(struct r300_fragment_program_compiler * c)
@@ -45,35 +44,35 @@ static void rewrite_depth_out(struct r300_fragment_program_compiler * c)
 	struct rc_instruction *rci;
 
 	for (rci = c->Base.Program.Instructions.Next; rci != &c->Base.Program.Instructions; rci = rci->Next) {
-		struct prog_instruction * inst = &rci->I;
+		struct rc_sub_instruction * inst = &rci->U.I;
 
-		if (inst->DstReg.File != PROGRAM_OUTPUT || inst->DstReg.Index != c->OutputDepth)
+		if (inst->DstReg.File != RC_FILE_OUTPUT || inst->DstReg.Index != c->OutputDepth)
 			continue;
 
-		if (inst->DstReg.WriteMask & WRITEMASK_Z) {
-			inst->DstReg.WriteMask = WRITEMASK_W;
+		if (inst->DstReg.WriteMask & RC_MASK_Z) {
+			inst->DstReg.WriteMask = RC_MASK_W;
 		} else {
 			inst->DstReg.WriteMask = 0;
 			continue;
 		}
 
 		switch (inst->Opcode) {
-			case OPCODE_FRC:
-			case OPCODE_MOV:
-				inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
+			case RC_OPCODE_FRC:
+			case RC_OPCODE_MOV:
+				inst->SrcReg[0] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[0]);
 				break;
-			case OPCODE_ADD:
-			case OPCODE_MAX:
-			case OPCODE_MIN:
-			case OPCODE_MUL:
-				inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
-				inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]);
+			case RC_OPCODE_ADD:
+			case RC_OPCODE_MAX:
+			case RC_OPCODE_MIN:
+			case RC_OPCODE_MUL:
+				inst->SrcReg[0] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[0]);
+				inst->SrcReg[1] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[1]);
 				break;
-			case OPCODE_CMP:
-			case OPCODE_MAD:
-				inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
-				inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]);
-				inst->SrcReg[2] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[2]);
+			case RC_OPCODE_CMP:
+			case RC_OPCODE_MAD:
+				inst->SrcReg[0] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[0]);
+				inst->SrcReg[1] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[1]);
+				inst->SrcReg[2] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[2]);
 				break;
 			default:
 				// Scalar instructions needn't be reswizzled
@@ -89,11 +88,14 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 	if (c->is_r500) {
 		struct radeon_program_transformation transformations[] = {
 			{ &r500_transform_TEX, c },
+			{ &r500_transform_IF, 0 },
 			{ &radeonTransformALU, 0 },
 			{ &radeonTransformDeriv, 0 },
 			{ &radeonTransformTrigScale, 0 }
 		};
-		radeonLocalTransform(&c->Base, 4, transformations);
+		radeonLocalTransform(&c->Base, 5, transformations);
+
+		c->Base.SwizzleCaps = &r500_swizzle_caps;
 	} else {
 		struct radeon_program_transformation transformations[] = {
 			{ &r300_transform_TEX, c },
@@ -101,32 +103,66 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 			{ &radeonTransformTrigSimple, 0 }
 		};
 		radeonLocalTransform(&c->Base, 3, transformations);
+
+		c->Base.SwizzleCaps = &r300_swizzle_caps;
 	}
 
 	if (c->Base.Debug) {
-		_mesa_printf("Fragment Program: After native rewrite:\n");
+		fprintf(stderr, "Fragment Program: After native rewrite:\n");
 		rc_print_program(&c->Base.Program);
 		fflush(stderr);
 	}
 
-	if (c->is_r500) {
-		struct radeon_nqssadce_descr nqssadce = {
-			.Init = &nqssadce_init,
-			.IsNativeSwizzle = &r500FPIsNativeSwizzle,
-			.BuildSwizzle = &r500FPBuildSwizzle
-		};
-		radeonNqssaDce(&c->Base, &nqssadce, c);
-	} else {
-		struct radeon_nqssadce_descr nqssadce = {
-			.Init = &nqssadce_init,
-			.IsNativeSwizzle = &r300FPIsNativeSwizzle,
-			.BuildSwizzle = &r300FPBuildSwizzle
-		};
-		radeonNqssaDce(&c->Base, &nqssadce, c);
+	rc_dataflow_deadcode(&c->Base, &dataflow_outputs_mark_use, c);
+	if (c->Base.Error)
+		return;
+
+	if (c->Base.Debug) {
+		fprintf(stderr, "Fragment Program: After deadcode:\n");
+		rc_print_program(&c->Base.Program);
+		fflush(stderr);
+	}
+
+	rc_dataflow_swizzles(&c->Base);
+	if (c->Base.Error)
+		return;
+
+	if (c->Base.Debug) {
+		fprintf(stderr, "Compiler: after dataflow passes:\n");
+		rc_print_program(&c->Base.Program);
+		fflush(stderr);
+	}
+
+	rc_pair_translate(c);
+	if (c->Base.Error)
+		return;
+
+	if (c->Base.Debug) {
+		fprintf(stderr, "Compiler: after pair translate:\n");
+		rc_print_program(&c->Base.Program);
+		fflush(stderr);
 	}
 
+	rc_pair_schedule(c);
+	if (c->Base.Error)
+		return;
+
+	if (c->Base.Debug) {
+		fprintf(stderr, "Compiler: after pair scheduling:\n");
+		rc_print_program(&c->Base.Program);
+		fflush(stderr);
+	}
+
+	if (c->is_r500)
+		rc_pair_regalloc(c, 128);
+	else
+		rc_pair_regalloc(c, R300_PFS_NUM_TEMP_REGS);
+
+	if (c->Base.Error)
+		return;
+
 	if (c->Base.Debug) {
-		_mesa_printf("Compiler: after NqSSA-DCE:\n");
+		fprintf(stderr, "Compiler: after pair register allocation:\n");
 		rc_print_program(&c->Base.Program);
 		fflush(stderr);
 	}
diff --git a/r300/compiler/r3xx_vertprog.c b/r300/compiler/r3xx_vertprog.c
index dad27fc..1b2cb8d 100644
--- a/r300/compiler/r3xx_vertprog.c
+++ b/r300/compiler/r3xx_vertprog.c
@@ -22,13 +22,13 @@
 
 #include "radeon_compiler.h"
 
+#include <stdio.h>
+
 #include "../r300_reg.h"
 
-#include "radeon_nqssadce.h"
-#include "radeon_program.h"
+#include "radeon_dataflow.h"
 #include "radeon_program_alu.h"
-
-#include "shader/prog_print.h"
+#include "radeon_swizzle.h"
 
 
 /*
@@ -42,104 +42,83 @@
 			   t_swizzle(y),	\
 			   t_swizzle(y),	\
 			   t_src_class(vpi->SrcReg[x].File), \
-			   NEGATE_NONE) | (vpi->SrcReg[x].RelAddr << 4))
+			   RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
 
 
-static unsigned long t_dst_mask(GLuint mask)
+static unsigned long t_dst_mask(unsigned int mask)
 {
-	/* WRITEMASK_* is equivalent to VSF_FLAG_* */
-	return mask & WRITEMASK_XYZW;
+	/* RC_MASK_* is equivalent to VSF_FLAG_* */
+	return mask & RC_MASK_XYZW;
 }
 
-static unsigned long t_dst_class(gl_register_file file)
+static unsigned long t_dst_class(rc_register_file file)
 {
-
 	switch (file) {
-	case PROGRAM_TEMPORARY:
+	default:
+		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
+		/* fall-through */
+	case RC_FILE_TEMPORARY:
 		return PVS_DST_REG_TEMPORARY;
-	case PROGRAM_OUTPUT:
+	case RC_FILE_OUTPUT:
 		return PVS_DST_REG_OUT;
-	case PROGRAM_ADDRESS:
+	case RC_FILE_ADDRESS:
 		return PVS_DST_REG_A0;
-		/*
-		   case PROGRAM_INPUT:
-		   case PROGRAM_LOCAL_PARAM:
-		   case PROGRAM_ENV_PARAM:
-		   case PROGRAM_NAMED_PARAM:
-		   case PROGRAM_STATE_VAR:
-		   case PROGRAM_WRITE_ONLY:
-		   case PROGRAM_ADDRESS:
-		 */
-	default:
-		fprintf(stderr, "problem in %s", __FUNCTION__);
-		_mesa_exit(-1);
-		return -1;
 	}
 }
 
 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
-				 struct prog_dst_register *dst)
+				 struct rc_dst_register *dst)
 {
-	if (dst->File == PROGRAM_OUTPUT)
+	if (dst->File == RC_FILE_OUTPUT)
 		return vp->outputs[dst->Index];
 
 	return dst->Index;
 }
 
-static unsigned long t_src_class(gl_register_file file)
+static unsigned long t_src_class(rc_register_file file)
 {
 	switch (file) {
-	case PROGRAM_BUILTIN:
-	case PROGRAM_TEMPORARY:
+	default:
+		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
+		/* fall-through */
+	case RC_FILE_NONE:
+	case RC_FILE_TEMPORARY:
 		return PVS_SRC_REG_TEMPORARY;
-	case PROGRAM_INPUT:
+	case RC_FILE_INPUT:
 		return PVS_SRC_REG_INPUT;
-	case PROGRAM_LOCAL_PARAM:
-	case PROGRAM_ENV_PARAM:
-	case PROGRAM_NAMED_PARAM:
-	case PROGRAM_CONSTANT:
-	case PROGRAM_STATE_VAR:
+	case RC_FILE_CONSTANT:
 		return PVS_SRC_REG_CONSTANT;
-		/*
-		   case PROGRAM_OUTPUT:
-		   case PROGRAM_WRITE_ONLY:
-		   case PROGRAM_ADDRESS:
-		 */
-	default:
-		fprintf(stderr, "problem in %s", __FUNCTION__);
-		_mesa_exit(-1);
-		return -1;
 	}
 }
 
-static GLboolean t_src_conflict(struct prog_src_register a, struct prog_src_register b)
+static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
 {
 	unsigned long aclass = t_src_class(a.File);
 	unsigned long bclass = t_src_class(b.File);
 
 	if (aclass != bclass)
-		return GL_FALSE;
+		return 0;
 	if (aclass == PVS_SRC_REG_TEMPORARY)
-		return GL_FALSE;
+		return 0;
 
 	if (a.RelAddr || b.RelAddr)
-		return GL_TRUE;
+		return 1;
 	if (a.Index != b.Index)
-		return GL_TRUE;
+		return 1;
 
-	return GL_FALSE;
+	return 0;
 }
 
-static INLINE unsigned long t_swizzle(GLubyte swizzle)
+static inline unsigned long t_swizzle(unsigned int swizzle)
 {
-	/* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
+	/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
 	return swizzle;
 }
 
 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
-				 struct prog_src_register *src)
+				 struct rc_src_register *src)
 {
-	if (src->File == PROGRAM_INPUT) {
+	if (src->File == RC_FILE_INPUT) {
 		assert(vp->inputs[src->Index] != -1);
 		return vp->inputs[src->Index];
 	} else {
@@ -155,9 +134,9 @@ static unsigned long t_src_index(struct r300_vertex_program_code *vp,
 /* these two functions should probably be merged... */
 
 static unsigned long t_src(struct r300_vertex_program_code *vp,
-			   struct prog_src_register *src)
+			   struct rc_src_register *src)
 {
-	/* src->Negate uses the NEGATE_ flags from program_instruction.h,
+	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
 	 */
 	return PVS_SRC_OPERAND(t_src_index(vp, src),
@@ -170,9 +149,9 @@ static unsigned long t_src(struct r300_vertex_program_code *vp,
 }
 
 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
-				  struct prog_src_register *src)
+				  struct rc_src_register *src)
 {
-	/* src->Negate uses the NEGATE_ flags from program_instruction.h,
+	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
 	 */
 	return PVS_SRC_OPERAND(t_src_index(vp, src),
@@ -181,79 +160,79 @@ static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
 			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
 			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
 			       t_src_class(src->File),
-			       src->Negate ? NEGATE_XYZW : NEGATE_NONE) |
+			       src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
 	    (src->RelAddr << 4);
 }
 
-static GLboolean valid_dst(struct r300_vertex_program_code *vp,
-			   struct prog_dst_register *dst)
+static int valid_dst(struct r300_vertex_program_code *vp,
+			   struct rc_dst_register *dst)
 {
-	if (dst->File == PROGRAM_OUTPUT && vp->outputs[dst->Index] == -1) {
-		return GL_FALSE;
-	} else if (dst->File == PROGRAM_ADDRESS) {
+	if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
+		return 0;
+	} else if (dst->File == RC_FILE_ADDRESS) {
 		assert(dst->Index == 0);
 	}
 
-	return GL_TRUE;
+	return 1;
 }
 
 static void ei_vector1(struct r300_vertex_program_code *vp,
-				GLuint hw_opcode,
-				struct prog_instruction *vpi,
-				GLuint * inst)
+				unsigned int hw_opcode,
+				struct rc_sub_instruction *vpi,
+				unsigned int * inst)
 {
 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
-				     GL_FALSE,
-				     GL_FALSE,
+				     0,
+				     0,
 				     t_dst_index(vp, &vpi->DstReg),
 				     t_dst_mask(vpi->DstReg.WriteMask),
 				     t_dst_class(vpi->DstReg.File));
 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
+	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
+	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
 }
 
 static void ei_vector2(struct r300_vertex_program_code *vp,
-				GLuint hw_opcode,
-				struct prog_instruction *vpi,
-				GLuint * inst)
+				unsigned int hw_opcode,
+				struct rc_sub_instruction *vpi,
+				unsigned int * inst)
 {
 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
-				     GL_FALSE,
-				     GL_FALSE,
+				     0,
+				     0,
 				     t_dst_index(vp, &vpi->DstReg),
 				     t_dst_mask(vpi->DstReg.WriteMask),
 				     t_dst_class(vpi->DstReg.File));
 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
-	inst[3] = __CONST(1, SWIZZLE_ZERO);
+	inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
 }
 
 static void ei_math1(struct r300_vertex_program_code *vp,
-				GLuint hw_opcode,
-				struct prog_instruction *vpi,
-				GLuint * inst)
+				unsigned int hw_opcode,
+				struct rc_sub_instruction *vpi,
+				unsigned int * inst)
 {
 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
-				     GL_TRUE,
-				     GL_FALSE,
+				     1,
+				     0,
 				     t_dst_index(vp, &vpi->DstReg),
 				     t_dst_mask(vpi->DstReg.WriteMask),
 				     t_dst_class(vpi->DstReg.File));
 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
+	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
+	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
 }
 
 static void ei_lit(struct r300_vertex_program_code *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst)
+				      struct rc_sub_instruction *vpi,
+				      unsigned int * inst)
 {
 	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
 
 	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
-				     GL_TRUE,
-				     GL_FALSE,
+				     1,
+				     0,
 				     t_dst_index(vp, &vpi->DstReg),
 				     t_dst_mask(vpi->DstReg.WriteMask),
 				     t_dst_class(vpi->DstReg.File));
@@ -263,27 +242,27 @@ static void ei_lit(struct r300_vertex_program_code *vp,
 				  PVS_SRC_SELECT_FORCE_0,	// Z
 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
 				  t_src_class(vpi->SrcReg[0].File),
-				  vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
 	    (vpi->SrcReg[0].RelAddr << 4);
 	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
 				  PVS_SRC_SELECT_FORCE_0,	// Z
 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
 				  t_src_class(vpi->SrcReg[0].File),
-				  vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
 	    (vpi->SrcReg[0].RelAddr << 4);
 	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
 				  PVS_SRC_SELECT_FORCE_0,	// Z
 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
 				  t_src_class(vpi->SrcReg[0].File),
-				  vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
 	    (vpi->SrcReg[0].RelAddr << 4);
 }
 
 static void ei_mad(struct r300_vertex_program_code *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst)
+				      struct rc_sub_instruction *vpi,
+				      unsigned int * inst)
 {
 	/* Remarks about hardware limitations of MAD
 	 * (please preserve this comment, as this information is _NOT_
@@ -311,22 +290,22 @@ static void ei_mad(struct r300_vertex_program_code *vp,
 	 * according to AMD docs, this should improve performance by one clock
 	 * as a nice side bonus.
 	 */
-	if (vpi->SrcReg[0].File == PROGRAM_TEMPORARY &&
-	    vpi->SrcReg[1].File == PROGRAM_TEMPORARY &&
-	    vpi->SrcReg[2].File == PROGRAM_TEMPORARY &&
+	if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
+	    vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
+	    vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
 	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
 	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
 	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
 		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
-				GL_FALSE,
-				GL_TRUE,
+				0,
+				1,
 				t_dst_index(vp, &vpi->DstReg),
 				t_dst_mask(vpi->DstReg.WriteMask),
 				t_dst_class(vpi->DstReg.File));
 	} else {
 		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
-				GL_FALSE,
-				GL_FALSE,
+				0,
+				0,
 				t_dst_index(vp, &vpi->DstReg),
 				t_dst_mask(vpi->DstReg.WriteMask),
 				t_dst_class(vpi->DstReg.File));
@@ -337,17 +316,17 @@ static void ei_mad(struct r300_vertex_program_code *vp,
 }
 
 static void ei_pow(struct r300_vertex_program_code *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst)
+				      struct rc_sub_instruction *vpi,
+				      unsigned int * inst)
 {
 	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
-				     GL_TRUE,
-				     GL_FALSE,
+				     1,
+				     0,
 				     t_dst_index(vp, &vpi->DstReg),
 				     t_dst_mask(vpi->DstReg.WriteMask),
 				     t_dst_class(vpi->DstReg.File));
 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
+	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
 	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
 }
 
@@ -362,8 +341,8 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 	compiler->SetHwInputOutput(compiler);
 
 	for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
-		struct prog_instruction *vpi = &rci->I;
-		GLuint *inst = compiler->code->body.d + compiler->code->length;
+		struct rc_sub_instruction *vpi = &rci->U.I;
+		unsigned int *inst = compiler->code->body.d + compiler->code->length;
 
 		/* Skip instructions writing to non-existing destination */
 		if (!valid_dst(compiler->code, &vpi->DstReg))
@@ -375,26 +354,26 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 		}
 
 		switch (vpi->Opcode) {
-		case OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
-		case OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
-		case OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
-		case OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
-		case OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
-		case OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
-		case OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
-		case OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
-		case OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
-		case OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
-		case OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
-		case OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
-		case OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
-		case OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
-		case OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
-		case OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
-		case OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
-		case OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
-		case OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
-		case OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
+		case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
+		case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
+		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
+		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
+		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
+		case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
+		case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
+		case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
+		case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
+		case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
+		case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
+		case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
+		case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
+		case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
+		case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
+		case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
+		case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
+		case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
+		case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
+		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
 		default:
 			rc_error(&compiler->Base, "Unknown opcode %i\n", vpi->Opcode);
 			return;
@@ -408,38 +387,37 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 }
 
 struct temporary_allocation {
-	GLuint Allocated:1;
-	GLuint HwTemp:15;
+	unsigned int Allocated:1;
+	unsigned int HwTemp:15;
 	struct rc_instruction * LastRead;
 };
 
 static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler)
 {
 	struct rc_instruction *inst;
-	GLuint num_orig_temps = 0;
-	GLboolean hwtemps[VSF_MAX_FRAGMENT_TEMPS];
+	unsigned int num_orig_temps = 0;
+	char hwtemps[VSF_MAX_FRAGMENT_TEMPS];
 	struct temporary_allocation * ta;
-	GLuint i, j;
+	unsigned int i, j;
 
 	compiler->code->num_temporaries = 0;
 	memset(hwtemps, 0, sizeof(hwtemps));
 
 	/* Pass 1: Count original temporaries and allocate structures */
 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
-		GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
-		GLuint numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 
-		for (i = 0; i < numsrcs; ++i) {
-			if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) {
-				if (inst->I.SrcReg[i].Index >= num_orig_temps)
-					num_orig_temps = inst->I.SrcReg[i].Index + 1;
+		for (i = 0; i < opcode->NumSrcRegs; ++i) {
+			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
+				if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
+					num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
 			}
 		}
 
-		if (numdsts) {
-			if (inst->I.DstReg.File == PROGRAM_TEMPORARY) {
-				if (inst->I.DstReg.Index >= num_orig_temps)
-					num_orig_temps = inst->I.DstReg.Index + 1;
+		if (opcode->HasDstReg) {
+			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
+				if (inst->U.I.DstReg.Index >= num_orig_temps)
+					num_orig_temps = inst->U.I.DstReg.Index + 1;
 			}
 		}
 	}
@@ -450,32 +428,31 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 
 	/* Pass 2: Determine original temporary lifetimes */
 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
-		GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 
-		for (i = 0; i < numsrcs; ++i) {
-			if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY)
-				ta[inst->I.SrcReg[i].Index].LastRead = inst;
+		for (i = 0; i < opcode->NumSrcRegs; ++i) {
+			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY)
+				ta[inst->U.I.SrcReg[i].Index].LastRead = inst;
 		}
 	}
 
 	/* Pass 3: Register allocation */
 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
-		GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
-		GLuint numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 
-		for (i = 0; i < numsrcs; ++i) {
-			if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) {
-				GLuint orig = inst->I.SrcReg[i].Index;
-				inst->I.SrcReg[i].Index = ta[orig].HwTemp;
+		for (i = 0; i < opcode->NumSrcRegs; ++i) {
+			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
+				unsigned int orig = inst->U.I.SrcReg[i].Index;
+				inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
 
 				if (ta[orig].Allocated && inst == ta[orig].LastRead)
-					hwtemps[ta[orig].HwTemp] = GL_FALSE;
+					hwtemps[ta[orig].HwTemp] = 0;
 			}
 		}
 
-		if (numdsts) {
-			if (inst->I.DstReg.File == PROGRAM_TEMPORARY) {
-				GLuint orig = inst->I.DstReg.Index;
+		if (opcode->HasDstReg) {
+			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
+				unsigned int orig = inst->U.I.DstReg.Index;
 
 				if (!ta[orig].Allocated) {
 					for(j = 0; j < VSF_MAX_FRAGMENT_TEMPS; ++j) {
@@ -485,16 +462,16 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 					if (j >= VSF_MAX_FRAGMENT_TEMPS) {
 						fprintf(stderr, "Out of hw temporaries\n");
 					} else {
-						ta[orig].Allocated = GL_TRUE;
+						ta[orig].Allocated = 1;
 						ta[orig].HwTemp = j;
-						hwtemps[j] = GL_TRUE;
+						hwtemps[j] = 1;
 
 						if (j >= compiler->code->num_temporaries)
 							compiler->code->num_temporaries = j + 1;
 					}
 				}
 
-				inst->I.DstReg.Index = ta[orig].HwTemp;
+				inst->U.I.DstReg.Index = ta[orig].HwTemp;
 			}
 		}
 	}
@@ -505,45 +482,45 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
  * Vertex engine cannot read two inputs or two constants at the same time.
  * Introduce intermediate MOVs to temporary registers to account for this.
  */
-static GLboolean transform_source_conflicts(
+static int transform_source_conflicts(
 	struct radeon_compiler *c,
 	struct rc_instruction* inst,
 	void* unused)
 {
-	GLuint num_operands = _mesa_num_inst_src_regs(inst->I.Opcode);
+	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 
-	if (num_operands == 3) {
-		if (t_src_conflict(inst->I.SrcReg[1], inst->I.SrcReg[2])
-		    || t_src_conflict(inst->I.SrcReg[0], inst->I.SrcReg[2])) {
+	if (opcode->NumSrcRegs == 3) {
+		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
+		    || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
 			int tmpreg = rc_find_free_temporary(c);
 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
-			inst_mov->I.Opcode = OPCODE_MOV;
-			inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
-			inst_mov->I.DstReg.Index = tmpreg;
-			inst_mov->I.SrcReg[0] = inst->I.SrcReg[2];
-
-			reset_srcreg(&inst->I.SrcReg[2]);
-			inst->I.SrcReg[2].File = PROGRAM_TEMPORARY;
-			inst->I.SrcReg[2].Index = tmpreg;
+			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
+			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
+			inst_mov->U.I.DstReg.Index = tmpreg;
+			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
+
+			reset_srcreg(&inst->U.I.SrcReg[2]);
+			inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
+			inst->U.I.SrcReg[2].Index = tmpreg;
 		}
 	}
 
-	if (num_operands >= 2) {
-		if (t_src_conflict(inst->I.SrcReg[1], inst->I.SrcReg[0])) {
+	if (opcode->NumSrcRegs >= 2) {
+		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
 			int tmpreg = rc_find_free_temporary(c);
 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
-			inst_mov->I.Opcode = OPCODE_MOV;
-			inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
-			inst_mov->I.DstReg.Index = tmpreg;
-			inst_mov->I.SrcReg[0] = inst->I.SrcReg[1];
-
-			reset_srcreg(&inst->I.SrcReg[1]);
-			inst->I.SrcReg[1].File = PROGRAM_TEMPORARY;
-			inst->I.SrcReg[1].Index = tmpreg;
+			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
+			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
+			inst_mov->U.I.DstReg.Index = tmpreg;
+			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
+
+			reset_srcreg(&inst->U.I.SrcReg[1]);
+			inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
+			inst->U.I.SrcReg[1].Index = tmpreg;
 		}
 	}
 
-	return GL_TRUE;
+	return 1;
 }
 
 static void addArtificialOutputs(struct r300_vertex_program_compiler * compiler)
@@ -554,44 +531,52 @@ static void addArtificialOutputs(struct r300_vertex_program_compiler * compiler)
 		if ((compiler->RequiredOutputs & (1 << i)) &&
 		    !(compiler->Base.Program.OutputsWritten & (1 << i))) {
 			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
-			inst->I.Opcode = OPCODE_MOV;
+			inst->U.I.Opcode = RC_OPCODE_MOV;
 
-			inst->I.DstReg.File = PROGRAM_OUTPUT;
-			inst->I.DstReg.Index = i;
-			inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
+			inst->U.I.DstReg.File = RC_FILE_OUTPUT;
+			inst->U.I.DstReg.Index = i;
+			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
 
-			inst->I.SrcReg[0].File = PROGRAM_CONSTANT;
-			inst->I.SrcReg[0].Index = 0;
-			inst->I.SrcReg[0].Swizzle = SWIZZLE_XYZW;
+			inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
+			inst->U.I.SrcReg[0].Index = 0;
+			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
 
 			compiler->Base.Program.OutputsWritten |= 1 << i;
 		}
 	}
 }
 
-static void nqssadceInit(struct nqssadce_state* s)
+static void dataflow_outputs_mark_used(void * userdata, void * data,
+		void (*callback)(void *, unsigned int, unsigned int))
 {
-	struct r300_vertex_program_compiler * compiler = s->UserData;
+	struct r300_vertex_program_compiler * c = userdata;
 	int i;
 
-	for(i = 0; i < VERT_RESULT_MAX; ++i) {
-		if (compiler->RequiredOutputs & (1 << i))
-			s->Outputs[i].Sourced = WRITEMASK_XYZW;
+	for(i = 0; i < 32; ++i) {
+		if (c->RequiredOutputs & (1 << i))
+			callback(data, i, RC_MASK_XYZW);
 	}
 }
 
-static GLboolean swizzleIsNative(GLuint opcode, struct prog_src_register reg)
+static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 {
 	(void) opcode;
 	(void) reg;
 
-	return GL_TRUE;
+	return 1;
 }
 
 
+static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
+	.IsNative = &swizzle_is_native,
+	.Split = 0 /* should never be called */
+};
+
 
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 {
+	compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
+
 	addArtificialOutputs(compiler);
 
 	{
@@ -624,22 +609,22 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 		fflush(stderr);
 	}
 
-	{
-		struct radeon_nqssadce_descr nqssadce = {
-			.Init = &nqssadceInit,
-			.IsNativeSwizzle = &swizzleIsNative,
-			.BuildSwizzle = NULL
-		};
-		radeonNqssaDce(&compiler->Base, &nqssadce, compiler);
+	rc_dataflow_deadcode(&compiler->Base, &dataflow_outputs_mark_used, compiler);
+
+	if (compiler->Base.Debug) {
+		fprintf(stderr, "Vertex program after deadcode:\n");
+		rc_print_program(&compiler->Base.Program);
+		fflush(stderr);
+	}
 
-		/* We need this step for reusing temporary registers */
-		allocate_temporary_registers(compiler);
+	rc_dataflow_swizzles(&compiler->Base);
 
-		if (compiler->Base.Debug) {
-			fprintf(stderr, "Vertex program after NQSSADCE:\n");
-			rc_print_program(&compiler->Base.Program);
-			fflush(stderr);
-		}
+	allocate_temporary_registers(compiler);
+
+	if (compiler->Base.Debug) {
+		fprintf(stderr, "Vertex program after dataflow:\n");
+		rc_print_program(&compiler->Base.Program);
+		fflush(stderr);
 	}
 
 	translate_vertex_program(compiler);
diff --git a/r300/compiler/r3xx_vertprog_dump.c b/r300/compiler/r3xx_vertprog_dump.c
index 980ef3e..66f9b05 100644
--- a/r300/compiler/r3xx_vertprog_dump.c
+++ b/r300/compiler/r3xx_vertprog_dump.c
@@ -146,7 +146,7 @@ static void r300_vs_op_dump(uint32_t op)
 static void r300_vs_src_dump(uint32_t src)
 {
 	fprintf(stderr, " reg: %d%s swiz: %s%s/%s%s/%s%s/%s%s\n",
-			(src >> 5) & 0x7f, r300_vs_src_debug[src & 0x3],
+			(src >> 5) & 0xff, r300_vs_src_debug[src & 0x3],
 			src & (1 << 25) ? "-" : " ",
 			r300_vs_swiz_debug[(src >> 13) & 0x7],
 			src & (1 << 26) ? "-" : " ",
diff --git a/r300/compiler/r500_fragprog.c b/r300/compiler/r500_fragprog.c
index 7e2faed..d87acec 100644
--- a/r300/compiler/r500_fragprog.c
+++ b/r300/compiler/r500_fragprog.c
@@ -27,15 +27,17 @@
 
 #include "r500_fragprog.h"
 
+#include <stdio.h>
+
 #include "../r300_reg.h"
 
-static struct prog_src_register shadow_ambient(struct radeon_compiler * c, int tmu)
+static struct rc_src_register shadow_ambient(struct radeon_compiler * c, int tmu)
 {
-	struct prog_src_register reg = { 0, };
+	struct rc_src_register reg = { 0, };
 
-	reg.File = PROGRAM_STATE_VAR;
+	reg.File = RC_FILE_CONSTANT;
 	reg.Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_SHADOW_AMBIENT, tmu);
-	reg.Swizzle = SWIZZLE_WWWW;
+	reg.Swizzle = RC_SWIZZLE_WWWW;
 	return reg;
 }
 
@@ -44,7 +46,7 @@ static struct prog_src_register shadow_ambient(struct radeon_compiler * c, int t
  *  - implement texture compare (shadow extensions)
  *  - extract non-native source / destination operands
  */
-GLboolean r500_transform_TEX(
+int r500_transform_TEX(
 	struct radeon_compiler * c,
 	struct rc_instruction * inst,
 	void* data)
@@ -52,77 +54,77 @@ GLboolean r500_transform_TEX(
 	struct r300_fragment_program_compiler *compiler =
 		(struct r300_fragment_program_compiler*)data;
 
-	if (inst->I.Opcode != OPCODE_TEX &&
-	    inst->I.Opcode != OPCODE_TXB &&
-	    inst->I.Opcode != OPCODE_TXP &&
-	    inst->I.Opcode != OPCODE_KIL)
-		return GL_FALSE;
+	if (inst->U.I.Opcode != RC_OPCODE_TEX &&
+	    inst->U.I.Opcode != RC_OPCODE_TXB &&
+	    inst->U.I.Opcode != RC_OPCODE_TXP &&
+	    inst->U.I.Opcode != RC_OPCODE_KIL)
+		return 0;
 
 	/* ARB_shadow & EXT_shadow_funcs */
-	if (inst->I.Opcode != OPCODE_KIL &&
-	    c->Program.ShadowSamplers & (1 << inst->I.TexSrcUnit)) {
-		GLuint comparefunc = GL_NEVER + compiler->state.unit[inst->I.TexSrcUnit].texture_compare_func;
+	if (inst->U.I.Opcode != RC_OPCODE_KIL &&
+	    c->Program.ShadowSamplers & (1 << inst->U.I.TexSrcUnit)) {
+		rc_compare_func comparefunc = compiler->state.unit[inst->U.I.TexSrcUnit].texture_compare_func;
 
-		if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) {
-			inst->I.Opcode = OPCODE_MOV;
+		if (comparefunc == RC_COMPARE_FUNC_NEVER || comparefunc == RC_COMPARE_FUNC_ALWAYS) {
+			inst->U.I.Opcode = RC_OPCODE_MOV;
 
-			if (comparefunc == GL_ALWAYS) {
-				inst->I.SrcReg[0].File = PROGRAM_BUILTIN;
-				inst->I.SrcReg[0].Swizzle = SWIZZLE_1111;
+			if (comparefunc == RC_COMPARE_FUNC_ALWAYS) {
+				inst->U.I.SrcReg[0].File = RC_FILE_NONE;
+				inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_1111;
 			} else {
-				inst->I.SrcReg[0] = shadow_ambient(c, inst->I.TexSrcUnit);
+				inst->U.I.SrcReg[0] = shadow_ambient(c, inst->U.I.TexSrcUnit);
 			}
 
-			return GL_TRUE;
+			return 1;
 		} else {
-			GLuint comparefunc = GL_NEVER + compiler->state.unit[inst->I.TexSrcUnit].texture_compare_func;
-			GLuint depthmode = compiler->state.unit[inst->I.TexSrcUnit].depth_texture_mode;
+			rc_compare_func comparefunc = compiler->state.unit[inst->U.I.TexSrcUnit].texture_compare_func;
+			unsigned int depthmode = compiler->state.unit[inst->U.I.TexSrcUnit].depth_texture_mode;
 			struct rc_instruction * inst_rcp = rc_insert_new_instruction(c, inst);
 			struct rc_instruction * inst_mad = rc_insert_new_instruction(c, inst_rcp);
 			struct rc_instruction * inst_cmp = rc_insert_new_instruction(c, inst_mad);
 			int pass, fail;
 
-			inst_rcp->I.Opcode = OPCODE_RCP;
-			inst_rcp->I.DstReg.File = PROGRAM_TEMPORARY;
-			inst_rcp->I.DstReg.Index = rc_find_free_temporary(c);
-			inst_rcp->I.DstReg.WriteMask = WRITEMASK_W;
-			inst_rcp->I.SrcReg[0] = inst->I.SrcReg[0];
-			inst_rcp->I.SrcReg[0].Swizzle = SWIZZLE_WWWW;
-
-			inst_cmp->I.DstReg = inst->I.DstReg;
-			inst->I.DstReg.File = PROGRAM_TEMPORARY;
-			inst->I.DstReg.Index = rc_find_free_temporary(c);
-			inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
-
-			inst_mad->I.Opcode = OPCODE_MAD;
-			inst_mad->I.DstReg.File = PROGRAM_TEMPORARY;
-			inst_mad->I.DstReg.Index = rc_find_free_temporary(c);
-			inst_mad->I.SrcReg[0] = inst->I.SrcReg[0];
-			inst_mad->I.SrcReg[0].Swizzle = SWIZZLE_ZZZZ;
-			inst_mad->I.SrcReg[1].File = PROGRAM_TEMPORARY;
-			inst_mad->I.SrcReg[1].Index = inst_rcp->I.DstReg.Index;
-			inst_mad->I.SrcReg[1].Swizzle = SWIZZLE_WWWW;
-			inst_mad->I.SrcReg[2].File = PROGRAM_TEMPORARY;
-			inst_mad->I.SrcReg[2].Index = inst->I.DstReg.Index;
+			inst_rcp->U.I.Opcode = RC_OPCODE_RCP;
+			inst_rcp->U.I.DstReg.File = RC_FILE_TEMPORARY;
+			inst_rcp->U.I.DstReg.Index = rc_find_free_temporary(c);
+			inst_rcp->U.I.DstReg.WriteMask = RC_MASK_W;
+			inst_rcp->U.I.SrcReg[0] = inst->U.I.SrcReg[0];
+			inst_rcp->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_WWWW;
+
+			inst_cmp->U.I.DstReg = inst->U.I.DstReg;
+			inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
+			inst->U.I.DstReg.Index = rc_find_free_temporary(c);
+			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
+
+			inst_mad->U.I.Opcode = RC_OPCODE_MAD;
+			inst_mad->U.I.DstReg.File = RC_FILE_TEMPORARY;
+			inst_mad->U.I.DstReg.Index = rc_find_free_temporary(c);
+			inst_mad->U.I.SrcReg[0] = inst->U.I.SrcReg[0];
+			inst_mad->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_ZZZZ;
+			inst_mad->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
+			inst_mad->U.I.SrcReg[1].Index = inst_rcp->U.I.DstReg.Index;
+			inst_mad->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_WWWW;
+			inst_mad->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
+			inst_mad->U.I.SrcReg[2].Index = inst->U.I.DstReg.Index;
 			if (depthmode == 0) /* GL_LUMINANCE */
-				inst_mad->I.SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z);
+				inst_mad->U.I.SrcReg[2].Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_Z);
 			else if (depthmode == 2) /* GL_ALPHA */
-				inst_mad->I.SrcReg[2].Swizzle = SWIZZLE_WWWW;
+				inst_mad->U.I.SrcReg[2].Swizzle = RC_SWIZZLE_WWWW;
 
 			/* Recall that SrcReg[0] is tex, SrcReg[2] is r and:
 			 *   r  < tex  <=>      -tex+r < 0
 			 *   r >= tex  <=> not (-tex+r < 0 */
-			if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
-				inst_mad->I.SrcReg[2].Negate = inst_mad->I.SrcReg[2].Negate ^ NEGATE_XYZW;
+			if (comparefunc == RC_COMPARE_FUNC_LESS || comparefunc == RC_COMPARE_FUNC_GEQUAL)
+				inst_mad->U.I.SrcReg[2].Negate = inst_mad->U.I.SrcReg[2].Negate ^ RC_MASK_XYZW;
 			else
-				inst_mad->I.SrcReg[0].Negate = inst_mad->I.SrcReg[0].Negate ^ NEGATE_XYZW;
+				inst_mad->U.I.SrcReg[0].Negate = inst_mad->U.I.SrcReg[0].Negate ^ RC_MASK_XYZW;
 
-			inst_cmp->I.Opcode = OPCODE_CMP;
+			inst_cmp->U.I.Opcode = RC_OPCODE_CMP;
 			/* DstReg has been filled out above */
-			inst_cmp->I.SrcReg[0].File = PROGRAM_TEMPORARY;
-			inst_cmp->I.SrcReg[0].Index = inst_mad->I.DstReg.Index;
+			inst_cmp->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
+			inst_cmp->U.I.SrcReg[0].Index = inst_mad->U.I.DstReg.Index;
 
-			if (comparefunc == GL_LESS || comparefunc == GL_GREATER) {
+			if (comparefunc == RC_COMPARE_FUNC_LESS || comparefunc == RC_COMPARE_FUNC_GREATER) {
 				pass = 1;
 				fail = 2;
 			} else {
@@ -130,131 +132,161 @@ GLboolean r500_transform_TEX(
 				fail = 1;
 			}
 
-			inst_cmp->I.SrcReg[pass].File = PROGRAM_BUILTIN;
-			inst_cmp->I.SrcReg[pass].Swizzle = SWIZZLE_1111;
-			inst_cmp->I.SrcReg[fail] = shadow_ambient(c, inst->I.TexSrcUnit);
+			inst_cmp->U.I.SrcReg[pass].File = RC_FILE_NONE;
+			inst_cmp->U.I.SrcReg[pass].Swizzle = RC_SWIZZLE_1111;
+			inst_cmp->U.I.SrcReg[fail] = shadow_ambient(c, inst->U.I.TexSrcUnit);
 		}
 	}
 
 	/* Cannot write texture to output registers */
-	if (inst->I.Opcode != OPCODE_KIL && inst->I.DstReg.File != PROGRAM_TEMPORARY) {
+	if (inst->U.I.Opcode != RC_OPCODE_KIL && inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
 		struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst);
 
-		inst_mov->I.Opcode = OPCODE_MOV;
-		inst_mov->I.DstReg = inst->I.DstReg;
-		inst_mov->I.SrcReg[0].File = PROGRAM_TEMPORARY;
-		inst_mov->I.SrcReg[0].Index = rc_find_free_temporary(c);
+		inst_mov->U.I.Opcode = RC_OPCODE_MOV;
+		inst_mov->U.I.DstReg = inst->U.I.DstReg;
+		inst_mov->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
+		inst_mov->U.I.SrcReg[0].Index = rc_find_free_temporary(c);
 
-		inst->I.DstReg.File = PROGRAM_TEMPORARY;
-		inst->I.DstReg.Index = inst_mov->I.SrcReg[0].Index;
-		inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
+		inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
+		inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
+		inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
 	}
 
 	/* Cannot read texture coordinate from constants file */
-	if (inst->I.SrcReg[0].File != PROGRAM_TEMPORARY && inst->I.SrcReg[0].File != PROGRAM_INPUT) {
+	if (inst->U.I.SrcReg[0].File != RC_FILE_TEMPORARY && inst->U.I.SrcReg[0].File != RC_FILE_INPUT) {
 		struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
 
-		inst_mov->I.Opcode = OPCODE_MOV;
-		inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
-		inst_mov->I.DstReg.Index = rc_find_free_temporary(c);
-		inst_mov->I.SrcReg[0] = inst->I.SrcReg[0];
+		inst_mov->U.I.Opcode = RC_OPCODE_MOV;
+		inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
+		inst_mov->U.I.DstReg.Index = rc_find_free_temporary(c);
+		inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[0];
 
-		reset_srcreg(&inst->I.SrcReg[0]);
-		inst->I.SrcReg[0].File = PROGRAM_TEMPORARY;
-		inst->I.SrcReg[0].Index = inst_mov->I.DstReg.Index;
+		reset_srcreg(&inst->U.I.SrcReg[0]);
+		inst->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
+		inst->U.I.SrcReg[0].Index = inst_mov->U.I.DstReg.Index;
 	}
 
-	return GL_TRUE;
+	return 1;
+}
+
+/**
+ * Rewrite IF instructions to use the ALU result special register.
+ */
+int r500_transform_IF(
+	struct radeon_compiler * c,
+	struct rc_instruction * inst,
+	void* data)
+{
+	if (inst->U.I.Opcode != RC_OPCODE_IF)
+		return 0;
+
+	struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
+	inst_mov->U.I.Opcode = RC_OPCODE_MOV;
+	inst_mov->U.I.DstReg.WriteMask = 0;
+	inst_mov->U.I.WriteALUResult = RC_ALURESULT_W;
+	inst_mov->U.I.ALUResultCompare = RC_COMPARE_FUNC_NOTEQUAL;
+	inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[0];
+	inst_mov->U.I.SrcReg[0].Swizzle = combine_swizzles4(inst_mov->U.I.SrcReg[0].Swizzle,
+			RC_SWIZZLE_UNUSED, RC_SWIZZLE_UNUSED, RC_SWIZZLE_UNUSED, RC_SWIZZLE_X);
+
+	inst->U.I.SrcReg[0].File = RC_FILE_SPECIAL;
+	inst->U.I.SrcReg[0].Index = RC_SPECIAL_ALU_RESULT;
+	inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
+	inst->U.I.SrcReg[0].Negate = 0;
+
+	return 1;
 }
 
-GLboolean r500FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
+static int r500_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 {
-	GLuint relevant;
+	unsigned int relevant;
 	int i;
 
-	if (opcode == OPCODE_TEX ||
-	    opcode == OPCODE_TXB ||
-	    opcode == OPCODE_TXP ||
-	    opcode == OPCODE_KIL) {
+	if (opcode == RC_OPCODE_TEX ||
+	    opcode == RC_OPCODE_TXB ||
+	    opcode == RC_OPCODE_TXP ||
+	    opcode == RC_OPCODE_KIL) {
 		if (reg.Abs)
-			return GL_FALSE;
+			return 0;
 
-		if (opcode == OPCODE_KIL && (reg.Swizzle != SWIZZLE_NOOP || reg.Negate != NEGATE_NONE))
-			return GL_FALSE;
+		if (opcode == RC_OPCODE_KIL && (reg.Swizzle != RC_SWIZZLE_XYZW || reg.Negate != RC_MASK_NONE))
+			return 0;
 
 		if (reg.Negate)
-			reg.Negate ^= NEGATE_XYZW;
+			reg.Negate ^= RC_MASK_XYZW;
 
 		for(i = 0; i < 4; ++i) {
-			GLuint swz = GET_SWZ(reg.Swizzle, i);
-			if (swz == SWIZZLE_NIL) {
+			unsigned int swz = GET_SWZ(reg.Swizzle, i);
+			if (swz == RC_SWIZZLE_UNUSED) {
 				reg.Negate &= ~(1 << i);
 				continue;
 			}
 			if (swz >= 4)
-				return GL_FALSE;
+				return 0;
 		}
 
 		if (reg.Negate)
-			return GL_FALSE;
+			return 0;
 
-		return GL_TRUE;
-	} else if (opcode == OPCODE_DDX || opcode == OPCODE_DDY) {
+		return 1;
+	} else if (opcode == RC_OPCODE_DDX || opcode == RC_OPCODE_DDY) {
 		/* DDX/MDH and DDY/MDV explicitly ignore incoming swizzles;
 		 * if it doesn't fit perfectly into a .xyzw case... */
-		if (reg.Swizzle == SWIZZLE_NOOP && !reg.Abs && !reg.Negate)
-			return GL_TRUE;
+		if (reg.Swizzle == RC_SWIZZLE_XYZW && !reg.Abs && !reg.Negate)
+			return 1;
 
-		return GL_FALSE;
+		return 0;
 	} else {
 		/* ALU instructions support almost everything */
 		if (reg.Abs)
-			return GL_TRUE;
+			return 1;
 
 		relevant = 0;
 		for(i = 0; i < 3; ++i) {
-			GLuint swz = GET_SWZ(reg.Swizzle, i);
-			if (swz != SWIZZLE_NIL && swz != SWIZZLE_ZERO)
+			unsigned int swz = GET_SWZ(reg.Swizzle, i);
+			if (swz != RC_SWIZZLE_UNUSED && swz != RC_SWIZZLE_ZERO)
 				relevant |= 1 << i;
 		}
 		if ((reg.Negate & relevant) && ((reg.Negate & relevant) != relevant))
-			return GL_FALSE;
+			return 0;
 
-		return GL_TRUE;
+		return 1;
 	}
 }
 
 /**
- * Implement a MOV with a potentially non-native swizzle.
+ * Split source register access.
  *
  * The only thing we *cannot* do in an ALU instruction is per-component
- * negation. Therefore, we split the MOV into two instructions when necessary.
+ * negation.
  */
-void r500FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src)
+static void r500_swizzle_split(struct rc_src_register src, unsigned int usemask,
+		struct rc_swizzle_split * split)
 {
-	GLuint negatebase[2] = { 0, 0 };
+	unsigned int negatebase[2] = { 0, 0 };
 	int i;
 
 	for(i = 0; i < 4; ++i) {
-		GLuint swz = GET_SWZ(src.Swizzle, i);
-		if (swz == SWIZZLE_NIL)
+		unsigned int swz = GET_SWZ(src.Swizzle, i);
+		if (swz == RC_SWIZZLE_UNUSED || !GET_BIT(usemask, i))
 			continue;
 		negatebase[GET_BIT(src.Negate, i)] |= 1 << i;
 	}
 
+	split->NumPhases = 0;
+
 	for(i = 0; i <= 1; ++i) {
 		if (!negatebase[i])
 			continue;
 
-		struct rc_instruction *inst = rc_insert_new_instruction(s->Compiler, s->IP->Prev);
-		inst->I.Opcode = OPCODE_MOV;
-		inst->I.DstReg = dst;
-		inst->I.DstReg.WriteMask = negatebase[i];
-		inst->I.SrcReg[0] = src;
-		inst->I.SrcReg[0].Negate = (i == 0) ? NEGATE_NONE : NEGATE_XYZW;
+		split->Phase[split->NumPhases++] = negatebase[i];
 	}
 }
 
+struct rc_swizzle_caps r500_swizzle_caps = {
+	.IsNative = r500_swizzle_is_native,
+	.Split = r500_swizzle_split
+};
 
 static char *toswiz(int swiz_val) {
   switch(swiz_val) {
diff --git a/r300/compiler/r500_fragprog.h b/r300/compiler/r500_fragprog.h
index 9091f65..0918cdf 100644
--- a/r300/compiler/r500_fragprog.h
+++ b/r300/compiler/r500_fragprog.h
@@ -33,21 +33,21 @@
 #ifndef __R500_FRAGPROG_H_
 #define __R500_FRAGPROG_H_
 
-#include "shader/prog_parameter.h"
-#include "shader/prog_instruction.h"
-
 #include "radeon_compiler.h"
-#include "radeon_nqssadce.h"
+#include "radeon_swizzle.h"
 
 extern void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler);
 
 extern void r500FragmentProgramDump(struct rX00_fragment_program_code *c);
 
-extern GLboolean r500FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg);
+extern struct rc_swizzle_caps r500_swizzle_caps;
 
-extern void r500FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src);
+extern int r500_transform_TEX(
+	struct radeon_compiler * c,
+	struct rc_instruction * inst,
+	void* data);
 
-extern GLboolean r500_transform_TEX(
+extern int r500_transform_IF(
 	struct radeon_compiler * c,
 	struct rc_instruction * inst,
 	void* data);
diff --git a/r300/compiler/r500_fragprog_emit.c b/r300/compiler/r500_fragprog_emit.c
index d694725..2942267 100644
--- a/r300/compiler/r500_fragprog_emit.c
+++ b/r300/compiler/r500_fragprog_emit.c
@@ -37,10 +37,6 @@
  *
  * \author Corbin Simpson <MostAwesomeDude@gmail.com>
  *
- * \todo Depth write, WPOS/FOGC inputs
- *
- * \todo FogOption
- *
  */
 
 #include "r500_fragprog.h"
@@ -51,7 +47,6 @@
 
 
 #define PROG_CODE \
-	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)data; \
 	struct r500_fragment_program_code *code = &c->code->code.r500
 
 #define error(fmt, args...) do {			\
@@ -60,63 +55,80 @@
 	} while(0)
 
 
-static GLuint translate_rgb_op(struct r300_fragment_program_compiler *c, GLuint opcode)
+struct branch_info {
+	int If;
+	int Else;
+	int Endif;
+};
+
+struct emit_state {
+	struct radeon_compiler * C;
+	struct r500_fragment_program_code * Code;
+
+	struct branch_info * Branches;
+	unsigned int CurrentBranchDepth;
+	unsigned int BranchesReserved;
+
+	unsigned int MaxBranchDepth;
+};
+
+static unsigned int translate_rgb_op(struct r300_fragment_program_compiler *c, rc_opcode opcode)
 {
 	switch(opcode) {
-	case OPCODE_CMP: return R500_ALU_RGBA_OP_CMP;
-	case OPCODE_DDX: return R500_ALU_RGBA_OP_MDH;
-	case OPCODE_DDY: return R500_ALU_RGBA_OP_MDV;
-	case OPCODE_DP3: return R500_ALU_RGBA_OP_DP3;
-	case OPCODE_DP4: return R500_ALU_RGBA_OP_DP4;
-	case OPCODE_FRC: return R500_ALU_RGBA_OP_FRC;
+	case RC_OPCODE_CMP: return R500_ALU_RGBA_OP_CMP;
+	case RC_OPCODE_DDX: return R500_ALU_RGBA_OP_MDH;
+	case RC_OPCODE_DDY: return R500_ALU_RGBA_OP_MDV;
+	case RC_OPCODE_DP3: return R500_ALU_RGBA_OP_DP3;
+	case RC_OPCODE_DP4: return R500_ALU_RGBA_OP_DP4;
+	case RC_OPCODE_FRC: return R500_ALU_RGBA_OP_FRC;
 	default:
 		error("translate_rgb_op(%d): unknown opcode\n", opcode);
 		/* fall through */
-	case OPCODE_NOP:
+	case RC_OPCODE_NOP:
 		/* fall through */
-	case OPCODE_MAD: return R500_ALU_RGBA_OP_MAD;
-	case OPCODE_MAX: return R500_ALU_RGBA_OP_MAX;
-	case OPCODE_MIN: return R500_ALU_RGBA_OP_MIN;
-	case OPCODE_REPL_ALPHA: return R500_ALU_RGBA_OP_SOP;
+	case RC_OPCODE_MAD: return R500_ALU_RGBA_OP_MAD;
+	case RC_OPCODE_MAX: return R500_ALU_RGBA_OP_MAX;
+	case RC_OPCODE_MIN: return R500_ALU_RGBA_OP_MIN;
+	case RC_OPCODE_REPL_ALPHA: return R500_ALU_RGBA_OP_SOP;
 	}
 }
 
-static GLuint translate_alpha_op(struct r300_fragment_program_compiler *c, GLuint opcode)
+static unsigned int translate_alpha_op(struct r300_fragment_program_compiler *c, rc_opcode opcode)
 {
 	switch(opcode) {
-	case OPCODE_CMP: return R500_ALPHA_OP_CMP;
-	case OPCODE_COS: return R500_ALPHA_OP_COS;
-	case OPCODE_DDX: return R500_ALPHA_OP_MDH;
-	case OPCODE_DDY: return R500_ALPHA_OP_MDV;
-	case OPCODE_DP3: return R500_ALPHA_OP_DP;
-	case OPCODE_DP4: return R500_ALPHA_OP_DP;
-	case OPCODE_EX2: return R500_ALPHA_OP_EX2;
-	case OPCODE_FRC: return R500_ALPHA_OP_FRC;
-	case OPCODE_LG2: return R500_ALPHA_OP_LN2;
+	case RC_OPCODE_CMP: return R500_ALPHA_OP_CMP;
+	case RC_OPCODE_COS: return R500_ALPHA_OP_COS;
+	case RC_OPCODE_DDX: return R500_ALPHA_OP_MDH;
+	case RC_OPCODE_DDY: return R500_ALPHA_OP_MDV;
+	case RC_OPCODE_DP3: return R500_ALPHA_OP_DP;
+	case RC_OPCODE_DP4: return R500_ALPHA_OP_DP;
+	case RC_OPCODE_EX2: return R500_ALPHA_OP_EX2;
+	case RC_OPCODE_FRC: return R500_ALPHA_OP_FRC;
+	case RC_OPCODE_LG2: return R500_ALPHA_OP_LN2;
 	default:
 		error("translate_alpha_op(%d): unknown opcode\n", opcode);
 		/* fall through */
-	case OPCODE_NOP:
+	case RC_OPCODE_NOP:
 		/* fall through */
-	case OPCODE_MAD: return R500_ALPHA_OP_MAD;
-	case OPCODE_MAX: return R500_ALPHA_OP_MAX;
-	case OPCODE_MIN: return R500_ALPHA_OP_MIN;
-	case OPCODE_RCP: return R500_ALPHA_OP_RCP;
-	case OPCODE_RSQ: return R500_ALPHA_OP_RSQ;
-	case OPCODE_SIN: return R500_ALPHA_OP_SIN;
+	case RC_OPCODE_MAD: return R500_ALPHA_OP_MAD;
+	case RC_OPCODE_MAX: return R500_ALPHA_OP_MAX;
+	case RC_OPCODE_MIN: return R500_ALPHA_OP_MIN;
+	case RC_OPCODE_RCP: return R500_ALPHA_OP_RCP;
+	case RC_OPCODE_RSQ: return R500_ALPHA_OP_RSQ;
+	case RC_OPCODE_SIN: return R500_ALPHA_OP_SIN;
 	}
 }
 
-static GLuint fix_hw_swizzle(GLuint swz)
+static unsigned int fix_hw_swizzle(unsigned int swz)
 {
 	if (swz == 5) swz = 6;
-	if (swz == SWIZZLE_NIL) swz = 4;
+	if (swz == RC_SWIZZLE_UNUSED) swz = 4;
 	return swz;
 }
 
-static GLuint translate_arg_rgb(struct radeon_pair_instruction *inst, int arg)
+static unsigned int translate_arg_rgb(struct rc_pair_instruction *inst, int arg)
 {
-	GLuint t = inst->RGB.Arg[arg].Source;
+	unsigned int t = inst->RGB.Arg[arg].Source;
 	int comp;
 	t |= inst->RGB.Arg[arg].Negate << 11;
 	t |= inst->RGB.Arg[arg].Abs << 12;
@@ -127,39 +139,57 @@ static GLuint translate_arg_rgb(struct radeon_pair_instruction *inst, int arg)
 	return t;
 }
 
-static GLuint translate_arg_alpha(struct radeon_pair_instruction *inst, int i)
+static unsigned int translate_arg_alpha(struct rc_pair_instruction *inst, int i)
 {
-	GLuint t = inst->Alpha.Arg[i].Source;
+	unsigned int t = inst->Alpha.Arg[i].Source;
 	t |= fix_hw_swizzle(inst->Alpha.Arg[i].Swizzle) << 2;
 	t |= inst->Alpha.Arg[i].Negate << 5;
 	t |= inst->Alpha.Arg[i].Abs << 6;
 	return t;
 }
 
-static void use_temporary(struct r500_fragment_program_code* code, GLuint index)
+static uint32_t translate_alu_result_op(struct r300_fragment_program_compiler * c, rc_compare_func func)
+{
+	switch(func) {
+	case RC_COMPARE_FUNC_EQUAL: return R500_INST_ALU_RESULT_OP_EQ;
+	case RC_COMPARE_FUNC_LESS: return R500_INST_ALU_RESULT_OP_LT;
+	case RC_COMPARE_FUNC_GEQUAL: return R500_INST_ALU_RESULT_OP_GE;
+	case RC_COMPARE_FUNC_NOTEQUAL: return R500_INST_ALU_RESULT_OP_NE;
+	default:
+		rc_error(&c->Base, "%s: unsupported compare func %i\n", __FUNCTION__, func);
+		return 0;
+	}
+}
+
+static void use_temporary(struct r500_fragment_program_code* code, unsigned int index)
 {
 	if (index > code->max_temp_idx)
 		code->max_temp_idx = index;
 }
 
-static GLuint use_source(struct r500_fragment_program_code* code, struct radeon_pair_instruction_source src)
+static unsigned int use_source(struct r500_fragment_program_code* code, struct radeon_pair_instruction_source src)
 {
-	if (!src.Constant)
+	if (src.File == RC_FILE_CONSTANT) {
+		return src.Index | 0x100;
+	} else if (src.File == RC_FILE_TEMPORARY) {
 		use_temporary(code, src.Index);
-	return src.Index | src.Constant << 8;
+		return src.Index;
+	}
+
+	return 0;
 }
 
 
 /**
  * Emit a paired ALU instruction.
  */
-static GLboolean emit_paired(void *data, struct radeon_pair_instruction *inst)
+static void emit_paired(struct r300_fragment_program_compiler *c, struct rc_pair_instruction *inst)
 {
 	PROG_CODE;
 
 	if (code->inst_end >= 511) {
 		error("emit_alu: Too many instructions");
-		return GL_FALSE;
+		return;
 	}
 
 	int ip = ++code->inst_end;
@@ -167,17 +197,22 @@ static GLboolean emit_paired(void *data, struct radeon_pair_instruction *inst)
 	code->inst[ip].inst5 = translate_rgb_op(c, inst->RGB.Opcode);
 	code->inst[ip].inst4 = translate_alpha_op(c, inst->Alpha.Opcode);
 
-	if (inst->RGB.OutputWriteMask || inst->Alpha.OutputWriteMask || inst->Alpha.DepthWriteMask)
+	if (inst->RGB.OutputWriteMask || inst->Alpha.OutputWriteMask || inst->Alpha.DepthWriteMask) {
 		code->inst[ip].inst0 = R500_INST_TYPE_OUT;
-	else
+		if (inst->WriteALUResult) {
+			error("%s: cannot write output and ALU result at the same time");
+			return;
+		}
+	} else {
 		code->inst[ip].inst0 = R500_INST_TYPE_ALU;
+	}
 	code->inst[ip].inst0 |= R500_INST_TEX_SEM_WAIT;
 
 	code->inst[ip].inst0 |= (inst->RGB.WriteMask << 11) | (inst->Alpha.WriteMask << 14);
 	code->inst[ip].inst0 |= (inst->RGB.OutputWriteMask << 15) | (inst->Alpha.OutputWriteMask << 18);
 	if (inst->Alpha.DepthWriteMask) {
 		code->inst[ip].inst4 |= R500_ALPHA_W_OMASK;
-		c->code->writes_depth = GL_TRUE;
+		c->code->writes_depth = 1;
 	}
 
 	code->inst[ip].inst4 |= R500_ALPHA_ADDRD(inst->Alpha.DestIndex);
@@ -206,12 +241,21 @@ static GLboolean emit_paired(void *data, struct radeon_pair_instruction *inst)
 	code->inst[ip].inst4 |= translate_arg_alpha(inst, 1) << R500_ALPHA_SEL_B_SHIFT;
 	code->inst[ip].inst5 |= translate_arg_alpha(inst, 2) << R500_ALU_RGBA_ALPHA_SEL_C_SHIFT;
 
-	return GL_TRUE;
+	if (inst->WriteALUResult) {
+		code->inst[ip].inst3 |= R500_ALU_RGB_WMASK;
+
+		if (inst->WriteALUResult == RC_ALURESULT_X)
+			code->inst[ip].inst0 |= R500_INST_ALU_RESULT_SEL_RED;
+		else
+			code->inst[ip].inst0 |= R500_INST_ALU_RESULT_SEL_ALPHA;
+
+		code->inst[ip].inst0 |= translate_alu_result_op(c, inst->ALUResultCompare);
+	}
 }
 
-static GLuint translate_strq_swizzle(GLuint swizzle)
+static unsigned int translate_strq_swizzle(unsigned int swizzle)
 {
-	GLuint swiz = 0;
+	unsigned int swiz = 0;
 	int i;
 	for (i = 0; i < 4; i++)
 		swiz |= (GET_SWZ(swizzle, i) & 0x3) << i*2;
@@ -221,67 +265,194 @@ static GLuint translate_strq_swizzle(GLuint swizzle)
 /**
  * Emit a single TEX instruction
  */
-static GLboolean emit_tex(void *data, struct radeon_pair_texture_instruction *inst)
+static int emit_tex(struct r300_fragment_program_compiler *c, struct rc_sub_instruction *inst)
 {
 	PROG_CODE;
 
 	if (code->inst_end >= 511) {
 		error("emit_tex: Too many instructions");
-		return GL_FALSE;
+		return 0;
 	}
 
 	int ip = ++code->inst_end;
 
 	code->inst[ip].inst0 = R500_INST_TYPE_TEX
-		| (inst->WriteMask << 11)
+		| (inst->DstReg.WriteMask << 11)
 		| R500_INST_TEX_SEM_WAIT;
 	code->inst[ip].inst1 = R500_TEX_ID(inst->TexSrcUnit)
 		| R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED;
 
-	if (inst->TexSrcTarget == TEXTURE_RECT_INDEX)
-	        code->inst[ip].inst1 |= R500_TEX_UNSCALED;
+	if (inst->TexSrcTarget == RC_TEXTURE_RECT)
+		code->inst[ip].inst1 |= R500_TEX_UNSCALED;
 
 	switch (inst->Opcode) {
-	case RADEON_OPCODE_KIL:
+	case RC_OPCODE_KIL:
 		code->inst[ip].inst1 |= R500_TEX_INST_TEXKILL;
 		break;
-	case RADEON_OPCODE_TEX:
+	case RC_OPCODE_TEX:
 		code->inst[ip].inst1 |= R500_TEX_INST_LD;
 		break;
-	case RADEON_OPCODE_TXB:
+	case RC_OPCODE_TXB:
 		code->inst[ip].inst1 |= R500_TEX_INST_LODBIAS;
 		break;
-	case RADEON_OPCODE_TXP:
+	case RC_OPCODE_TXP:
 		code->inst[ip].inst1 |= R500_TEX_INST_PROJ;
 		break;
 	default:
 		error("emit_tex can't handle opcode %x\n", inst->Opcode);
 	}
 
-	code->inst[ip].inst2 = R500_TEX_SRC_ADDR(inst->SrcIndex)
-		| (translate_strq_swizzle(inst->SrcSwizzle) << 8)
-		| R500_TEX_DST_ADDR(inst->DestIndex)
+	use_temporary(code, inst->SrcReg[0].Index);
+	if (inst->Opcode != RC_OPCODE_KIL)
+		use_temporary(code, inst->DstReg.Index);
+
+	code->inst[ip].inst2 = R500_TEX_SRC_ADDR(inst->SrcReg[0].Index)
+		| (translate_strq_swizzle(inst->SrcReg[0].Swizzle) << 8)
+		| R500_TEX_DST_ADDR(inst->DstReg.Index)
 		| R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G
 		| R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A;
 
-	return GL_TRUE;
+	return 1;
 }
 
-static const struct radeon_pair_handler pair_handler = {
-	.EmitPaired = emit_paired,
-	.EmitTex = emit_tex,
-	.MaxHwTemps = 128
-};
+static void grow_branches(struct emit_state * s)
+{
+	unsigned int newreserved = s->BranchesReserved * 2;
+	struct branch_info * newbranches;
+
+	if (!newreserved)
+		newreserved = 4;
+
+	newbranches = memory_pool_malloc(&s->C->Pool, newreserved*sizeof(struct branch_info));
+	memcpy(newbranches, s->Branches, s->CurrentBranchDepth*sizeof(struct branch_info));
+
+	s->Branches = newbranches;
+	s->BranchesReserved = newreserved;
+}
+
+static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst)
+{
+	if (s->Code->inst_end >= 511) {
+		rc_error(s->C, "emit_tex: Too many instructions");
+		return;
+	}
+
+	unsigned int newip = ++s->Code->inst_end;
+
+	s->Code->inst[newip].inst0 = R500_INST_TYPE_FC | R500_INST_ALU_WAIT;
+
+	if (inst->U.I.Opcode == RC_OPCODE_IF) {
+		if (s->CurrentBranchDepth >= 32) {
+			rc_error(s->C, "Branch depth exceeds hardware limit");
+			return;
+		}
+
+		if (s->CurrentBranchDepth >= s->BranchesReserved)
+			grow_branches(s);
+
+		struct branch_info * branch = &s->Branches[s->CurrentBranchDepth++];
+		branch->If = newip;
+		branch->Else = -1;
+		branch->Endif = -1;
+
+		if (s->CurrentBranchDepth > s->MaxBranchDepth)
+			s->MaxBranchDepth = s->CurrentBranchDepth;
+
+		/* actual instruction is filled in at ENDIF time */
+	} else if (inst->U.I.Opcode == RC_OPCODE_ELSE) {
+		if (!s->CurrentBranchDepth) {
+			rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__);
+			return;
+		}
+
+		struct branch_info * branch = &s->Branches[s->CurrentBranchDepth - 1];
+		branch->Else = newip;
+
+		/* actual instruction is filled in at ENDIF time */
+	} else if (inst->U.I.Opcode == RC_OPCODE_ENDIF) {
+		if (!s->CurrentBranchDepth) {
+			rc_error(s->C, "%s: got ELSE outside a branch", __FUNCTION__);
+			return;
+		}
+
+		struct branch_info * branch = &s->Branches[s->CurrentBranchDepth - 1];
+		branch->Endif = newip;
+
+		s->Code->inst[branch->If].inst2 = R500_FC_OP_JUMP
+			| R500_FC_A_OP_NONE /* no address stack */
+			| R500_FC_JUMP_FUNC(0x0f) /* jump if ALU result is false */
+			| R500_FC_B_OP0_INCR /* increment branch counter if stay */
+		;
+
+		if (branch->Else >= 0) {
+			/* increment branch counter also if jump */
+			s->Code->inst[branch->If].inst2 |= R500_FC_B_OP1_INCR;
+			s->Code->inst[branch->If].inst3 = R500_FC_JUMP_ADDR(branch->Else + 1);
+
+			s->Code->inst[branch->Else].inst2 = R500_FC_OP_JUMP
+				| R500_FC_A_OP_NONE /* no address stack */
+				| R500_FC_B_ELSE /* all active pixels want to jump */
+				| R500_FC_B_OP0_NONE /* no counter op if stay */
+				| R500_FC_B_OP1_DECR /* decrement branch counter if jump */
+				| R500_FC_B_POP_CNT(1)
+			;
+			s->Code->inst[branch->Else].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
+		} else {
+			/* don't touch branch counter on jump */
+			s->Code->inst[branch->If].inst2 |= R500_FC_B_OP1_NONE;
+			s->Code->inst[branch->If].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
+		}
+
+		s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP
+			| R500_FC_A_OP_NONE /* no address stack */
+			| R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */
+			| R500_FC_B_OP0_DECR /* decrement branch counter if stay */
+			| R500_FC_B_OP1_NONE /* no branch counter if stay */
+			| R500_FC_B_POP_CNT(1)
+		;
+		s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
+
+		s->CurrentBranchDepth--;
+	} else {
+		rc_error(s->C, "%s: unknown opcode %i\n", __FUNCTION__, inst->U.I.Opcode);
+	}
+}
 
 void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler)
 {
+	struct emit_state s;
 	struct r500_fragment_program_code *code = &compiler->code->code.r500;
+	struct rc_instruction * inst;
+
+	memset(&s, 0, sizeof(s));
+	s.C = &compiler->Base;
+	s.Code = code;
 
-	_mesa_bzero(code, sizeof(*code));
+	memset(code, 0, sizeof(*code));
 	code->max_temp_idx = 1;
 	code->inst_end = -1;
 
-	radeonPairProgram(compiler, &pair_handler, compiler);
+	for (inst = compiler->Base.Program.Instructions.Next;
+	    inst != &compiler->Base.Program.Instructions && !compiler->Base.Error;
+	    inst = inst->Next) {
+		if (inst->Type == RC_INSTRUCTION_NORMAL) {
+			const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+
+			if (opcode->IsFlowControl) {
+				emit_flowcontrol(&s, inst);
+			} else if (inst->U.I.Opcode == RC_OPCODE_BEGIN_TEX) {
+				continue;
+			} else {
+				emit_tex(compiler, &inst->U.I);
+			}
+		} else {
+			emit_paired(compiler, &inst->U.P);
+		}
+	}
+
+	if (code->max_temp_idx >= 128)
+		rc_error(&compiler->Base, "Too many hardware temporaries used");
+
 	if (compiler->Base.Error)
 		return;
 
@@ -296,4 +467,11 @@ void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compi
 		int ip = ++code->inst_end;
 		code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT;
 	}
+
+	if (s.MaxBranchDepth >= 4) {
+		if (code->max_temp_idx < 1)
+			code->max_temp_idx = 1;
+
+		code->us_fc_ctrl |= R500_FC_FULL_FC_EN;
+	}
 }
diff --git a/r300/compiler/radeon_code.c b/r300/compiler/radeon_code.c
index a9dedf7..853b2be 100644
--- a/r300/compiler/radeon_code.c
+++ b/r300/compiler/radeon_code.c
@@ -25,11 +25,13 @@
  *
  */
 
-#include "main/mtypes.h"
-#include "shader/prog_instruction.h"
-
 #include "radeon_code.h"
 
+#include <stdlib.h>
+#include <string.h>
+
+#include "radeon_program.h"
+
 void rc_constants_init(struct rc_constant_list * c)
 {
 	memset(c, 0, sizeof(*c));
@@ -138,13 +140,13 @@ unsigned rc_constants_add_immediate_scalar(struct rc_constant_list * c, float da
 	unsigned index;
 	int free_index = -1;
 	struct rc_constant constant;
-	unsigned comp;
 
 	for(index = 0; index < c->Count; ++index) {
 		if (c->Constants[index].Type == RC_CONSTANT_IMMEDIATE) {
+			unsigned comp;
 			for(comp = 0; comp < c->Constants[index].Size; ++comp) {
 				if (c->Constants[index].u.Immediate[comp] == data) {
-					*swizzle = MAKE_SWIZZLE4(comp, comp, comp, comp);
+					*swizzle = RC_MAKE_SWIZZLE(comp, comp, comp, comp);
 					return index;
 				}
 			}
@@ -155,9 +157,9 @@ unsigned rc_constants_add_immediate_scalar(struct rc_constant_list * c, float da
 	}
 
 	if (free_index >= 0) {
-		comp = c->Constants[free_index].Size++;
+		unsigned comp = c->Constants[free_index].Size++;
 		c->Constants[free_index].u.Immediate[comp] = data;
-		*swizzle = MAKE_SWIZZLE4(comp, comp, comp, comp);
+		*swizzle = RC_MAKE_SWIZZLE(comp, comp, comp, comp);
 		return free_index;
 	}
 
@@ -165,7 +167,7 @@ unsigned rc_constants_add_immediate_scalar(struct rc_constant_list * c, float da
 	constant.Type = RC_CONSTANT_IMMEDIATE;
 	constant.Size = 1;
 	constant.u.Immediate[0] = data;
-	*swizzle = SWIZZLE_XXXX;
+	*swizzle = RC_SWIZZLE_XXXX;
 
 	return rc_constants_add(c, &constant);
 }
diff --git a/r300/compiler/radeon_code.h b/r300/compiler/radeon_code.h
index 3e88554..902b7cf 100644
--- a/r300/compiler/radeon_code.h
+++ b/r300/compiler/radeon_code.h
@@ -89,6 +89,23 @@ unsigned rc_constants_add_immediate_vec4(struct rc_constant_list * c, const floa
 unsigned rc_constants_add_immediate_scalar(struct rc_constant_list * c, float data, unsigned * swizzle);
 
 /**
+ * Compare functions.
+ *
+ * \note By design, RC_COMPARE_FUNC_xxx + GL_NEVER gives you
+ * the correct GL compare function.
+ */
+typedef enum {
+	RC_COMPARE_FUNC_NEVER = 0,
+	RC_COMPARE_FUNC_LESS,
+	RC_COMPARE_FUNC_EQUAL,
+	RC_COMPARE_FUNC_LEQUAL,
+	RC_COMPARE_FUNC_GREATER,
+	RC_COMPARE_FUNC_NOTEQUAL,
+	RC_COMPARE_FUNC_GEQUAL,
+	RC_COMPARE_FUNC_ALWAYS
+} rc_compare_func;
+
+/**
  * Stores state that influences the compilation of a fragment program.
  */
 struct r300_fragment_program_external_state {
@@ -105,10 +122,12 @@ struct r300_fragment_program_external_state {
 
 		/**
 		 * If the sampler is used as a shadow sampler,
-		 * this field is (texture_compare_func - GL_NEVER).
-		 * [e.g. if compare function is GL_LEQUAL, this field is 3]
+		 * this field specifies the compare function.
+		 *
+		 * Otherwise, this field is \ref RC_COMPARE_FUNC_NEVER (aka 0).
 		 *
 		 * Otherwise, this field is 0.
+		 * \sa rc_compare_func
 		 */
 		unsigned texture_compare_func : 3;
 	} unit[16];
@@ -163,6 +182,8 @@ struct r500_fragment_program_code {
 	int inst_end; /* Number of instructions - 1; also, last instruction to be executed */
 
 	int max_temp_idx;
+
+	uint32_t us_fc_ctrl;
 };
 
 struct rX00_fragment_program_code {
diff --git a/r300/compiler/radeon_compiler.c b/r300/compiler/radeon_compiler.c
index da950d5..c0e7a7f 100644
--- a/r300/compiler/radeon_compiler.c
+++ b/r300/compiler/radeon_compiler.c
@@ -23,6 +23,8 @@
 #include "radeon_compiler.h"
 
 #include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 #include "radeon_program.h"
 
@@ -34,7 +36,7 @@ void rc_init(struct radeon_compiler * c)
 	memory_pool_init(&c->Pool);
 	c->Program.Instructions.Prev = &c->Program.Instructions;
 	c->Program.Instructions.Next = &c->Program.Instructions;
-	c->Program.Instructions.I.Opcode = OPCODE_END;
+	c->Program.Instructions.U.I.Opcode = RC_OPCODE_ILLEGAL_OPCODE;
 }
 
 void rc_destroy(struct radeon_compiler * c)
@@ -60,7 +62,7 @@ void rc_error(struct radeon_compiler * c, const char * fmt, ...)
 {
 	va_list ap;
 
-	c->Error = GL_TRUE;
+	c->Error = 1;
 
 	if (!c->ErrorMsg) {
 		/* Only remember the first error */
@@ -91,28 +93,63 @@ void rc_error(struct radeon_compiler * c, const char * fmt, ...)
 	}
 }
 
+int rc_if_fail_helper(struct radeon_compiler * c, const char * file, int line, const char * assertion)
+{
+	rc_error(c, "ICE at %s:%i: assertion failed: %s\n", file, line, assertion);
+	return 1;
+}
+
+/**
+ * Recompute c->Program.InputsRead and c->Program.OutputsWritten
+ * based on which inputs and outputs are actually referenced
+ * in program instructions.
+ */
+void rc_calculate_inputs_outputs(struct radeon_compiler * c)
+{
+	struct rc_instruction *inst;
+
+	c->Program.InputsRead = 0;
+	c->Program.OutputsWritten = 0;
+
+	for(inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next)
+	{
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+		int i;
+
+		for (i = 0; i < opcode->NumSrcRegs; ++i) {
+			if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT)
+				c->Program.InputsRead |= 1 << inst->U.I.SrcReg[i].Index;
+		}
+
+		if (opcode->HasDstReg) {
+			if (inst->U.I.DstReg.File == RC_FILE_OUTPUT)
+				c->Program.OutputsWritten |= 1 << inst->U.I.DstReg.Index;
+		}
+	}
+}
+
 /**
  * Rewrite the program such that everything that source the given input
  * register will source new_input instead.
  */
-void rc_move_input(struct radeon_compiler * c, unsigned input, struct prog_src_register new_input)
+void rc_move_input(struct radeon_compiler * c, unsigned input, struct rc_src_register new_input)
 {
 	struct rc_instruction * inst;
 
 	c->Program.InputsRead &= ~(1 << input);
 
 	for(inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
-		const unsigned numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 		unsigned i;
 
-		for(i = 0; i < numsrcs; ++i) {
-			if (inst->I.SrcReg[i].File == PROGRAM_INPUT && inst->I.SrcReg[i].Index == input) {
-				inst->I.SrcReg[i].File = new_input.File;
-				inst->I.SrcReg[i].Index = new_input.Index;
-				inst->I.SrcReg[i].Swizzle = combine_swizzles(new_input.Swizzle, inst->I.SrcReg[i].Swizzle);
-				if (!inst->I.SrcReg[i].Abs) {
-					inst->I.SrcReg[i].Negate ^= new_input.Negate;
-					inst->I.SrcReg[i].Abs = new_input.Abs;
+		for(i = 0; i < opcode->NumSrcRegs; ++i) {
+			if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT && inst->U.I.SrcReg[i].Index == input) {
+				inst->U.I.SrcReg[i].File = new_input.File;
+				inst->U.I.SrcReg[i].Index = new_input.Index;
+				inst->U.I.SrcReg[i].Swizzle = combine_swizzles(new_input.Swizzle, inst->U.I.SrcReg[i].Swizzle);
+				if (!inst->U.I.SrcReg[i].Abs) {
+					inst->U.I.SrcReg[i].Negate ^= new_input.Negate;
+					inst->U.I.SrcReg[i].Abs = new_input.Abs;
 				}
 
 				c->Program.InputsRead |= 1 << new_input.Index;
@@ -134,12 +171,12 @@ void rc_move_output(struct radeon_compiler * c, unsigned output, unsigned new_ou
 	c->Program.OutputsWritten &= ~(1 << output);
 
 	for(inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
-		const unsigned numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 
-		if (numdsts) {
-			if (inst->I.DstReg.File == PROGRAM_OUTPUT && inst->I.DstReg.Index == output) {
-				inst->I.DstReg.Index = new_output;
-				inst->I.DstReg.WriteMask &= writemask;
+		if (opcode->HasDstReg) {
+			if (inst->U.I.DstReg.File == RC_FILE_OUTPUT && inst->U.I.DstReg.Index == output) {
+				inst->U.I.DstReg.Index = new_output;
+				inst->U.I.DstReg.WriteMask &= writemask;
 
 				c->Program.OutputsWritten |= 1 << new_output;
 			}
@@ -157,33 +194,33 @@ void rc_copy_output(struct radeon_compiler * c, unsigned output, unsigned dup_ou
 	struct rc_instruction * inst;
 
 	for(inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
-		const unsigned numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 
-		if (numdsts) {
-			if (inst->I.DstReg.File == PROGRAM_OUTPUT && inst->I.DstReg.Index == output) {
-				inst->I.DstReg.File = PROGRAM_TEMPORARY;
-				inst->I.DstReg.Index = tempreg;
+		if (opcode->HasDstReg) {
+			if (inst->U.I.DstReg.File == RC_FILE_OUTPUT && inst->U.I.DstReg.Index == output) {
+				inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
+				inst->U.I.DstReg.Index = tempreg;
 			}
 		}
 	}
 
 	inst = rc_insert_new_instruction(c, c->Program.Instructions.Prev);
-	inst->I.Opcode = OPCODE_MOV;
-	inst->I.DstReg.File = PROGRAM_OUTPUT;
-	inst->I.DstReg.Index = output;
+	inst->U.I.Opcode = RC_OPCODE_MOV;
+	inst->U.I.DstReg.File = RC_FILE_OUTPUT;
+	inst->U.I.DstReg.Index = output;
 
-	inst->I.SrcReg[0].File = PROGRAM_TEMPORARY;
-	inst->I.SrcReg[0].Index = tempreg;
-	inst->I.SrcReg[0].Swizzle = SWIZZLE_XYZW;
+	inst->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
+	inst->U.I.SrcReg[0].Index = tempreg;
+	inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
 
 	inst = rc_insert_new_instruction(c, c->Program.Instructions.Prev);
-	inst->I.Opcode = OPCODE_MOV;
-	inst->I.DstReg.File = PROGRAM_OUTPUT;
-	inst->I.DstReg.Index = dup_output;
+	inst->U.I.Opcode = RC_OPCODE_MOV;
+	inst->U.I.DstReg.File = RC_FILE_OUTPUT;
+	inst->U.I.DstReg.Index = dup_output;
 
-	inst->I.SrcReg[0].File = PROGRAM_TEMPORARY;
-	inst->I.SrcReg[0].Index = tempreg;
-	inst->I.SrcReg[0].Swizzle = SWIZZLE_XYZW;
+	inst->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
+	inst->U.I.SrcReg[0].Index = tempreg;
+	inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
 
 	c->Program.OutputsWritten |= 1 << dup_output;
 }
@@ -201,60 +238,60 @@ void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsig
 
 	/* perspective divide */
 	struct rc_instruction * inst_rcp = rc_insert_new_instruction(c, &c->Program.Instructions);
-	inst_rcp->I.Opcode = OPCODE_RCP;
+	inst_rcp->U.I.Opcode = RC_OPCODE_RCP;
 
-	inst_rcp->I.DstReg.File = PROGRAM_TEMPORARY;
-	inst_rcp->I.DstReg.Index = tempregi;
-	inst_rcp->I.DstReg.WriteMask = WRITEMASK_W;
+	inst_rcp->U.I.DstReg.File = RC_FILE_TEMPORARY;
+	inst_rcp->U.I.DstReg.Index = tempregi;
+	inst_rcp->U.I.DstReg.WriteMask = RC_MASK_W;
 
-	inst_rcp->I.SrcReg[0].File = PROGRAM_INPUT;
-	inst_rcp->I.SrcReg[0].Index = new_input;
-	inst_rcp->I.SrcReg[0].Swizzle = SWIZZLE_WWWW;
+	inst_rcp->U.I.SrcReg[0].File = RC_FILE_INPUT;
+	inst_rcp->U.I.SrcReg[0].Index = new_input;
+	inst_rcp->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_WWWW;
 
 	struct rc_instruction * inst_mul = rc_insert_new_instruction(c, inst_rcp);
-	inst_mul->I.Opcode = OPCODE_MUL;
+	inst_mul->U.I.Opcode = RC_OPCODE_MUL;
 
-	inst_mul->I.DstReg.File = PROGRAM_TEMPORARY;
-	inst_mul->I.DstReg.Index = tempregi;
-	inst_mul->I.DstReg.WriteMask = WRITEMASK_XYZ;
+	inst_mul->U.I.DstReg.File = RC_FILE_TEMPORARY;
+	inst_mul->U.I.DstReg.Index = tempregi;
+	inst_mul->U.I.DstReg.WriteMask = RC_MASK_XYZ;
 
-	inst_mul->I.SrcReg[0].File = PROGRAM_INPUT;
-	inst_mul->I.SrcReg[0].Index = new_input;
+	inst_mul->U.I.SrcReg[0].File = RC_FILE_INPUT;
+	inst_mul->U.I.SrcReg[0].Index = new_input;
 
-	inst_mul->I.SrcReg[1].File = PROGRAM_TEMPORARY;
-	inst_mul->I.SrcReg[1].Index = tempregi;
-	inst_mul->I.SrcReg[1].Swizzle = SWIZZLE_WWWW;
+	inst_mul->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
+	inst_mul->U.I.SrcReg[1].Index = tempregi;
+	inst_mul->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_WWWW;
 
 	/* viewport transformation */
 	struct rc_instruction * inst_mad = rc_insert_new_instruction(c, inst_mul);
-	inst_mad->I.Opcode = OPCODE_MAD;
+	inst_mad->U.I.Opcode = RC_OPCODE_MAD;
 
-	inst_mad->I.DstReg.File = PROGRAM_TEMPORARY;
-	inst_mad->I.DstReg.Index = tempregi;
-	inst_mad->I.DstReg.WriteMask = WRITEMASK_XYZ;
+	inst_mad->U.I.DstReg.File = RC_FILE_TEMPORARY;
+	inst_mad->U.I.DstReg.Index = tempregi;
+	inst_mad->U.I.DstReg.WriteMask = RC_MASK_XYZ;
 
-	inst_mad->I.SrcReg[0].File = PROGRAM_TEMPORARY;
-	inst_mad->I.SrcReg[0].Index = tempregi;
-	inst_mad->I.SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+	inst_mad->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
+	inst_mad->U.I.SrcReg[0].Index = tempregi;
+	inst_mad->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ZERO);
 
-	inst_mad->I.SrcReg[1].File = PROGRAM_STATE_VAR;
-	inst_mad->I.SrcReg[1].Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_WINDOW_DIMENSION, 0);
-	inst_mad->I.SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+	inst_mad->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
+	inst_mad->U.I.SrcReg[1].Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_WINDOW_DIMENSION, 0);
+	inst_mad->U.I.SrcReg[1].Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ZERO);
 
-	inst_mad->I.SrcReg[2].File = PROGRAM_STATE_VAR;
-	inst_mad->I.SrcReg[2].Index = inst_mad->I.SrcReg[1].Index;
-	inst_mad->I.SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+	inst_mad->U.I.SrcReg[2].File = RC_FILE_CONSTANT;
+	inst_mad->U.I.SrcReg[2].Index = inst_mad->U.I.SrcReg[1].Index;
+	inst_mad->U.I.SrcReg[2].Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ZERO);
 
 	struct rc_instruction * inst;
 	for (inst = inst_mad->Next; inst != &c->Program.Instructions; inst = inst->Next) {
-		const unsigned numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 		unsigned i;
 
-		for(i = 0; i < numsrcs; i++) {
-			if (inst->I.SrcReg[i].File == PROGRAM_INPUT &&
-			    inst->I.SrcReg[i].Index == wpos) {
-				inst->I.SrcReg[i].File = PROGRAM_TEMPORARY;
-				inst->I.SrcReg[i].Index = tempregi;
+		for(i = 0; i < opcode->NumSrcRegs; i++) {
+			if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT &&
+			    inst->U.I.SrcReg[i].Index == wpos) {
+				inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
+				inst->U.I.SrcReg[i].Index = tempregi;
 			}
 		}
 	}
diff --git a/r300/compiler/radeon_compiler.h b/r300/compiler/radeon_compiler.h
index e63ab88..87a732c 100644
--- a/r300/compiler/radeon_compiler.h
+++ b/r300/compiler/radeon_compiler.h
@@ -23,35 +23,11 @@
 #ifndef RADEON_COMPILER_H
 #define RADEON_COMPILER_H
 
-#include "main/mtypes.h"
-#include "shader/prog_instruction.h"
-
 #include "memory_pool.h"
 #include "radeon_code.h"
+#include "radeon_program.h"
 
-
-struct rc_instruction {
-	struct rc_instruction * Prev;
-	struct rc_instruction * Next;
-	struct prog_instruction I;
-};
-
-struct rc_program {
-	/**
-	 * Instructions.Next points to the first instruction,
-	 * Instructions.Prev points to the last instruction.
-	 */
-	struct rc_instruction Instructions;
-
-	/* Long term, we should probably remove InputsRead & OutputsWritten,
-	 * since updating dependent state can be fragile, and they aren't
-	 * actually used very often. */
-	uint32_t InputsRead;
-	uint32_t OutputsWritten;
-	uint32_t ShadowSamplers; /**< Texture units used for shadow sampling. */
-
-	struct rc_constant_list Constants;
-};
+struct rc_swizzle_caps;
 
 struct radeon_compiler {
 	struct memory_pool Pool;
@@ -59,6 +35,14 @@ struct radeon_compiler {
 	unsigned Debug:1;
 	unsigned Error:1;
 	char * ErrorMsg;
+
+	/**
+	 * Variables used internally, not be touched by callers
+	 * of the compiler
+	 */
+	/*@{*/
+	struct rc_swizzle_caps * SwizzleCaps;
+	/*@}*/
 };
 
 void rc_init(struct radeon_compiler * c);
@@ -67,11 +51,26 @@ void rc_destroy(struct radeon_compiler * c);
 void rc_debug(struct radeon_compiler * c, const char * fmt, ...);
 void rc_error(struct radeon_compiler * c, const char * fmt, ...);
 
-void rc_mesa_to_rc_program(struct radeon_compiler * c, struct gl_program * program);
+int rc_if_fail_helper(struct radeon_compiler * c, const char * file, int line, const char * assertion);
+
+/**
+ * This macro acts like an if-statement that can be used to implement
+ * non-aborting assertions in the compiler.
+ *
+ * It checks whether \p cond is true. If not, an internal compiler error is
+ * flagged and the if-clause is run.
+ *
+ * A typical use-case would be:
+ *
+ *  if (rc_assert(c, condition-that-must-be-true))
+ *  	return;
+ */
+#define rc_assert(c, cond) \
+	(!(cond) && rc_if_fail_helper(c, __FILE__, __LINE__, #cond))
 
 void rc_calculate_inputs_outputs(struct radeon_compiler * c);
 
-void rc_move_input(struct radeon_compiler * c, unsigned input, struct prog_src_register new_input);
+void rc_move_input(struct radeon_compiler * c, unsigned input, struct rc_src_register new_input);
 void rc_move_output(struct radeon_compiler * c, unsigned output, unsigned new_output, unsigned writemask);
 void rc_copy_output(struct radeon_compiler * c, unsigned output, unsigned dup_output);
 void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsigned new_input);
@@ -97,7 +96,7 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c);
 struct r300_vertex_program_compiler {
 	struct radeon_compiler Base;
 	struct r300_vertex_program_code *code;
-	GLbitfield RequiredOutputs;
+	uint32_t RequiredOutputs;
 
 	void * UserData;
 	void (*SetHwInputOutput)(struct r300_vertex_program_compiler * c);
diff --git a/r300/compiler/radeon_dataflow.c b/r300/compiler/radeon_dataflow.c
new file mode 100644
index 0000000..a003e77
--- /dev/null
+++ b/r300/compiler/radeon_dataflow.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_dataflow.h"
+
+#include "radeon_program.h"
+
+
+static void reads_normal(struct rc_instruction * fullinst, rc_read_write_fn cb, void * userdata)
+{
+	struct rc_sub_instruction * inst = &fullinst->U.I;
+	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->Opcode);
+	unsigned int src;
+
+	for(src = 0; src < opcode->NumSrcRegs; ++src) {
+		unsigned int refmask = 0, chan;
+		
+		if (inst->SrcReg[src].File == RC_FILE_NONE)
+			return;
+
+		for(chan = 0; chan < 4; ++chan)
+			refmask |= 1 << GET_SWZ(inst->SrcReg[src].Swizzle, chan);
+
+		refmask &= RC_MASK_XYZW;
+
+		for(chan = 0; chan < 4; ++chan) {
+			if (GET_BIT(refmask, chan)) {
+				cb(userdata, fullinst, inst->SrcReg[src].File, inst->SrcReg[src].Index, chan);
+			}
+		}
+
+		if (refmask && inst->SrcReg[src].RelAddr)
+			cb(userdata, fullinst, RC_FILE_ADDRESS, 0, RC_MASK_X);
+	}
+}
+
+static void reads_pair(struct rc_instruction * fullinst,  rc_read_write_fn cb, void * userdata)
+{
+	struct rc_pair_instruction * inst = &fullinst->U.P;
+	unsigned int refmasks[3] = { 0, 0, 0 };
+	unsigned int arg, src;
+
+	if (inst->RGB.Opcode != RC_OPCODE_NOP) {
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->RGB.Opcode);
+
+		for(arg = 0; arg < opcode->NumSrcRegs; ++arg) {
+			unsigned int chan;
+
+			for(chan = 0; chan < 3; ++chan) {
+				unsigned int swz = GET_SWZ(inst->RGB.Arg[arg].Swizzle, chan);
+				if (swz < 4)
+					refmasks[inst->RGB.Arg[arg].Source] |= 1 << swz;
+			}
+		}
+	}
+
+	if (inst->Alpha.Opcode != RC_OPCODE_NOP) {
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->Alpha.Opcode);
+
+		for(arg = 0; arg < opcode->NumSrcRegs; ++arg) {
+			if (inst->Alpha.Arg[arg].Swizzle < 4)
+				refmasks[inst->Alpha.Arg[arg].Source] |= 1 << inst->Alpha.Arg[arg].Swizzle;
+		}
+	}
+
+	for(src = 0; src < 3; ++src) {
+		if (inst->RGB.Src[src].Used) {
+			unsigned int chan;
+			for(chan = 0; chan < 3; ++chan) {
+				if (GET_BIT(refmasks[src], chan))
+					cb(userdata, fullinst, inst->RGB.Src[src].File, inst->RGB.Src[src].Index, chan);
+			}
+		}
+
+		if (inst->Alpha.Src[src].Used) {
+			if (GET_BIT(refmasks[src], 3))
+				cb(userdata, fullinst, inst->Alpha.Src[src].File, inst->Alpha.Src[src].Index, 3);
+		}
+	}
+}
+
+/**
+ * Calls a callback function for all sourced register channels.
+ *
+ * This is conservative, i.e. channels may be called multiple times,
+ * and the writemask of the instruction is not taken into account.
+ */
+void rc_for_all_reads(struct rc_instruction * inst, rc_read_write_fn cb, void * userdata)
+{
+	if (inst->Type == RC_INSTRUCTION_NORMAL) {
+		reads_normal(inst, cb, userdata);
+	} else {
+		reads_pair(inst, cb, userdata);
+	}
+}
+
+
+
+static void writes_normal(struct rc_instruction * fullinst, rc_read_write_fn cb, void * userdata)
+{
+	struct rc_sub_instruction * inst = &fullinst->U.I;
+	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->Opcode);
+
+	if (opcode->HasDstReg) {
+		unsigned int chan;
+
+		for(chan = 0; chan < 4; ++chan) {
+			if (GET_BIT(inst->DstReg.WriteMask, chan))
+				cb(userdata, fullinst, inst->DstReg.File, inst->DstReg.Index, chan);
+		}
+	}
+
+	if (inst->WriteALUResult)
+		cb(userdata, fullinst, RC_FILE_SPECIAL, RC_SPECIAL_ALU_RESULT, 0);
+}
+
+static void writes_pair(struct rc_instruction * fullinst, rc_read_write_fn cb, void * userdata)
+{
+	struct rc_pair_instruction * inst = &fullinst->U.P;
+	unsigned int chan;
+
+	for(chan = 0; chan < 3; ++chan) {
+		if (GET_BIT(inst->RGB.WriteMask, chan))
+			cb(userdata, fullinst, RC_FILE_TEMPORARY, inst->RGB.DestIndex, chan);
+	}
+
+	if (inst->Alpha.WriteMask)
+		cb(userdata, fullinst, RC_FILE_TEMPORARY, inst->Alpha.DestIndex, 3);
+
+	if (inst->WriteALUResult)
+		cb(userdata, fullinst, RC_FILE_SPECIAL, RC_SPECIAL_ALU_RESULT, 0);
+}
+
+/**
+ * Calls a callback function for all written register channels.
+ *
+ * \warning Does not report output registers for paired instructions!
+ */
+void rc_for_all_writes(struct rc_instruction * inst, rc_read_write_fn cb, void * userdata)
+{
+	if (inst->Type == RC_INSTRUCTION_NORMAL) {
+		writes_normal(inst, cb, userdata);
+	} else {
+		writes_pair(inst, cb, userdata);
+	}
+}
diff --git a/r300/compiler/radeon_dataflow.h b/r300/compiler/radeon_dataflow.h
new file mode 100644
index 0000000..5aa4cb6
--- /dev/null
+++ b/r300/compiler/radeon_dataflow.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef RADEON_DATAFLOW_H
+#define RADEON_DATAFLOW_H
+
+#include "radeon_program_constants.h"
+
+struct radeon_compiler;
+struct rc_instruction;
+struct rc_swizzle_caps;
+
+
+/**
+ * Help analyze the register accesses of instructions.
+ */
+/*@{*/
+typedef void (*rc_read_write_fn)(void * userdata, struct rc_instruction * inst,
+			rc_register_file file, unsigned int index, unsigned int chan);
+void rc_for_all_reads(struct rc_instruction * inst, rc_read_write_fn cb, void * userdata);
+void rc_for_all_writes(struct rc_instruction * inst, rc_read_write_fn cb, void * userdata);
+/*@}*/
+
+
+/**
+ * Compiler passes based on dataflow analysis.
+ */
+/*@{*/
+typedef void (*rc_dataflow_mark_outputs_fn)(void * userdata, void * data,
+			void (*mark_fn)(void * data, unsigned int index, unsigned int mask));
+void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_fn dce, void * userdata);
+void rc_dataflow_swizzles(struct radeon_compiler * c);
+/*@}*/
+
+#endif /* RADEON_DATAFLOW_H */
diff --git a/r300/compiler/radeon_dataflow_deadcode.c b/r300/compiler/radeon_dataflow_deadcode.c
new file mode 100644
index 0000000..d78efa1
--- /dev/null
+++ b/r300/compiler/radeon_dataflow_deadcode.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_dataflow.h"
+
+#include "radeon_compiler.h"
+
+
+struct updatemask_state {
+	unsigned char Output[RC_REGISTER_MAX_INDEX];
+	unsigned char Temporary[RC_REGISTER_MAX_INDEX];
+	unsigned char Address;
+	unsigned char Special[RC_NUM_SPECIAL_REGISTERS];
+};
+
+struct instruction_state {
+	unsigned char WriteMask:4;
+	unsigned char WriteALUResult:1;
+	unsigned char SrcReg[3];
+};
+
+struct branchinfo {
+	unsigned int HaveElse:1;
+
+	struct updatemask_state StoreEndif;
+	struct updatemask_state StoreElse;
+};
+
+struct deadcode_state {
+	struct radeon_compiler * C;
+	struct instruction_state * Instructions;
+
+	struct updatemask_state R;
+
+	struct branchinfo * BranchStack;
+	unsigned int BranchStackSize;
+	unsigned int BranchStackReserved;
+};
+
+
+static void or_updatemasks(
+	struct updatemask_state * dst,
+	struct updatemask_state * a,
+	struct updatemask_state * b)
+{
+	unsigned int i;
+
+	for(i = 0; i < RC_REGISTER_MAX_INDEX; ++i) {
+		dst->Output[i] = a->Output[i] | b->Output[i];
+		dst->Temporary[i] = a->Temporary[i] | b->Temporary[i];
+	}
+
+	for(i = 0; i < RC_NUM_SPECIAL_REGISTERS; ++i)
+		dst->Special[i] = a->Special[i] | b->Special[i];
+
+	dst->Address = a->Address | b->Address;
+}
+
+static void push_branch(struct deadcode_state * s)
+{
+	if (s->BranchStackSize >= s->BranchStackReserved) {
+		unsigned int new_reserve = 2 * s->BranchStackReserved;
+		struct branchinfo * new_stack;
+
+		if (!new_reserve)
+			new_reserve = 4;
+
+		new_stack = memory_pool_malloc(&s->C->Pool, new_reserve * sizeof(struct branchinfo));
+		memcpy(new_stack, s->BranchStack, s->BranchStackSize * sizeof(struct branchinfo));
+
+		s->BranchStack = new_stack;
+		s->BranchStackReserved = new_reserve;
+	}
+
+	struct branchinfo * branch = &s->BranchStack[s->BranchStackSize++];
+	branch->HaveElse = 0;
+	memcpy(&branch->StoreEndif, &s->R, sizeof(s->R));
+}
+
+static unsigned char * get_used_ptr(struct deadcode_state *s, rc_register_file file, unsigned int index)
+{
+	if (file == RC_FILE_OUTPUT || file == RC_FILE_TEMPORARY) {
+		if (index >= RC_REGISTER_MAX_INDEX) {
+			rc_error(s->C, "%s: index %i is out of bounds for file %i\n", __FUNCTION__, index, file);
+			return 0;
+		}
+
+		if (file == RC_FILE_OUTPUT)
+			return &s->R.Output[index];
+		else
+			return &s->R.Temporary[index];
+	} else if (file == RC_FILE_ADDRESS) {
+		return &s->R.Address;
+	} else if (file == RC_FILE_SPECIAL) {
+		if (index >= RC_NUM_SPECIAL_REGISTERS) {
+			rc_error(s->C, "%s: special file index %i out of bounds\n", __FUNCTION__, index);
+			return 0;
+		}
+
+		return &s->R.Special[index];
+	}
+
+	return 0;
+}
+
+static void mark_used(struct deadcode_state * s, rc_register_file file, unsigned int index, unsigned int mask)
+{
+	unsigned char * pused = get_used_ptr(s, file, index);
+	if (pused)
+		*pused |= mask;
+}
+
+static void update_instruction(struct deadcode_state * s, struct rc_instruction * inst)
+{
+	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+	struct instruction_state * insts = &s->Instructions[inst->IP];
+	unsigned int usedmask = 0, src;
+
+	if (opcode->HasDstReg) {
+		unsigned char * pused = get_used_ptr(s, inst->U.I.DstReg.File, inst->U.I.DstReg.Index);
+		if (pused) {
+			usedmask = *pused & inst->U.I.DstReg.WriteMask;
+			*pused &= ~usedmask;
+		}
+	}
+
+	insts->WriteMask |= usedmask;
+
+	if (inst->U.I.WriteALUResult) {
+		unsigned char * pused = get_used_ptr(s, RC_FILE_SPECIAL, RC_SPECIAL_ALU_RESULT);
+		if (pused && *pused) {
+			if (inst->U.I.WriteALUResult == RC_ALURESULT_X)
+				usedmask |= RC_MASK_X;
+			else if (inst->U.I.WriteALUResult == RC_ALURESULT_W)
+				usedmask |= RC_MASK_W;
+
+			*pused = 0;
+			insts->WriteALUResult = 1;
+		}
+	}
+
+	unsigned int srcmasks[3];
+	rc_compute_sources_for_writemask(opcode, usedmask, srcmasks);
+
+	for(src = 0; src < opcode->NumSrcRegs; ++src) {
+		unsigned int refmask = 0, chan;
+		unsigned int newsrcmask = srcmasks[src] & ~insts->SrcReg[src];
+		insts->SrcReg[src] |= newsrcmask;
+
+		for(chan = 0; chan < 4; ++chan) {
+			if (GET_BIT(newsrcmask, chan))
+				refmask |= 1 << GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan);
+		}
+
+		/* get rid of spurious bits from ZERO, ONE, etc. swizzles */
+		refmask &= RC_MASK_XYZW;
+
+		if (!refmask)
+			continue;
+
+		mark_used(s, inst->U.I.SrcReg[src].File, inst->U.I.SrcReg[src].Index, refmask);
+
+		if (inst->U.I.SrcReg[src].RelAddr)
+			mark_used(s, RC_FILE_ADDRESS, 0, RC_MASK_X);
+	}
+}
+
+static void mark_output_use(void * data, unsigned int index, unsigned int mask)
+{
+	struct deadcode_state * s = data;
+
+	mark_used(s, RC_FILE_OUTPUT, index, mask);
+}
+
+void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_fn dce, void * userdata)
+{
+	struct deadcode_state s;
+	unsigned int nr_instructions;
+	struct rc_instruction * inst;
+
+	memset(&s, 0, sizeof(s));
+	s.C = c;
+
+	nr_instructions = rc_recompute_ips(c);
+	s.Instructions = memory_pool_malloc(&c->Pool, sizeof(struct instruction_state)*nr_instructions);
+	memset(s.Instructions, 0, sizeof(struct instruction_state)*nr_instructions);
+
+	dce(userdata, &s, &mark_output_use);
+
+	for(inst = c->Program.Instructions.Prev;
+	    inst != &c->Program.Instructions;
+	    inst = inst->Prev) {
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+
+		if (opcode->IsFlowControl) {
+			if (opcode->Opcode == RC_OPCODE_ENDIF) {
+				push_branch(&s);
+			} else {
+				if (s.BranchStackSize) {
+					struct branchinfo * branch = &s.BranchStack[s.BranchStackSize-1];
+
+					if (opcode->Opcode == RC_OPCODE_IF) {
+						or_updatemasks(&s.R,
+								&s.R,
+								branch->HaveElse ? &branch->StoreElse : &branch->StoreEndif);
+
+						s.BranchStackSize--;
+					} else if (opcode->Opcode == RC_OPCODE_ELSE) {
+						if (branch->HaveElse) {
+							rc_error(c, "%s: Multiple ELSE for one IF/ENDIF\n", __FUNCTION__);
+						} else {
+							memcpy(&branch->StoreElse, &s.R, sizeof(s.R));
+							memcpy(&s.R, &branch->StoreEndif, sizeof(s.R));
+							branch->HaveElse = 1;
+						}
+					} else {
+						rc_error(c, "%s: Unhandled control flow instruction %s\n", __FUNCTION__, opcode->Name);
+					}
+				} else {
+					rc_error(c, "%s: Unexpected control flow instruction\n", __FUNCTION__);
+				}
+			}
+		}
+
+		update_instruction(&s, inst);
+	}
+
+	unsigned int ip = 0;
+	for(inst = c->Program.Instructions.Next;
+	    inst != &c->Program.Instructions;
+	    inst = inst->Next, ++ip) {
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);\
+		int dead = 1;
+		unsigned int src, chan;
+
+		if (!opcode->HasDstReg) {
+			dead = 0;
+		} else {
+			inst->U.I.DstReg.WriteMask = s.Instructions[ip].WriteMask;
+			if (s.Instructions[ip].WriteMask)
+				dead = 0;
+
+			if (s.Instructions[ip].WriteALUResult)
+				dead = 0;
+			else
+				inst->U.I.WriteALUResult = RC_ALURESULT_NONE;
+		}
+
+		if (dead) {
+			struct rc_instruction * todelete = inst;
+			inst = inst->Prev;
+			rc_remove_instruction(todelete);
+			continue;
+		}
+
+		unsigned int srcmasks[3];
+		unsigned int usemask = s.Instructions[ip].WriteMask;
+
+		if (inst->U.I.WriteALUResult == RC_ALURESULT_X)
+			usemask |= RC_MASK_X;
+		else if (inst->U.I.WriteALUResult == RC_ALURESULT_W)
+			usemask |= RC_MASK_W;
+
+		rc_compute_sources_for_writemask(opcode, usemask, srcmasks);
+
+		for(src = 0; src < 3; ++src) {
+			for(chan = 0; chan < 4; ++chan) {
+				if (!GET_BIT(srcmasks[src], chan))
+					SET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan, RC_SWIZZLE_UNUSED);
+			}
+		}
+	}
+
+	rc_calculate_inputs_outputs(c);
+}
diff --git a/r300/compiler/radeon_dataflow_swizzles.c b/r300/compiler/radeon_dataflow_swizzles.c
new file mode 100644
index 0000000..d4ccd35
--- /dev/null
+++ b/r300/compiler/radeon_dataflow_swizzles.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_dataflow.h"
+
+#include "radeon_compiler.h"
+#include "radeon_swizzle.h"
+
+
+static void rewrite_source(struct radeon_compiler * c,
+		struct rc_instruction * inst, unsigned src)
+{
+	struct rc_swizzle_split split;
+	unsigned int tempreg = rc_find_free_temporary(c);
+	unsigned int usemask, chan, phase;
+
+	usemask = 0;
+	for(chan = 0; chan < 4; ++chan) {
+		if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) != RC_SWIZZLE_UNUSED)
+			usemask |= 1 << chan;
+	}
+
+	c->SwizzleCaps->Split(inst->U.I.SrcReg[src], usemask, &split);
+
+	for(phase = 0; phase < split.NumPhases; ++phase) {
+		struct rc_instruction * mov = rc_insert_new_instruction(c, inst->Prev);
+		unsigned int phase_refmask;
+		unsigned int masked_negate;
+
+		mov->U.I.Opcode = RC_OPCODE_MOV;
+		mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
+		mov->U.I.DstReg.Index = tempreg;
+		mov->U.I.DstReg.WriteMask = split.Phase[phase];
+		mov->U.I.SrcReg[0] = inst->U.I.SrcReg[src];
+
+		phase_refmask = 0;
+		for(chan = 0; chan < 4; ++chan) {
+			if (!GET_BIT(split.Phase[phase], chan))
+				SET_SWZ(mov->U.I.SrcReg[0].Swizzle, chan, RC_SWIZZLE_UNUSED);
+			else
+				phase_refmask |= 1 << GET_SWZ(mov->U.I.SrcReg[0].Swizzle, chan);
+		}
+
+		phase_refmask &= RC_MASK_XYZW;
+
+		masked_negate = split.Phase[phase] & mov->U.I.SrcReg[0].Negate;
+		if (masked_negate == 0)
+			mov->U.I.SrcReg[0].Negate = 0;
+		else if (masked_negate == split.Phase[phase])
+			mov->U.I.SrcReg[0].Negate = RC_MASK_XYZW;
+
+	}
+
+	inst->U.I.SrcReg[src].File = RC_FILE_TEMPORARY;
+	inst->U.I.SrcReg[src].Index = tempreg;
+	inst->U.I.SrcReg[src].Swizzle = 0;
+	inst->U.I.SrcReg[src].Negate = RC_MASK_NONE;
+	inst->U.I.SrcReg[src].Abs = 0;
+	for(chan = 0; chan < 4; ++chan) {
+		SET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan,
+				GET_BIT(usemask, chan) ? chan : RC_SWIZZLE_UNUSED);
+	}
+}
+
+void rc_dataflow_swizzles(struct radeon_compiler * c)
+{
+	struct rc_instruction * inst;
+
+	for(inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+		unsigned int src;
+
+		for(src = 0; src < opcode->NumSrcRegs; ++src) {
+			if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, inst->U.I.SrcReg[src]))
+				rewrite_source(c, inst, src);
+		}
+	}
+}
diff --git a/r300/compiler/radeon_nqssadce.c b/r300/compiler/radeon_nqssadce.c
deleted file mode 100644
index aaaa50a..0000000
--- a/r300/compiler/radeon_nqssadce.c
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Copyright (C) 2008 Nicolai Haehnle.
- *
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-/**
- * @file
- *
- * "Not-quite SSA" and Dead-Code Elimination.
- *
- * @note This code uses SWIZZLE_NIL in a source register to indicate that
- * the corresponding component is ignored by the corresponding instruction.
- */
-
-#include "radeon_nqssadce.h"
-
-#include "radeon_compiler.h"
-
-
-/**
- * Return the @ref register_state for the given register (or 0 for untracked
- * registers, i.e. constants).
- */
-static struct register_state *get_reg_state(struct nqssadce_state* s, GLuint file, GLuint index)
-{
-	switch(file) {
-	case PROGRAM_TEMPORARY: return &s->Temps[index];
-	case PROGRAM_OUTPUT: return &s->Outputs[index];
-	case PROGRAM_ADDRESS: return &s->Address;
-	default: return 0;
-	}
-}
-
-
-/**
- * Left multiplication of a register with a swizzle
- *
- * @note Works correctly only for X, Y, Z, W swizzles, not for constant swizzles.
- */
-struct prog_src_register lmul_swizzle(GLuint swizzle, struct prog_src_register srcreg)
-{
-	struct prog_src_register tmp = srcreg;
-	int i;
-	tmp.Swizzle = 0;
-	tmp.Negate = NEGATE_NONE;
-	for(i = 0; i < 4; ++i) {
-		GLuint swz = GET_SWZ(swizzle, i);
-		if (swz < 4) {
-			tmp.Swizzle |= GET_SWZ(srcreg.Swizzle, swz) << (i*3);
-			tmp.Negate |= GET_BIT(srcreg.Negate, swz) << i;
-		} else {
-			tmp.Swizzle |= swz << (i*3);
-		}
-	}
-	return tmp;
-}
-
-
-static void track_used_srcreg(struct nqssadce_state* s,
-	GLint src, GLuint sourced)
-{
-	struct prog_instruction * inst = &s->IP->I;
-	int i;
-	GLuint deswz_source = 0;
-
-	for(i = 0; i < 4; ++i) {
-		if (GET_BIT(sourced, i)) {
-			GLuint swz = GET_SWZ(inst->SrcReg[src].Swizzle, i);
-			deswz_source |= 1 << swz;
-		} else {
-			inst->SrcReg[src].Swizzle &= ~(7 << (3*i));
-			inst->SrcReg[src].Swizzle |= SWIZZLE_NIL << (3*i);
-		}
-	}
-
-	if (!s->Descr->IsNativeSwizzle(inst->Opcode, inst->SrcReg[src])) {
-		struct prog_dst_register dstreg = inst->DstReg;
-		dstreg.File = PROGRAM_TEMPORARY;
-		dstreg.Index = rc_find_free_temporary(s->Compiler);
-		dstreg.WriteMask = sourced;
-
-		s->Descr->BuildSwizzle(s, dstreg, inst->SrcReg[src]);
-
-		inst->SrcReg[src].File = PROGRAM_TEMPORARY;
-		inst->SrcReg[src].Index = dstreg.Index;
-		inst->SrcReg[src].Swizzle = 0;
-		inst->SrcReg[src].Negate = NEGATE_NONE;
-		inst->SrcReg[src].Abs = 0;
-		for(i = 0; i < 4; ++i) {
-			if (GET_BIT(sourced, i))
-				inst->SrcReg[src].Swizzle |= i << (3*i);
-			else
-				inst->SrcReg[src].Swizzle |= SWIZZLE_NIL << (3*i);
-		}
-		deswz_source = sourced;
-	}
-
-	struct register_state *regstate;
-
-	if (inst->SrcReg[src].RelAddr) {
-		regstate = get_reg_state(s, PROGRAM_ADDRESS, 0);
-		if (regstate)
-			regstate->Sourced |= WRITEMASK_X;
-	} else {
-		regstate = get_reg_state(s, inst->SrcReg[src].File, inst->SrcReg[src].Index);
-		if (regstate)
-			regstate->Sourced |= deswz_source & 0xf;
-	}
-}
-
-static void unalias_srcregs(struct rc_instruction *inst, GLuint oldindex, GLuint newindex)
-{
-	int nsrc = _mesa_num_inst_src_regs(inst->I.Opcode);
-	int i;
-	for(i = 0; i < nsrc; ++i)
-		if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY && inst->I.SrcReg[i].Index == oldindex)
-			inst->I.SrcReg[i].Index = newindex;
-}
-
-static void unalias_temporary(struct nqssadce_state* s, GLuint oldindex)
-{
-	GLuint newindex = rc_find_free_temporary(s->Compiler);
-	struct rc_instruction * inst;
-	for(inst = s->Compiler->Program.Instructions.Next; inst != s->IP; inst = inst->Next) {
-		if (inst->I.DstReg.File == PROGRAM_TEMPORARY && inst->I.DstReg.Index == oldindex)
-			inst->I.DstReg.Index = newindex;
-		unalias_srcregs(inst, oldindex, newindex);
-	}
-	unalias_srcregs(s->IP, oldindex, newindex);
-}
-
-
-/**
- * Handle one instruction.
- */
-static void process_instruction(struct nqssadce_state* s)
-{
-	struct prog_instruction *inst = &s->IP->I;
-	GLuint WriteMask;
-
-	if (inst->Opcode == OPCODE_END)
-		return;
-
-	if (inst->Opcode != OPCODE_KIL) {
-		struct register_state *regstate = get_reg_state(s, inst->DstReg.File, inst->DstReg.Index);
-		if (!regstate) {
-			rc_error(s->Compiler, "NqssaDce: bad destination register (%i[%i])\n",
-				inst->DstReg.File, inst->DstReg.Index);
-			return;
-		}
-
-		inst->DstReg.WriteMask &= regstate->Sourced;
-		regstate->Sourced &= ~inst->DstReg.WriteMask;
-
-		if (inst->DstReg.WriteMask == 0) {
-			struct rc_instruction * inst_remove = s->IP;
-			s->IP = s->IP->Prev;
-			rc_remove_instruction(inst_remove);
-			return;
-		}
-
-		if (inst->DstReg.File == PROGRAM_TEMPORARY && !regstate->Sourced)
-			unalias_temporary(s, inst->DstReg.Index);
-	}
-
-	WriteMask = inst->DstReg.WriteMask;
-
-	switch (inst->Opcode) {
-	case OPCODE_ARL:
-	case OPCODE_DDX:
-	case OPCODE_DDY:
-	case OPCODE_FRC:
-	case OPCODE_MOV:
-		track_used_srcreg(s, 0, WriteMask);
-		break;
-	case OPCODE_ADD:
-	case OPCODE_MAX:
-	case OPCODE_MIN:
-	case OPCODE_MUL:
-	case OPCODE_SGE:
-	case OPCODE_SLT:
-		track_used_srcreg(s, 0, WriteMask);
-		track_used_srcreg(s, 1, WriteMask);
-		break;
-	case OPCODE_CMP:
-	case OPCODE_MAD:
-		track_used_srcreg(s, 0, WriteMask);
-		track_used_srcreg(s, 1, WriteMask);
-		track_used_srcreg(s, 2, WriteMask);
-		break;
-	case OPCODE_COS:
-	case OPCODE_EX2:
-	case OPCODE_LG2:
-	case OPCODE_RCP:
-	case OPCODE_RSQ:
-	case OPCODE_SIN:
-		track_used_srcreg(s, 0, 0x1);
-		break;
-	case OPCODE_DP3:
-		track_used_srcreg(s, 0, 0x7);
-		track_used_srcreg(s, 1, 0x7);
-		break;
-	case OPCODE_DP4:
-		track_used_srcreg(s, 0, 0xf);
-		track_used_srcreg(s, 1, 0xf);
-		break;
-	case OPCODE_KIL:
-	case OPCODE_TEX:
-	case OPCODE_TXB:
-	case OPCODE_TXP:
-		track_used_srcreg(s, 0, 0xf);
-		break;
-	case OPCODE_DST:
-		track_used_srcreg(s, 0, 0x6);
-		track_used_srcreg(s, 1, 0xa);
-		break;
-	case OPCODE_EXP:
-	case OPCODE_LOG:
-	case OPCODE_POW:
-		track_used_srcreg(s, 0, 0x3);
-		break;
-	case OPCODE_LIT:
-		track_used_srcreg(s, 0, 0xb);
-		break;
-	default:
-		rc_error(s->Compiler, "NqssaDce: Unknown opcode %d\n", inst->Opcode);
-		return;
-	}
-
-	s->IP = s->IP->Prev;
-}
-
-void rc_calculate_inputs_outputs(struct radeon_compiler * c)
-{
-	struct rc_instruction *inst;
-
-	c->Program.InputsRead = 0;
-	c->Program.OutputsWritten = 0;
-
-	for(inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next)
-	{
-		int i;
-		int num_src_regs = _mesa_num_inst_src_regs(inst->I.Opcode);
-
-		for (i = 0; i < num_src_regs; ++i) {
-			if (inst->I.SrcReg[i].File == PROGRAM_INPUT)
-				c->Program.InputsRead |= 1 << inst->I.SrcReg[i].Index;
-		}
-
-		if (_mesa_num_inst_dst_regs(inst->I.Opcode)) {
-			if (inst->I.DstReg.File == PROGRAM_OUTPUT)
-				c->Program.OutputsWritten |= 1 << inst->I.DstReg.Index;
-		}
-	}
-}
-
-void radeonNqssaDce(struct radeon_compiler * c, struct radeon_nqssadce_descr* descr, void * data)
-{
-	struct nqssadce_state s;
-
-	_mesa_bzero(&s, sizeof(s));
-	s.Compiler = c;
-	s.Descr = descr;
-	s.UserData = data;
-	s.Descr->Init(&s);
-	s.IP = c->Program.Instructions.Prev;
-
-	while(s.IP != &c->Program.Instructions && !c->Error)
-		process_instruction(&s);
-
-	rc_calculate_inputs_outputs(c);
-}
diff --git a/r300/compiler/radeon_nqssadce.h b/r300/compiler/radeon_nqssadce.h
deleted file mode 100644
index b3fc77a..0000000
--- a/r300/compiler/radeon_nqssadce.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (C) 2008 Nicolai Haehnle.
- *
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#ifndef __RADEON_PROGRAM_NQSSADCE_H_
-#define __RADEON_PROGRAM_NQSSADCE_H_
-
-#include "radeon_program.h"
-
-struct register_state {
-	/**
-	 * Bitmask indicating which components of the register are sourced
-	 * by later instructions.
-	 */
-	GLuint Sourced : 4;
-};
-
-/**
- * Maintain state such as which registers are used, which registers are
- * read from, etc.
- */
-struct nqssadce_state {
-	struct radeon_compiler *Compiler;
-	struct radeon_nqssadce_descr *Descr;
-
-	/**
-	 * All instructions after this instruction pointer have been dealt with.
-	 */
-	struct rc_instruction * IP;
-
-	/**
-	 * Which registers are read by subsequent instructions?
-	 */
-	struct register_state Temps[MAX_PROGRAM_TEMPS];
-	struct register_state Outputs[VERT_RESULT_MAX];
-	struct register_state Address;
-
-	void * UserData;
-};
-
-
-/**
- * This structure contains a description of the hardware in-so-far as
- * it is required for the NqSSA-DCE pass.
- */
-struct radeon_nqssadce_descr {
-	/**
-	 * Fill in which outputs
-	 */
-	void (*Init)(struct nqssadce_state *);
-
-	/**
-	 * Check whether the given swizzle, absolute and negate combination
-	 * can be implemented natively by the hardware for this opcode.
-	 */
-	GLboolean (*IsNativeSwizzle)(GLuint opcode, struct prog_src_register reg);
-
-	/**
-	 * Emit (at the current IP) the instruction MOV dst, src;
-	 * The transformation will work recursively on the emitted instruction(s).
-	 */
-	void (*BuildSwizzle)(struct nqssadce_state*, struct prog_dst_register dst, struct prog_src_register src);
-};
-
-void radeonNqssaDce(struct radeon_compiler * c, struct radeon_nqssadce_descr* descr, void * data);
-struct prog_src_register lmul_swizzle(GLuint swizzle, struct prog_src_register srcreg);
-
-#endif /* __RADEON_PROGRAM_NQSSADCE_H_ */
diff --git a/r300/compiler/radeon_opcodes.c b/r300/compiler/radeon_opcodes.c
new file mode 100644
index 0000000..9285748
--- /dev/null
+++ b/r300/compiler/radeon_opcodes.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_opcodes.h"
+
+#include "radeon_program_constants.h"
+
+struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
+	{
+		.Opcode = RC_OPCODE_NOP,
+		.Name = "NOP"
+	},
+	{
+		.Opcode = RC_OPCODE_ILLEGAL_OPCODE,
+		.Name = "ILLEGAL OPCODE"
+	},
+	{
+		.Opcode = RC_OPCODE_ABS,
+		.Name = "ABS",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_ADD,
+		.Name = "ADD",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_ARL,
+		.Name = "ARL",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_CMP,
+		.Name = "CMP",
+		.NumSrcRegs = 3,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_COS,
+		.Name = "COS",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsStandardScalar = 1
+	},
+	{
+		.Opcode = RC_OPCODE_DDX,
+		.Name = "DDX",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_DDY,
+		.Name = "DDY",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_DP3,
+		.Name = "DP3",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_DP4,
+		.Name = "DP4",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_DPH,
+		.Name = "DPH",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_DST,
+		.Name = "DST",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_EX2,
+		.Name = "EX2",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsStandardScalar = 1
+	},
+	{
+		.Opcode = RC_OPCODE_EXP,
+		.Name = "EXP",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_FLR,
+		.Name = "FLR",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_FRC,
+		.Name = "FRC",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_KIL,
+		.Name = "KIL",
+		.NumSrcRegs = 1
+	},
+	{
+		.Opcode = RC_OPCODE_LG2,
+		.Name = "LG2",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsStandardScalar = 1
+	},
+	{
+		.Opcode = RC_OPCODE_LIT,
+		.Name = "LIT",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_LOG,
+		.Name = "LOG",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_LRP,
+		.Name = "LRP",
+		.NumSrcRegs = 3,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_MAD,
+		.Name = "MAD",
+		.NumSrcRegs = 3,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_MAX,
+		.Name = "MAX",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_MIN,
+		.Name = "MIN",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_MOV,
+		.Name = "MOV",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_MUL,
+		.Name = "MUL",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_POW,
+		.Name = "POW",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1,
+		.IsStandardScalar = 1
+	},
+	{
+		.Opcode = RC_OPCODE_RCP,
+		.Name = "RCP",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsStandardScalar = 1
+	},
+	{
+		.Opcode = RC_OPCODE_RSQ,
+		.Name = "RSQ",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsStandardScalar = 1
+	},
+	{
+		.Opcode = RC_OPCODE_SCS,
+		.Name = "SCS",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_SEQ,
+		.Name = "SEQ",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_SFL,
+		.Name = "SFL",
+		.NumSrcRegs = 0,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_SGE,
+		.Name = "SGE",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_SGT,
+		.Name = "SGT",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_SIN,
+		.Name = "SIN",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsStandardScalar = 1
+	},
+	{
+		.Opcode = RC_OPCODE_SLE,
+		.Name = "SLE",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_SLT,
+		.Name = "SLT",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_SNE,
+		.Name = "SNE",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_SUB,
+		.Name = "SUB",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_SWZ,
+		.Name = "SWZ",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
+		.Opcode = RC_OPCODE_XPD,
+		.Name = "XPD",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_TEX,
+		.Name = "TEX",
+		.HasTexture = 1,
+		.NumSrcRegs = 1,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_TXB,
+		.Name = "TXB",
+		.HasTexture = 1,
+		.NumSrcRegs = 1,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_TXD,
+		.Name = "TXD",
+		.HasTexture = 1,
+		.NumSrcRegs = 3,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_TXL,
+		.Name = "TXL",
+		.HasTexture = 1,
+		.NumSrcRegs = 1,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_TXP,
+		.Name = "TXP",
+		.HasTexture = 1,
+		.NumSrcRegs = 1,
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_IF,
+		.Name = "IF",
+		.IsFlowControl = 1,
+		.NumSrcRegs = 1
+	},
+	{
+		.Opcode = RC_OPCODE_ELSE,
+		.Name = "ELSE",
+		.IsFlowControl = 1,
+		.NumSrcRegs = 0
+	},
+	{
+		.Opcode = RC_OPCODE_ENDIF,
+		.Name = "ENDIF",
+		.IsFlowControl = 1,
+		.NumSrcRegs = 0
+	},
+	{
+		.Opcode = RC_OPCODE_REPL_ALPHA,
+		.Name = "REPL_ALPHA",
+		.HasDstReg = 1
+	},
+	{
+		.Opcode = RC_OPCODE_BEGIN_TEX,
+		.Name = "BEGIN_TEX"
+	}
+};
+
+void rc_compute_sources_for_writemask(
+		const struct rc_opcode_info * opcode,
+		unsigned int writemask,
+		unsigned int *srcmasks)
+{
+	unsigned int src;
+
+	srcmasks[0] = 0;
+	srcmasks[1] = 0;
+	srcmasks[2] = 0;
+
+	if (opcode->Opcode == RC_OPCODE_KIL)
+		srcmasks[0] |= RC_MASK_XYZW;
+	else if (opcode->Opcode == RC_OPCODE_IF)
+		srcmasks[0] |= RC_MASK_X;
+
+	if (!writemask)
+		return;
+
+	if (opcode->IsComponentwise) {
+		for(src = 0; src < opcode->NumSrcRegs; ++src)
+			srcmasks[src] |= writemask;
+	} else if (opcode->IsStandardScalar) {
+		for(src = 0; src < opcode->NumSrcRegs; ++src)
+			srcmasks[src] |= RC_MASK_X;
+	} else {
+		switch(opcode->Opcode) {
+		case RC_OPCODE_ARL:
+			srcmasks[0] |= RC_MASK_X;
+			break;
+		case RC_OPCODE_DP3:
+			srcmasks[0] |= RC_MASK_XYZ;
+			srcmasks[1] |= RC_MASK_XYZ;
+			break;
+		case RC_OPCODE_DP4:
+			srcmasks[0] |= RC_MASK_XYZW;
+			srcmasks[1] |= RC_MASK_XYZW;
+			break;
+		case RC_OPCODE_TEX:
+		case RC_OPCODE_TXB:
+		case RC_OPCODE_TXP:
+			srcmasks[0] |= RC_MASK_XYZW;
+			break;
+		case RC_OPCODE_DST:
+			srcmasks[0] |= 0x6;
+			srcmasks[1] |= 0xa;
+			break;
+		case RC_OPCODE_EXP:
+		case RC_OPCODE_LOG:
+			srcmasks[0] |= RC_MASK_XY;
+			break;
+		case RC_OPCODE_LIT:
+			srcmasks[0] |= 0xb;
+			break;
+		default:
+			break;
+		}
+	}
+}
diff --git a/r300/compiler/radeon_opcodes.h b/r300/compiler/radeon_opcodes.h
new file mode 100644
index 0000000..a3c5b86
--- /dev/null
+++ b/r300/compiler/radeon_opcodes.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef RADEON_OPCODES_H
+#define RADEON_OPCODES_H
+
+#include <assert.h>
+
+/**
+ * Opcodes understood by the Radeon compiler.
+ */
+typedef enum {
+	RC_OPCODE_NOP = 0,
+	RC_OPCODE_ILLEGAL_OPCODE,
+
+	/** vec4 instruction: dst.c = abs(src0.c); */
+	RC_OPCODE_ABS,
+
+	/** vec4 instruction: dst.c = src0.c + src1.c; */
+	RC_OPCODE_ADD,
+
+	/** special instruction: load address register
+	 * dst.x = floor(src.x), where dst must be an address register */
+	RC_OPCODE_ARL,
+
+	/** vec4 instruction: dst.c = src0.c < 0.0 ? src1.c : src2.c */
+	RC_OPCODE_CMP,
+
+	/** scalar instruction: dst = cos(src0.x) */
+	RC_OPCODE_COS,
+
+	/** special instruction: take vec4 partial derivative in X direction
+	 * dst.c = d src0.c / dx */
+	RC_OPCODE_DDX,
+
+	/** special instruction: take vec4 partial derivative in Y direction
+	 * dst.c = d src0.c / dy */
+	RC_OPCODE_DDY,
+
+	/** scalar instruction: dst = src0.x*src1.x + src0.y*src1.y + src0.z*src1.z */
+	RC_OPCODE_DP3,
+
+	/** scalar instruction: dst = src0.x*src1.x + src0.y*src1.y + src0.z*src1.z + src0.w*src1.w */
+	RC_OPCODE_DP4,
+
+	/** scalar instruction: dst = src0.x*src1.x + src0.y*src1.y + src0.z*src1.z + src1.w */
+	RC_OPCODE_DPH,
+
+	/** special instruction, see ARB_fragment_program */
+	RC_OPCODE_DST,
+
+	/** scalar instruction: dst = 2**src0.x */
+	RC_OPCODE_EX2,
+
+	/** special instruction, see ARB_vertex_program */
+	RC_OPCODE_EXP,
+
+	/** vec4 instruction: dst.c = floor(src0.c) */
+	RC_OPCODE_FLR,
+
+	/** vec4 instruction: dst.c = src0.c - floor(src0.c) */
+	RC_OPCODE_FRC,
+
+	/** special instruction: stop execution if any component of src0 is negative */
+	RC_OPCODE_KIL,
+
+	/** scalar instruction: dst = log_2(src0.x) */
+	RC_OPCODE_LG2,
+
+	/** special instruction, see ARB_vertex_program */
+	RC_OPCODE_LIT,
+
+	/** special instruction, see ARB_vertex_program */
+	RC_OPCODE_LOG,
+
+	/** vec4 instruction: dst.c = src0.c*src1.c + (1 - src0.c)*src2.c */
+	RC_OPCODE_LRP,
+
+	/** vec4 instruction: dst.c = src0.c*src1.c + src2.c */
+	RC_OPCODE_MAD,
+
+	/** vec4 instruction: dst.c = max(src0.c, src1.c) */
+	RC_OPCODE_MAX,
+
+	/** vec4 instruction: dst.c = min(src0.c, src1.c) */
+	RC_OPCODE_MIN,
+
+	/** vec4 instruction: dst.c = src0.c */
+	RC_OPCODE_MOV,
+
+	/** vec4 instruction: dst.c = src0.c*src1.c */
+	RC_OPCODE_MUL,
+
+	/** scalar instruction: dst = src0.x ** src1.x */
+	RC_OPCODE_POW,
+
+	/** scalar instruction: dst = 1 / src0.x */
+	RC_OPCODE_RCP,
+
+	/** scalar instruction: dst = 1 / sqrt(src0.x) */
+	RC_OPCODE_RSQ,
+
+	/** special instruction, see ARB_fragment_program */
+	RC_OPCODE_SCS,
+
+	/** vec4 instruction: dst.c = (src0.c == src1.c) ? 1.0 : 0.0 */
+	RC_OPCODE_SEQ,
+
+	/** vec4 instruction: dst.c = 0.0 */
+	RC_OPCODE_SFL,
+
+	/** vec4 instruction: dst.c = (src0.c >= src1.c) ? 1.0 : 0.0 */
+	RC_OPCODE_SGE,
+
+	/** vec4 instruction: dst.c = (src0.c > src1.c) ? 1.0 : 0.0 */
+	RC_OPCODE_SGT,
+
+	/** scalar instruction: dst = sin(src0.x) */
+	RC_OPCODE_SIN,
+
+	/** vec4 instruction: dst.c = (src0.c <= src1.c) ? 1.0 : 0.0 */
+	RC_OPCODE_SLE,
+
+	/** vec4 instruction: dst.c = (src0.c < src1.c) ? 1.0 : 0.0 */
+	RC_OPCODE_SLT,
+
+	/** vec4 instruction: dst.c = (src0.c != src1.c) ? 1.0 : 0.0 */
+	RC_OPCODE_SNE,
+
+	/** vec4 instruction: dst.c = src0.c - src1.c */
+	RC_OPCODE_SUB,
+
+	/** vec4 instruction: dst.c = src0.c */
+	RC_OPCODE_SWZ,
+
+	/** special instruction, see ARB_fragment_program */
+	RC_OPCODE_XPD,
+
+	RC_OPCODE_TEX,
+	RC_OPCODE_TXB,
+	RC_OPCODE_TXD,
+	RC_OPCODE_TXL,
+	RC_OPCODE_TXP,
+
+	/** branch instruction:
+	 * If src0.x != 0.0, continue with the next instruction;
+	 * otherwise, jump to matching RC_OPCODE_ELSE or RC_OPCODE_ENDIF.
+	 */
+	RC_OPCODE_IF,
+
+	/** branch instruction: jump to matching RC_OPCODE_ENDIF */
+	RC_OPCODE_ELSE,
+
+	/** branch instruction: has no effect */
+	RC_OPCODE_ENDIF,
+
+	/** special instruction, used in R300-R500 fragment program pair instructions
+	 * indicates that the result of the alpha operation shall be replicated
+	 * across all other channels */
+	RC_OPCODE_REPL_ALPHA,
+
+	/** special instruction, used in R300-R500 fragment programs
+	 * to indicate the start of a block of texture instructions that
+	 * can run simultaneously. */
+	RC_OPCODE_BEGIN_TEX,
+
+	MAX_RC_OPCODE
+} rc_opcode;
+
+
+struct rc_opcode_info {
+	rc_opcode Opcode;
+	const char * Name;
+
+	/** true if the instruction reads from a texture.
+	 *
+	 * \note This is false for the KIL instruction, even though KIL is
+	 * a texture instruction from a hardware point of view. */
+	unsigned int HasTexture:1;
+
+	unsigned int NumSrcRegs:2;
+	unsigned int HasDstReg:1;
+
+	/** true if this instruction affects control flow */
+	unsigned int IsFlowControl:1;
+
+	/** true if this is a vector instruction that operates on components in parallel
+	 * without any cross-component interaction */
+	unsigned int IsComponentwise:1;
+
+	/** true if this instruction sources only its operands X components
+	 * to compute one result which is smeared across all output channels */
+	unsigned int IsStandardScalar:1;
+};
+
+extern struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE];
+
+static inline const struct rc_opcode_info * rc_get_opcode_info(rc_opcode opcode)
+{
+	assert((unsigned int)opcode < MAX_RC_OPCODE);
+	assert(rc_opcodes[opcode].Opcode == opcode);
+
+	return &rc_opcodes[opcode];
+}
+
+void rc_compute_sources_for_writemask(
+		const struct rc_opcode_info * opcode,
+		unsigned int writemask,
+		unsigned int *srcmasks);
+
+#endif /* RADEON_OPCODES_H */
diff --git a/r300/compiler/radeon_pair_regalloc.c b/r300/compiler/radeon_pair_regalloc.c
new file mode 100644
index 0000000..23c2a5e
--- /dev/null
+++ b/r300/compiler/radeon_pair_regalloc.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_program_pair.h"
+
+#include <stdio.h>
+
+#include "radeon_compiler.h"
+#include "radeon_dataflow.h"
+
+
+#define VERBOSE 0
+
+#define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
+
+
+struct live_intervals {
+	int Start;
+	int End;
+	struct live_intervals * Next;
+};
+
+struct register_info {
+	struct live_intervals Live;
+
+	unsigned int Used:1;
+	unsigned int Allocated:1;
+	rc_register_file File:3;
+	unsigned int Index:RC_REGISTER_INDEX_BITS;
+};
+
+struct hardware_register {
+	struct live_intervals * Used;
+};
+
+struct regalloc_state {
+	struct radeon_compiler * C;
+
+	struct register_info Input[RC_REGISTER_MAX_INDEX];
+	struct register_info Temporary[RC_REGISTER_MAX_INDEX];
+
+	struct hardware_register * HwTemporary;
+	unsigned int NumHwTemporaries;
+};
+
+static void print_live_intervals(struct live_intervals * src)
+{
+	if (!src) {
+		DBG("(null)");
+		return;
+	}
+
+	while(src) {
+		DBG("(%i,%i)", src->Start, src->End);
+		src = src->Next;
+	}
+}
+
+static void add_live_intervals(struct regalloc_state * s,
+		struct live_intervals ** dst, struct live_intervals * src)
+{
+	struct live_intervals ** dst_backup = dst;
+
+	if (VERBOSE) {
+		DBG("add_live_intervals: ");
+		print_live_intervals(*dst);
+		DBG(" to ");
+		print_live_intervals(src);
+		DBG("\n");
+	}
+
+	while(src) {
+		if (*dst && (*dst)->End < src->Start) {
+			dst = &(*dst)->Next;
+		} else if (!*dst || (*dst)->Start > src->End) {
+			struct live_intervals * li = memory_pool_malloc(&s->C->Pool, sizeof(*li));
+			li->Start = src->Start;
+			li->End = src->End;
+			li->Next = *dst;
+			*dst = li;
+			src = src->Next;
+		} else {
+			if (src->End > (*dst)->End)
+				(*dst)->End = src->End;
+			if (src->Start < (*dst)->Start)
+				(*dst)->Start = src->Start;
+			src = src->Next;
+		}
+	}
+
+	if (VERBOSE) {
+		DBG("    result: ");
+		print_live_intervals(*dst_backup);
+		DBG("\n");
+	}
+}
+
+static int overlap_live_intervals(struct live_intervals * dst, struct live_intervals * src)
+{
+	if (VERBOSE) {
+		DBG("overlap_live_intervals: ");
+		print_live_intervals(dst);
+		DBG(" to ");
+		print_live_intervals(src);
+		DBG("\n");
+	}
+
+	while(src && dst) {
+		if (dst->End <= src->Start) {
+			dst = dst->Next;
+		} else if (dst->End <= src->End) {
+			DBG("    overlap\n");
+			return 1;
+		} else if (dst->Start < src->End) {
+			DBG("    overlap\n");
+			return 1;
+		} else {
+			src = src->Next;
+		}
+	}
+
+	DBG("    no overlap\n");
+
+	return 0;
+}
+
+static int try_add_live_intervals(struct regalloc_state * s,
+		struct live_intervals ** dst, struct live_intervals * src)
+{
+	if (overlap_live_intervals(*dst, src))
+		return 0;
+
+	add_live_intervals(s, dst, src);
+	return 1;
+}
+
+static void scan_callback(void * data, struct rc_instruction * inst,
+		rc_register_file file, unsigned int index, unsigned int chan)
+{
+	struct regalloc_state * s = data;
+	struct register_info * reg;
+
+	if (file == RC_FILE_TEMPORARY)
+		reg = &s->Temporary[index];
+	else if (file == RC_FILE_INPUT)
+		reg = &s->Input[index];
+	else
+		return;
+
+	if (!reg->Used) {
+		reg->Used = 1;
+		if (file == RC_FILE_INPUT)
+			reg->Live.Start = -1;
+		else
+			reg->Live.Start = inst->IP;
+		reg->Live.End = inst->IP;
+	} else {
+		if (inst->IP > reg->Live.End)
+			reg->Live.End = inst->IP;
+	}
+}
+
+static void compute_live_intervals(struct regalloc_state * s)
+{
+	struct rc_instruction * inst;
+
+	rc_recompute_ips(s->C);
+
+	for(inst = s->C->Program.Instructions.Next;
+	    inst != &s->C->Program.Instructions;
+	    inst = inst->Next) {
+		rc_for_all_reads(inst, scan_callback, s);
+		rc_for_all_writes(inst, scan_callback, s);
+	}
+}
+
+static void rewrite_register(struct regalloc_state * s,
+		rc_register_file * file, unsigned int * index)
+{
+	const struct register_info * reg;
+
+	if (*file == RC_FILE_TEMPORARY)
+		reg = &s->Temporary[*index];
+	else if (*file == RC_FILE_INPUT)
+		reg = &s->Input[*index];
+	else
+		return;
+
+	if (reg->Allocated) {
+		*file = reg->File;
+		*index = reg->Index;
+	}
+}
+
+static void rewrite_normal_instruction(struct regalloc_state * s, struct rc_sub_instruction * inst)
+{
+	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->Opcode);
+	unsigned int src;
+
+	if (opcode->HasDstReg) {
+		rc_register_file file = inst->DstReg.File;
+		unsigned int index = inst->DstReg.Index;
+
+		rewrite_register(s, &file, &index);
+
+		inst->DstReg.File = file;
+		inst->DstReg.Index = index;
+	}
+
+	for(src = 0; src < opcode->NumSrcRegs; ++src) {
+		rc_register_file file = inst->SrcReg[src].File;
+		unsigned int index = inst->SrcReg[src].Index;
+
+		rewrite_register(s, &file, &index);
+
+		inst->SrcReg[src].File = file;
+		inst->SrcReg[src].Index = index;
+	}
+}
+
+static void rewrite_pair_instruction(struct regalloc_state * s, struct rc_pair_instruction * inst)
+{
+	unsigned int src;
+
+	if (inst->RGB.WriteMask) {
+		rc_register_file file = RC_FILE_TEMPORARY;
+		unsigned int index = inst->RGB.DestIndex;
+
+		rewrite_register(s, &file, &index);
+
+		inst->RGB.DestIndex = index;
+	}
+
+	if (inst->Alpha.WriteMask) {
+		rc_register_file file = RC_FILE_TEMPORARY;
+		unsigned int index = inst->Alpha.DestIndex;
+
+		rewrite_register(s, &file, &index);
+
+		inst->Alpha.DestIndex = index;
+	}
+
+	for(src = 0; src < 3; ++src) {
+		if (inst->RGB.Src[src].Used) {
+			rc_register_file file = inst->RGB.Src[src].File;
+			unsigned int index = inst->RGB.Src[src].Index;
+
+			rewrite_register(s, &file, &index);
+
+			inst->RGB.Src[src].File = file;
+			inst->RGB.Src[src].Index = index;
+		}
+
+		if (inst->Alpha.Src[src].Used) {
+			rc_register_file file = inst->Alpha.Src[src].File;
+			unsigned int index = inst->Alpha.Src[src].Index;
+
+			rewrite_register(s, &file, &index);
+
+			inst->Alpha.Src[src].File = file;
+			inst->Alpha.Src[src].Index = index;
+		}
+	}
+}
+
+static void do_regalloc(struct regalloc_state * s)
+{
+	struct rc_instruction * inst;
+	unsigned int index;
+
+	/* Simple and stupid greedy register allocation */
+	for(index = 0; index < RC_REGISTER_MAX_INDEX; ++index) {
+		struct register_info * reg = &s->Temporary[index];
+		unsigned int hwreg;
+
+		if (!reg->Used)
+			continue;
+
+		for(hwreg = 0; hwreg < s->NumHwTemporaries; ++hwreg) {
+			if (try_add_live_intervals(s, &s->HwTemporary[hwreg].Used, &reg->Live)) {
+				reg->Allocated = 1;
+				reg->File = RC_FILE_TEMPORARY;
+				reg->Index = hwreg;
+				goto success;
+			}
+		}
+
+		rc_error(s->C, "Ran out of hardware temporaries\n");
+		return;
+
+	success:;
+	}
+
+	/* Rewrite all instructions based on the translation table we built */
+	for(inst = s->C->Program.Instructions.Next;
+	    inst != &s->C->Program.Instructions;
+	    inst = inst->Next) {
+		if (inst->Type == RC_INSTRUCTION_NORMAL)
+			rewrite_normal_instruction(s, &inst->U.I);
+		else
+			rewrite_pair_instruction(s, &inst->U.P);
+	}
+}
+
+static void alloc_input(void * data, unsigned int input, unsigned int hwreg)
+{
+	struct regalloc_state * s = data;
+
+	if (!s->Input[input].Used)
+		return;
+
+	add_live_intervals(s, &s->HwTemporary[hwreg].Used, &s->Input[input].Live);
+
+	s->Input[input].Allocated = 1;
+	s->Input[input].File = RC_FILE_TEMPORARY;
+	s->Input[input].Index = hwreg;
+
+}
+
+void rc_pair_regalloc(struct r300_fragment_program_compiler *c, unsigned maxtemps)
+{
+	struct regalloc_state s;
+
+	memset(&s, 0, sizeof(s));
+	s.C = &c->Base;
+	s.NumHwTemporaries = maxtemps;
+	s.HwTemporary = memory_pool_malloc(&s.C->Pool, maxtemps*sizeof(struct hardware_register));
+	memset(s.HwTemporary, 0, maxtemps*sizeof(struct hardware_register));
+
+	compute_live_intervals(&s);
+
+	c->AllocateHwInputs(c, &alloc_input, &s);
+
+	do_regalloc(&s);
+}
diff --git a/r300/compiler/radeon_pair_schedule.c b/r300/compiler/radeon_pair_schedule.c
new file mode 100644
index 0000000..2890c00
--- /dev/null
+++ b/r300/compiler/radeon_pair_schedule.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_program_pair.h"
+
+#include <stdio.h>
+
+#include "radeon_compiler.h"
+#include "radeon_dataflow.h"
+
+
+#define VERBOSE 0
+
+#define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
+
+struct schedule_instruction {
+	struct rc_instruction * Instruction;
+
+	/** Next instruction in the linked list of ready instructions. */
+	struct schedule_instruction *NextReady;
+
+	/** Values that this instruction reads and writes */
+	struct reg_value * WriteValues[4];
+	struct reg_value * ReadValues[12];
+	unsigned int NumWriteValues:3;
+	unsigned int NumReadValues:4;
+
+	/**
+	 * Number of (read and write) dependencies that must be resolved before
+	 * this instruction can be scheduled.
+	 */
+	unsigned int NumDependencies:5;
+};
+
+
+/**
+ * Used to keep track of which instructions read a value.
+ */
+struct reg_value_reader {
+	struct schedule_instruction *Reader;
+	struct reg_value_reader *Next;
+};
+
+/**
+ * Used to keep track which values are stored in each component of a
+ * RC_FILE_TEMPORARY.
+ */
+struct reg_value {
+	struct schedule_instruction * Writer;
+
+	/**
+	 * Unordered linked list of instructions that read from this value.
+	 * When this value becomes available, we increase all readers'
+	 * dependency count.
+	 */
+	struct reg_value_reader *Readers;
+
+	/**
+	 * Number of readers of this value. This is decremented each time
+	 * a reader of the value is committed.
+	 * When the reader cound reaches zero, the dependency count
+	 * of the instruction writing \ref Next is decremented.
+	 */
+	unsigned int NumReaders;
+
+	struct reg_value *Next; /**< Pointer to the next value to be written to the same register */
+};
+
+struct register_state {
+	struct reg_value * Values[4];
+};
+
+struct schedule_state {
+	struct radeon_compiler * C;
+	struct schedule_instruction * Current;
+
+	struct register_state Temporary[RC_REGISTER_MAX_INDEX];
+
+	/**
+	 * Linked lists of instructions that can be scheduled right now,
+	 * based on which ALU/TEX resources they require.
+	 */
+	/*@{*/
+	struct schedule_instruction *ReadyFullALU;
+	struct schedule_instruction *ReadyRGB;
+	struct schedule_instruction *ReadyAlpha;
+	struct schedule_instruction *ReadyTEX;
+	/*@}*/
+};
+
+static struct reg_value ** get_reg_valuep(struct schedule_state * s,
+		rc_register_file file, unsigned int index, unsigned int chan)
+{
+	if (file != RC_FILE_TEMPORARY)
+		return 0;
+
+	if (index >= RC_REGISTER_MAX_INDEX) {
+		rc_error(s->C, "%s: index %i out of bounds\n", __FUNCTION__, index);
+		return 0;
+	}
+
+	return &s->Temporary[index].Values[chan];
+}
+
+static struct reg_value * get_reg_value(struct schedule_state * s,
+		rc_register_file file, unsigned int index, unsigned int chan)
+{
+	struct reg_value ** pv = get_reg_valuep(s, file, index, chan);
+	if (!pv)
+		return 0;
+	return *pv;
+}
+
+static void add_inst_to_list(struct schedule_instruction ** list, struct schedule_instruction * inst)
+{
+	inst->NextReady = *list;
+	*list = inst;
+}
+
+static void instruction_ready(struct schedule_state * s, struct schedule_instruction * sinst)
+{
+	DBG("%i is now ready\n", sinst->Instruction->IP);
+
+	if (sinst->Instruction->Type == RC_INSTRUCTION_NORMAL)
+		add_inst_to_list(&s->ReadyTEX, sinst);
+	else if (sinst->Instruction->U.P.Alpha.Opcode == RC_OPCODE_NOP)
+		add_inst_to_list(&s->ReadyRGB, sinst);
+	else if (sinst->Instruction->U.P.RGB.Opcode == RC_OPCODE_NOP)
+		add_inst_to_list(&s->ReadyAlpha, sinst);
+	else
+		add_inst_to_list(&s->ReadyFullALU, sinst);
+}
+
+static void decrease_dependencies(struct schedule_state * s, struct schedule_instruction * sinst)
+{
+	assert(sinst->NumDependencies > 0);
+	sinst->NumDependencies--;
+	if (!sinst->NumDependencies)
+		instruction_ready(s, sinst);
+}
+
+static void commit_instruction(struct schedule_state * s, struct schedule_instruction * sinst)
+{
+	unsigned int i;
+
+	DBG("%i: commit\n", sinst->Instruction->IP);
+
+	for(i = 0; i < sinst->NumReadValues; ++i) {
+		struct reg_value * v = sinst->ReadValues[i];
+		assert(v->NumReaders > 0);
+		v->NumReaders--;
+		if (!v->NumReaders) {
+			if (v->Next)
+				decrease_dependencies(s, v->Next->Writer);
+		}
+	}
+
+	for(i = 0; i < sinst->NumWriteValues; ++i) {
+		struct reg_value * v = sinst->WriteValues[i];
+		struct reg_value_reader * r;
+
+		if (v->NumReaders) {
+			for(r = v->Readers; r; r = r->Next) {
+				decrease_dependencies(s, r->Reader);
+			}
+		} else {
+			/* This happens in instruction sequences of the type
+			 *  OP r.x, ...;
+			 *  OP r.x, r.x, ...;
+			 * See also the subtlety in how instructions that both
+			 * read and write the same register are scanned.
+			 */
+			if (v->Next)
+				decrease_dependencies(s, v->Next->Writer);
+		}
+	}
+}
+
+/**
+ * Emit all ready texture instructions in a single block.
+ *
+ * Emit as a single block to (hopefully) sample many textures in parallel,
+ * and to avoid hardware indirections on R300.
+ */
+static void emit_all_tex(struct schedule_state * s, struct rc_instruction * before)
+{
+	struct schedule_instruction *readytex;
+
+	assert(s->ReadyTEX);
+
+	/* Don't let the ready list change under us! */
+	readytex = s->ReadyTEX;
+	s->ReadyTEX = 0;
+
+	/* Node marker for R300 */
+	struct rc_instruction * inst_begin = rc_insert_new_instruction(s->C, before->Prev);
+	inst_begin->U.I.Opcode = RC_OPCODE_BEGIN_TEX;
+
+	/* Link texture instructions back in */
+	while(readytex) {
+		struct schedule_instruction * tex = readytex;
+		readytex = readytex->NextReady;
+
+		rc_insert_instruction(before->Prev, tex->Instruction);
+		commit_instruction(s, tex);
+	}
+}
+
+
+static int destructive_merge_instructions(
+		struct rc_pair_instruction * rgb,
+		struct rc_pair_instruction * alpha)
+{
+	unsigned int arg;
+
+	assert(rgb->Alpha.Opcode == RC_OPCODE_NOP);
+	assert(alpha->RGB.Opcode == RC_OPCODE_NOP);
+
+	/* Copy alpha args into rgb */
+	const struct rc_opcode_info * opcode = rc_get_opcode_info(alpha->Alpha.Opcode);
+
+	for(arg = 0; arg < opcode->NumSrcRegs; ++arg) {
+		unsigned int srcrgb = 0;
+		unsigned int srcalpha = 0;
+		unsigned int oldsrc = alpha->Alpha.Arg[arg].Source;
+		rc_register_file file = 0;
+		unsigned int index = 0;
+
+		if (alpha->Alpha.Arg[arg].Swizzle < 3) {
+			srcrgb = 1;
+			file = alpha->RGB.Src[oldsrc].File;
+			index = alpha->RGB.Src[oldsrc].Index;
+		} else if (alpha->Alpha.Arg[arg].Swizzle < 4) {
+			srcalpha = 1;
+			file = alpha->Alpha.Src[oldsrc].File;
+			index = alpha->Alpha.Src[oldsrc].Index;
+		}
+
+		int source = rc_pair_alloc_source(rgb, srcrgb, srcalpha, file, index);
+		if (source < 0)
+			return 0;
+
+		rgb->Alpha.Arg[arg].Source = source;
+		rgb->Alpha.Arg[arg].Swizzle = alpha->Alpha.Arg[arg].Swizzle;
+		rgb->Alpha.Arg[arg].Abs = alpha->Alpha.Arg[arg].Abs;
+		rgb->Alpha.Arg[arg].Negate = alpha->Alpha.Arg[arg].Negate;
+	}
+
+	/* Copy alpha opcode into rgb */
+	rgb->Alpha.Opcode = alpha->Alpha.Opcode;
+	rgb->Alpha.DestIndex = alpha->Alpha.DestIndex;
+	rgb->Alpha.WriteMask = alpha->Alpha.WriteMask;
+	rgb->Alpha.OutputWriteMask = alpha->Alpha.OutputWriteMask;
+	rgb->Alpha.DepthWriteMask = alpha->Alpha.DepthWriteMask;
+	rgb->Alpha.Saturate = alpha->Alpha.Saturate;
+
+	/* Merge ALU result writing */
+	if (alpha->WriteALUResult) {
+		if (rgb->WriteALUResult)
+			return 0;
+
+		rgb->WriteALUResult = alpha->WriteALUResult;
+		rgb->ALUResultCompare = alpha->ALUResultCompare;
+	}
+
+	return 1;
+}
+
+/**
+ * Try to merge the given instructions into the rgb instructions.
+ *
+ * Return true on success; on failure, return false, and keep
+ * the instructions untouched.
+ */
+static int merge_instructions(struct rc_pair_instruction * rgb, struct rc_pair_instruction * alpha)
+{
+	struct rc_pair_instruction backup;
+
+	memcpy(&backup, rgb, sizeof(struct rc_pair_instruction));
+
+	if (destructive_merge_instructions(rgb, alpha))
+		return 1;
+
+	memcpy(rgb, &backup, sizeof(struct rc_pair_instruction));
+	return 0;
+}
+
+
+/**
+ * Find a good ALU instruction or pair of ALU instruction and emit it.
+ *
+ * Prefer emitting full ALU instructions, so that when we reach a point
+ * where no full ALU instruction can be emitted, we have more candidates
+ * for RGB/Alpha pairing.
+ */
+static void emit_one_alu(struct schedule_state *s, struct rc_instruction * before)
+{
+	struct schedule_instruction * sinst;
+
+	if (s->ReadyFullALU || !(s->ReadyRGB && s->ReadyAlpha)) {
+		if (s->ReadyFullALU) {
+			sinst = s->ReadyFullALU;
+			s->ReadyFullALU = s->ReadyFullALU->NextReady;
+		} else if (s->ReadyRGB) {
+			sinst = s->ReadyRGB;
+			s->ReadyRGB = s->ReadyRGB->NextReady;
+		} else {
+			sinst = s->ReadyAlpha;
+			s->ReadyAlpha = s->ReadyAlpha->NextReady;
+		}
+
+		rc_insert_instruction(before->Prev, sinst->Instruction);
+		commit_instruction(s, sinst);
+	} else {
+		struct schedule_instruction **prgb;
+		struct schedule_instruction **palpha;
+
+		/* Some pairings might fail because they require too
+		 * many source slots; try all possible pairings if necessary */
+		for(prgb = &s->ReadyRGB; *prgb; prgb = &(*prgb)->NextReady) {
+			for(palpha = &s->ReadyAlpha; *palpha; palpha = &(*palpha)->NextReady) {
+				struct schedule_instruction * psirgb = *prgb;
+				struct schedule_instruction * psialpha = *palpha;
+
+				if (!merge_instructions(&psirgb->Instruction->U.P, &psialpha->Instruction->U.P))
+					continue;
+
+				*prgb = (*prgb)->NextReady;
+				*palpha = (*palpha)->NextReady;
+				rc_insert_instruction(before->Prev, psirgb->Instruction);
+				commit_instruction(s, psirgb);
+				commit_instruction(s, psialpha);
+				goto success;
+			}
+		}
+
+		/* No success in pairing; just take the first RGB instruction */
+		sinst = s->ReadyRGB;
+		s->ReadyRGB = s->ReadyRGB->NextReady;
+
+		rc_insert_instruction(before->Prev, sinst->Instruction);
+		commit_instruction(s, sinst);
+	success: ;
+	}
+}
+
+static void scan_read(void * data, struct rc_instruction * inst,
+		rc_register_file file, unsigned int index, unsigned int chan)
+{
+	struct schedule_state * s = data;
+	struct reg_value * v = get_reg_value(s, file, index, chan);
+
+	if (!v)
+		return;
+
+	if (v->Writer == s->Current) {
+		/* The instruction reads and writes to a register component.
+		 * In this case, we only want to increment dependencies by one.
+		 */
+		return;
+	}
+
+	DBG("%i: read %i[%i] chan %i\n", s->Current->Instruction->IP, file, index, chan);
+
+	struct reg_value_reader * reader = memory_pool_malloc(&s->C->Pool, sizeof(*reader));
+	reader->Reader = s->Current;
+	reader->Next = v->Readers;
+	v->Readers = reader;
+	v->NumReaders++;
+
+	s->Current->NumDependencies++;
+
+	if (s->Current->NumReadValues >= 12) {
+		rc_error(s->C, "%s: NumReadValues overflow\n", __FUNCTION__);
+	} else {
+		s->Current->ReadValues[s->Current->NumReadValues++] = v;
+	}
+}
+
+static void scan_write(void * data, struct rc_instruction * inst,
+		rc_register_file file, unsigned int index, unsigned int chan)
+{
+	struct schedule_state * s = data;
+	struct reg_value ** pv = get_reg_valuep(s, file, index, chan);
+
+	if (!pv)
+		return;
+
+	DBG("%i: write %i[%i] chan %i\n", s->Current->Instruction->IP, file, index, chan);
+
+	struct reg_value * newv = memory_pool_malloc(&s->C->Pool, sizeof(*newv));
+	memset(newv, 0, sizeof(*newv));
+
+	newv->Writer = s->Current;
+
+	if (*pv) {
+		(*pv)->Next = newv;
+		s->Current->NumDependencies++;
+	}
+
+	*pv = newv;
+
+	if (s->Current->NumWriteValues >= 4) {
+		rc_error(s->C, "%s: NumWriteValues overflow\n", __FUNCTION__);
+	} else {
+		s->Current->WriteValues[s->Current->NumWriteValues++] = newv;
+	}
+}
+
+static void schedule_block(struct r300_fragment_program_compiler * c,
+		struct rc_instruction * begin, struct rc_instruction * end)
+{
+	struct schedule_state s;
+	struct rc_instruction * inst;
+
+	memset(&s, 0, sizeof(s));
+	s.C = &c->Base;
+
+	/* Scan instructions for data dependencies */
+	unsigned int ip = 0;
+	for(inst = begin; inst != end; inst = inst->Next) {
+		s.Current = memory_pool_malloc(&c->Base.Pool, sizeof(*s.Current));
+		memset(s.Current, 0, sizeof(struct schedule_instruction));
+
+		s.Current->Instruction = inst;
+		inst->IP = ip++;
+
+		DBG("%i: Scanning\n", inst->IP);
+
+		/* The order of things here is subtle and maybe slightly
+		 * counter-intuitive, to account for the case where an
+		 * instruction writes to the same register as it reads
+		 * from. */
+		rc_for_all_writes(inst, &scan_write, &s);
+		rc_for_all_reads(inst, &scan_read, &s);
+
+		DBG("%i: Has %i dependencies\n", inst->IP, s.Current->NumDependencies);
+
+		if (!s.Current->NumDependencies)
+			instruction_ready(&s, s.Current);
+	}
+
+	/* Temporarily unlink all instructions */
+	begin->Prev->Next = end;
+	end->Prev = begin->Prev;
+
+	/* Schedule instructions back */
+	while(!s.C->Error &&
+	      (s.ReadyTEX || s.ReadyRGB || s.ReadyAlpha || s.ReadyFullALU)) {
+		if (s.ReadyTEX)
+			emit_all_tex(&s, end);
+
+		while(!s.C->Error && (s.ReadyFullALU || s.ReadyRGB || s.ReadyAlpha))
+			emit_one_alu(&s, end);
+	}
+}
+
+static int is_controlflow(struct rc_instruction * inst)
+{
+	if (inst->Type == RC_INSTRUCTION_NORMAL) {
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+		return opcode->IsFlowControl;
+	}
+	return 0;
+}
+
+void rc_pair_schedule(struct r300_fragment_program_compiler *c)
+{
+	struct rc_instruction * inst = c->Base.Program.Instructions.Next;
+	while(inst != &c->Base.Program.Instructions) {
+		if (is_controlflow(inst)) {
+			inst = inst->Next;
+			continue;
+		}
+
+		struct rc_instruction * first = inst;
+
+		while(inst != &c->Base.Program.Instructions && !is_controlflow(inst))
+			inst = inst->Next;
+
+		DBG("Schedule one block\n");
+		schedule_block(c, first, inst);
+	}
+}
diff --git a/r300/compiler/radeon_pair_translate.c b/r300/compiler/radeon_pair_translate.c
new file mode 100644
index 0000000..933cf13
--- /dev/null
+++ b/r300/compiler/radeon_pair_translate.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_program_pair.h"
+
+#include "radeon_compiler.h"
+
+
+/**
+ * Finally rewrite ADD, MOV, MUL as the appropriate native instruction
+ * and reverse the order of arguments for CMP.
+ */
+static void final_rewrite(struct rc_sub_instruction *inst)
+{
+	struct rc_src_register tmp;
+
+	switch(inst->Opcode) {
+	case RC_OPCODE_ADD:
+		inst->SrcReg[2] = inst->SrcReg[1];
+		inst->SrcReg[1].File = RC_FILE_NONE;
+		inst->SrcReg[1].Swizzle = RC_SWIZZLE_1111;
+		inst->SrcReg[1].Negate = RC_MASK_NONE;
+		inst->Opcode = RC_OPCODE_MAD;
+		break;
+	case RC_OPCODE_CMP:
+		tmp = inst->SrcReg[2];
+		inst->SrcReg[2] = inst->SrcReg[0];
+		inst->SrcReg[0] = tmp;
+		break;
+	case RC_OPCODE_MOV:
+		/* AMD say we should use CMP.
+		 * However, when we transform
+		 *  KIL -r0;
+		 * into
+		 *  CMP tmp, -r0, -r0, 0;
+		 *  KIL tmp;
+		 * we get incorrect behaviour on R500 when r0 == 0.0.
+		 * It appears that the R500 KIL hardware treats -0.0 as less
+		 * than zero.
+		 */
+		inst->SrcReg[1].File = RC_FILE_NONE;
+		inst->SrcReg[1].Swizzle = RC_SWIZZLE_1111;
+		inst->SrcReg[2].File = RC_FILE_NONE;
+		inst->SrcReg[2].Swizzle = RC_SWIZZLE_0000;
+		inst->Opcode = RC_OPCODE_MAD;
+		break;
+	case RC_OPCODE_MUL:
+		inst->SrcReg[2].File = RC_FILE_NONE;
+		inst->SrcReg[2].Swizzle = RC_SWIZZLE_0000;
+		inst->Opcode = RC_OPCODE_MAD;
+		break;
+	default:
+		/* nothing to do */
+		break;
+	}
+}
+
+
+/**
+ * Classify an instruction according to which ALUs etc. it needs
+ */
+static void classify_instruction(struct rc_sub_instruction * inst,
+	int * needrgb, int * needalpha, int * istranscendent)
+{
+	*needrgb = (inst->DstReg.WriteMask & RC_MASK_XYZ) ? 1 : 0;
+	*needalpha = (inst->DstReg.WriteMask & RC_MASK_W) ? 1 : 0;
+	*istranscendent = 0;
+
+	if (inst->WriteALUResult == RC_ALURESULT_X)
+		*needrgb = 1;
+	else if (inst->WriteALUResult == RC_ALURESULT_W)
+		*needalpha = 1;
+
+	switch(inst->Opcode) {
+	case RC_OPCODE_ADD:
+	case RC_OPCODE_CMP:
+	case RC_OPCODE_DDX:
+	case RC_OPCODE_DDY:
+	case RC_OPCODE_FRC:
+	case RC_OPCODE_MAD:
+	case RC_OPCODE_MAX:
+	case RC_OPCODE_MIN:
+	case RC_OPCODE_MOV:
+	case RC_OPCODE_MUL:
+		break;
+	case RC_OPCODE_COS:
+	case RC_OPCODE_EX2:
+	case RC_OPCODE_LG2:
+	case RC_OPCODE_RCP:
+	case RC_OPCODE_RSQ:
+	case RC_OPCODE_SIN:
+		*istranscendent = 1;
+		*needalpha = 1;
+		break;
+	case RC_OPCODE_DP4:
+		*needalpha = 1;
+		/* fall through */
+	case RC_OPCODE_DP3:
+		*needrgb = 1;
+		break;
+	default:
+		break;
+	}
+}
+
+
+/**
+ * Fill the given ALU instruction's opcodes and source operands into the given pair,
+ * if possible.
+ */
+static void set_pair_instruction(struct r300_fragment_program_compiler *c,
+	struct rc_pair_instruction * pair,
+	struct rc_sub_instruction * inst)
+{
+	memset(pair, 0, sizeof(struct rc_pair_instruction));
+
+	int needrgb, needalpha, istranscendent;
+	classify_instruction(inst, &needrgb, &needalpha, &istranscendent);
+
+	if (needrgb) {
+		if (istranscendent)
+			pair->RGB.Opcode = RC_OPCODE_REPL_ALPHA;
+		else
+			pair->RGB.Opcode = inst->Opcode;
+		if (inst->SaturateMode == RC_SATURATE_ZERO_ONE)
+			pair->RGB.Saturate = 1;
+	}
+	if (needalpha) {
+		pair->Alpha.Opcode = inst->Opcode;
+		if (inst->SaturateMode == RC_SATURATE_ZERO_ONE)
+			pair->Alpha.Saturate = 1;
+	}
+
+	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->Opcode);
+	int nargs = opcode->NumSrcRegs;
+	int i;
+
+	/* Special case for DDX/DDY (MDH/MDV). */
+	if (inst->Opcode == RC_OPCODE_DDX || inst->Opcode == RC_OPCODE_DDY) {
+		nargs++;
+	}
+
+	for(i = 0; i < opcode->NumSrcRegs; ++i) {
+		int source;
+		if (needrgb && !istranscendent) {
+			unsigned int srcrgb = 0;
+			unsigned int srcalpha = 0;
+			int j;
+			for(j = 0; j < 3; ++j) {
+				unsigned int swz = GET_SWZ(inst->SrcReg[i].Swizzle, j);
+				if (swz < 3)
+					srcrgb = 1;
+				else if (swz < 4)
+					srcalpha = 1;
+			}
+			source = rc_pair_alloc_source(pair, srcrgb, srcalpha,
+							inst->SrcReg[i].File, inst->SrcReg[i].Index);
+			pair->RGB.Arg[i].Source = source;
+			pair->RGB.Arg[i].Swizzle = inst->SrcReg[i].Swizzle & 0x1ff;
+			pair->RGB.Arg[i].Abs = inst->SrcReg[i].Abs;
+			pair->RGB.Arg[i].Negate = !!(inst->SrcReg[i].Negate & (RC_MASK_X | RC_MASK_Y | RC_MASK_Z));
+		}
+		if (needalpha) {
+			unsigned int srcrgb = 0;
+			unsigned int srcalpha = 0;
+			unsigned int swz = GET_SWZ(inst->SrcReg[i].Swizzle, istranscendent ? 0 : 3);
+			if (swz < 3)
+				srcrgb = 1;
+			else if (swz < 4)
+				srcalpha = 1;
+			source = rc_pair_alloc_source(pair, srcrgb, srcalpha,
+							inst->SrcReg[i].File, inst->SrcReg[i].Index);
+			pair->Alpha.Arg[i].Source = source;
+			pair->Alpha.Arg[i].Swizzle = swz;
+			pair->Alpha.Arg[i].Abs = inst->SrcReg[i].Abs;
+			pair->Alpha.Arg[i].Negate = !!(inst->SrcReg[i].Negate & RC_MASK_W);
+		}
+	}
+
+	/* Destination handling */
+	if (inst->DstReg.File == RC_FILE_OUTPUT) {
+		if (inst->DstReg.Index == c->OutputColor) {
+			pair->RGB.OutputWriteMask |= inst->DstReg.WriteMask & RC_MASK_XYZ;
+			pair->Alpha.OutputWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
+		} else if (inst->DstReg.Index == c->OutputDepth) {
+			pair->Alpha.DepthWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
+		}
+	} else {
+		if (needrgb) {
+			pair->RGB.DestIndex = inst->DstReg.Index;
+			pair->RGB.WriteMask |= inst->DstReg.WriteMask & RC_MASK_XYZ;
+		}
+		if (needalpha) {
+			pair->Alpha.DestIndex = inst->DstReg.Index;
+			pair->Alpha.WriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
+		}
+	}
+
+	if (inst->WriteALUResult) {
+		pair->WriteALUResult = inst->WriteALUResult;
+		pair->ALUResultCompare = inst->ALUResultCompare;
+	}
+}
+
+
+/**
+ * Translate all ALU instructions into corresponding pair instructions,
+ * performing no other changes.
+ */
+void rc_pair_translate(struct r300_fragment_program_compiler *c)
+{
+	struct rc_instruction *inst;
+
+	for(inst = c->Base.Program.Instructions.Next;
+	    inst != &c->Base.Program.Instructions;
+	    inst = inst->Next) {
+		if (inst->Type != RC_INSTRUCTION_NORMAL)
+			continue;
+
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+
+		if (opcode->HasTexture || opcode->IsFlowControl || opcode->Opcode == RC_OPCODE_KIL)
+			continue;
+
+		struct rc_sub_instruction copy = inst->U.I;
+
+		final_rewrite(&copy);
+		inst->Type = RC_INSTRUCTION_PAIR;
+		set_pair_instruction(c, &inst->U.P, &copy);
+	}
+}
diff --git a/r300/compiler/radeon_program.c b/r300/compiler/radeon_program.c
index 605edb6..fb4752f 100644
--- a/r300/compiler/radeon_program.c
+++ b/r300/compiler/radeon_program.c
@@ -27,9 +27,9 @@
 
 #include "radeon_program.h"
 
+#include <stdio.h>
+
 #include "radeon_compiler.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
 
 
 /**
@@ -69,38 +69,58 @@ void radeonLocalTransform(
 	}
 }
 
+/**
+ * Left multiplication of a register with a swizzle
+ */
+struct rc_src_register lmul_swizzle(unsigned int swizzle, struct rc_src_register srcreg)
+{
+	struct rc_src_register tmp = srcreg;
+	int i;
+	tmp.Swizzle = 0;
+	tmp.Negate = 0;
+	for(i = 0; i < 4; ++i) {
+		rc_swizzle swz = GET_SWZ(swizzle, i);
+		if (swz < 4) {
+			tmp.Swizzle |= GET_SWZ(srcreg.Swizzle, swz) << (i*3);
+			tmp.Negate |= GET_BIT(srcreg.Negate, swz) << i;
+		} else {
+			tmp.Swizzle |= swz << (i*3);
+		}
+	}
+	return tmp;
+}
 
-GLint rc_find_free_temporary(struct radeon_compiler * c)
+unsigned int rc_find_free_temporary(struct radeon_compiler * c)
 {
 	struct rc_instruction * rcinst;
-	GLboolean used[MAX_PROGRAM_TEMPS];
-	GLuint i;
+	char used[RC_REGISTER_MAX_INDEX];
+	unsigned int i;
 
 	memset(used, 0, sizeof(used));
 
 	for (rcinst = c->Program.Instructions.Next; rcinst != &c->Program.Instructions; rcinst = rcinst->Next) {
-		const struct prog_instruction *inst = &rcinst->I;
-		const GLuint nsrc = _mesa_num_inst_src_regs(inst->Opcode);
-		const GLuint ndst = _mesa_num_inst_dst_regs(inst->Opcode);
-		GLuint k;
-
-		for (k = 0; k < nsrc; k++) {
-			if (inst->SrcReg[k].File == PROGRAM_TEMPORARY)
-				used[inst->SrcReg[k].Index] = GL_TRUE;
+		const struct rc_sub_instruction *inst = &rcinst->U.I;
+		const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->Opcode);
+		unsigned int k;
+
+		for (k = 0; k < opcode->NumSrcRegs; k++) {
+			if (inst->SrcReg[k].File == RC_FILE_TEMPORARY)
+				used[inst->SrcReg[k].Index] = 1;
 		}
 
-		if (ndst) {
-			if (inst->DstReg.File == PROGRAM_TEMPORARY)
-				used[inst->DstReg.Index] = GL_TRUE;
+		if (opcode->HasDstReg) {
+			if (inst->DstReg.File == RC_FILE_TEMPORARY)
+				used[inst->DstReg.Index] = 1;
 		}
 	}
 
-	for (i = 0; i < MAX_PROGRAM_TEMPS; i++) {
+	for (i = 0; i < RC_REGISTER_MAX_INDEX; i++) {
 		if (!used[i])
 			return i;
 	}
 
-	return -1;
+	rc_error(c, "Ran out of temporary registers\n");
+	return 0;
 }
 
 
@@ -108,24 +128,31 @@ struct rc_instruction *rc_alloc_instruction(struct radeon_compiler * c)
 {
 	struct rc_instruction * inst = memory_pool_malloc(&c->Pool, sizeof(struct rc_instruction));
 
-	inst->Prev = 0;
-	inst->Next = 0;
+	memset(inst, 0, sizeof(struct rc_instruction));
 
-	_mesa_init_instructions(&inst->I, 1);
+	inst->U.I.Opcode = RC_OPCODE_ILLEGAL_OPCODE;
+	inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
+	inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
+	inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_XYZW;
+	inst->U.I.SrcReg[2].Swizzle = RC_SWIZZLE_XYZW;
 
 	return inst;
 }
 
-
-struct rc_instruction *rc_insert_new_instruction(struct radeon_compiler * c, struct rc_instruction * after)
+void rc_insert_instruction(struct rc_instruction * after, struct rc_instruction * inst)
 {
-	struct rc_instruction * inst = rc_alloc_instruction(c);
-
 	inst->Prev = after;
 	inst->Next = after->Next;
 
 	inst->Prev->Next = inst;
 	inst->Next->Prev = inst;
+}
+
+struct rc_instruction *rc_insert_new_instruction(struct radeon_compiler * c, struct rc_instruction * after)
+{
+	struct rc_instruction * inst = rc_alloc_instruction(c);
+
+	rc_insert_instruction(after, inst);
 
 	return inst;
 }
@@ -136,76 +163,21 @@ void rc_remove_instruction(struct rc_instruction * inst)
 	inst->Next->Prev = inst->Prev;
 }
 
-
-void rc_mesa_to_rc_program(struct radeon_compiler * c, struct gl_program * program)
-{
-	struct prog_instruction *source;
-	unsigned int i;
-
-	for(source = program->Instructions; source->Opcode != OPCODE_END; ++source) {
-		struct rc_instruction * dest = rc_insert_new_instruction(c, c->Program.Instructions.Prev);
-		dest->I = *source;
-	}
-
-	c->Program.ShadowSamplers = program->ShadowSamplers;
-	c->Program.InputsRead = program->InputsRead;
-	c->Program.OutputsWritten = program->OutputsWritten;
-
-	int isNVProgram = 0;
-
-	if (program->Target == GL_VERTEX_PROGRAM_ARB) {
-		struct gl_vertex_program * vp = (struct gl_vertex_program *) program;
-		isNVProgram = vp->IsNVProgram;
-	}
-
-	if (isNVProgram) {
-		/* NV_vertex_program has a fixed-sized constant environment.
-		 * This could be handled more efficiently for programs that
-		 * do not use relative addressing.
-		 */
-		for(i = 0; i < 96; ++i) {
-			struct rc_constant constant;
-
-			constant.Type = RC_CONSTANT_EXTERNAL;
-			constant.Size = 4;
-			constant.u.External = i;
-
-			rc_constants_add(&c->Program.Constants, &constant);
-		}
-	} else {
-		for(i = 0; i < program->Parameters->NumParameters; ++i) {
-			struct rc_constant constant;
-
-			constant.Type = RC_CONSTANT_EXTERNAL;
-			constant.Size = 4;
-			constant.u.External = i;
-
-			rc_constants_add(&c->Program.Constants, &constant);
-		}
-	}
-}
-
-
 /**
- * Print program to stderr, default options.
+ * Return the number of instructions in the program.
  */
-void rc_print_program(const struct rc_program *prog)
+unsigned int rc_recompute_ips(struct radeon_compiler * c)
 {
-	GLuint indent = 0;
-	GLuint linenum = 1;
-	struct rc_instruction *inst;
-
-	fprintf(stderr, "# Radeon Compiler Program\n");
+	unsigned int ip = 0;
+	struct rc_instruction * inst;
 
-	for(inst = prog->Instructions.Next; inst != &prog->Instructions; inst = inst->Next) {
-		fprintf(stderr, "%3d: ", linenum);
+	for(inst = c->Program.Instructions.Next;
+	    inst != &c->Program.Instructions;
+	    inst = inst->Next) {
+		inst->IP = ip++;
+	}
 
-		/* Massive hack: We rely on the fact that the printers do not actually
-		 * use the gl_program argument (last argument) in debug mode */
-		indent = _mesa_fprint_instruction_opt(
-				stderr, &inst->I,
-				indent, PROG_PRINT_DEBUG, 0);
+	c->Program.Instructions.IP = 0xcafedead;
 
-		linenum++;
-	}
+	return ip;
 }
diff --git a/r300/compiler/radeon_program.h b/r300/compiler/radeon_program.h
index 5619586..0359288 100644
--- a/r300/compiler/radeon_program.h
+++ b/r300/compiler/radeon_program.h
@@ -28,37 +28,144 @@
 #ifndef __RADEON_PROGRAM_H_
 #define __RADEON_PROGRAM_H_
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-#include "shader/prog_instruction.h"
+#include <stdint.h>
+#include <string.h>
+
+#include "radeon_opcodes.h"
+#include "radeon_code.h"
+#include "radeon_program_constants.h"
+#include "radeon_program_pair.h"
 
 struct radeon_compiler;
-struct rc_instruction;
-struct rc_program;
 
-enum {
-	PROGRAM_BUILTIN = PROGRAM_FILE_MAX /**< not a real register, but a special swizzle constant */
+struct rc_src_register {
+	rc_register_file File:3;
+
+	/** Negative values may be used for relative addressing. */
+	signed int Index:(RC_REGISTER_INDEX_BITS+1);
+	unsigned int RelAddr:1;
+
+	unsigned int Swizzle:12;
+
+	/** Take the component-wise absolute value */
+	unsigned int Abs:1;
+
+	/** Post-Abs negation. */
+	unsigned int Negate:4;
+};
+
+struct rc_dst_register {
+	rc_register_file File:3;
+
+	/** Negative values may be used for relative addressing. */
+	signed int Index:(RC_REGISTER_INDEX_BITS+1);
+	unsigned int RelAddr:1;
+
+	unsigned int WriteMask:4;
+};
+
+/**
+ * Instructions are maintained by the compiler in a doubly linked list
+ * of these structures.
+ *
+ * This instruction format is intended to be expanded for hardware-specific
+ * trickery. At different stages of compilation, a different set of
+ * instruction types may be valid.
+ */
+struct rc_sub_instruction {
+	struct rc_src_register SrcReg[3];
+	struct rc_dst_register DstReg;
+
+	/**
+	 * Opcode of this instruction, according to \ref rc_opcode enums.
+	 */
+	rc_opcode Opcode:8;
+
+	/**
+	 * Saturate each value of the result to the range [0,1] or [-1,1],
+	 * according to \ref rc_saturate_mode enums.
+	 */
+	rc_saturate_mode SaturateMode:2;
+
+	/**
+	 * Writing to the special register RC_SPECIAL_ALU_RESULT
+	 */
+	/*@{*/
+	rc_write_aluresult WriteALUResult:2;
+	rc_compare_func ALUResultCompare:3;
+	/*@}*/
+
+	/**
+	 * \name Extra fields for TEX, TXB, TXD, TXL, TXP instructions.
+	 */
+	/*@{*/
+	/** Source texture unit. */
+	unsigned int TexSrcUnit:5;
+
+	/** Source texture target, one of the \ref rc_texture_target enums */
+	rc_texture_target TexSrcTarget:3;
+
+	/** True if tex instruction should do shadow comparison */
+	unsigned int TexShadow:1;
+	/*@}*/
+};
+
+typedef enum {
+	RC_INSTRUCTION_NORMAL = 0,
+	RC_INSTRUCTION_PAIR
+} rc_instruction_type;
+
+struct rc_instruction {
+	struct rc_instruction * Prev;
+	struct rc_instruction * Next;
+
+	rc_instruction_type Type;
+	union {
+		struct rc_sub_instruction I;
+		struct rc_pair_instruction P;
+	} U;
+
+	/**
+	 * Warning: IPs are not stable. If you want to use them,
+	 * you need to recompute them at the beginning of each pass
+	 * using \ref rc_recompute_ips
+	 */
+	unsigned int IP;
+};
+
+struct rc_program {
+	/**
+	 * Instructions.Next points to the first instruction,
+	 * Instructions.Prev points to the last instruction.
+	 */
+	struct rc_instruction Instructions;
+
+	/* Long term, we should probably remove InputsRead & OutputsWritten,
+	 * since updating dependent state can be fragile, and they aren't
+	 * actually used very often. */
+	uint32_t InputsRead;
+	uint32_t OutputsWritten;
+	uint32_t ShadowSamplers; /**< Texture units used for shadow sampling. */
+
+	struct rc_constant_list Constants;
 };
 
 enum {
-	OPCODE_REPL_ALPHA = MAX_OPCODE /**< used in paired instructions */
+	OPCODE_REPL_ALPHA = MAX_RC_OPCODE /**< used in paired instructions */
 };
 
-#define SWIZZLE_0000 MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO)
-#define SWIZZLE_1111 MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE)
 
-static inline GLuint get_swz(GLuint swz, GLuint idx)
+static inline rc_swizzle get_swz(unsigned int swz, rc_swizzle idx)
 {
 	if (idx & 0x4)
 		return idx;
 	return GET_SWZ(swz, idx);
 }
 
-static inline GLuint combine_swizzles4(GLuint src, GLuint swz_x, GLuint swz_y, GLuint swz_z, GLuint swz_w)
+static inline unsigned int combine_swizzles4(unsigned int src,
+		rc_swizzle swz_x, rc_swizzle swz_y, rc_swizzle swz_z, rc_swizzle swz_w)
 {
-	GLuint ret = 0;
+	unsigned int ret = 0;
 
 	ret |= get_swz(src, swz_x);
 	ret |= get_swz(src, swz_y) << 3;
@@ -68,22 +175,24 @@ static inline GLuint combine_swizzles4(GLuint src, GLuint swz_x, GLuint swz_y, G
 	return ret;
 }
 
-static inline GLuint combine_swizzles(GLuint src, GLuint swz)
+static inline unsigned int combine_swizzles(unsigned int src, unsigned int swz)
 {
-	GLuint ret = 0;
+	unsigned int ret = 0;
 
-	ret |= get_swz(src, GET_SWZ(swz, SWIZZLE_X));
-	ret |= get_swz(src, GET_SWZ(swz, SWIZZLE_Y)) << 3;
-	ret |= get_swz(src, GET_SWZ(swz, SWIZZLE_Z)) << 6;
-	ret |= get_swz(src, GET_SWZ(swz, SWIZZLE_W)) << 9;
+	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_X));
+	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_Y)) << 3;
+	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_Z)) << 6;
+	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_W)) << 9;
 
 	return ret;
 }
 
-static INLINE void reset_srcreg(struct prog_src_register* reg)
+struct rc_src_register lmul_swizzle(unsigned int swizzle, struct rc_src_register srcreg);
+
+static inline void reset_srcreg(struct rc_src_register* reg)
 {
-	_mesa_bzero(reg, sizeof(*reg));
-	reg->Swizzle = SWIZZLE_NOOP;
+	memset(reg, 0, sizeof(struct rc_src_register));
+	reg->Swizzle = RC_SWIZZLE_XYZW;
 }
 
 
@@ -92,13 +201,13 @@ static INLINE void reset_srcreg(struct prog_src_register* reg)
  *
  * The function will be called once for each instruction.
  * It has to either emit the appropriate transformed code for the instruction
- * and return GL_TRUE, or return GL_FALSE if it doesn't understand the
+ * and return true, or return false if it doesn't understand the
  * instruction.
  *
  * The function gets passed the userData as last parameter.
  */
 struct radeon_program_transformation {
-	GLboolean (*function)(
+	int (*function)(
 		struct radeon_compiler*,
 		struct rc_instruction*,
 		void*);
@@ -110,12 +219,15 @@ void radeonLocalTransform(
 	int num_transformations,
 	struct radeon_program_transformation* transformations);
 
-GLint rc_find_free_temporary(struct radeon_compiler * c);
+unsigned int rc_find_free_temporary(struct radeon_compiler * c);
 
 struct rc_instruction *rc_alloc_instruction(struct radeon_compiler * c);
 struct rc_instruction *rc_insert_new_instruction(struct radeon_compiler * c, struct rc_instruction * after);
+void rc_insert_instruction(struct rc_instruction * after, struct rc_instruction * inst);
 void rc_remove_instruction(struct rc_instruction * inst);
 
+unsigned int rc_recompute_ips(struct radeon_compiler * c);
+
 void rc_print_program(const struct rc_program *prog);
 
 #endif
diff --git a/r300/compiler/radeon_program_alu.c b/r300/compiler/radeon_program_alu.c
index f23ce30..ced66af 100644
--- a/r300/compiler/radeon_program_alu.c
+++ b/r300/compiler/radeon_program_alu.c
@@ -40,175 +40,164 @@
 
 static struct rc_instruction *emit1(
 	struct radeon_compiler * c, struct rc_instruction * after,
-	gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg,
-	struct prog_src_register SrcReg)
+	rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
+	struct rc_src_register SrcReg)
 {
 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
 
-	fpi->I.Opcode = Opcode;
-	fpi->I.SaturateMode = Saturate;
-	fpi->I.DstReg = DstReg;
-	fpi->I.SrcReg[0] = SrcReg;
+	fpi->U.I.Opcode = Opcode;
+	fpi->U.I.SaturateMode = Saturate;
+	fpi->U.I.DstReg = DstReg;
+	fpi->U.I.SrcReg[0] = SrcReg;
 	return fpi;
 }
 
 static struct rc_instruction *emit2(
 	struct radeon_compiler * c, struct rc_instruction * after,
-	gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg,
-	struct prog_src_register SrcReg0, struct prog_src_register SrcReg1)
+	rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
+	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
 {
 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
 
-	fpi->I.Opcode = Opcode;
-	fpi->I.SaturateMode = Saturate;
-	fpi->I.DstReg = DstReg;
-	fpi->I.SrcReg[0] = SrcReg0;
-	fpi->I.SrcReg[1] = SrcReg1;
+	fpi->U.I.Opcode = Opcode;
+	fpi->U.I.SaturateMode = Saturate;
+	fpi->U.I.DstReg = DstReg;
+	fpi->U.I.SrcReg[0] = SrcReg0;
+	fpi->U.I.SrcReg[1] = SrcReg1;
 	return fpi;
 }
 
 static struct rc_instruction *emit3(
 	struct radeon_compiler * c, struct rc_instruction * after,
-	gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg,
-	struct prog_src_register SrcReg0, struct prog_src_register SrcReg1,
-	struct prog_src_register SrcReg2)
+	rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
+	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
+	struct rc_src_register SrcReg2)
 {
 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
 
-	fpi->I.Opcode = Opcode;
-	fpi->I.SaturateMode = Saturate;
-	fpi->I.DstReg = DstReg;
-	fpi->I.SrcReg[0] = SrcReg0;
-	fpi->I.SrcReg[1] = SrcReg1;
-	fpi->I.SrcReg[2] = SrcReg2;
+	fpi->U.I.Opcode = Opcode;
+	fpi->U.I.SaturateMode = Saturate;
+	fpi->U.I.DstReg = DstReg;
+	fpi->U.I.SrcReg[0] = SrcReg0;
+	fpi->U.I.SrcReg[1] = SrcReg1;
+	fpi->U.I.SrcReg[2] = SrcReg2;
 	return fpi;
 }
 
-static struct prog_dst_register dstreg(int file, int index)
+static struct rc_dst_register dstreg(int file, int index)
 {
-	struct prog_dst_register dst;
+	struct rc_dst_register dst;
 	dst.File = file;
 	dst.Index = index;
-	dst.WriteMask = WRITEMASK_XYZW;
-	dst.CondMask = COND_TR;
+	dst.WriteMask = RC_MASK_XYZW;
 	dst.RelAddr = 0;
-	dst.CondSwizzle = SWIZZLE_NOOP;
-	dst.CondSrc = 0;
-	dst.pad = 0;
 	return dst;
 }
 
-static struct prog_dst_register dstregtmpmask(int index, int mask)
+static struct rc_dst_register dstregtmpmask(int index, int mask)
 {
-	struct prog_dst_register dst = {0};
-	dst.File = PROGRAM_TEMPORARY;
+	struct rc_dst_register dst = {0};
+	dst.File = RC_FILE_TEMPORARY;
 	dst.Index = index;
 	dst.WriteMask = mask;
 	dst.RelAddr = 0;
-	dst.CondMask = COND_TR;
-	dst.CondSwizzle = SWIZZLE_NOOP;
-	dst.CondSrc = 0;
-	dst.pad = 0;
 	return dst;
 }
 
-static const struct prog_src_register builtin_zero = {
-	.File = PROGRAM_BUILTIN,
+static const struct rc_src_register builtin_zero = {
+	.File = RC_FILE_NONE,
 	.Index = 0,
-	.Swizzle = SWIZZLE_0000
+	.Swizzle = RC_SWIZZLE_0000
 };
-static const struct prog_src_register builtin_one = {
-	.File = PROGRAM_BUILTIN,
+static const struct rc_src_register builtin_one = {
+	.File = RC_FILE_NONE,
 	.Index = 0,
-	.Swizzle = SWIZZLE_1111
+	.Swizzle = RC_SWIZZLE_1111
 };
-static const struct prog_src_register srcreg_undefined = {
-	.File = PROGRAM_UNDEFINED,
+static const struct rc_src_register srcreg_undefined = {
+	.File = RC_FILE_NONE,
 	.Index = 0,
-	.Swizzle = SWIZZLE_NOOP
+	.Swizzle = RC_SWIZZLE_XYZW
 };
 
-static struct prog_src_register srcreg(int file, int index)
+static struct rc_src_register srcreg(int file, int index)
 {
-	struct prog_src_register src = srcreg_undefined;
+	struct rc_src_register src = srcreg_undefined;
 	src.File = file;
 	src.Index = index;
 	return src;
 }
 
-static struct prog_src_register srcregswz(int file, int index, int swz)
+static struct rc_src_register srcregswz(int file, int index, int swz)
 {
-	struct prog_src_register src = srcreg_undefined;
+	struct rc_src_register src = srcreg_undefined;
 	src.File = file;
 	src.Index = index;
 	src.Swizzle = swz;
 	return src;
 }
 
-static struct prog_src_register absolute(struct prog_src_register reg)
+static struct rc_src_register absolute(struct rc_src_register reg)
 {
-	struct prog_src_register newreg = reg;
+	struct rc_src_register newreg = reg;
 	newreg.Abs = 1;
-	newreg.Negate = NEGATE_NONE;
+	newreg.Negate = RC_MASK_NONE;
 	return newreg;
 }
 
-static struct prog_src_register negate(struct prog_src_register reg)
+static struct rc_src_register negate(struct rc_src_register reg)
 {
-	struct prog_src_register newreg = reg;
-	newreg.Negate = newreg.Negate ^ NEGATE_XYZW;
+	struct rc_src_register newreg = reg;
+	newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
 	return newreg;
 }
 
-static struct prog_src_register swizzle(struct prog_src_register reg, GLuint x, GLuint y, GLuint z, GLuint w)
+static struct rc_src_register swizzle(struct rc_src_register reg,
+		rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
 {
-	struct prog_src_register swizzled = reg;
-	swizzled.Swizzle = MAKE_SWIZZLE4(
-		x >= 4 ? x : GET_SWZ(reg.Swizzle, x),
-		y >= 4 ? y : GET_SWZ(reg.Swizzle, y),
-		z >= 4 ? z : GET_SWZ(reg.Swizzle, z),
-		w >= 4 ? w : GET_SWZ(reg.Swizzle, w));
+	struct rc_src_register swizzled = reg;
+	swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
 	return swizzled;
 }
 
-static struct prog_src_register scalar(struct prog_src_register reg)
+static struct rc_src_register scalar(struct rc_src_register reg)
 {
-	return swizzle(reg, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
+	return swizzle(reg, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X);
 }
 
 static void transform_ABS(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	struct prog_src_register src = inst->I.SrcReg[0];
+	struct rc_src_register src = inst->U.I.SrcReg[0];
 	src.Abs = 1;
-	src.Negate = NEGATE_NONE;
-	emit1(c, inst->Prev, OPCODE_MOV, inst->I.SaturateMode, inst->I.DstReg, src);
+	src.Negate = RC_MASK_NONE;
+	emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, src);
 	rc_remove_instruction(inst);
 }
 
 static void transform_DP3(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	struct prog_src_register src0 = inst->I.SrcReg[0];
-	struct prog_src_register src1 = inst->I.SrcReg[1];
-	src0.Negate &= ~NEGATE_W;
+	struct rc_src_register src0 = inst->U.I.SrcReg[0];
+	struct rc_src_register src1 = inst->U.I.SrcReg[1];
+	src0.Negate &= ~RC_MASK_W;
 	src0.Swizzle &= ~(7 << (3 * 3));
-	src0.Swizzle |= SWIZZLE_ZERO << (3 * 3);
-	src1.Negate &= ~NEGATE_W;
+	src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
+	src1.Negate &= ~RC_MASK_W;
 	src1.Swizzle &= ~(7 << (3 * 3));
-	src1.Swizzle |= SWIZZLE_ZERO << (3 * 3);
-	emit2(c, inst->Prev, OPCODE_DP4, inst->I.SaturateMode, inst->I.DstReg, src0, src1);
+	src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
+	emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
 	rc_remove_instruction(inst);
 }
 
 static void transform_DPH(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	struct prog_src_register src0 = inst->I.SrcReg[0];
-	src0.Negate &= ~NEGATE_W;
+	struct rc_src_register src0 = inst->U.I.SrcReg[0];
+	src0.Negate &= ~RC_MASK_W;
 	src0.Swizzle &= ~(7 << (3 * 3));
-	src0.Swizzle |= SWIZZLE_ONE << (3 * 3);
-	emit2(c, inst->Prev, OPCODE_DP4, inst->I.SaturateMode, inst->I.DstReg, src0, inst->I.SrcReg[1]);
+	src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
+	emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
 	rc_remove_instruction(inst);
 }
 
@@ -219,9 +208,9 @@ static void transform_DPH(struct radeon_compiler* c,
 static void transform_DST(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	emit2(c, inst->Prev, OPCODE_MUL, inst->I.SaturateMode, inst->I.DstReg,
-		swizzle(inst->I.SrcReg[0], SWIZZLE_ONE, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE),
-		swizzle(inst->I.SrcReg[1], SWIZZLE_ONE, SWIZZLE_Y, SWIZZLE_ONE, SWIZZLE_W));
+	emit2(c, inst->Prev, RC_OPCODE_MUL, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
+		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
 	rc_remove_instruction(inst);
 }
 
@@ -229,9 +218,9 @@ static void transform_FLR(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
 	int tempreg = rc_find_free_temporary(c);
-	emit1(c, inst->Prev, OPCODE_FRC, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->I.SrcReg[0]);
-	emit2(c, inst->Prev, OPCODE_ADD, inst->I.SaturateMode, inst->I.DstReg,
-		inst->I.SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
+	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0]);
+	emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, tempreg)));
 	rc_remove_instruction(inst);
 }
 
@@ -256,64 +245,64 @@ static void transform_FLR(struct radeon_compiler* c,
 static void transform_LIT(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	GLuint constant;
-	GLuint constant_swizzle;
-	GLuint temp;
-	struct prog_src_register srctemp;
+	unsigned int constant;
+	unsigned int constant_swizzle;
+	unsigned int temp;
+	struct rc_src_register srctemp;
 
 	constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
 
-	if (inst->I.DstReg.WriteMask != WRITEMASK_XYZW || inst->I.DstReg.File != PROGRAM_TEMPORARY) {
+	if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
 		struct rc_instruction * inst_mov;
 
 		inst_mov = emit1(c, inst,
-			OPCODE_MOV, 0, inst->I.DstReg,
-			srcreg(PROGRAM_TEMPORARY, rc_find_free_temporary(c)));
+			RC_OPCODE_MOV, 0, inst->U.I.DstReg,
+			srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
 
-		inst->I.DstReg.File = PROGRAM_TEMPORARY;
-		inst->I.DstReg.Index = inst_mov->I.SrcReg[0].Index;
-		inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
+		inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
+		inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
+		inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
 	}
 
-	temp = inst->I.DstReg.Index;
-	srctemp = srcreg(PROGRAM_TEMPORARY, temp);
+	temp = inst->U.I.DstReg.Index;
+	srctemp = srcreg(RC_FILE_TEMPORARY, temp);
 
 	// tmp.x = max(0.0, Src.x);
 	// tmp.y = max(0.0, Src.y);
 	// tmp.w = clamp(Src.z, -128+eps, 128-eps);
-	emit2(c, inst->Prev, OPCODE_MAX, 0,
-		dstregtmpmask(temp, WRITEMASK_XYW),
-		inst->I.SrcReg[0],
-		swizzle(srcreg(PROGRAM_CONSTANT, constant),
-			SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, constant_swizzle&3));
-	emit2(c, inst->Prev, OPCODE_MIN, 0,
-		dstregtmpmask(temp, WRITEMASK_Z),
-		swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-		negate(srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle)));
+	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
+		dstregtmpmask(temp, RC_MASK_XYW),
+		inst->U.I.SrcReg[0],
+		swizzle(srcreg(RC_FILE_CONSTANT, constant),
+			RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
+	emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
+		dstregtmpmask(temp, RC_MASK_Z),
+		swizzle(srctemp, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
+		negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
 
 	// tmp.w = Pow(tmp.y, tmp.w)
-	emit1(c, inst->Prev, OPCODE_LG2, 0,
-		dstregtmpmask(temp, WRITEMASK_W),
-		swizzle(srctemp, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y));
-	emit2(c, inst->Prev, OPCODE_MUL, 0,
-		dstregtmpmask(temp, WRITEMASK_W),
-		swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-		swizzle(srctemp, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z));
-	emit1(c, inst->Prev, OPCODE_EX2, 0,
-		dstregtmpmask(temp, WRITEMASK_W),
-		swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
+	emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
+		dstregtmpmask(temp, RC_MASK_W),
+		swizzle(srctemp, RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y));
+	emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
+		dstregtmpmask(temp, RC_MASK_W),
+		swizzle(srctemp, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
+		swizzle(srctemp, RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z));
+	emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
+		dstregtmpmask(temp, RC_MASK_W),
+		swizzle(srctemp, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W));
 
 	// tmp.z = (tmp.x > 0) ? tmp.w : 0.0
-	emit3(c, inst->Prev, OPCODE_CMP, inst->I.SaturateMode,
-		dstregtmpmask(temp, WRITEMASK_Z),
-		negate(swizzle(srctemp, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
-		swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode,
+		dstregtmpmask(temp, RC_MASK_Z),
+		negate(swizzle(srctemp, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X)),
+		swizzle(srctemp, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
 		builtin_zero);
 
 	// tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0
-	emit1(c, inst->Prev, OPCODE_MOV, inst->I.SaturateMode,
-		dstregtmpmask(temp, WRITEMASK_XYW),
-		swizzle(srctemp, SWIZZLE_ONE, SWIZZLE_X, SWIZZLE_ONE, SWIZZLE_ONE));
+	emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode,
+		dstregtmpmask(temp, RC_MASK_XYW),
+		swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
 
 	rc_remove_instruction(inst);
 }
@@ -323,12 +312,12 @@ static void transform_LRP(struct radeon_compiler* c,
 {
 	int tempreg = rc_find_free_temporary(c);
 
-	emit2(c, inst->Prev, OPCODE_ADD, 0,
-		dstreg(PROGRAM_TEMPORARY, tempreg),
-		inst->I.SrcReg[1], negate(inst->I.SrcReg[2]));
-	emit3(c, inst->Prev, OPCODE_MAD, inst->I.SaturateMode,
-		inst->I.DstReg,
-		inst->I.SrcReg[0], srcreg(PROGRAM_TEMPORARY, tempreg), inst->I.SrcReg[2]);
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
+		dstreg(RC_FILE_TEMPORARY, tempreg),
+		inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
+	emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode,
+		inst->U.I.DstReg,
+		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[2]);
 
 	rc_remove_instruction(inst);
 }
@@ -337,14 +326,14 @@ static void transform_POW(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
 	int tempreg = rc_find_free_temporary(c);
-	struct prog_dst_register tempdst = dstreg(PROGRAM_TEMPORARY, tempreg);
-	struct prog_src_register tempsrc = srcreg(PROGRAM_TEMPORARY, tempreg);
-	tempdst.WriteMask = WRITEMASK_W;
-	tempsrc.Swizzle = SWIZZLE_WWWW;
+	struct rc_dst_register tempdst = dstreg(RC_FILE_TEMPORARY, tempreg);
+	struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempreg);
+	tempdst.WriteMask = RC_MASK_W;
+	tempsrc.Swizzle = RC_SWIZZLE_WWWW;
 
-	emit1(c, inst->Prev, OPCODE_LG2, 0, tempdst, scalar(inst->I.SrcReg[0]));
-	emit2(c, inst->Prev, OPCODE_MUL, 0, tempdst, tempsrc, scalar(inst->I.SrcReg[1]));
-	emit1(c, inst->Prev, OPCODE_EX2, inst->I.SaturateMode, inst->I.DstReg, tempsrc);
+	emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, scalar(inst->U.I.SrcReg[0]));
+	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, scalar(inst->U.I.SrcReg[1]));
+	emit1(c, inst->Prev, RC_OPCODE_EX2, inst->U.I.SaturateMode, inst->U.I.DstReg, tempsrc);
 
 	rc_remove_instruction(inst);
 }
@@ -352,7 +341,26 @@ static void transform_POW(struct radeon_compiler* c,
 static void transform_RSQ(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	inst->I.SrcReg[0] = absolute(inst->I.SrcReg[0]);
+	inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
+}
+
+static void transform_SEQ(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	int tempreg = rc_find_free_temporary(c);
+
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		negate(absolute(srcreg(RC_FILE_TEMPORARY, tempreg))), builtin_zero, builtin_one);
+
+	rc_remove_instruction(inst);
+}
+
+static void transform_SFL(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, builtin_zero);
+	rc_remove_instruction(inst);
 }
 
 static void transform_SGE(struct radeon_compiler* c,
@@ -360,9 +368,33 @@ static void transform_SGE(struct radeon_compiler* c,
 {
 	int tempreg = rc_find_free_temporary(c);
 
-	emit2(c, inst->Prev, OPCODE_ADD, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->I.SrcReg[0], negate(inst->I.SrcReg[1]));
-	emit3(c, inst->Prev, OPCODE_CMP, inst->I.SaturateMode, inst->I.DstReg,
-		srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one);
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		srcreg(RC_FILE_TEMPORARY, tempreg), builtin_zero, builtin_one);
+
+	rc_remove_instruction(inst);
+}
+
+static void transform_SGT(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	int tempreg = rc_find_free_temporary(c);
+
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
+	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		srcreg(RC_FILE_TEMPORARY, tempreg), builtin_one, builtin_zero);
+
+	rc_remove_instruction(inst);
+}
+
+static void transform_SLE(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	int tempreg = rc_find_free_temporary(c);
+
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
+	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		srcreg(RC_FILE_TEMPORARY, tempreg), builtin_zero, builtin_one);
 
 	rc_remove_instruction(inst);
 }
@@ -372,9 +404,21 @@ static void transform_SLT(struct radeon_compiler* c,
 {
 	int tempreg = rc_find_free_temporary(c);
 
-	emit2(c, inst->Prev, OPCODE_ADD, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->I.SrcReg[0], negate(inst->I.SrcReg[1]));
-	emit3(c, inst->Prev, OPCODE_CMP, inst->I.SaturateMode, inst->I.DstReg,
-		srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero);
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		srcreg(RC_FILE_TEMPORARY, tempreg), builtin_one, builtin_zero);
+
+	rc_remove_instruction(inst);
+}
+
+static void transform_SNE(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	int tempreg = rc_find_free_temporary(c);
+
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		negate(absolute(srcreg(RC_FILE_TEMPORARY, tempreg))), builtin_one, builtin_zero);
 
 	rc_remove_instruction(inst);
 }
@@ -382,14 +426,14 @@ static void transform_SLT(struct radeon_compiler* c,
 static void transform_SUB(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	inst->I.Opcode = OPCODE_ADD;
-	inst->I.SrcReg[1] = negate(inst->I.SrcReg[1]);
+	inst->U.I.Opcode = RC_OPCODE_ADD;
+	inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
 }
 
 static void transform_SWZ(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	inst->I.Opcode = OPCODE_MOV;
+	inst->U.I.Opcode = RC_OPCODE_MOV;
 }
 
 static void transform_XPD(struct radeon_compiler* c,
@@ -397,13 +441,13 @@ static void transform_XPD(struct radeon_compiler* c,
 {
 	int tempreg = rc_find_free_temporary(c);
 
-	emit2(c, inst->Prev, OPCODE_MUL, 0, dstreg(PROGRAM_TEMPORARY, tempreg),
-		swizzle(inst->I.SrcReg[0], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
-		swizzle(inst->I.SrcReg[1], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W));
-	emit3(c, inst->Prev, OPCODE_MAD, inst->I.SaturateMode, inst->I.DstReg,
-		swizzle(inst->I.SrcReg[0], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W),
-		swizzle(inst->I.SrcReg[1], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
-		negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
+	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstreg(RC_FILE_TEMPORARY, tempreg),
+		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
+		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
+	emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
+		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
+		negate(srcreg(RC_FILE_TEMPORARY, tempreg)));
 
 	rc_remove_instruction(inst);
 }
@@ -414,7 +458,7 @@ static void transform_XPD(struct radeon_compiler* c,
  * no userData necessary.
  *
  * Eliminates the following ALU instructions:
- *  ABS, DPH, DST, FLR, LIT, LRP, POW, SGE, SLT, SUB, SWZ, XPD
+ *  ABS, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
  * using:
  *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
  *
@@ -423,27 +467,32 @@ static void transform_XPD(struct radeon_compiler* c,
  *
  * @note should be applicable to R300 and R500 fragment programs.
  */
-GLboolean radeonTransformALU(
+int radeonTransformALU(
 	struct radeon_compiler * c,
 	struct rc_instruction* inst,
 	void* unused)
 {
-	switch(inst->I.Opcode) {
-	case OPCODE_ABS: transform_ABS(c, inst); return GL_TRUE;
-	case OPCODE_DPH: transform_DPH(c, inst); return GL_TRUE;
-	case OPCODE_DST: transform_DST(c, inst); return GL_TRUE;
-	case OPCODE_FLR: transform_FLR(c, inst); return GL_TRUE;
-	case OPCODE_LIT: transform_LIT(c, inst); return GL_TRUE;
-	case OPCODE_LRP: transform_LRP(c, inst); return GL_TRUE;
-	case OPCODE_POW: transform_POW(c, inst); return GL_TRUE;
-	case OPCODE_RSQ: transform_RSQ(c, inst); return GL_TRUE;
-	case OPCODE_SGE: transform_SGE(c, inst); return GL_TRUE;
-	case OPCODE_SLT: transform_SLT(c, inst); return GL_TRUE;
-	case OPCODE_SUB: transform_SUB(c, inst); return GL_TRUE;
-	case OPCODE_SWZ: transform_SWZ(c, inst); return GL_TRUE;
-	case OPCODE_XPD: transform_XPD(c, inst); return GL_TRUE;
+	switch(inst->U.I.Opcode) {
+	case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
+	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
+	case RC_OPCODE_DST: transform_DST(c, inst); return 1;
+	case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
+	case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
+	case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
+	case RC_OPCODE_POW: transform_POW(c, inst); return 1;
+	case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
+	case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
+	case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
+	case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
+	case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
+	case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
+	case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
+	case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
+	case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
+	case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
+	case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
 	default:
-		return GL_FALSE;
+		return 0;
 	}
 }
 
@@ -452,37 +501,37 @@ static void transform_r300_vertex_ABS(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
 	/* Note: r500 can take absolute values, but r300 cannot. */
-	inst->I.Opcode = OPCODE_MAX;
-	inst->I.SrcReg[1] = inst->I.SrcReg[0];
-	inst->I.SrcReg[1].Negate ^= NEGATE_XYZW;
+	inst->U.I.Opcode = RC_OPCODE_MAX;
+	inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
+	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
 }
 
 /**
  * For use with radeonLocalTransform, this transforms non-native ALU
  * instructions of the r300 up to r500 vertex engine.
  */
-GLboolean r300_transform_vertex_alu(
+int r300_transform_vertex_alu(
 	struct radeon_compiler * c,
 	struct rc_instruction* inst,
 	void* unused)
 {
-	switch(inst->I.Opcode) {
-	case OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return GL_TRUE;
-	case OPCODE_DP3: transform_DP3(c, inst); return GL_TRUE;
-	case OPCODE_DPH: transform_DPH(c, inst); return GL_TRUE;
-	case OPCODE_FLR: transform_FLR(c, inst); return GL_TRUE;
-	case OPCODE_LRP: transform_LRP(c, inst); return GL_TRUE;
-	case OPCODE_SUB: transform_SUB(c, inst); return GL_TRUE;
-	case OPCODE_SWZ: transform_SWZ(c, inst); return GL_TRUE;
-	case OPCODE_XPD: transform_XPD(c, inst); return GL_TRUE;
+	switch(inst->U.I.Opcode) {
+	case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
+	case RC_OPCODE_DP3: transform_DP3(c, inst); return 1;
+	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
+	case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
+	case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
+	case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
+	case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
+	case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
 	default:
-		return GL_FALSE;
+		return 0;
 	}
 }
 
-static void sincos_constants(struct radeon_compiler* c, GLuint *constants)
+static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
 {
-	static const GLfloat SinCosConsts[2][4] = {
+	static const float SinCosConsts[2][4] = {
 		{
 			1.273239545,		// 4/PI
 			-0.405284735,		// -4/(PI*PI)
@@ -511,26 +560,26 @@ static void sincos_constants(struct radeon_compiler* c, GLuint *constants)
  * MAD dest, tmp.y, weight, tmp.x
  */
 static void sin_approx(
-	struct radeon_compiler* c, struct rc_instruction * before,
-	struct prog_dst_register dst, struct prog_src_register src, const GLuint* constants)
-{
-	GLuint tempreg = rc_find_free_temporary(c);
-
-	emit2(c, before->Prev, OPCODE_MUL, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
-		swizzle(src, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-		srcreg(PROGRAM_CONSTANT, constants[0]));
-	emit3(c, before->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_X),
-		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
-		absolute(swizzle(src, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
-		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X));
-	emit3(c, before->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_Y),
-		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-		absolute(swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
-		negate(swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)));
-	emit3(c, before->Prev, OPCODE_MAD, 0, dst,
-		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
-		swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X));
+	struct radeon_compiler* c, struct rc_instruction * inst,
+	struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
+{
+	unsigned int tempreg = rc_find_free_temporary(c);
+
+	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
+		swizzle(src, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X),
+		srcreg(RC_FILE_CONSTANT, constants[0]));
+	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
+		swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y),
+		absolute(swizzle(src, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X)),
+		swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X));
+	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
+		swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X),
+		absolute(swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X)),
+		negate(swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X)));
+	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
+		swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y),
+		swizzle(srcreg(RC_FILE_CONSTANT, constants[0]), RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
+		swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X));
 }
 
 /**
@@ -538,81 +587,81 @@ static void sin_approx(
  * using only the basic instructions
  *  MOV, ADD, MUL, MAD, FRC
  */
-GLboolean radeonTransformTrigSimple(struct radeon_compiler* c,
+int radeonTransformTrigSimple(struct radeon_compiler* c,
 	struct rc_instruction* inst,
 	void* unused)
 {
-	if (inst->I.Opcode != OPCODE_COS &&
-	    inst->I.Opcode != OPCODE_SIN &&
-	    inst->I.Opcode != OPCODE_SCS)
-		return GL_FALSE;
+	if (inst->U.I.Opcode != RC_OPCODE_COS &&
+	    inst->U.I.Opcode != RC_OPCODE_SIN &&
+	    inst->U.I.Opcode != RC_OPCODE_SCS)
+		return 0;
 
-	GLuint constants[2];
-	GLuint tempreg = rc_find_free_temporary(c);
+	unsigned int constants[2];
+	unsigned int tempreg = rc_find_free_temporary(c);
 
 	sincos_constants(c, constants);
 
-	if (inst->I.Opcode == OPCODE_COS) {
+	if (inst->U.I.Opcode == RC_OPCODE_COS) {
 		// MAD tmp.x, src, 1/(2*PI), 0.75
 		// FRC tmp.x, tmp.x
 		// MAD tmp.z, tmp.x, 2*PI, -PI
-		emit3(c, inst->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
-			swizzle(inst->I.SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X));
-		emit1(c, inst->Prev, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_W),
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
-		emit3(c, inst->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-			negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)));
-
-		sin_approx(c, inst, inst->I.DstReg,
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
+			swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X),
+			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z),
+			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X));
+		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
+			swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W));
+		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
+			swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
+			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
+			negate(swizzle(srcreg(RC_FILE_CONSTANT, constants[0]), RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z)));
+
+		sin_approx(c, inst, inst->U.I.DstReg,
+			swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
 			constants);
-	} else if (inst->I.Opcode == OPCODE_SIN) {
-		emit3(c, inst->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
-			swizzle(inst->I.SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y));
-		emit1(c, inst->Prev, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_W),
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
-		emit3(c, inst->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-			negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)));
-
-		sin_approx(c, inst, inst->I.DstReg,
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+	} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
+		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
+			swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X),
+			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z),
+			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y));
+		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
+			swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W));
+		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
+			swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
+			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
+			negate(swizzle(srcreg(RC_FILE_CONSTANT, constants[0]), RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z)));
+
+		sin_approx(c, inst, inst->U.I.DstReg,
+			swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
 			constants);
 	} else {
-		emit3(c, inst->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
-			swizzle(inst->I.SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W));
-		emit1(c, inst->Prev, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
-			srcreg(PROGRAM_TEMPORARY, tempreg));
-		emit3(c, inst->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
-			srcreg(PROGRAM_TEMPORARY, tempreg),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-			negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)));
-
-		struct prog_dst_register dst = inst->I.DstReg;
-
-		dst.WriteMask = inst->I.DstReg.WriteMask & WRITEMASK_X;
+		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
+			swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X),
+			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z),
+			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
+		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
+			srcreg(RC_FILE_TEMPORARY, tempreg));
+		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
+			srcreg(RC_FILE_TEMPORARY, tempreg),
+			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
+			negate(swizzle(srcreg(RC_FILE_CONSTANT, constants[0]), RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z)));
+
+		struct rc_dst_register dst = inst->U.I.DstReg;
+
+		dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
 		sin_approx(c, inst, dst,
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+			swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X),
 			constants);
 
-		dst.WriteMask = inst->I.DstReg.WriteMask & WRITEMASK_Y;
+		dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
 		sin_approx(c, inst, dst,
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
+			swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y),
 			constants);
 	}
 
 	rc_remove_instruction(inst);
 
-	return GL_TRUE;
+	return 1;
 }
 
 
@@ -624,53 +673,53 @@ GLboolean radeonTransformTrigSimple(struct radeon_compiler* c,
  *
  * @warning This transformation implicitly changes the semantics of SIN and COS!
  */
-GLboolean radeonTransformTrigScale(struct radeon_compiler* c,
+int radeonTransformTrigScale(struct radeon_compiler* c,
 	struct rc_instruction* inst,
 	void* unused)
 {
-	if (inst->I.Opcode != OPCODE_COS &&
-	    inst->I.Opcode != OPCODE_SIN &&
-	    inst->I.Opcode != OPCODE_SCS)
-		return GL_FALSE;
+	if (inst->U.I.Opcode != RC_OPCODE_COS &&
+	    inst->U.I.Opcode != RC_OPCODE_SIN &&
+	    inst->U.I.Opcode != RC_OPCODE_SCS)
+		return 0;
 
-	static const GLfloat RCP_2PI = 0.15915494309189535;
-	GLuint temp;
-	GLuint constant;
-	GLuint constant_swizzle;
+	static const float RCP_2PI = 0.15915494309189535;
+	unsigned int temp;
+	unsigned int constant;
+	unsigned int constant_swizzle;
 
 	temp = rc_find_free_temporary(c);
 	constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
 
-	emit2(c, inst->Prev, OPCODE_MUL, 0, dstregtmpmask(temp, WRITEMASK_W),
-		swizzle(inst->I.SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-		srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle));
-	emit1(c, inst->Prev, OPCODE_FRC, 0, dstregtmpmask(temp, WRITEMASK_W),
-		srcreg(PROGRAM_TEMPORARY, temp));
-
-	if (inst->I.Opcode == OPCODE_COS) {
-		emit1(c, inst->Prev, OPCODE_COS, inst->I.SaturateMode, inst->I.DstReg,
-			srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
-	} else if (inst->I.Opcode == OPCODE_SIN) {
-		emit1(c, inst->Prev, OPCODE_SIN, inst->I.SaturateMode,
-			inst->I.DstReg, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
-	} else if (inst->I.Opcode == OPCODE_SCS) {
-		struct prog_dst_register moddst = inst->I.DstReg;
-
-		if (inst->I.DstReg.WriteMask & WRITEMASK_X) {
-			moddst.WriteMask = WRITEMASK_X;
-			emit1(c, inst->Prev, OPCODE_COS, inst->I.SaturateMode, moddst,
-				srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
+	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
+		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X),
+		srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
+	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
+		srcreg(RC_FILE_TEMPORARY, temp));
+
+	if (inst->U.I.Opcode == RC_OPCODE_COS) {
+		emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, inst->U.I.DstReg,
+			srcregswz(RC_FILE_TEMPORARY, temp, RC_SWIZZLE_WWWW));
+	} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
+		emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode,
+			inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, temp, RC_SWIZZLE_WWWW));
+	} else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
+		struct rc_dst_register moddst = inst->U.I.DstReg;
+
+		if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
+			moddst.WriteMask = RC_MASK_X;
+			emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, moddst,
+				srcregswz(RC_FILE_TEMPORARY, temp, RC_SWIZZLE_WWWW));
 		}
-		if (inst->I.DstReg.WriteMask & WRITEMASK_Y) {
-			moddst.WriteMask = WRITEMASK_Y;
-			emit1(c, inst->Prev, OPCODE_SIN, inst->I.SaturateMode, moddst,
-				srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
+		if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
+			moddst.WriteMask = RC_MASK_Y;
+			emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode, moddst,
+				srcregswz(RC_FILE_TEMPORARY, temp, RC_SWIZZLE_WWWW));
 		}
 	}
 
 	rc_remove_instruction(inst);
 
-	return GL_TRUE;
+	return 1;
 }
 
 /**
@@ -681,15 +730,15 @@ GLboolean radeonTransformTrigScale(struct radeon_compiler* c,
  * @warning This explicitly changes the form of DDX and DDY!
  */
 
-GLboolean radeonTransformDeriv(struct radeon_compiler* c,
+int radeonTransformDeriv(struct radeon_compiler* c,
 	struct rc_instruction* inst,
 	void* unused)
 {
-	if (inst->I.Opcode != OPCODE_DDX && inst->I.Opcode != OPCODE_DDY)
-		return GL_FALSE;
+	if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
+		return 0;
 
-	inst->I.SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE);
-	inst->I.SrcReg[1].Negate = NEGATE_XYZW;
+	inst->U.I.SrcReg[1].Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_ONE, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE);
+	inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
 
-	return GL_TRUE;
+	return 1;
 }
diff --git a/r300/compiler/radeon_program_alu.h b/r300/compiler/radeon_program_alu.h
index 147efec..7cb5f84 100644
--- a/r300/compiler/radeon_program_alu.h
+++ b/r300/compiler/radeon_program_alu.h
@@ -30,27 +30,27 @@
 
 #include "radeon_program.h"
 
-GLboolean radeonTransformALU(
+int radeonTransformALU(
 	struct radeon_compiler * c,
 	struct rc_instruction * inst,
 	void*);
 
-GLboolean r300_transform_vertex_alu(
+int r300_transform_vertex_alu(
 	struct radeon_compiler * c,
 	struct rc_instruction * inst,
 	void*);
 
-GLboolean radeonTransformTrigSimple(
+int radeonTransformTrigSimple(
 	struct radeon_compiler * c,
 	struct rc_instruction * inst,
 	void*);
 
-GLboolean radeonTransformTrigScale(
+int radeonTransformTrigScale(
 	struct radeon_compiler * c,
 	struct rc_instruction * inst,
 	void*);
 
-GLboolean radeonTransformDeriv(
+int radeonTransformDeriv(
 	struct radeon_compiler * c,
 	struct rc_instruction * inst,
 	void*);
diff --git a/r300/compiler/radeon_program_constants.h b/r300/compiler/radeon_program_constants.h
new file mode 100644
index 0000000..7c0d672
--- /dev/null
+++ b/r300/compiler/radeon_program_constants.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef RADEON_PROGRAM_CONSTANTS_H
+#define RADEON_PROGRAM_CONSTANTS_H
+
+typedef enum {
+	RC_SATURATE_NONE = 0,
+	RC_SATURATE_ZERO_ONE,
+	RC_SATURATE_MINUS_PLUS_ONE
+} rc_saturate_mode;
+
+typedef enum {
+	RC_TEXTURE_2D_ARRAY,
+	RC_TEXTURE_1D_ARRAY,
+	RC_TEXTURE_CUBE,
+	RC_TEXTURE_3D,
+	RC_TEXTURE_RECT,
+	RC_TEXTURE_2D,
+	RC_TEXTURE_1D
+} rc_texture_target;
+
+typedef enum {
+	/**
+	 * Used to indicate unused register descriptions and
+	 * source register that use a constant swizzle.
+	 */
+	RC_FILE_NONE = 0,
+	RC_FILE_TEMPORARY,
+
+	/**
+	 * Input register.
+	 *
+	 * \note The compiler attaches no implicit semantics to input registers.
+	 * Fragment/vertex program specific semantics must be defined explicitly
+	 * using the appropriate compiler interfaces.
+	 */
+	RC_FILE_INPUT,
+
+	/**
+	 * Output register.
+	 *
+	 * \note The compiler attaches no implicit semantics to input registers.
+	 * Fragment/vertex program specific semantics must be defined explicitly
+	 * using the appropriate compiler interfaces.
+	 */
+	RC_FILE_OUTPUT,
+	RC_FILE_ADDRESS,
+
+	/**
+	 * Indicates a constant from the \ref rc_constant_list .
+	 */
+	RC_FILE_CONSTANT,
+
+	/**
+	 * Indicates a special register, see RC_SPECIAL_xxx.
+	 */
+	RC_FILE_SPECIAL
+} rc_register_file;
+
+enum {
+	/** R500 fragment program ALU result "register" */
+	RC_SPECIAL_ALU_RESULT = 0,
+
+	/** Must be last */
+	RC_NUM_SPECIAL_REGISTERS
+};
+
+#define RC_REGISTER_INDEX_BITS 10
+#define RC_REGISTER_MAX_INDEX (1 << RC_REGISTER_INDEX_BITS)
+
+typedef enum {
+	RC_SWIZZLE_X = 0,
+	RC_SWIZZLE_Y,
+	RC_SWIZZLE_Z,
+	RC_SWIZZLE_W,
+	RC_SWIZZLE_ZERO,
+	RC_SWIZZLE_ONE,
+	RC_SWIZZLE_HALF,
+	RC_SWIZZLE_UNUSED
+} rc_swizzle;
+
+#define RC_MAKE_SWIZZLE(a,b,c,d) (((a)<<0) | ((b)<<3) | ((c)<<6) | ((d)<<9))
+#define RC_MAKE_SWIZZLE_SMEAR(a) RC_MAKE_SWIZZLE((a),(a),(a),(a))
+#define GET_SWZ(swz, idx)      (((swz) >> ((idx)*3)) & 0x7)
+#define GET_BIT(msk, idx)      (((msk) >> (idx)) & 0x1)
+#define SET_SWZ(swz, idx, newv) \
+	do { \
+		(swz) = ((swz) & ~(7 << ((idx)*3))) | ((newv) << ((idx)*3)); \
+	} while(0)
+
+#define RC_SWIZZLE_XYZW RC_MAKE_SWIZZLE(RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W)
+#define RC_SWIZZLE_XXXX RC_MAKE_SWIZZLE_SMEAR(RC_SWIZZLE_X)
+#define RC_SWIZZLE_YYYY RC_MAKE_SWIZZLE_SMEAR(RC_SWIZZLE_Y)
+#define RC_SWIZZLE_ZZZZ RC_MAKE_SWIZZLE_SMEAR(RC_SWIZZLE_Z)
+#define RC_SWIZZLE_WWWW RC_MAKE_SWIZZLE_SMEAR(RC_SWIZZLE_W)
+#define RC_SWIZZLE_0000 RC_MAKE_SWIZZLE_SMEAR(RC_SWIZZLE_ZERO)
+#define RC_SWIZZLE_1111 RC_MAKE_SWIZZLE_SMEAR(RC_SWIZZLE_ONE)
+
+/**
+ * \name Bitmasks for components of vectors.
+ *
+ * Used for write masks, negation masks, etc.
+ */
+/*@{*/
+#define RC_MASK_NONE 0
+#define RC_MASK_X 1
+#define RC_MASK_Y 2
+#define RC_MASK_Z 4
+#define RC_MASK_W 8
+#define RC_MASK_XY (RC_MASK_X|RC_MASK_Y)
+#define RC_MASK_XYZ (RC_MASK_X|RC_MASK_Y|RC_MASK_Z)
+#define RC_MASK_XYW (RC_MASK_X|RC_MASK_Y|RC_MASK_W)
+#define RC_MASK_XYZW (RC_MASK_X|RC_MASK_Y|RC_MASK_Z|RC_MASK_W)
+/*@}*/
+
+typedef enum {
+	RC_ALURESULT_NONE = 0,
+	RC_ALURESULT_X,
+	RC_ALURESULT_W
+} rc_write_aluresult;
+
+#endif /* RADEON_PROGRAM_CONSTANTS_H */
diff --git a/r300/compiler/radeon_program_pair.c b/r300/compiler/radeon_program_pair.c
index 4c26db5..ee83959 100644
--- a/r300/compiler/radeon_program_pair.c
+++ b/r300/compiler/radeon_program_pair.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 Nicolai Haehnle.
+ * Copyright (C) 2008-2009 Nicolai Haehnle.
  *
  * All Rights Reserved.
  *
@@ -25,584 +25,29 @@
  *
  */
 
-/**
- * @file
- *
- * Perform temporary register allocation and attempt to pair off instructions
- * in RGB and Alpha pairs. Also attempts to optimize the TEX instruction
- * vs. ALU instruction scheduling.
- */
-
 #include "radeon_program_pair.h"
 
-#include "memory_pool.h"
-#include "radeon_compiler.h"
-#include "shader/prog_print.h"
-
-#define error(fmt, args...) do { \
-	rc_error(&s->Compiler->Base, "%s::%s(): " fmt "\n",	\
-		__FILE__, __FUNCTION__, ##args);	\
-} while(0)
-
-struct pair_state_instruction {
-	struct prog_instruction Instruction;
-	GLuint IP; /**< Position of this instruction in original program */
-
-	GLuint IsTex:1; /**< Is a texture instruction */
-	GLuint NeedRGB:1; /**< Needs the RGB ALU */
-	GLuint NeedAlpha:1; /**< Needs the Alpha ALU */
-	GLuint IsTranscendent:1; /**< Is a special transcendent instruction */
-
-	/**
-	 * Number of (read and write) dependencies that must be resolved before
-	 * this instruction can be scheduled.
-	 */
-	GLuint NumDependencies:5;
-
-	/**
-	 * Next instruction in the linked list of ready instructions.
-	 */
-	struct pair_state_instruction *NextReady;
-
-	/**
-	 * Values that this instruction writes
-	 */
-	struct reg_value *Values[4];
-};
-
-
-/**
- * Used to keep track of which instructions read a value.
- */
-struct reg_value_reader {
-	struct pair_state_instruction *Reader;
-	struct reg_value_reader *Next;
-};
-
-/**
- * Used to keep track which values are stored in each component of a
- * PROGRAM_TEMPORARY.
- */
-struct reg_value {
-	struct pair_state_instruction *Writer;
-	struct reg_value *Next; /**< Pointer to the next value to be written to the same PROGRAM_TEMPORARY component */
-
-	/**
-	 * Unordered linked list of instructions that read from this value.
-	 */
-	struct reg_value_reader *Readers;
-
-	/**
-	 * Number of readers of this value. This is calculated during @ref scan_instructions
-	 * and continually decremented during code emission.
-	 * When this count reaches zero, the instruction that writes the @ref Next value
-	 * can be scheduled.
-	 */
-	GLuint NumReaders;
-};
-
-/**
- * Used to translate a PROGRAM_INPUT or PROGRAM_TEMPORARY Mesa register
- * to the proper hardware temporary.
- */
-struct pair_register_translation {
-	GLuint Allocated:1;
-	GLuint HwIndex:8;
-	GLuint RefCount:23; /**< # of times this occurs in an unscheduled instruction SrcReg or DstReg */
-
-	/**
-	 * Notes the value that is currently contained in each component
-	 * (only used for PROGRAM_TEMPORARY registers).
-	 */
-	struct reg_value *Value[4];
-};
-
-struct pair_state {
-	struct r300_fragment_program_compiler * Compiler;
-	const struct radeon_pair_handler *Handler;
-	GLboolean Verbose;
-	void *UserData;
-
-	/**
-	 * Translate Mesa registers to hardware registers
-	 */
-	struct pair_register_translation Inputs[FRAG_ATTRIB_MAX];
-	struct pair_register_translation Temps[MAX_PROGRAM_TEMPS];
-
-	struct {
-		GLuint RefCount; /**< # of times this occurs in an unscheduled SrcReg or DstReg */
-	} HwTemps[128];
-
-	/**
-	 * Linked list of instructions that can be scheduled right now,
-	 * based on which ALU/TEX resources they require.
-	 */
-	struct pair_state_instruction *ReadyFullALU;
-	struct pair_state_instruction *ReadyRGB;
-	struct pair_state_instruction *ReadyAlpha;
-	struct pair_state_instruction *ReadyTEX;
-};
-
-
-static struct pair_register_translation *get_register(struct pair_state *s, GLuint file, GLuint index)
-{
-	switch(file) {
-	case PROGRAM_TEMPORARY: return &s->Temps[index];
-	case PROGRAM_INPUT: return &s->Inputs[index];
-	default: return 0;
-	}
-}
-
-static void alloc_hw_reg(struct pair_state *s, GLuint file, GLuint index, GLuint hwindex)
-{
-	struct pair_register_translation *t = get_register(s, file, index);
-	ASSERT(!s->HwTemps[hwindex].RefCount);
-	ASSERT(!t->Allocated);
-	s->HwTemps[hwindex].RefCount = t->RefCount;
-	t->Allocated = 1;
-	t->HwIndex = hwindex;
-}
-
-static GLuint get_hw_reg(struct pair_state *s, GLuint file, GLuint index)
-{
-	GLuint hwindex;
-
-	struct pair_register_translation *t = get_register(s, file, index);
-	if (!t) {
-		error("get_hw_reg: %i[%i]\n", file, index);
-		return 0;
-	}
-
-	if (t->Allocated)
-		return t->HwIndex;
-
-	for(hwindex = 0; hwindex < s->Handler->MaxHwTemps; ++hwindex)
-		if (!s->HwTemps[hwindex].RefCount)
-			break;
-
-	if (hwindex >= s->Handler->MaxHwTemps) {
-		error("Ran out of hardware temporaries");
-		return 0;
-	}
-
-	alloc_hw_reg(s, file, index, hwindex);
-	return hwindex;
-}
-
-
-static void deref_hw_reg(struct pair_state *s, GLuint hwindex)
-{
-	if (!s->HwTemps[hwindex].RefCount) {
-		error("Hwindex %i refcount error", hwindex);
-		return;
-	}
-
-	s->HwTemps[hwindex].RefCount--;
-}
-
-static void add_pairinst_to_list(struct pair_state_instruction **list, struct pair_state_instruction *pairinst)
-{
-	pairinst->NextReady = *list;
-	*list = pairinst;
-}
-
-/**
- * The given instruction has become ready. Link it into the ready
- * instructions.
- */
-static void instruction_ready(struct pair_state *s, struct pair_state_instruction *pairinst)
-{
-	if (s->Verbose)
-		_mesa_printf("instruction_ready(%i)\n", pairinst->IP);
-
-	if (pairinst->IsTex)
-		add_pairinst_to_list(&s->ReadyTEX, pairinst);
-	else if (!pairinst->NeedAlpha)
-		add_pairinst_to_list(&s->ReadyRGB, pairinst);
-	else if (!pairinst->NeedRGB)
-		add_pairinst_to_list(&s->ReadyAlpha, pairinst);
-	else
-		add_pairinst_to_list(&s->ReadyFullALU, pairinst);
-}
-
-
-/**
- * Finally rewrite ADD, MOV, MUL as the appropriate native instruction
- * and reverse the order of arguments for CMP.
- */
-static void final_rewrite(struct pair_state *s, struct prog_instruction *inst)
-{
-	struct prog_src_register tmp;
-
-	switch(inst->Opcode) {
-	case OPCODE_ADD:
-		inst->SrcReg[2] = inst->SrcReg[1];
-		inst->SrcReg[1].File = PROGRAM_BUILTIN;
-		inst->SrcReg[1].Swizzle = SWIZZLE_1111;
-		inst->SrcReg[1].Negate = NEGATE_NONE;
-		inst->Opcode = OPCODE_MAD;
-		break;
-	case OPCODE_CMP:
-		tmp = inst->SrcReg[2];
-		inst->SrcReg[2] = inst->SrcReg[0];
-		inst->SrcReg[0] = tmp;
-		break;
-	case OPCODE_MOV:
-		/* AMD say we should use CMP.
-		 * However, when we transform
-		 *  KIL -r0;
-		 * into
-		 *  CMP tmp, -r0, -r0, 0;
-		 *  KIL tmp;
-		 * we get incorrect behaviour on R500 when r0 == 0.0.
-		 * It appears that the R500 KIL hardware treats -0.0 as less
-		 * than zero.
-		 */
-		inst->SrcReg[1].File = PROGRAM_BUILTIN;
-		inst->SrcReg[1].Swizzle = SWIZZLE_1111;
-		inst->SrcReg[2].File = PROGRAM_BUILTIN;
-		inst->SrcReg[2].Swizzle = SWIZZLE_0000;
-		inst->Opcode = OPCODE_MAD;
-		break;
-	case OPCODE_MUL:
-		inst->SrcReg[2].File = PROGRAM_BUILTIN;
-		inst->SrcReg[2].Swizzle = SWIZZLE_0000;
-		inst->Opcode = OPCODE_MAD;
-		break;
-	default:
-		/* nothing to do */
-		break;
-	}
-}
-
-
-/**
- * Classify an instruction according to which ALUs etc. it needs
- */
-static void classify_instruction(struct pair_state *s,
-	struct pair_state_instruction *psi)
-{
-	psi->NeedRGB = (psi->Instruction.DstReg.WriteMask & WRITEMASK_XYZ) ? 1 : 0;
-	psi->NeedAlpha = (psi->Instruction.DstReg.WriteMask & WRITEMASK_W) ? 1 : 0;
-
-	switch(psi->Instruction.Opcode) {
-	case OPCODE_ADD:
-	case OPCODE_CMP:
-	case OPCODE_DDX:
-	case OPCODE_DDY:
-	case OPCODE_FRC:
-	case OPCODE_MAD:
-	case OPCODE_MAX:
-	case OPCODE_MIN:
-	case OPCODE_MOV:
-	case OPCODE_MUL:
-		break;
-	case OPCODE_COS:
-	case OPCODE_EX2:
-	case OPCODE_LG2:
-	case OPCODE_RCP:
-	case OPCODE_RSQ:
-	case OPCODE_SIN:
-		psi->IsTranscendent = 1;
-		psi->NeedAlpha = 1;
-		break;
-	case OPCODE_DP4:
-		psi->NeedAlpha = 1;
-		/* fall through */
-	case OPCODE_DP3:
-		psi->NeedRGB = 1;
-		break;
-	case OPCODE_KIL:
-	case OPCODE_TEX:
-	case OPCODE_TXB:
-	case OPCODE_TXP:
-	case OPCODE_END:
-		psi->IsTex = 1;
-		break;
-	default:
-		error("Unknown opcode %d\n", psi->Instruction.Opcode);
-		break;
-	}
-}
-
-
-/**
- * Count which (input, temporary) register is read and written how often,
- * and scan the instruction stream to find dependencies.
- */
-static void scan_instructions(struct pair_state *s)
-{
-	struct rc_instruction *source;
-	GLuint ip;
-
-	for(source = s->Compiler->Base.Program.Instructions.Next, ip = 0;
-	    source != &s->Compiler->Base.Program.Instructions;
-	    source = source->Next, ++ip) {
-		struct pair_state_instruction *pairinst = memory_pool_malloc(&s->Compiler->Base.Pool, sizeof(*pairinst));
-		memset(pairinst, 0, sizeof(struct pair_state_instruction));
-
-		pairinst->Instruction = source->I;
-		pairinst->IP = ip;
-		final_rewrite(s, &pairinst->Instruction);
-		classify_instruction(s, pairinst);
-
-		int nsrc = _mesa_num_inst_src_regs(pairinst->Instruction.Opcode);
-		int j;
-		for(j = 0; j < nsrc; j++) {
-			struct pair_register_translation *t =
-				get_register(s, pairinst->Instruction.SrcReg[j].File, pairinst->Instruction.SrcReg[j].Index);
-			if (!t)
-				continue;
-
-			t->RefCount++;
-
-			if (pairinst->Instruction.SrcReg[j].File == PROGRAM_TEMPORARY) {
-				int i;
-				for(i = 0; i < 4; ++i) {
-					GLuint swz = GET_SWZ(pairinst->Instruction.SrcReg[j].Swizzle, i);
-					if (swz >= 4)
-						continue; /* constant or NIL swizzle */
-					if (!t->Value[swz])
-						continue; /* this is an undefined read */
-
-					/* Do not add a dependency if this instruction
-					 * also rewrites the value. The code below adds
-					 * a dependency for the DstReg, which is a superset
-					 * of the SrcReg dependency. */
-					if (pairinst->Instruction.DstReg.File == PROGRAM_TEMPORARY &&
-					    pairinst->Instruction.DstReg.Index == pairinst->Instruction.SrcReg[j].Index &&
-					    GET_BIT(pairinst->Instruction.DstReg.WriteMask, swz))
-						continue;
-
-					struct reg_value_reader* r = memory_pool_malloc(&s->Compiler->Base.Pool, sizeof(*r));
-					pairinst->NumDependencies++;
-					t->Value[swz]->NumReaders++;
-					r->Reader = pairinst;
-					r->Next = t->Value[swz]->Readers;
-					t->Value[swz]->Readers = r;
-				}
-			}
-		}
-
-		int ndst = _mesa_num_inst_dst_regs(pairinst->Instruction.Opcode);
-		if (ndst) {
-			struct pair_register_translation *t =
-				get_register(s, pairinst->Instruction.DstReg.File, pairinst->Instruction.DstReg.Index);
-			if (t) {
-				t->RefCount++;
-
-				if (pairinst->Instruction.DstReg.File == PROGRAM_TEMPORARY) {
-					int j;
-					for(j = 0; j < 4; ++j) {
-						if (!GET_BIT(pairinst->Instruction.DstReg.WriteMask, j))
-							continue;
-
-						struct reg_value* v = memory_pool_malloc(&s->Compiler->Base.Pool, sizeof(*v));
-						memset(v, 0, sizeof(struct reg_value));
-						v->Writer = pairinst;
-						if (t->Value[j]) {
-							pairinst->NumDependencies++;
-							t->Value[j]->Next = v;
-						}
-						t->Value[j] = v;
-						pairinst->Values[j] = v;
-					}
-				}
-			}
-		}
-
-		if (s->Verbose)
-			_mesa_printf("scan(%i): NumDeps = %i\n", ip, pairinst->NumDependencies);
-
-		if (!pairinst->NumDependencies)
-			instruction_ready(s, pairinst);
-	}
-
-	/* Clear the PROGRAM_TEMPORARY state */
-	int i, j;
-	for(i = 0; i < MAX_PROGRAM_TEMPS; ++i) {
-		for(j = 0; j < 4; ++j)
-			s->Temps[i].Value[j] = 0;
-	}
-}
-
-
-static void decrement_dependencies(struct pair_state *s, struct pair_state_instruction *pairinst)
-{
-	ASSERT(pairinst->NumDependencies > 0);
-	if (!--pairinst->NumDependencies)
-		instruction_ready(s, pairinst);
-}
 
 /**
- * Update the dependency tracking state based on what the instruction
- * at the given IP does.
+ * Return the source slot where we installed the given register access,
+ * or -1 if no slot was free anymore.
  */
-static void commit_instruction(struct pair_state *s, struct pair_state_instruction *pairinst)
-{
-	struct prog_instruction *inst = &pairinst->Instruction;
-
-	if (s->Verbose)
-		_mesa_printf("commit_instruction(%i)\n", pairinst->IP);
-
-	if (inst->DstReg.File == PROGRAM_TEMPORARY) {
-		struct pair_register_translation *t = &s->Temps[inst->DstReg.Index];
-		deref_hw_reg(s, t->HwIndex);
-
-		int i;
-		for(i = 0; i < 4; ++i) {
-			if (!GET_BIT(inst->DstReg.WriteMask, i))
-				continue;
-
-			t->Value[i] = pairinst->Values[i];
-			if (t->Value[i]->NumReaders) {
-				struct reg_value_reader *r;
-				for(r = pairinst->Values[i]->Readers; r; r = r->Next)
-					decrement_dependencies(s, r->Reader);
-			} else if (t->Value[i]->Next) {
-				/* This happens when the only reader writes
-				 * the register at the same time */
-				decrement_dependencies(s, t->Value[i]->Next->Writer);
-			}
-		}
-	}
-
-	int nsrc = _mesa_num_inst_src_regs(inst->Opcode);
-	int i;
-	for(i = 0; i < nsrc; i++) {
-		struct pair_register_translation *t = get_register(s, inst->SrcReg[i].File, inst->SrcReg[i].Index);
-		if (!t)
-			continue;
-
-		deref_hw_reg(s, get_hw_reg(s, inst->SrcReg[i].File, inst->SrcReg[i].Index));
-
-		if (inst->SrcReg[i].File != PROGRAM_TEMPORARY)
-			continue;
-
-		int j;
-		for(j = 0; j < 4; ++j) {
-			GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j);
-			if (swz >= 4)
-				continue;
-			if (!t->Value[swz])
-				continue;
-
-			/* Do not free a dependency if this instruction
-			 * also rewrites the value. See scan_instructions. */
-			if (inst->DstReg.File == PROGRAM_TEMPORARY &&
-			    inst->DstReg.Index == inst->SrcReg[i].Index &&
-			    GET_BIT(inst->DstReg.WriteMask, swz))
-				continue;
-
-			if (!--t->Value[swz]->NumReaders) {
-				if (t->Value[swz]->Next)
-					decrement_dependencies(s, t->Value[swz]->Next->Writer);
-			}
-		}
-	}
-}
-
-
-/**
- * Emit all ready texture instructions in a single block.
- *
- * Emit as a single block to (hopefully) sample many textures in parallel,
- * and to avoid hardware indirections on R300.
- *
- * In R500, we don't really know when the result of a texture instruction
- * arrives. So allocate all destinations first, to make sure they do not
- * arrive early and overwrite a texture coordinate we're going to use later
- * in the block.
- */
-static void emit_all_tex(struct pair_state *s)
-{
-	struct pair_state_instruction *readytex;
-	struct pair_state_instruction *pairinst;
-
-	ASSERT(s->ReadyTEX);
-
-	// Don't let the ready list change under us!
-	readytex = s->ReadyTEX;
-	s->ReadyTEX = 0;
-
-	// Allocate destination hardware registers in one block to avoid conflicts.
-	for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) {
-		struct prog_instruction *inst = &pairinst->Instruction;
-		if (inst->Opcode != OPCODE_KIL)
-			get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
-	}
-
-	if (s->Compiler->Base.Debug)
-		_mesa_printf(" BEGIN_TEX\n");
-
-	if (s->Handler->BeginTexBlock)
-		s->Compiler->Base.Error = s->Compiler->Base.Error || !s->Handler->BeginTexBlock(s->UserData);
-
-	for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) {
-		struct prog_instruction *inst = &pairinst->Instruction;
-		commit_instruction(s, pairinst);
-
-		if (inst->Opcode != OPCODE_KIL)
-			inst->DstReg.Index = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
-		inst->SrcReg[0].Index = get_hw_reg(s, inst->SrcReg[0].File, inst->SrcReg[0].Index);
-
-		if (s->Compiler->Base.Debug) {
-			_mesa_printf("   ");
-			_mesa_print_instruction(inst);
-			fflush(stderr);
-		}
-
-		struct radeon_pair_texture_instruction rpti;
-
-		switch(inst->Opcode) {
-		case OPCODE_TEX: rpti.Opcode = RADEON_OPCODE_TEX; break;
-		case OPCODE_TXB: rpti.Opcode = RADEON_OPCODE_TXB; break;
-		case OPCODE_TXP: rpti.Opcode = RADEON_OPCODE_TXP; break;
-		default:
-		case OPCODE_KIL: rpti.Opcode = RADEON_OPCODE_KIL; break;
-		}
-
-		rpti.DestIndex = inst->DstReg.Index;
-		rpti.WriteMask = inst->DstReg.WriteMask;
-		rpti.TexSrcUnit = inst->TexSrcUnit;
-		rpti.TexSrcTarget = inst->TexSrcTarget;
-		rpti.SrcIndex = inst->SrcReg[0].Index;
-		rpti.SrcSwizzle = inst->SrcReg[0].Swizzle;
-
-		s->Compiler->Base.Error = s->Compiler->Base.Error || !s->Handler->EmitTex(s->UserData, &rpti);
-	}
-
-	if (s->Compiler->Base.Debug)
-		_mesa_printf(" END_TEX\n");
-}
-
-
-static int alloc_pair_source(struct pair_state *s, struct radeon_pair_instruction *pair,
-	struct prog_src_register src, GLboolean rgb, GLboolean alpha)
+int rc_pair_alloc_source(struct rc_pair_instruction *pair,
+	unsigned int rgb, unsigned int alpha,
+	rc_register_file file, unsigned int index)
 {
 	int candidate = -1;
 	int candidate_quality = -1;
 	int i;
 
-	if (!rgb && !alpha)
+	if ((!rgb && !alpha) || file == RC_FILE_NONE)
 		return 0;
 
-	GLuint constant;
-	GLuint index;
-
-	if (src.File == PROGRAM_TEMPORARY || src.File == PROGRAM_INPUT) {
-		constant = 0;
-		index = get_hw_reg(s, src.File, src.Index);
-	} else {
-		constant = 1;
-		index = src.Index;
-	}
-
 	for(i = 0; i < 3; ++i) {
 		int q = 0;
 		if (rgb) {
 			if (pair->RGB.Src[i].Used) {
-				if (pair->RGB.Src[i].Constant != constant ||
+				if (pair->RGB.Src[i].File != file ||
 				    pair->RGB.Src[i].Index != index)
 					continue;
 				q++;
@@ -610,7 +55,7 @@ static int alloc_pair_source(struct pair_state *s, struct radeon_pair_instructio
 		}
 		if (alpha) {
 			if (pair->Alpha.Src[i].Used) {
-				if (pair->Alpha.Src[i].Constant != constant ||
+				if (pair->Alpha.Src[i].File != file ||
 				    pair->Alpha.Src[i].Index != index)
 					continue;
 				q++;
@@ -625,334 +70,15 @@ static int alloc_pair_source(struct pair_state *s, struct radeon_pair_instructio
 	if (candidate >= 0) {
 		if (rgb) {
 			pair->RGB.Src[candidate].Used = 1;
-			pair->RGB.Src[candidate].Constant = constant;
+			pair->RGB.Src[candidate].File = file;
 			pair->RGB.Src[candidate].Index = index;
 		}
 		if (alpha) {
 			pair->Alpha.Src[candidate].Used = 1;
-			pair->Alpha.Src[candidate].Constant = constant;
+			pair->Alpha.Src[candidate].File = file;
 			pair->Alpha.Src[candidate].Index = index;
 		}
 	}
 
 	return candidate;
 }
-
-/**
- * Fill the given ALU instruction's opcodes and source operands into the given pair,
- * if possible.
- */
-static GLboolean fill_instruction_into_pair(
-	struct pair_state *s,
-	struct radeon_pair_instruction *pair,
-	struct pair_state_instruction *pairinst)
-{
-	struct prog_instruction *inst = &pairinst->Instruction;
-
-	ASSERT(!pairinst->NeedRGB || pair->RGB.Opcode == OPCODE_NOP);
-	ASSERT(!pairinst->NeedAlpha || pair->Alpha.Opcode == OPCODE_NOP);
-
-	if (pairinst->NeedRGB) {
-		if (pairinst->IsTranscendent)
-			pair->RGB.Opcode = OPCODE_REPL_ALPHA;
-		else
-			pair->RGB.Opcode = inst->Opcode;
-		if (inst->SaturateMode == SATURATE_ZERO_ONE)
-			pair->RGB.Saturate = 1;
-	}
-	if (pairinst->NeedAlpha) {
-		pair->Alpha.Opcode = inst->Opcode;
-		if (inst->SaturateMode == SATURATE_ZERO_ONE)
-			pair->Alpha.Saturate = 1;
-	}
-
-	int nargs = _mesa_num_inst_src_regs(inst->Opcode);
-	int i;
-
-	/* Special case for DDX/DDY (MDH/MDV). */
-	if (inst->Opcode == OPCODE_DDX || inst->Opcode == OPCODE_DDY) {
-		if (pair->RGB.Src[0].Used || pair->Alpha.Src[0].Used)
-			return GL_FALSE;
-		else
-			nargs++;
-	}
-
-	for(i = 0; i < nargs; ++i) {
-		int source;
-		if (pairinst->NeedRGB && !pairinst->IsTranscendent) {
-			GLboolean srcrgb = GL_FALSE;
-			GLboolean srcalpha = GL_FALSE;
-			int j;
-			for(j = 0; j < 3; ++j) {
-				GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j);
-				if (swz < 3)
-					srcrgb = GL_TRUE;
-				else if (swz < 4)
-					srcalpha = GL_TRUE;
-			}
-			source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha);
-			if (source < 0)
-				return GL_FALSE;
-			pair->RGB.Arg[i].Source = source;
-			pair->RGB.Arg[i].Swizzle = inst->SrcReg[i].Swizzle & 0x1ff;
-			pair->RGB.Arg[i].Abs = inst->SrcReg[i].Abs;
-			pair->RGB.Arg[i].Negate = !!(inst->SrcReg[i].Negate & (NEGATE_X | NEGATE_Y | NEGATE_Z));
-		}
-		if (pairinst->NeedAlpha) {
-			GLboolean srcrgb = GL_FALSE;
-			GLboolean srcalpha = GL_FALSE;
-			GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, pairinst->IsTranscendent ? 0 : 3);
-			if (swz < 3)
-				srcrgb = GL_TRUE;
-			else if (swz < 4)
-				srcalpha = GL_TRUE;
-			source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha);
-			if (source < 0)
-				return GL_FALSE;
-			pair->Alpha.Arg[i].Source = source;
-			pair->Alpha.Arg[i].Swizzle = swz;
-			pair->Alpha.Arg[i].Abs = inst->SrcReg[i].Abs;
-			pair->Alpha.Arg[i].Negate = !!(inst->SrcReg[i].Negate & NEGATE_W);
-		}
-	}
-
-	return GL_TRUE;
-}
-
-
-/**
- * Fill in the destination register information.
- *
- * This is split from filling in source registers because we want
- * to avoid allocating hardware temporaries for destinations until
- * we are absolutely certain that we're going to emit a certain
- * instruction pairing.
- */
-static void fill_dest_into_pair(
-	struct pair_state *s,
-	struct radeon_pair_instruction *pair,
-	struct pair_state_instruction *pairinst)
-{
-	struct prog_instruction *inst = &pairinst->Instruction;
-
-	if (inst->DstReg.File == PROGRAM_OUTPUT) {
-		if (inst->DstReg.Index == s->Compiler->OutputColor) {
-			pair->RGB.OutputWriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ;
-			pair->Alpha.OutputWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
-		} else if (inst->DstReg.Index == s->Compiler->OutputDepth) {
-			pair->Alpha.DepthWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
-		}
-	} else {
-		GLuint hwindex = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
-		if (pairinst->NeedRGB) {
-			pair->RGB.DestIndex = hwindex;
-			pair->RGB.WriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ;
-		}
-		if (pairinst->NeedAlpha) {
-			pair->Alpha.DestIndex = hwindex;
-			pair->Alpha.WriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
-		}
-	}
-}
-
-
-/**
- * Find a good ALU instruction or pair of ALU instruction and emit it.
- *
- * Prefer emitting full ALU instructions, so that when we reach a point
- * where no full ALU instruction can be emitted, we have more candidates
- * for RGB/Alpha pairing.
- */
-static void emit_alu(struct pair_state *s)
-{
-	struct radeon_pair_instruction pair;
-	struct pair_state_instruction *psi;
-
-	if (s->ReadyFullALU || !(s->ReadyRGB && s->ReadyAlpha)) {
-		if (s->ReadyFullALU) {
-			psi = s->ReadyFullALU;
-			s->ReadyFullALU = s->ReadyFullALU->NextReady;
-		} else if (s->ReadyRGB) {
-			psi = s->ReadyRGB;
-			s->ReadyRGB = s->ReadyRGB->NextReady;
-		} else {
-			psi = s->ReadyAlpha;
-			s->ReadyAlpha = s->ReadyAlpha->NextReady;
-		}
-
-		_mesa_bzero(&pair, sizeof(pair));
-		fill_instruction_into_pair(s, &pair, psi);
-		fill_dest_into_pair(s, &pair, psi);
-		commit_instruction(s, psi);
-	} else {
-		struct pair_state_instruction **prgb;
-		struct pair_state_instruction **palpha;
-
-		/* Some pairings might fail because they require too
-		 * many source slots; try all possible pairings if necessary */
-		for(prgb = &s->ReadyRGB; *prgb; prgb = &(*prgb)->NextReady) {
-			for(palpha = &s->ReadyAlpha; *palpha; palpha = &(*palpha)->NextReady) {
-				struct pair_state_instruction * psirgb = *prgb;
-				struct pair_state_instruction * psialpha = *palpha;
-				_mesa_bzero(&pair, sizeof(pair));
-				fill_instruction_into_pair(s, &pair, psirgb);
-				if (!fill_instruction_into_pair(s, &pair, psialpha))
-					continue;
-				*prgb = (*prgb)->NextReady;
-				*palpha = (*palpha)->NextReady;
-				fill_dest_into_pair(s, &pair, psirgb);
-				fill_dest_into_pair(s, &pair, psialpha);
-				commit_instruction(s, psirgb);
-				commit_instruction(s, psialpha);
-				goto success;
-			}
-		}
-
-		/* No success in pairing; just take the first RGB instruction */
-		psi = s->ReadyRGB;
-		s->ReadyRGB = s->ReadyRGB->NextReady;
-
-		_mesa_bzero(&pair, sizeof(pair));
-		fill_instruction_into_pair(s, &pair, psi);
-		fill_dest_into_pair(s, &pair, psi);
-		commit_instruction(s, psi);
-	success: ;
-	}
-
-	if (s->Compiler->Base.Debug)
-		radeonPrintPairInstruction(&pair);
-
-	s->Compiler->Base.Error = s->Compiler->Base.Error || !s->Handler->EmitPaired(s->UserData, &pair);
-}
-
-/* Callback function for assigning input registers to hardware registers */
-static void alloc_helper(void * data, unsigned input, unsigned hwreg)
-{
-	struct pair_state * s = data;
-	alloc_hw_reg(s, PROGRAM_INPUT, input, hwreg);
-}
-
-void radeonPairProgram(
-	struct r300_fragment_program_compiler * compiler,
-	const struct radeon_pair_handler* handler, void *userdata)
-{
-	struct pair_state s;
-
-	_mesa_bzero(&s, sizeof(s));
-	s.Compiler = compiler;
-	s.Handler = handler;
-	s.UserData = userdata;
-	s.Verbose = GL_FALSE && s.Compiler->Base.Debug;
-
-	if (s.Compiler->Base.Debug)
-		_mesa_printf("Emit paired program\n");
-
-	scan_instructions(&s);
-	s.Compiler->AllocateHwInputs(s.Compiler, &alloc_helper, &s);
-
-	while(!s.Compiler->Base.Error &&
-	      (s.ReadyTEX || s.ReadyRGB || s.ReadyAlpha || s.ReadyFullALU)) {
-		if (s.ReadyTEX)
-			emit_all_tex(&s);
-
-		while(s.ReadyFullALU || s.ReadyRGB || s.ReadyAlpha)
-			emit_alu(&s);
-	}
-
-	if (s.Compiler->Base.Debug)
-		_mesa_printf(" END\n");
-}
-
-
-static void print_pair_src(int i, struct radeon_pair_instruction_source* src)
-{
-	_mesa_printf("  Src%i = %s[%i]", i, src->Constant ? "CNST" : "TEMP", src->Index);
-}
-
-static const char* opcode_string(GLuint opcode)
-{
-	if (opcode == OPCODE_REPL_ALPHA)
-		return "SOP";
-	else
-		return _mesa_opcode_string(opcode);
-}
-
-static int num_pairinst_args(GLuint opcode)
-{
-	if (opcode == OPCODE_REPL_ALPHA)
-		return 0;
-	else
-		return _mesa_num_inst_src_regs(opcode);
-}
-
-static char swizzle_char(GLuint swz)
-{
-	switch(swz) {
-	case SWIZZLE_X: return 'x';
-	case SWIZZLE_Y: return 'y';
-	case SWIZZLE_Z: return 'z';
-	case SWIZZLE_W: return 'w';
-	case SWIZZLE_ZERO: return '0';
-	case SWIZZLE_ONE: return '1';
-	case SWIZZLE_NIL: return '_';
-	default: return '?';
-	}
-}
-
-void radeonPrintPairInstruction(struct radeon_pair_instruction *inst)
-{
-	int nargs;
-	int i;
-
-	_mesa_printf("       RGB:  ");
-	for(i = 0; i < 3; ++i) {
-		if (inst->RGB.Src[i].Used)
-			print_pair_src(i, inst->RGB.Src + i);
-	}
-	_mesa_printf("\n");
-	_mesa_printf("       Alpha:");
-	for(i = 0; i < 3; ++i) {
-		if (inst->Alpha.Src[i].Used)
-			print_pair_src(i, inst->Alpha.Src + i);
-	}
-	_mesa_printf("\n");
-
-	_mesa_printf("  %s%s", opcode_string(inst->RGB.Opcode), inst->RGB.Saturate ? "_SAT" : "");
-	if (inst->RGB.WriteMask)
-		_mesa_printf(" TEMP[%i].%s%s%s", inst->RGB.DestIndex,
-			(inst->RGB.WriteMask & 1) ? "x" : "",
-			(inst->RGB.WriteMask & 2) ? "y" : "",
-			(inst->RGB.WriteMask & 4) ? "z" : "");
-	if (inst->RGB.OutputWriteMask)
-		_mesa_printf(" COLOR.%s%s%s",
-			(inst->RGB.OutputWriteMask & 1) ? "x" : "",
-			(inst->RGB.OutputWriteMask & 2) ? "y" : "",
-			(inst->RGB.OutputWriteMask & 4) ? "z" : "");
-	nargs = num_pairinst_args(inst->RGB.Opcode);
-	for(i = 0; i < nargs; ++i) {
-		const char* abs = inst->RGB.Arg[i].Abs ? "|" : "";
-		const char* neg = inst->RGB.Arg[i].Negate ? "-" : "";
-		_mesa_printf(", %s%sSrc%i.%c%c%c%s", neg, abs, inst->RGB.Arg[i].Source,
-			swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 0)),
-			swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 1)),
-			swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 2)),
-			abs);
-	}
-	_mesa_printf("\n");
-
-	_mesa_printf("  %s%s", opcode_string(inst->Alpha.Opcode), inst->Alpha.Saturate ? "_SAT" : "");
-	if (inst->Alpha.WriteMask)
-		_mesa_printf(" TEMP[%i].w", inst->Alpha.DestIndex);
-	if (inst->Alpha.OutputWriteMask)
-		_mesa_printf(" COLOR.w");
-	if (inst->Alpha.DepthWriteMask)
-		_mesa_printf(" DEPTH.w");
-	nargs = num_pairinst_args(inst->Alpha.Opcode);
-	for(i = 0; i < nargs; ++i) {
-		const char* abs = inst->Alpha.Arg[i].Abs ? "|" : "";
-		const char* neg = inst->Alpha.Arg[i].Negate ? "-" : "";
-		_mesa_printf(", %s%sSrc%i.%c%s", neg, abs, inst->Alpha.Arg[i].Source,
-			swizzle_char(inst->Alpha.Arg[i].Swizzle), abs);
-	}
-	_mesa_printf("\n");
-}
diff --git a/r300/compiler/radeon_program_pair.h b/r300/compiler/radeon_program_pair.h
index ff76178..1600598 100644
--- a/r300/compiler/radeon_program_pair.h
+++ b/r300/compiler/radeon_program_pair.h
@@ -28,116 +28,97 @@
 #ifndef __RADEON_PROGRAM_PAIR_H_
 #define __RADEON_PROGRAM_PAIR_H_
 
-#include "radeon_program.h"
+#include "radeon_code.h"
+#include "radeon_opcodes.h"
+#include "radeon_program_constants.h"
 
 struct r300_fragment_program_compiler;
 
 
 /**
- * Represents a paired instruction, as found in R300 and R500
+ * \file
+ * Represents a paired ALU instruction, as found in R300 and R500
  * fragment programs.
+ *
+ * Note that this representation is taking some liberties as far
+ * as register files are concerned, to allow separate register
+ * allocation.
+ *
+ * Also note that there are some subtleties in that the semantics
+ * of certain opcodes are implicitly changed in this representation;
+ * see \ref rc_pair_translate
  */
+
+
 struct radeon_pair_instruction_source {
-	GLuint Index:8;
-	GLuint Constant:1;
-	GLuint Used:1;
+	unsigned int Used:1;
+	rc_register_file File:3;
+	unsigned int Index:RC_REGISTER_INDEX_BITS;
 };
 
 struct radeon_pair_instruction_rgb {
-	GLuint Opcode:8;
-	GLuint DestIndex:8;
-	GLuint WriteMask:3;
-	GLuint OutputWriteMask:3;
-	GLuint Saturate:1;
+	rc_opcode Opcode:8;
+	unsigned int DestIndex:RC_REGISTER_INDEX_BITS;
+	unsigned int WriteMask:3;
+	unsigned int OutputWriteMask:3;
+	unsigned int Saturate:1;
 
 	struct radeon_pair_instruction_source Src[3];
 
 	struct {
-		GLuint Source:2;
-		GLuint Swizzle:9;
-		GLuint Abs:1;
-		GLuint Negate:1;
+		unsigned int Source:2;
+		unsigned int Swizzle:9;
+		unsigned int Abs:1;
+		unsigned int Negate:1;
 	} Arg[3];
 };
 
 struct radeon_pair_instruction_alpha {
-	GLuint Opcode:8;
-	GLuint DestIndex:8;
-	GLuint WriteMask:1;
-	GLuint OutputWriteMask:1;
-	GLuint DepthWriteMask:1;
-	GLuint Saturate:1;
+	rc_opcode Opcode:8;
+	unsigned int DestIndex:RC_REGISTER_INDEX_BITS;
+	unsigned int WriteMask:1;
+	unsigned int OutputWriteMask:1;
+	unsigned int DepthWriteMask:1;
+	unsigned int Saturate:1;
 
 	struct radeon_pair_instruction_source Src[3];
 
 	struct {
-		GLuint Source:2;
-		GLuint Swizzle:3;
-		GLuint Abs:1;
-		GLuint Negate:1;
+		unsigned int Source:2;
+		unsigned int Swizzle:3;
+		unsigned int Abs:1;
+		unsigned int Negate:1;
 	} Arg[3];
 };
 
-struct radeon_pair_instruction {
+struct rc_pair_instruction {
 	struct radeon_pair_instruction_rgb RGB;
 	struct radeon_pair_instruction_alpha Alpha;
-};
-
 
-enum {
-	RADEON_OPCODE_TEX = 0,
-	RADEON_OPCODE_TXB,
-	RADEON_OPCODE_TXP,
-	RADEON_OPCODE_KIL
+	rc_write_aluresult WriteALUResult:2;
+	rc_compare_func ALUResultCompare:3;
 };
 
-struct radeon_pair_texture_instruction {
-	GLuint Opcode:2; /**< one of RADEON_OPCODE_xxx */
-
-	GLuint DestIndex:8;
-	GLuint WriteMask:4;
 
-	GLuint TexSrcUnit:5;
-	GLuint TexSrcTarget:3;
-
-	GLuint SrcIndex:8;
-	GLuint SrcSwizzle:12;
-};
+/**
+ * General helper functions for dealing with the paired instruction format.
+ */
+/*@{*/
+int rc_pair_alloc_source(struct rc_pair_instruction *pair,
+	unsigned int rgb, unsigned int alpha,
+	rc_register_file file, unsigned int index);
+/*@}*/
 
 
 /**
- *
+ * Compiler passes that operate with the paired format.
  */
-struct radeon_pair_handler {
-	/**
-	 * Write a paired instruction to the hardware.
-	 *
-	 * @return GL_FALSE on error.
-	 */
-	GLboolean (*EmitPaired)(void*, struct radeon_pair_instruction*);
-
-	/**
-	 * Write a texture instruction to the hardware.
-	 * Register indices have already been rewritten to the allocated
-	 * hardware register numbers.
-	 *
-	 * @return GL_FALSE on error.
-	 */
-	GLboolean (*EmitTex)(void*, struct radeon_pair_texture_instruction*);
-
-	/**
-	 * Called before a block of contiguous, independent texture
-	 * instructions is emitted.
-	 */
-	GLboolean (*BeginTexBlock)(void*);
-
-	unsigned MaxHwTemps;
-};
-
-void radeonPairProgram(
-	struct r300_fragment_program_compiler * compiler,
-	const struct radeon_pair_handler*, void *userdata);
+/*@{*/
+struct radeon_pair_handler;
 
-void radeonPrintPairInstruction(struct radeon_pair_instruction *inst);
+void rc_pair_translate(struct r300_fragment_program_compiler *c);
+void rc_pair_schedule(struct r300_fragment_program_compiler *c);
+void rc_pair_regalloc(struct r300_fragment_program_compiler *c, unsigned maxtemps);
+/*@}*/
 
 #endif /* __RADEON_PROGRAM_PAIR_H_ */
diff --git a/r300/compiler/radeon_program_print.c b/r300/compiler/radeon_program_print.c
new file mode 100644
index 0000000..c980f5c
--- /dev/null
+++ b/r300/compiler/radeon_program_print.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "radeon_program.h"
+
+#include <stdio.h>
+
+static const char * textarget_to_string(rc_texture_target target)
+{
+	switch(target) {
+	case RC_TEXTURE_2D_ARRAY: return "2D_ARRAY";
+	case RC_TEXTURE_1D_ARRAY: return "1D_ARRAY";
+	case RC_TEXTURE_CUBE: return "CUBE";
+	case RC_TEXTURE_3D: return "3D";
+	case RC_TEXTURE_RECT: return "RECT";
+	case RC_TEXTURE_2D: return "2D";
+	case RC_TEXTURE_1D: return "1D";
+	default: return "BAD_TEXTURE_TARGET";
+	}
+}
+
+static void rc_print_comparefunc(FILE * f, const char * lhs, rc_compare_func func, const char * rhs)
+{
+	if (func == RC_COMPARE_FUNC_NEVER) {
+		fprintf(f, "false");
+	} else if (func == RC_COMPARE_FUNC_ALWAYS) {
+		fprintf(f, "true");
+	} else {
+		const char * op;
+		switch(func) {
+		case RC_COMPARE_FUNC_LESS: op = "<"; break;
+		case RC_COMPARE_FUNC_EQUAL: op = "=="; break;
+		case RC_COMPARE_FUNC_LEQUAL: op = "<="; break;
+		case RC_COMPARE_FUNC_GREATER: op = ">"; break;
+		case RC_COMPARE_FUNC_NOTEQUAL: op = "!="; break;
+		case RC_COMPARE_FUNC_GEQUAL: op = ">="; break;
+		default: op = "???"; break;
+		}
+		fprintf(f, "%s %s %s", lhs, op, rhs);
+	}
+}
+
+static void rc_print_register(FILE * f, rc_register_file file, int index, unsigned int reladdr)
+{
+	if (file == RC_FILE_NONE) {
+		fprintf(f, "none");
+	} else if (file == RC_FILE_SPECIAL) {
+		switch(index) {
+		case RC_SPECIAL_ALU_RESULT: fprintf(f, "aluresult"); break;
+		default: fprintf(f, "special[%i]", index); break;
+		}
+	} else {
+		const char * filename;
+		switch(file) {
+		case RC_FILE_TEMPORARY: filename = "temp"; break;
+		case RC_FILE_INPUT: filename = "input"; break;
+		case RC_FILE_OUTPUT: filename = "output"; break;
+		case RC_FILE_ADDRESS: filename = "addr"; break;
+		case RC_FILE_CONSTANT: filename = "const"; break;
+		default: filename = "BAD FILE"; break;
+		}
+		fprintf(f, "%s[%i%s]", filename, index, reladdr ? " + addr[0]" : "");
+	}
+}
+
+static void rc_print_mask(FILE * f, unsigned int mask)
+{
+	if (mask & RC_MASK_X) fprintf(f, "x");
+	if (mask & RC_MASK_Y) fprintf(f, "y");
+	if (mask & RC_MASK_Z) fprintf(f, "z");
+	if (mask & RC_MASK_W) fprintf(f, "w");
+}
+
+static void rc_print_dst_register(FILE * f, struct rc_dst_register dst)
+{
+	rc_print_register(f, dst.File, dst.Index, dst.RelAddr);
+	if (dst.WriteMask != RC_MASK_XYZW) {
+		fprintf(f, ".");
+		rc_print_mask(f, dst.WriteMask);
+	}
+}
+
+static char rc_swizzle_char(unsigned int swz)
+{
+	switch(swz) {
+	case RC_SWIZZLE_X: return 'x';
+	case RC_SWIZZLE_Y: return 'y';
+	case RC_SWIZZLE_Z: return 'z';
+	case RC_SWIZZLE_W: return 'w';
+	case RC_SWIZZLE_ZERO: return '0';
+	case RC_SWIZZLE_ONE: return '1';
+	case RC_SWIZZLE_HALF: return 'H';
+	case RC_SWIZZLE_UNUSED: return '_';
+	}
+	return '?';
+}
+
+static void rc_print_swizzle(FILE * f, unsigned int swizzle, unsigned int negate)
+{
+	unsigned int comp;
+	for(comp = 0; comp < 4; ++comp) {
+		rc_swizzle swz = GET_SWZ(swizzle, comp);
+		if (GET_BIT(negate, comp))
+			fprintf(f, "-");
+		fprintf(f, "%c", rc_swizzle_char(swz));
+	}
+}
+
+static void rc_print_src_register(FILE * f, struct rc_src_register src)
+{
+	int trivial_negate = (src.Negate == RC_MASK_NONE || src.Negate == RC_MASK_XYZW);
+
+	if (src.Negate == RC_MASK_XYZW)
+		fprintf(f, "-");
+	if (src.Abs)
+		fprintf(f, "|");
+
+	rc_print_register(f, src.File, src.Index, src.RelAddr);
+
+	if (src.Abs && !trivial_negate)
+		fprintf(f, "|");
+
+	if (src.Swizzle != RC_SWIZZLE_XYZW || !trivial_negate) {
+		fprintf(f, ".");
+		rc_print_swizzle(f, src.Swizzle, trivial_negate ? 0 : src.Negate);
+	}
+
+	if (src.Abs && trivial_negate)
+		fprintf(f, "|");
+}
+
+static void rc_print_normal_instruction(FILE * f, struct rc_instruction * inst)
+{
+	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+	unsigned int reg;
+
+	fprintf(f, "%s", opcode->Name);
+
+	switch(inst->U.I.SaturateMode) {
+	case RC_SATURATE_NONE: break;
+	case RC_SATURATE_ZERO_ONE: fprintf(f, "_SAT"); break;
+	case RC_SATURATE_MINUS_PLUS_ONE: fprintf(f, "_SAT2"); break;
+	default: fprintf(f, "_BAD_SAT"); break;
+	}
+
+	if (opcode->HasDstReg) {
+		fprintf(f, " ");
+		rc_print_dst_register(f, inst->U.I.DstReg);
+		if (opcode->NumSrcRegs)
+			fprintf(f, ",");
+	}
+
+	for(reg = 0; reg < opcode->NumSrcRegs; ++reg) {
+		if (reg > 0)
+			fprintf(f, ",");
+		fprintf(f, " ");
+		rc_print_src_register(f, inst->U.I.SrcReg[reg]);
+	}
+
+	if (opcode->HasTexture) {
+		fprintf(f, ", %s%s[%u]",
+			textarget_to_string(inst->U.I.TexSrcTarget),
+			inst->U.I.TexShadow ? "SHADOW" : "",
+			inst->U.I.TexSrcUnit);
+	}
+
+	fprintf(f, ";");
+
+	if (inst->U.I.WriteALUResult) {
+		fprintf(f, " [aluresult = (");
+		rc_print_comparefunc(f,
+			(inst->U.I.WriteALUResult == RC_ALURESULT_X) ? "x" : "w",
+			inst->U.I.ALUResultCompare, "0");
+		fprintf(f, ")]");
+	}
+
+	fprintf(f, "\n");
+}
+
+static void rc_print_pair_instruction(FILE * f, struct rc_instruction * fullinst)
+{
+	struct rc_pair_instruction * inst = &fullinst->U.P;
+	int printedsrc = 0;
+	unsigned int src, arg;
+
+	for(src = 0; src < 3; ++src) {
+		if (inst->RGB.Src[src].Used) {
+			if (printedsrc)
+				fprintf(f, ", ");
+			fprintf(f, "src%i.xyz = ", src);
+			rc_print_register(f, inst->RGB.Src[src].File, inst->RGB.Src[src].Index, 0);
+			printedsrc = 1;
+		}
+		if (inst->Alpha.Src[src].Used) {
+			if (printedsrc)
+				fprintf(f, ", ");
+			fprintf(f, "src%i.w = ", src);
+			rc_print_register(f, inst->Alpha.Src[src].File, inst->Alpha.Src[src].Index, 0);
+			printedsrc = 1;
+		}
+	}
+	fprintf(f, "\n");
+
+	if (inst->RGB.Opcode != RC_OPCODE_NOP) {
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->RGB.Opcode);
+
+		fprintf(f, "     %s%s", opcode->Name, inst->RGB.Saturate ? "_SAT" : "");
+		if (inst->RGB.WriteMask)
+			fprintf(f, " temp[%i].%s%s%s", inst->RGB.DestIndex,
+				(inst->RGB.WriteMask & 1) ? "x" : "",
+				(inst->RGB.WriteMask & 2) ? "y" : "",
+				(inst->RGB.WriteMask & 4) ? "z" : "");
+		if (inst->RGB.OutputWriteMask)
+			fprintf(f, " color.%s%s%s",
+				(inst->RGB.OutputWriteMask & 1) ? "x" : "",
+				(inst->RGB.OutputWriteMask & 2) ? "y" : "",
+				(inst->RGB.OutputWriteMask & 4) ? "z" : "");
+		if (inst->WriteALUResult == RC_ALURESULT_X)
+			fprintf(f, " aluresult");
+
+		for (arg = 0; arg < opcode->NumSrcRegs; ++arg) {
+			const char* abs = inst->RGB.Arg[arg].Abs ? "|" : "";
+			const char* neg = inst->RGB.Arg[arg].Negate ? "-" : "";
+			fprintf(f, ", %s%ssrc%i.%c%c%c%s", neg, abs, inst->RGB.Arg[arg].Source,
+				rc_swizzle_char(GET_SWZ(inst->RGB.Arg[arg].Swizzle, 0)),
+				rc_swizzle_char(GET_SWZ(inst->RGB.Arg[arg].Swizzle, 1)),
+				rc_swizzle_char(GET_SWZ(inst->RGB.Arg[arg].Swizzle, 2)),
+				abs);
+		}
+		fprintf(f, "\n");
+	}
+
+	if (inst->Alpha.Opcode != RC_OPCODE_NOP) {
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->Alpha.Opcode);
+
+		fprintf(f, "     %s%s", opcode->Name, inst->Alpha.Saturate ? "_SAT" : "");
+		if (inst->Alpha.WriteMask)
+			fprintf(f, " temp[%i].w", inst->Alpha.DestIndex);
+		if (inst->Alpha.OutputWriteMask)
+			fprintf(f, " color.w");
+		if (inst->Alpha.DepthWriteMask)
+			fprintf(f, " depth.w");
+		if (inst->WriteALUResult == RC_ALURESULT_W)
+			fprintf(f, " aluresult");
+
+		for(arg = 0; arg < opcode->NumSrcRegs; ++arg) {
+			const char* abs = inst->Alpha.Arg[arg].Abs ? "|" : "";
+			const char* neg = inst->Alpha.Arg[arg].Negate ? "-" : "";
+			fprintf(f, ", %s%ssrc%i.%c%s", neg, abs, inst->Alpha.Arg[arg].Source,
+				rc_swizzle_char(inst->Alpha.Arg[arg].Swizzle), abs);
+		}
+		fprintf(f, "\n");
+	}
+
+	if (inst->WriteALUResult) {
+		fprintf(f, "      [aluresult = (");
+		rc_print_comparefunc(f, "result", inst->ALUResultCompare, "0");
+		fprintf(f, ")]\n");
+	}
+}
+
+/**
+ * Print program to stderr, default options.
+ */
+void rc_print_program(const struct rc_program *prog)
+{
+	unsigned int linenum = 0;
+	struct rc_instruction *inst;
+
+	fprintf(stderr, "# Radeon Compiler Program\n");
+
+	for(inst = prog->Instructions.Next; inst != &prog->Instructions; inst = inst->Next) {
+		fprintf(stderr, "%3d: ", linenum);
+
+		if (inst->Type == RC_INSTRUCTION_PAIR)
+			rc_print_pair_instruction(stderr, inst);
+		else
+			rc_print_normal_instruction(stderr, inst);
+
+		linenum++;
+	}
+}
diff --git a/r300/compiler/radeon_swizzle.h b/r300/compiler/radeon_swizzle.h
new file mode 100644
index 0000000..c81d5f7
--- /dev/null
+++ b/r300/compiler/radeon_swizzle.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef RADEON_SWIZZLE_H
+#define RADEON_SWIZZLE_H
+
+#include "radeon_program.h"
+
+struct rc_swizzle_split {
+	unsigned char NumPhases;
+	unsigned char Phase[4];
+};
+
+/**
+ * Describe the swizzling capability of target hardware.
+ */
+struct rc_swizzle_caps {
+	/**
+	 * Check whether the given swizzle, absolute and negate combination
+	 * can be implemented natively by the hardware for this opcode.
+	 *
+	 * \return 1 if the swizzle is native for the given opcode
+	 */
+	int (*IsNative)(rc_opcode opcode, struct rc_src_register reg);
+
+	/**
+	 * Determine how to split access to the masked channels of the
+	 * given source register to obtain ALU-native swizzles.
+	 */
+	void (*Split)(struct rc_src_register reg, unsigned int mask, struct rc_swizzle_split * split);
+};
+
+#endif /* RADEON_SWIZZLE_H */
diff --git a/r300/r300_cmdbuf.c b/r300/r300_cmdbuf.c
index 0fe32a5..ad8db6e 100644
--- a/r300/r300_cmdbuf.c
+++ b/r300/r300_cmdbuf.c
@@ -46,14 +46,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "r300_context.h"
 #include "r300_ioctl.h"
-#include "radeon_reg.h"
 #include "r300_reg.h"
 #include "r300_cmdbuf.h"
 #include "r300_emit.h"
 #include "radeon_bocs_wrapper.h"
 #include "radeon_mipmap_tree.h"
 #include "r300_state.h"
-#include "radeon_reg.h"
 #include "radeon_queryobj.h"
 
 /** # of dwords reserved for additional instructions that may need to be written
@@ -171,7 +169,7 @@ static void emit_tex_offsets(GLcontext *ctx, struct radeon_state_atom * atom)
 		if (t && !t->image_override) {
 			BEGIN_BATCH_NO_AUTOSTATE(4);
 			OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
-			OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+			OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, get_base_teximage_offset(t),
 					RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
 			END_BATCH();
 		} else if (!t) {
@@ -279,16 +277,33 @@ static void emit_cb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
 	cbpitch = (rrb->pitch / rrb->cpp);
 	if (rrb->cpp == 4)
 		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
-	else switch (rrb->base._ActualFormat) {
-	case GL_RGB5:
+	else switch (rrb->base.Format) {
+        case MESA_FORMAT_RGB565:
+		assert(_mesa_little_endian());
 		cbpitch |= R300_COLOR_FORMAT_RGB565;
 		break;
-	case GL_RGBA4:
+        case MESA_FORMAT_RGB565_REV:
+		assert(!_mesa_little_endian());
+		cbpitch |= R300_COLOR_FORMAT_RGB565;
+		break;
+        case MESA_FORMAT_ARGB4444:
+		assert(_mesa_little_endian());
 		cbpitch |= R300_COLOR_FORMAT_ARGB4444;
 		break;
-	case GL_RGB5_A1:
+        case MESA_FORMAT_ARGB4444_REV:
+		assert(!_mesa_little_endian());
+		cbpitch |= R300_COLOR_FORMAT_ARGB4444;
+		break;
+	case MESA_FORMAT_ARGB1555:
+		assert(_mesa_little_endian());
 		cbpitch |= R300_COLOR_FORMAT_ARGB1555;
 		break;
+	case MESA_FORMAT_ARGB1555_REV:
+		assert(!_mesa_little_endian());
+		cbpitch |= R300_COLOR_FORMAT_ARGB1555;
+		break;
+	default:
+		_mesa_problem(ctx, "unexpected format in emit_cb_offset()");
 	}
 
 	if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
@@ -684,11 +699,7 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_DITHER_CTL, 9);
 	ALLOC_STATE(rb3d_aaresolve_ctl, always, 2, 0);
 	r300->hw.rb3d_aaresolve_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_AARESOLVE_CTL, 1);
-	if ((r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) ||
-	      ( !r300->radeon.radeonScreen->kernel_mm && (
-	    (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RS400) ||
-	    (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) ||
-	    (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420) ) ) ) {
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV350) {
 		ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, always, 3, 0);
 	} else {
 		ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, never, 3, 0);
@@ -697,6 +708,14 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	ALLOC_STATE(zs, always, R300_ZS_CMDSIZE, 0);
 	r300->hw.zs.cmd[R300_ZS_CMD_0] =
 	    cmdpacket0(r300->radeon.radeonScreen, R300_ZB_CNTL, 3);
+	if (is_r500) {
+		if (r300->radeon.radeonScreen->kernel_mm)
+			ALLOC_STATE(zsb, always, R300_ZSB_CMDSIZE, 0);
+		else
+			ALLOC_STATE(zsb, never, R300_ZSB_CMDSIZE, 0);
+		r300->hw.zsb.cmd[R300_ZSB_CMD_0] =
+			cmdpacket0(r300->radeon.radeonScreen, R500_ZB_STENCILREFMASK_BF, 1);
+	}
 
 	ALLOC_STATE(zstencil_format, always, 5, 0);
 	r300->hw.zstencil_format.cmd[0] =
diff --git a/r300/r300_context.c b/r300/r300_context.c
index 2ea1b82..5f07b95 100644
--- a/r300/r300_context.c
+++ b/r300/r300_context.c
@@ -84,15 +84,16 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define need_GL_EXT_framebuffer_object
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_gpu_program_parameters
+#define need_GL_EXT_provoking_vertex
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_stencil_two_side
 #define need_GL_ATI_separate_stencil
 #define need_GL_NV_vertex_program
 
-#include "extension_helper.h"
+#include "main/remap_helper.h"
 
 
-const struct dri_extension card_extensions[] = {
+static const struct dri_extension card_extensions[] = {
   /* *INDENT-OFF* */
   {"GL_ARB_depth_texture",		NULL},
   {"GL_ARB_fragment_program",		NULL},
@@ -116,6 +117,7 @@ const struct dri_extension card_extensions[] = {
   {"GL_EXT_packed_depth_stencil",	NULL},
   {"GL_EXT_fog_coord",			GL_EXT_fog_coord_functions },
   {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
+  {"GL_EXT_provoking_vertex",           GL_EXT_provoking_vertex_functions },
   {"GL_EXT_secondary_color", 		GL_EXT_secondary_color_functions},
   {"GL_EXT_shadow_funcs",		NULL},
   {"GL_EXT_stencil_two_side",		GL_EXT_stencil_two_side_functions},
@@ -143,7 +145,7 @@ const struct dri_extension card_extensions[] = {
 };
 
 
-const struct dri_extension mm_extensions[] = {
+static const struct dri_extension mm_extensions[] = {
   { "GL_EXT_framebuffer_blit",	GL_EXT_framebuffer_blit_functions },
   { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
   { NULL, NULL }
@@ -153,7 +155,7 @@ const struct dri_extension mm_extensions[] = {
  * The GL 2.0 functions are needed to make display lists work with
  * functions added by GL_ATI_separate_stencil.
  */
-const struct dri_extension gl_20_extension[] = {
+static const struct dri_extension gl_20_extension[] = {
   {"GL_VERSION_2_0",			GL_VERSION_2_0_functions },
 };
 
@@ -374,11 +376,21 @@ static void r300InitConstValues(GLcontext *ctx, radeonScreenPtr screen)
 	if (screen->chip_family >= CHIP_FAMILY_RV515) {
 		ctx->Const.FragmentProgram.MaxNativeTemps = R500_PFS_NUM_TEMP_REGS;
 		ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
-		ctx->Const.FragmentProgram.MaxNativeParameters = R500_PFS_NUM_CONST_REGS;
-		ctx->Const.FragmentProgram.MaxNativeAluInstructions = R500_PFS_MAX_INST;
-		ctx->Const.FragmentProgram.MaxNativeTexInstructions = R500_PFS_MAX_INST;
-		ctx->Const.FragmentProgram.MaxNativeInstructions = R500_PFS_MAX_INST;
-		ctx->Const.FragmentProgram.MaxNativeTexIndirections = R500_PFS_MAX_INST;
+
+		/* The hardware limits are higher than this,
+		 * but the non-KMS DRM interface artificially limits us
+		 * to this many instructions.
+		 *
+		 * We could of course work around it in the KMS path,
+		 * but it would be a mess, so it seems wiser
+		 * to leave it as is. Going forward, the Gallium driver
+		 * will not be subject to these limitations.
+		 */
+		ctx->Const.FragmentProgram.MaxNativeParameters = 255;
+		ctx->Const.FragmentProgram.MaxNativeAluInstructions = 255;
+		ctx->Const.FragmentProgram.MaxNativeTexInstructions = 255;
+		ctx->Const.FragmentProgram.MaxNativeInstructions = 255;
+		ctx->Const.FragmentProgram.MaxNativeTexIndirections = 255;
 		ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
 	} else {
 		ctx->Const.FragmentProgram.MaxNativeTemps = R300_PFS_NUM_TEMP_REGS;
@@ -427,11 +439,11 @@ static void r300InitGLExtensions(GLcontext *ctx)
 	if (r300->options.stencil_two_side_disabled)
 		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
 
-	if (r300->options.s3tc_force_enabled) {
+	if (r300->options.s3tc_force_disabled) {
+		_mesa_disable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+	} else if (ctx->Mesa_DXTn || r300->options.s3tc_force_enabled) {
 		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
 		_mesa_enable_extension(ctx, "GL_S3_s3tc");
-	} else if (r300->options.s3tc_force_disabled) {
-		_mesa_disable_extension(ctx, "GL_EXT_texture_compression_s3tc");
 	}
 
 	if (!r300->radeon.radeonScreen->drmSupportsOcclusionQueries) {
diff --git a/r300/r300_context.h b/r300/r300_context.h
index 1dadcc0..518d5cd 100644
--- a/r300/r300_context.h
+++ b/r300/r300_context.h
@@ -234,6 +234,10 @@ typedef struct r300_context *r300ContextPtr;
 #define R300_ZS_CNTL_2		3
 #define R300_ZS_CMDSIZE		4
 
+#define R300_ZSB_CMD_0		0
+#define R300_ZSB_CNTL_0		1
+#define R300_ZSB_CMDSIZE	2
+
 #define R300_ZB_CMD_0		0
 #define R300_ZB_OFFSET		1
 #define R300_ZB_PITCH		2
@@ -343,6 +347,7 @@ struct r300_hw_state {
 	struct radeon_state_atom rb3d_aaresolve_ctl;	/* (4E88) */
 	struct radeon_state_atom rb3d_discard_src_pixel_lte_threshold;	/* (4E88) I saw it only written on RV350 hardware..  */
 	struct radeon_state_atom zs;	/* zstencil control (4F00) */
+	struct radeon_state_atom zsb;	/* zstencil bf */
 	struct radeon_state_atom zstencil_format;
 	struct radeon_state_atom zb;	/* z buffer (4F20) */
 	struct radeon_state_atom zb_depthclearvalue;	/* (4F28) */
diff --git a/r300/r300_draw.c b/r300/r300_draw.c
index e9968f9..06a0490 100644
--- a/r300/r300_draw.c
+++ b/r300/r300_draw.c
@@ -29,7 +29,7 @@
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/state.h"
-#include "main/api_validate.h"
+/* #include "main/api_validate.h" */
 #include "main/enums.h"
 #include "main/simple_list.h"
 
diff --git a/r300/r300_emit.h b/r300/r300_emit.h
index 8e57e35..a456d88 100644
--- a/r300/r300_emit.h
+++ b/r300/r300_emit.h
@@ -42,7 +42,6 @@
 #include "main/glheader.h"
 #include "r300_context.h"
 #include "r300_cmdbuf.h"
-#include "radeon_reg.h"
 
 static INLINE uint32_t cmdpacket0(struct radeon_screen *rscrn,
                                   int reg, int count)
diff --git a/r300/r300_fragprog_common.c b/r300/r300_fragprog_common.c
index 0bdc90b..267ee81 100644
--- a/r300/r300_fragprog_common.c
+++ b/r300/r300_fragprog_common.c
@@ -44,6 +44,7 @@
 
 #include "compiler/radeon_compiler.h"
 
+#include "radeon_mesa_to_rc.h"
 #include "r300_state.h"
 
 
@@ -131,7 +132,7 @@ static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler,
  */
 static void rewriteFog(struct r300_fragment_program_compiler *compiler, struct r300_fragment_program * fp)
 {
-	struct prog_src_register src;
+	struct rc_src_register src;
 	int i;
 
 	fp->fog_attr = FRAG_ATTRIB_MAX;
@@ -155,7 +156,7 @@ static void rewriteFog(struct r300_fragment_program_compiler *compiler, struct r
 	}
 
 	memset(&src, 0, sizeof(src));
-	src.File = PROGRAM_INPUT;
+	src.File = RC_FILE_INPUT;
 	src.Index = fp->fog_attr;
 	src.Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE);
 	rc_move_input(&compiler->Base, FRAG_ATTRIB_FOGC, src);
@@ -232,13 +233,26 @@ static void translate_fragment_program(GLcontext *ctx, struct r300_fragment_prog
 		fflush(stderr);
 	}
 
-	rc_mesa_to_rc_program(&compiler.Base, &cont->Base.Base);
+	radeon_mesa_to_rc_program(&compiler.Base, &cont->Base.Base);
 
 	insert_WPOS_trailer(&compiler, fp);
 
 	rewriteFog(&compiler, fp);
 
 	r3xx_compile_fragment_program(&compiler);
+
+	if (compiler.is_r500) {
+		/* We need to support the non-KMS DRM interface, which
+		 * artificially limits the number of instructions and
+		 * constants which are available to us.
+		 *
+		 * See also the comment in r300_context.c where we
+		 * set the MAX_NATIVE_xxx values.
+		 */
+		if (fp->code.code.r500.inst_end >= 255 || fp->code.constants.Count > 255)
+			rc_error(&compiler.Base, "Program is too big (upgrade to r300g to avoid this limitation).\n");
+	}
+
 	fp->error = compiler.Base.Error;
 
 	fp->InputsRead = compiler.Base.Program.InputsRead;
diff --git a/r300/r300_reg.h b/r300/r300_reg.h
index 39b4b61..ea684e7 100644
--- a/r300/r300_reg.h
+++ b/r300/r300_reg.h
@@ -1022,15 +1022,13 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 	R300_GA_COLOR_CONTROL_RGB0_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA0_SHADING_GOURAUD | \
 	R300_GA_COLOR_CONTROL_RGB1_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD | \
 	R300_GA_COLOR_CONTROL_RGB2_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA2_SHADING_GOURAUD | \
-	R300_GA_COLOR_CONTROL_RGB3_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD | \
-	R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST )
+	R300_GA_COLOR_CONTROL_RGB3_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD)
 /** TODO: might be candidate for removal, the GOURAUD stuff also looks buggy to me */
 #	define R300_RE_SHADE_MODEL_FLAT     ( \
 	R300_GA_COLOR_CONTROL_RGB0_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA0_SHADING_FLAT | \
 	R300_GA_COLOR_CONTROL_RGB1_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD | \
 	R300_GA_COLOR_CONTROL_RGB2_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA2_SHADING_FLAT | \
-	R300_GA_COLOR_CONTROL_RGB3_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD | \
-	R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST )
+	R300_GA_COLOR_CONTROL_RGB3_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD)
 
 /* Specifies red & green components of fill color -- S312 format -- Backwards comp. */
 #define R300_GA_SOLID_RG                         0x427c
@@ -1791,6 +1789,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_ALU_DSTC_OUTPUT_X           (1 << 26)
 #       define R300_ALU_DSTC_OUTPUT_Y           (1 << 27)
 #       define R300_ALU_DSTC_OUTPUT_Z           (1 << 28)
+#       define R300_RGB_TARGET(x)               ((x) << 29)
 
 #define R300_US_ALU_ALPHA_ADDR_0                 0x47C0
 #       define R300_ALU_SRC0A_SHIFT             0
@@ -1808,6 +1807,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_ALU_DSTA_REG                (1 << 23)
 #       define R300_ALU_DSTA_OUTPUT             (1 << 24)
 #		define R300_ALU_DSTA_DEPTH              (1 << 27)
+#		define R300_ALPHA_TARGET(x)             ((x) << 25)
 
 #define R300_US_ALU_RGB_INST_0                   0x48C0
 #       define R300_ALU_ARGC_SRC0C_XYZ          0
@@ -2315,6 +2315,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_Z_WRITE_ENABLE		 (1 << 2)
 #	define R300_Z_SIGNED_COMPARE		 (1 << 3)
 #	define R300_STENCIL_FRONT_BACK		 (1 << 4)
+#	define R400_ZSIGNED_MAGNITUDE		 (1 << 5)
+#	define R500_STENCIL_REFMASK_FRONT_BACK	 (1 << 6)
 
 #define R300_ZB_ZSTENCILCNTL                   0x4f04
 	/* functions */
@@ -3002,6 +3004,8 @@ enum {
 #   define R500_INST_RGB_CLAMP				(1 << 19)
 #   define R500_INST_ALPHA_CLAMP			(1 << 20)
 #   define R500_INST_ALU_RESULT_SEL			(1 << 21)
+#   define R500_INST_ALU_RESULT_SEL_RED		(0 << 21)
+#   define R500_INST_ALU_RESULT_SEL_ALPHA		(1 << 21)
 #   define R500_INST_ALPHA_PRED_INV			(1 << 22)
 #   define R500_INST_ALU_RESULT_OP_EQ			(0 << 23)
 #   define R500_INST_ALU_RESULT_OP_LT			(1 << 23)
diff --git a/r300/r300_render.c b/r300/r300_render.c
index 3cd3875..4ae593c 100644
--- a/r300/r300_render.c
+++ b/r300/r300_render.c
@@ -67,8 +67,6 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "vbo/vbo_split.h"
 #include "tnl/tnl.h"
 #include "tnl/t_vp_build.h"
-#include "radeon_reg.h"
-#include "radeon_macros.h"
 #include "r300_context.h"
 #include "r300_ioctl.h"
 #include "r300_state.h"
diff --git a/r300/r300_state.c b/r300/r300_state.c
index 9301543..ac20c08 100644
--- a/r300/r300_state.c
+++ b/r300/r300_state.c
@@ -45,7 +45,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/framebuffer.h"
 #include "main/simple_list.h"
 #include "main/api_arrayelt.h"
-#include "main/texformat.h"
 
 #include "swrast/swrast.h"
 #include "swrast_setup/swrast_setup.h"
@@ -590,7 +589,9 @@ static void r300SetDepthState(GLcontext * ctx)
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 
 	R300_STATECHANGE(r300, zs);
-	r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_STENCIL_ENABLE|R300_STENCIL_FRONT_BACK;
+	r300->hw.zs.cmd[R300_ZS_CNTL_0] &= (R300_STENCIL_ENABLE |
+					    R300_STENCIL_FRONT_BACK |
+					    R500_STENCIL_REFMASK_FRONT_BACK);
 	r300->hw.zs.cmd[R300_ZS_CNTL_1] &= ~(R300_ZS_MASK << R300_Z_FUNC_SHIFT);
 
 	if (ctx->Depth.Test) {
@@ -604,11 +605,16 @@ static void r300SetDepthState(GLcontext * ctx)
 
 static void r300CatchStencilFallback(GLcontext *ctx)
 {
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	const unsigned back = ctx->Stencil._BackFace;
 
-	if (ctx->Stencil._Enabled && (ctx->Stencil.Ref[0] != ctx->Stencil.Ref[back]
-		|| ctx->Stencil.ValueMask[0] != ctx->Stencil.ValueMask[back]
-		|| ctx->Stencil.WriteMask[0] != ctx->Stencil.WriteMask[back])) {
+	if (rmesa->radeon.radeonScreen->kernel_mm &&
+	    (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)) {
+		r300SwitchFallback(ctx, R300_FALLBACK_STENCIL_TWOSIDE, GL_FALSE);
+	} else if (ctx->Stencil._Enabled &&
+		   (ctx->Stencil.Ref[0] != ctx->Stencil.Ref[back]
+		    || ctx->Stencil.ValueMask[0] != ctx->Stencil.ValueMask[back]
+		    || ctx->Stencil.WriteMask[0] != ctx->Stencil.WriteMask[back])) {
 		r300SwitchFallback(ctx, R300_FALLBACK_STENCIL_TWOSIDE, GL_TRUE);
 	} else {
 		r300SwitchFallback(ctx, R300_FALLBACK_STENCIL_TWOSIDE, GL_FALSE);
@@ -915,11 +921,24 @@ static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
 	    (flag << R300_S_BACK_FUNC_SHIFT);
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] |= refmask;
+
+	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+		rmesa->hw.zs.cmd[R300_ZS_CNTL_0] |= R500_STENCIL_REFMASK_FRONT_BACK;
+		R300_STATECHANGE(rmesa, zsb);
+		refmask = ((ctx->Stencil.Ref[back] & 0xff) << R300_STENCILREF_SHIFT)
+			| ((ctx->Stencil.ValueMask[back] & 0xff) << R300_STENCILMASK_SHIFT);
+
+		rmesa->hw.zsb.cmd[R300_ZSB_CNTL_0] &=
+			~((R300_STENCILREF_MASK << R300_STENCILREF_SHIFT) |
+			  (R300_STENCILREF_MASK << R300_STENCILMASK_SHIFT));
+		rmesa->hw.zsb.cmd[R300_ZSB_CNTL_0] |= refmask;
+	}
 }
 
 static void r300StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	const unsigned back = ctx->Stencil._BackFace;
 
 	r300CatchStencilFallback(ctx);
 
@@ -931,6 +950,13 @@ static void r300StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
 	    (ctx->Stencil.
 	     WriteMask[0] & R300_STENCILREF_MASK) <<
 	     R300_STENCILWRITEMASK_SHIFT;
+	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+		R300_STATECHANGE(rmesa, zsb);
+		rmesa->hw.zsb.cmd[R300_ZSB_CNTL_0] |=
+			(ctx->Stencil.
+			 WriteMask[back] & R300_STENCILREF_MASK) <<
+			R300_STENCILWRITEMASK_SHIFT;
+	}
 }
 
 static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
@@ -2253,6 +2279,14 @@ static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
 		R300_STATECHANGE(r300, zb);
 	}
 
+	if (new_state & (_NEW_LIGHT)) {
+		R300_STATECHANGE(r300, shade2);
+		if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION)
+			r300->hw.shade2.cmd[1] |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST;
+		else
+			r300->hw.shade2.cmd[1] &= ~R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST;
+	}
+
 	r300->radeon.NewGLState |= new_state;
 }
 
diff --git a/r300/r300_tex.c b/r300/r300_tex.c
index 433e5a8..726b3ff 100644
--- a/r300/r300_tex.c
+++ b/r300/r300_tex.c
@@ -40,7 +40,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/image.h"
 #include "main/mipmap.h"
 #include "main/simple_list.h"
-#include "main/texformat.h"
 #include "main/texstore.h"
 #include "main/teximage.h"
 #include "main/texobj.h"
@@ -196,6 +195,7 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 			     GLenum pname, const GLfloat * params)
 {
 	radeonTexObj* t = radeon_tex_obj(texObj);
+	GLenum texBaseFormat;
 
 	if (RADEON_DEBUG & (RADEON_STATE | RADEON_TEXTURE)) {
 		fprintf(stderr, "%s( %s )\n", __FUNCTION__,
@@ -223,23 +223,16 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 	case GL_TEXTURE_MAX_LEVEL:
 	case GL_TEXTURE_MIN_LOD:
 	case GL_TEXTURE_MAX_LOD:
-		/* This isn't the most efficient solution but there doesn't appear to
-		 * be a nice alternative.  Since there's no LOD clamping,
-		 * we just have to rely on loading the right subset of mipmap levels
-		 * to simulate a clamped LOD.
-		 */
-		if (t->mt) {
-			radeon_miptree_unreference(t->mt);
-			t->mt = 0;
-			t->validated = GL_FALSE;
-		}
+		t->validated = GL_FALSE;
 		break;
 
 	case GL_DEPTH_TEXTURE_MODE:
 		if (!texObj->Image[0][texObj->BaseLevel])
 			return;
-		if (texObj->Image[0][texObj->BaseLevel]->TexFormat->BaseFormat
-		    == GL_DEPTH_COMPONENT) {
+		texBaseFormat = texObj->Image[0][texObj->BaseLevel]->_BaseFormat;
+
+		if (texBaseFormat == GL_DEPTH_COMPONENT ||
+			texBaseFormat == GL_DEPTH_STENCIL) {
 			r300SetDepthTexMode(texObj);
 			break;
 		} else {
@@ -268,7 +261,11 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 
 	if (rmesa) {
 		int i;
-		radeon_firevertices(&rmesa->radeon);
+		struct radeon_bo *bo;
+		bo = !t->mt ? t->bo : t->mt->bo;
+		if (bo && radeon_bo_is_referenced_by_cs(bo, rmesa->radeon.cmdbuf.cs)) {
+			radeon_firevertices(&rmesa->radeon);
+		}
 
 		for(i = 0; i < R300_MAX_TEXTURE_UNITS; ++i)
 			if (rmesa->hw.textures[i] == t)
@@ -280,10 +277,8 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 		t->bo = NULL;
 	}
 
-	if (t->mt) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = 0;
-	}
+	radeon_miptree_unreference(&t->mt);
+
 	_mesa_delete_texture_object(ctx, texObj);
 }
 
diff --git a/r300/r300_texstate.c b/r300/r300_texstate.c
index f030451..bbe8b1e 100644
--- a/r300/r300_texstate.c
+++ b/r300/r300_texstate.c
@@ -39,7 +39,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/imports.h"
 #include "main/context.h"
 #include "main/macros.h"
-#include "main/texformat.h"
 #include "main/teximage.h"
 #include "main/texobj.h"
 #include "main/enums.h"
@@ -84,6 +83,7 @@ static const struct tx_table {
 	_ASSIGN(ARGB8888, R300_EASY_TX_FORMAT(W, Z, Y, X, W8Z8Y8X8)),
 	_ASSIGN(ARGB8888_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8)),
 #endif
+	_ASSIGN(XRGB8888, R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8)),
 	_ASSIGN(RGB888, R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8)),
 	_ASSIGN(RGB565, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
 	_ASSIGN(RGB565_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
@@ -138,9 +138,9 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 			R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X16),
 		},
 		{
-			R300_EASY_TX_FORMAT(X, X, X, ONE, X24_Y8),
-			R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8),
-			R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X24_Y8),
+			R300_EASY_TX_FORMAT(Y, Y, Y, ONE, X24_Y8),
+			R300_EASY_TX_FORMAT(Y, Y, Y, Y, X24_Y8),
+			R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, Y, X24_Y8),
 		},
 		{
 			R300_EASY_TX_FORMAT(X, X, X, ONE, X32),
@@ -156,11 +156,11 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 
 	t = radeon_tex_obj(tObj);
 
-	switch (tObj->Image[0][tObj->BaseLevel]->TexFormat->MesaFormat) {
+	switch (tObj->Image[0][tObj->BaseLevel]->TexFormat) {
 	case MESA_FORMAT_Z16:
 		format = formats[0];
 		break;
-	case MESA_FORMAT_Z24_S8:
+	case MESA_FORMAT_S8_Z24:
 		format = formats[1];
 		break;
 	case MESA_FORMAT_Z32:
@@ -203,19 +203,17 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 static void setup_hardware_state(r300ContextPtr rmesa, radeonTexObj *t)
 {
 	const struct gl_texture_image *firstImage;
-	int firstlevel = t->mt ? t->mt->firstLevel : 0;
-	    
-	firstImage = t->base.Image[0][firstlevel];
+	firstImage = t->base.Image[0][t->minLod];
 
 	if (!t->image_override
-	    && VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
-		if (firstImage->TexFormat->BaseFormat == GL_DEPTH_COMPONENT) {
+	    && VALID_FORMAT(firstImage->TexFormat)) {
+		if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
 			r300SetDepthTexMode(&t->base);
 		} else {
-			t->pp_txformat = tx_table[firstImage->TexFormat->MesaFormat].format;
+			t->pp_txformat = tx_table[firstImage->TexFormat].format;
 		}
 
-		t->pp_txfilter |= tx_table[firstImage->TexFormat->MesaFormat].filter;
+		t->pp_txfilter |= tx_table[firstImage->TexFormat].filter;
 	} else if (!t->image_override) {
 		_mesa_problem(NULL, "unexpected texture format in %s",
 			      __FUNCTION__);
@@ -225,10 +223,10 @@ static void setup_hardware_state(r300ContextPtr rmesa, radeonTexObj *t)
 	if (t->image_override && t->bo)
 		return;
 
-	t->pp_txsize = (((firstImage->Width - 1) << R300_TX_WIDTHMASK_SHIFT)
-			| ((firstImage->Height - 1) << R300_TX_HEIGHTMASK_SHIFT)
-			| ((firstImage->DepthLog2) << R300_TX_DEPTHMASK_SHIFT)
-			| ((t->mt->lastLevel - t->mt->firstLevel) << R300_TX_MAX_MIP_LEVEL_SHIFT));
+	t->pp_txsize = (((R300_TX_WIDTHMASK_MASK & ((firstImage->Width - 1) << R300_TX_WIDTHMASK_SHIFT)))
+			| ((R300_TX_HEIGHTMASK_MASK & ((firstImage->Height - 1) << R300_TX_HEIGHTMASK_SHIFT)))
+			| ((R300_TX_DEPTHMASK_MASK & ((firstImage->DepthLog2) << R300_TX_DEPTHMASK_SHIFT)))
+			| ((R300_TX_MAX_MIP_LEVEL_MASK & ((t->maxLod - t->minLod) << R300_TX_MAX_MIP_LEVEL_SHIFT))));
 
 	t->tile_bits = 0;
 
@@ -239,7 +237,7 @@ static void setup_hardware_state(r300ContextPtr rmesa, radeonTexObj *t)
 
 
 	if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
-		unsigned int align = (64 / t->mt->bpp) - 1;
+		unsigned int align = (64 / _mesa_get_format_bytes(firstImage->TexFormat)) - 1;
 		t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
 		if (!t->image_override)
 			t->pp_txpitch = ((firstImage->Width + align) & ~align) - 1;
@@ -248,8 +246,12 @@ static void setup_hardware_state(r300ContextPtr rmesa, radeonTexObj *t)
 	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
 	    if (firstImage->Width > 2048)
 		t->pp_txpitch |= R500_TXWIDTH_BIT11;
+            else
+		t->pp_txpitch &= ~R500_TXWIDTH_BIT11;
 	    if (firstImage->Height > 2048)
 		t->pp_txpitch |= R500_TXHEIGHT_BIT11;
+            else
+		t->pp_txpitch &= ~R500_TXHEIGHT_BIT11;
 	}
 }
 
@@ -434,20 +436,13 @@ void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 		radeon_bo_unref(rImage->bo);
 		rImage->bo = NULL;
 	}
-	if (t->mt) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = NULL;
-	}
-	if (rImage->mt) {
-		radeon_miptree_unreference(rImage->mt);
-		rImage->mt = NULL;
-	}
+
+	radeon_miptree_unreference(&t->mt);
+	radeon_miptree_unreference(&rImage->mt);
+
 	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
 				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
 	texImage->RowStride = rb->pitch / rb->cpp;
-	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
-							internalFormat,
-							type, format, 0);
 	rImage->bo = rb->bo;
 	radeon_bo_ref(rImage->bo);
 	t->bo = rb->bo;
@@ -479,16 +474,20 @@ void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 		break;
 	}
 	pitch_val--;
-	t->pp_txsize = ((rb->base.Width - 1) << R300_TX_WIDTHMASK_SHIFT) |
-              ((rb->base.Height - 1) << R300_TX_HEIGHTMASK_SHIFT);
+	t->pp_txsize = (((R300_TX_WIDTHMASK_MASK & ((rb->base.Width - 1) << R300_TX_WIDTHMASK_SHIFT)))
+			| ((R300_TX_HEIGHTMASK_MASK & ((rb->base.Height - 1) << R300_TX_HEIGHTMASK_SHIFT))));
 	t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
 	t->pp_txpitch |= pitch_val;
 
 	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
 	    if (rb->base.Width > 2048)
 		t->pp_txpitch |= R500_TXWIDTH_BIT11;
+            else
+		t->pp_txpitch &= ~R500_TXWIDTH_BIT11;
 	    if (rb->base.Height > 2048)
 		t->pp_txpitch |= R500_TXHEIGHT_BIT11;
+            else
+		t->pp_txpitch &= ~R500_TXHEIGHT_BIT11;
 	}
 	t->validated = GL_TRUE;
 	_mesa_unlock_texture(radeon->glCtx, texObj);
diff --git a/r300/r300_vertprog.c b/r300/r300_vertprog.c
index 2f7b67c..c2f96af 100644
--- a/r300/r300_vertprog.c
+++ b/r300/r300_vertprog.c
@@ -41,7 +41,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/tnl.h"
 
 #include "compiler/radeon_compiler.h"
-#include "compiler/radeon_nqssadce.h"
+#include "radeon_mesa_to_rc.h"
 #include "r300_context.h"
 #include "r300_fragprog_common.h"
 #include "r300_state.h"
@@ -62,12 +62,6 @@ static int r300VertexProgUpdateParams(GLcontext * ctx, struct r300_vertex_progra
 		}
 	}
 
-	if (vp->code.constants.Count * 4 > VSF_MAX_FRAGMENT_LENGTH) {
-		/* Should have checked this earlier... */
-		fprintf(stderr, "%s:Params exhausted\n", __FUNCTION__);
-		_mesa_exit(-1);
-	}
-
 	for(i = 0; i < vp->code.constants.Count; ++i) {
 		const float * src = 0;
 		const struct rc_constant * constant = &vp->code.constants.Constants[i];
@@ -217,20 +211,20 @@ static void initialize_NV_registers(struct radeon_compiler * compiler)
 
 	for(reg = 0; reg < 12; ++reg) {
 		inst = rc_insert_new_instruction(compiler, &compiler->Program.Instructions);
-		inst->I.Opcode = OPCODE_MOV;
-		inst->I.DstReg.File = PROGRAM_TEMPORARY;
-		inst->I.DstReg.Index = reg;
-		inst->I.SrcReg[0].File = PROGRAM_BUILTIN;
-		inst->I.SrcReg[0].Swizzle = SWIZZLE_0000;
+		inst->U.I.Opcode = RC_OPCODE_MOV;
+		inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
+		inst->U.I.DstReg.Index = reg;
+		inst->U.I.SrcReg[0].File = RC_FILE_NONE;
+		inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
 	}
 
 	inst = rc_insert_new_instruction(compiler, &compiler->Program.Instructions);
-	inst->I.Opcode = OPCODE_ARL;
-	inst->I.DstReg.File = PROGRAM_ADDRESS;
-	inst->I.DstReg.Index = 0;
-	inst->I.DstReg.WriteMask = WRITEMASK_X;
-	inst->I.SrcReg[0].File = PROGRAM_BUILTIN;
-	inst->I.SrcReg[0].Swizzle = SWIZZLE_0000;
+	inst->U.I.Opcode = RC_OPCODE_ARL;
+	inst->U.I.DstReg.File = RC_FILE_ADDRESS;
+	inst->U.I.DstReg.Index = 0;
+	inst->U.I.DstReg.WriteMask = WRITEMASK_X;
+	inst->U.I.SrcReg[0].File = RC_FILE_NONE;
+	inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
 }
 
 static struct r300_vertex_program *build_program(GLcontext *ctx,
@@ -261,7 +255,7 @@ static struct r300_vertex_program *build_program(GLcontext *ctx,
 		_mesa_insert_mvp_code(ctx, vp->Base);
 	}
 
-	rc_mesa_to_rc_program(&compiler.Base, &vp->Base->Base);
+	radeon_mesa_to_rc_program(&compiler.Base, &vp->Base->Base);
 
 	if (mesa_vp->IsNVProgram)
 		initialize_NV_registers(&compiler.Base);
@@ -281,6 +275,11 @@ static struct r300_vertex_program *build_program(GLcontext *ctx,
 	}
 
 	r3xx_compile_vertex_program(&compiler);
+
+	if (vp->code.constants.Count > ctx->Const.VertexProgram.MaxParameters) {
+		rc_error(&compiler.Base, "Program exceeds constant buffer size limit\n");
+	}
+
 	vp->error = compiler.Base.Error;
 
 	vp->Base->Base.InputsRead = vp->code.InputsRead;
@@ -334,7 +333,6 @@ struct r300_vertex_program * r300SelectAndTranslateVertexShader(GLcontext *ctx)
 #define bump_vpu_count(ptr, new_count)   do { \
 		drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr)); \
 		int _nc=(new_count)/4; \
-		assert(_nc < 256); \
 		if(_nc>_p->vpu.count)_p->vpu.count=_nc; \
 	} while(0)
 
diff --git a/r300/radeon_context.h b/r300/radeon_context.h
index 250570f..da4812d 100644
--- a/r300/radeon_context.h
+++ b/r300/radeon_context.h
@@ -51,26 +51,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "radeon_screen.h"
 
-#if R200_MERGED
-extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
-
-#define FALLBACK( radeon, bit, mode ) do {			\
-   if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",	\
-		     __FUNCTION__, bit, mode );			\
-   radeonFallback( (radeon)->glCtx, bit, mode );		\
-} while (0)
-#else
 #define FALLBACK( radeon, bit, mode ) fprintf(stderr, "%s:%s\n", __LINE__, __FILE__);
-#endif
 
 /* TCL fallbacks */
 extern void radeonTclFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
 
-#if R200_MERGED
-#define TCL_FALLBACK( ctx, bit, mode )	radeonTclFallback( ctx, bit, mode )
-#else
 #define TCL_FALLBACK( ctx, bit, mode )	;
-#endif
 
 
 #endif				/* __RADEON_CONTEXT_H__ */
diff --git a/r300/radeon_mesa_to_rc.c b/r300/radeon_mesa_to_rc.c
new file mode 100644
index 0000000..9f9dec8
--- /dev/null
+++ b/r300/radeon_mesa_to_rc.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_mesa_to_rc.h"
+
+#include "main/mtypes.h"
+#include "shader/prog_instruction.h"
+#include "shader/prog_parameter.h"
+
+#include "compiler/radeon_compiler.h"
+#include "compiler/radeon_program.h"
+
+
+static rc_opcode translate_opcode(gl_inst_opcode opcode)
+{
+	switch(opcode) {
+	case OPCODE_NOP: return RC_OPCODE_NOP;
+	case OPCODE_ABS: return RC_OPCODE_ABS;
+	case OPCODE_ADD: return RC_OPCODE_ADD;
+	case OPCODE_ARL: return RC_OPCODE_ARL;
+	case OPCODE_CMP: return RC_OPCODE_CMP;
+	case OPCODE_COS: return RC_OPCODE_COS;
+	case OPCODE_DDX: return RC_OPCODE_DDX;
+	case OPCODE_DDY: return RC_OPCODE_DDY;
+	case OPCODE_DP3: return RC_OPCODE_DP3;
+	case OPCODE_DP4: return RC_OPCODE_DP4;
+	case OPCODE_DPH: return RC_OPCODE_DPH;
+	case OPCODE_DST: return RC_OPCODE_DST;
+	case OPCODE_EX2: return RC_OPCODE_EX2;
+	case OPCODE_EXP: return RC_OPCODE_EXP;
+	case OPCODE_FLR: return RC_OPCODE_FLR;
+	case OPCODE_FRC: return RC_OPCODE_FRC;
+	case OPCODE_KIL: return RC_OPCODE_KIL;
+	case OPCODE_LG2: return RC_OPCODE_LG2;
+	case OPCODE_LIT: return RC_OPCODE_LIT;
+	case OPCODE_LOG: return RC_OPCODE_LOG;
+	case OPCODE_LRP: return RC_OPCODE_LRP;
+	case OPCODE_MAD: return RC_OPCODE_MAD;
+	case OPCODE_MAX: return RC_OPCODE_MAX;
+	case OPCODE_MIN: return RC_OPCODE_MIN;
+	case OPCODE_MOV: return RC_OPCODE_MOV;
+	case OPCODE_MUL: return RC_OPCODE_MUL;
+	case OPCODE_POW: return RC_OPCODE_POW;
+	case OPCODE_RCP: return RC_OPCODE_RCP;
+	case OPCODE_RSQ: return RC_OPCODE_RSQ;
+	case OPCODE_SCS: return RC_OPCODE_SCS;
+	case OPCODE_SEQ: return RC_OPCODE_SEQ;
+	case OPCODE_SFL: return RC_OPCODE_SFL;
+	case OPCODE_SGE: return RC_OPCODE_SGE;
+	case OPCODE_SGT: return RC_OPCODE_SGT;
+	case OPCODE_SIN: return RC_OPCODE_SIN;
+	case OPCODE_SLE: return RC_OPCODE_SLE;
+	case OPCODE_SLT: return RC_OPCODE_SLT;
+	case OPCODE_SNE: return RC_OPCODE_SNE;
+	case OPCODE_SUB: return RC_OPCODE_SUB;
+	case OPCODE_SWZ: return RC_OPCODE_SWZ;
+	case OPCODE_TEX: return RC_OPCODE_TEX;
+	case OPCODE_TXB: return RC_OPCODE_TXB;
+	case OPCODE_TXD: return RC_OPCODE_TXD;
+	case OPCODE_TXL: return RC_OPCODE_TXL;
+	case OPCODE_TXP: return RC_OPCODE_TXP;
+	case OPCODE_XPD: return RC_OPCODE_XPD;
+	default: return RC_OPCODE_ILLEGAL_OPCODE;
+	}
+}
+
+static rc_saturate_mode translate_saturate(unsigned int saturate)
+{
+	switch(saturate) {
+	default:
+	case SATURATE_OFF: return RC_SATURATE_NONE;
+	case SATURATE_ZERO_ONE: return RC_SATURATE_ZERO_ONE;
+	}
+}
+
+static rc_register_file translate_register_file(unsigned int file)
+{
+	switch(file) {
+	case PROGRAM_TEMPORARY: return RC_FILE_TEMPORARY;
+	case PROGRAM_INPUT: return RC_FILE_INPUT;
+	case PROGRAM_OUTPUT: return RC_FILE_OUTPUT;
+	case PROGRAM_LOCAL_PARAM:
+	case PROGRAM_ENV_PARAM:
+	case PROGRAM_STATE_VAR:
+	case PROGRAM_NAMED_PARAM:
+	case PROGRAM_CONSTANT:
+	case PROGRAM_UNIFORM: return RC_FILE_CONSTANT;
+	case PROGRAM_ADDRESS: return RC_FILE_ADDRESS;
+	default: return RC_FILE_NONE;
+	}
+}
+
+static void translate_srcreg(struct rc_src_register * dest, struct prog_src_register * src)
+{
+	dest->File = translate_register_file(src->File);
+	dest->Index = src->Index;
+	dest->RelAddr = src->RelAddr;
+	dest->Swizzle = src->Swizzle;
+	dest->Abs = src->Abs;
+	dest->Negate = src->Negate;
+}
+
+static void translate_dstreg(struct rc_dst_register * dest, struct prog_dst_register * src)
+{
+	dest->File = translate_register_file(src->File);
+	dest->Index = src->Index;
+	dest->RelAddr = src->RelAddr;
+	dest->WriteMask = src->WriteMask;
+}
+
+static rc_texture_target translate_tex_target(gl_texture_index target)
+{
+	switch(target) {
+	case TEXTURE_2D_ARRAY_INDEX: return RC_TEXTURE_2D_ARRAY;
+	case TEXTURE_1D_ARRAY_INDEX: return RC_TEXTURE_1D_ARRAY;
+	case TEXTURE_CUBE_INDEX: return RC_TEXTURE_CUBE;
+	case TEXTURE_3D_INDEX: return RC_TEXTURE_3D;
+	case TEXTURE_RECT_INDEX: return RC_TEXTURE_RECT;
+	default:
+	case TEXTURE_2D_INDEX: return RC_TEXTURE_2D;
+	case TEXTURE_1D_INDEX: return RC_TEXTURE_1D;
+	}
+}
+
+static void translate_instruction(struct radeon_compiler * c,
+		struct rc_instruction * dest, struct prog_instruction * src)
+{
+	const struct rc_opcode_info * opcode;
+	unsigned int i;
+
+	dest->U.I.Opcode = translate_opcode(src->Opcode);
+	if (dest->U.I.Opcode == RC_OPCODE_ILLEGAL_OPCODE) {
+		rc_error(c, "Unsupported opcode %i\n", src->Opcode);
+		return;
+	}
+	dest->U.I.SaturateMode = translate_saturate(src->SaturateMode);
+
+	opcode = rc_get_opcode_info(dest->U.I.Opcode);
+
+	for(i = 0; i < opcode->NumSrcRegs; ++i)
+		translate_srcreg(&dest->U.I.SrcReg[i], &src->SrcReg[i]);
+
+	if (opcode->HasDstReg)
+		translate_dstreg(&dest->U.I.DstReg, &src->DstReg);
+
+	if (opcode->HasTexture) {
+		dest->U.I.TexSrcUnit = src->TexSrcUnit;
+		dest->U.I.TexSrcTarget = translate_tex_target(src->TexSrcTarget);
+		dest->U.I.TexShadow = src->TexShadow;
+	}
+}
+
+void radeon_mesa_to_rc_program(struct radeon_compiler * c, struct gl_program * program)
+{
+	struct prog_instruction *source;
+	unsigned int i;
+
+	for(source = program->Instructions; source->Opcode != OPCODE_END; ++source) {
+		struct rc_instruction * dest = rc_insert_new_instruction(c, c->Program.Instructions.Prev);
+		translate_instruction(c, dest, source);
+	}
+
+	c->Program.ShadowSamplers = program->ShadowSamplers;
+	c->Program.InputsRead = program->InputsRead;
+	c->Program.OutputsWritten = program->OutputsWritten;
+
+	int isNVProgram = 0;
+
+	if (program->Target == GL_VERTEX_PROGRAM_ARB) {
+		struct gl_vertex_program * vp = (struct gl_vertex_program *) program;
+		isNVProgram = vp->IsNVProgram;
+	}
+
+	if (isNVProgram) {
+		/* NV_vertex_program has a fixed-sized constant environment.
+		 * This could be handled more efficiently for programs that
+		 * do not use relative addressing.
+		 */
+		for(i = 0; i < 96; ++i) {
+			struct rc_constant constant;
+
+			constant.Type = RC_CONSTANT_EXTERNAL;
+			constant.Size = 4;
+			constant.u.External = i;
+
+			rc_constants_add(&c->Program.Constants, &constant);
+		}
+	} else {
+		for(i = 0; i < program->Parameters->NumParameters; ++i) {
+			struct rc_constant constant;
+
+			constant.Type = RC_CONSTANT_EXTERNAL;
+			constant.Size = 4;
+			constant.u.External = i;
+
+			rc_constants_add(&c->Program.Constants, &constant);
+		}
+	}
+}
diff --git a/r300/radeon_mesa_to_rc.h b/r300/radeon_mesa_to_rc.h
new file mode 100644
index 0000000..9511a04
--- /dev/null
+++ b/r300/radeon_mesa_to_rc.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef RADEON_MESA_TO_RC_H
+#define RADEON_MESA_TO_RC_H
+
+struct gl_program;
+struct radeon_compiler;
+
+void radeon_mesa_to_rc_program(struct radeon_compiler * c, struct gl_program * program);
+
+#endif /* RADEON_MESA_TO_RC_H */
diff --git a/r600/Makefile.am b/r600/Makefile.am
index 6b2091e..c2ce9c8 100644
--- a/r600/Makefile.am
+++ b/r600/Makefile.am
@@ -1,7 +1,6 @@
 AM_CFLAGS = -DIN_DRI_DRIVER -DGLX_DIRECT_RENDERING -DGLX_INDIRECT_RENDERING
 
-R600_CFLAGS = -DCOMPILE_R600 -DR200_MERGED=0 -DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R600
-R600_CFLAGS += -I../radeon -I../radeon/server
+R600_CFLAGS = -DRADEON_R600 -I../radeon -I../radeon/server
 
 r600_dri_la_LTLIBRARIES = r600_dri.la
 r600_dri_la_CFLAGS = $(AM_CFLAGS) $(DRM_CFLAGS) $(DRI_CFLAGS) $(R600_CFLAGS)
@@ -11,6 +10,7 @@ r600_dri_ladir = @libdir@/dri
 r600_dri_la_SOURCES = \
 	../radeon/radeon_bo_legacy.c \
 	../radeon/radeon_common_context.c \
+	../radeon/radeon_buffer_objects.c \
 	../radeon/radeon_common.c \
 	../radeon/radeon_cs_legacy.c \
 	../radeon/radeon_dma.c \
@@ -44,5 +44,7 @@ if HAVE_LIBDRM_RADEON
 r600_dri_la_CFLAGS += -DHAVE_LIBDRM_RADEON=1 $(LIBDRM_RADEON_CFLAGS)
 r600_dri_la_LDFLAGS += $(LIBDRM_RADEON_LIBS)
 r600_dri_la_SOURCES += \
-	../radeon/radeon_cs_space_drm.c
+	../radeon/radeon_cs_space_drm.c \
+	../radeon/radeon_bo.c \
+	../radeon/radeon_cs.c
 endif
diff --git a/r600/r600_cmdbuf.c b/r600/r600_cmdbuf.c
index 3cfe03a..370bb04 100644
--- a/r600/r600_cmdbuf.c
+++ b/r600/r600_cmdbuf.c
@@ -52,29 +52,49 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_mipmap_tree.h"
 #include "radeon_reg.h"
 
+#ifdef HAVE_LIBDRM_RADEON
+#include "radeon_cs_int.h"
+#else
+#include "radeon_cs_int_drm.h"
+#endif
 
+struct r600_cs_manager_legacy
+{
+    struct radeon_cs_manager    base;
+    struct radeon_context       *ctx;
+    /* hack for scratch stuff */
+    uint32_t                    pending_age;
+    uint32_t                    pending_count;
+};
+
+struct r600_cs_reloc_legacy {
+    struct radeon_cs_reloc  base;
+    uint32_t                cindices;
+    uint32_t                *indices;
+    uint32_t                *reloc_indices;
+};
 
-static struct radeon_cs * r600_cs_create(struct radeon_cs_manager *csm,
-                                   uint32_t ndw)
+static struct radeon_cs_int *r600_cs_create(struct radeon_cs_manager *csm,
+					    uint32_t ndw)
 {
-    struct radeon_cs *cs;
+    struct radeon_cs_int *csi;
 
-    cs = (struct radeon_cs*)calloc(1, sizeof(struct radeon_cs));
-    if (cs == NULL) {
+    csi = (struct radeon_cs_int*)calloc(1, sizeof(struct radeon_cs_int));
+    if (csi == NULL) {
         return NULL;
     }
-    cs->csm = csm;
-    cs->ndw = (ndw + 0x3FF) & (~0x3FF);
-    cs->packets = (uint32_t*)malloc(4*cs->ndw);
-    if (cs->packets == NULL) {
-        free(cs);
+    csi->csm = csm;
+    csi->ndw = (ndw + 0x3FF) & (~0x3FF);
+    csi->packets = (uint32_t*)malloc(4*csi->ndw);
+    if (csi->packets == NULL) {
+        free(csi);
         return NULL;
     }
-    cs->relocs_total_size = 0;
-    return cs;
+    csi->relocs_total_size = 0;
+    return csi;
 }
 
-static int r600_cs_write_reloc(struct radeon_cs *cs,
+static int r600_cs_write_reloc(struct radeon_cs_int *csi,
 			       struct radeon_bo *bo,
 			       uint32_t read_domain,
 			       uint32_t write_domain,
@@ -83,7 +103,7 @@ static int r600_cs_write_reloc(struct radeon_cs *cs,
     struct r600_cs_reloc_legacy *relocs;
     int i;
 
-    relocs = (struct r600_cs_reloc_legacy *)cs->relocs;
+    relocs = (struct r600_cs_reloc_legacy *)csi->relocs;
     /* check domains */
     if ((read_domain && write_domain) || (!read_domain && !write_domain)) {
         /* in one CS a bo can only be in read or write domain but not
@@ -98,7 +118,7 @@ static int r600_cs_write_reloc(struct radeon_cs *cs,
         return -EINVAL;
     }
     /* check if bo is already referenced */
-    for(i = 0; i < cs->crelocs; i++) {
+    for(i = 0; i < csi->crelocs; i++) {
         uint32_t *indices;
         uint32_t *reloc_indices;
 
@@ -129,109 +149,108 @@ static int r600_cs_write_reloc(struct radeon_cs *cs,
             }
             relocs[i].indices = indices;
             relocs[i].reloc_indices = reloc_indices;
-            relocs[i].indices[relocs[i].cindices - 1] = cs->cdw;
-            relocs[i].reloc_indices[relocs[i].cindices - 1] = cs->cdw;
-            cs->section_cdw += 2;
-	    cs->cdw += 2;
+            relocs[i].indices[relocs[i].cindices - 1] = csi->cdw;
+            relocs[i].reloc_indices[relocs[i].cindices - 1] = csi->cdw;
+            csi->section_cdw += 2;
+	    csi->cdw += 2;
 
             return 0;
         }
     }
     /* add bo to reloc */
     relocs = (struct r600_cs_reloc_legacy*)
-             realloc(cs->relocs,
-                     sizeof(struct r600_cs_reloc_legacy) * (cs->crelocs + 1));
+             realloc(csi->relocs,
+                     sizeof(struct r600_cs_reloc_legacy) * (csi->crelocs + 1));
     if (relocs == NULL) {
         return -ENOMEM;
     }
-    cs->relocs = relocs;
-    relocs[cs->crelocs].base.bo = bo;
-    relocs[cs->crelocs].base.read_domain = read_domain;
-    relocs[cs->crelocs].base.write_domain = write_domain;
-    relocs[cs->crelocs].base.flags = flags;
-    relocs[cs->crelocs].indices = (uint32_t*)malloc(4);
-    relocs[cs->crelocs].reloc_indices = (uint32_t*)malloc(4);
-    if ( (relocs[cs->crelocs].indices == NULL) || (relocs[cs->crelocs].reloc_indices == NULL) )
+    csi->relocs = relocs;
+    relocs[csi->crelocs].base.bo = bo;
+    relocs[csi->crelocs].base.read_domain = read_domain;
+    relocs[csi->crelocs].base.write_domain = write_domain;
+    relocs[csi->crelocs].base.flags = flags;
+    relocs[csi->crelocs].indices = (uint32_t*)malloc(4);
+    relocs[csi->crelocs].reloc_indices = (uint32_t*)malloc(4);
+    if ( (relocs[csi->crelocs].indices == NULL) || (relocs[csi->crelocs].reloc_indices == NULL) )
     {
         return -ENOMEM;
     }
 
-    relocs[cs->crelocs].indices[0] = cs->cdw;
-    relocs[cs->crelocs].reloc_indices[0] = cs->cdw;
-    cs->section_cdw += 2;
-    cs->cdw += 2;
-    relocs[cs->crelocs].cindices = 1;
-    cs->relocs_total_size += radeon_bo_legacy_relocs_size(bo);
-    cs->crelocs++;
+    relocs[csi->crelocs].indices[0] = csi->cdw;
+    relocs[csi->crelocs].reloc_indices[0] = csi->cdw;
+    csi->section_cdw += 2;
+    csi->cdw += 2;
+    relocs[csi->crelocs].cindices = 1;
+    csi->relocs_total_size += radeon_bo_legacy_relocs_size(bo);
+    csi->crelocs++;
 
     radeon_bo_ref(bo);
 
     return 0;
 }
 
-static int r600_cs_begin(struct radeon_cs *cs,
+static int r600_cs_begin(struct radeon_cs_int *csi,
                     uint32_t ndw,
                     const char *file,
                     const char *func,
                     int line)
 {
-    if (cs->section) {
+    if (csi->section_ndw) {
         fprintf(stderr, "CS already in a section(%s,%s,%d)\n",
-                cs->section_file, cs->section_func, cs->section_line);
+                csi->section_file, csi->section_func, csi->section_line);
         fprintf(stderr, "CS can't start section(%s,%s,%d)\n",
                 file, func, line);
         return -EPIPE;
     }
 
-    cs->section = 1;
-    cs->section_ndw = ndw;
-    cs->section_cdw = 0;
-    cs->section_file = file;
-    cs->section_func = func;
-    cs->section_line = line;
+    csi->section_ndw = ndw;
+    csi->section_cdw = 0;
+    csi->section_file = file;
+    csi->section_func = func;
+    csi->section_line = line;
 
-    if (cs->cdw + ndw > cs->ndw) {
+    if (csi->cdw + ndw > csi->ndw) {
         uint32_t tmp, *ptr;
 	int num = (ndw > 0x400) ? ndw : 0x400;
 
-        tmp = (cs->cdw + num + 0x3FF) & (~0x3FF);
-        ptr = (uint32_t*)realloc(cs->packets, 4 * tmp);
+        tmp = (csi->cdw + num + 0x3FF) & (~0x3FF);
+        ptr = (uint32_t*)realloc(csi->packets, 4 * tmp);
         if (ptr == NULL) {
             return -ENOMEM;
         }
-        cs->packets = ptr;
-        cs->ndw = tmp;
+        csi->packets = ptr;
+        csi->ndw = tmp;
     }
 
     return 0;
 }
 
-static int r600_cs_end(struct radeon_cs *cs,
+static int r600_cs_end(struct radeon_cs_int *csi,
                   const char *file,
                   const char *func,
                   int line)
 
 {
-    if (!cs->section) {
+    if (!csi->section_ndw) {
         fprintf(stderr, "CS no section to end at (%s,%s,%d)\n",
                 file, func, line);
         return -EPIPE;
     }
-    cs->section = 0;
 
-    if ( cs->section_ndw != cs->section_cdw ) {
+    if ( csi->section_ndw != csi->section_cdw ) {
         fprintf(stderr, "CS section size missmatch start at (%s,%s,%d) %d vs %d\n",
-                cs->section_file, cs->section_func, cs->section_line, cs->section_ndw, cs->section_cdw);
-        fprintf(stderr, "cs->section_ndw = %d, cs->cdw = %d, cs->section_cdw = %d \n",
-                cs->section_ndw, cs->cdw, cs->section_cdw);
+                csi->section_file, csi->section_func, csi->section_line, csi->section_ndw, csi->section_cdw);
+        fprintf(stderr, "csi->section_ndw = %d, csi->cdw = %d, csi->section_cdw = %d \n",
+                csi->section_ndw, csi->cdw, csi->section_cdw);
         fprintf(stderr, "CS section end at (%s,%s,%d)\n",
                 file, func, line);
         return -EPIPE;
     }
+    csi->section_ndw = 0;
 
-    if (cs->cdw > cs->ndw) {
+    if (csi->cdw > csi->ndw) {
 	    fprintf(stderr, "CS section overflow at (%s,%s,%d) cdw %d ndw %d\n",
-		    cs->section_file, cs->section_func, cs->section_line,cs->cdw,cs->ndw);
+		    csi->section_file, csi->section_func, csi->section_line,csi->cdw,csi->ndw);
 	    fprintf(stderr, "CS section end at (%s,%s,%d)\n",
 		    file, func, line);
 	    assert(0);
@@ -240,21 +259,21 @@ static int r600_cs_end(struct radeon_cs *cs,
     return 0;
 }
 
-static int r600_cs_process_relocs(struct radeon_cs *cs, 
+static int r600_cs_process_relocs(struct radeon_cs_int *csi, 
                                   uint32_t * reloc_chunk,
                                   uint32_t * length_dw_reloc_chunk) 
 {
-    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)cs->csm;
+    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)csi->csm;
     struct r600_cs_reloc_legacy *relocs;
     int i, j, r;
 
     uint32_t offset_dw = 0;
 
-    csm = (struct r600_cs_manager_legacy*)cs->csm;
-    relocs = (struct r600_cs_reloc_legacy *)cs->relocs;
+    csm = (struct r600_cs_manager_legacy*)csi->csm;
+    relocs = (struct r600_cs_reloc_legacy *)csi->relocs;
 restart:
-    for (i = 0; i < cs->crelocs; i++) {
-            uint32_t soffset, eoffset, asicoffset;
+    for (i = 0; i < csi->crelocs; i++) {
+            uint32_t soffset, eoffset;
 
             r = radeon_bo_legacy_validate(relocs[i].base.bo,
 					  &soffset, &eoffset);
@@ -262,32 +281,20 @@ restart:
 		    goto restart;
             }
             if (r) {
-		    fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
+		    fprintf(stderr, "invalid bo(%p) [0x%08X, 0x%08X]\n",
 			    relocs[i].base.bo, soffset, eoffset);
 		    return r;
             }
-            asicoffset = soffset;
 
 	    for (j = 0; j < relocs[i].cindices; j++) {
-		    if (asicoffset >= eoffset) {
-			    /*                radeon_bo_debug(relocs[i].base.bo, 12); */
-			    fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
-				    relocs[i].base.bo, soffset, eoffset);
-			    fprintf(stderr, "above end: %p 0x%08X 0x%08X\n",
-				    relocs[i].base.bo,
-				    cs->packets[relocs[i].indices[j]],
-				    eoffset);
-			    exit(0);
-			    return -EINVAL;
-		    }
 		    /* pkt3 nop header in ib chunk */
-		    cs->packets[relocs[i].reloc_indices[j]] = 0xC0001000;
+		    csi->packets[relocs[i].reloc_indices[j]] = 0xC0001000;
 		    /* reloc index in ib chunk */
-		    cs->packets[relocs[i].reloc_indices[j] + 1] = offset_dw;
+		    csi->packets[relocs[i].reloc_indices[j] + 1] = offset_dw;
 	    }
 
 	    /* asic offset in reloc chunk */ /* see alex drm r600_nomm_relocate */
-	    reloc_chunk[offset_dw] = asicoffset;
+	    reloc_chunk[offset_dw] = soffset;
 	    reloc_chunk[offset_dw + 3] = 0;
 
 	    offset_dw += 4;
@@ -298,14 +305,14 @@ restart:
     return 0;
 }
 
-static int r600_cs_set_age(struct radeon_cs *cs) /* -------------- */
+static int r600_cs_set_age(struct radeon_cs_int *csi) /* -------------- */
 {
-    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)cs->csm;
+    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)csi->csm;
     struct r600_cs_reloc_legacy *relocs;
     int i;
 
-    relocs = (struct r600_cs_reloc_legacy *)cs->relocs;
-    for (i = 0; i < cs->crelocs; i++) {
+    relocs = (struct r600_cs_reloc_legacy *)csi->relocs;
+    for (i = 0; i < csi->crelocs; i++) {
         radeon_bo_legacy_pending(relocs[i].base.bo, csm->pending_age);
         radeon_bo_unref(relocs[i].base.bo);
     }
@@ -313,21 +320,21 @@ static int r600_cs_set_age(struct radeon_cs *cs) /* -------------- */
 }
 
 #if 0
-static void dump_cmdbuf(struct radeon_cs *cs)
+static void dump_cmdbuf(struct radeon_cs_int *csi)
 {
 	int i;
 	fprintf(stderr,"--start--\n");
-	for (i = 0; i < cs->cdw; i++){
-		fprintf(stderr,"0x%08x\n", cs->packets[i]);
+	for (i = 0; i < csi->cdw; i++){
+		fprintf(stderr,"0x%08x\n", csi->packets[i]);
 	}
 	fprintf(stderr,"--end--\n");
 
 }
 #endif
 
-static int r600_cs_emit(struct radeon_cs *cs)
+static int r600_cs_emit(struct radeon_cs_int *csi)
 {
-    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)cs->csm;
+    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)csi->csm;
     struct drm_radeon_cs       cs_cmd;
     struct drm_radeon_cs_chunk cs_chunk[2];
     uint32_t length_dw_reloc_chunk;
@@ -341,9 +348,9 @@ static int r600_cs_emit(struct radeon_cs *cs)
 
     csm->pending_count = 1;
 
-    reloc_chunk = (uint32_t*)calloc(1, cs->crelocs * 4 * 4);
+    reloc_chunk = (uint32_t*)calloc(1, csi->crelocs * 4 * 4);
 
-    r = r600_cs_process_relocs(cs, reloc_chunk, &length_dw_reloc_chunk);
+    r = r600_cs_process_relocs(csi, reloc_chunk, &length_dw_reloc_chunk);
     if (r) {
 	free(reloc_chunk);
         return 0;
@@ -351,8 +358,8 @@ static int r600_cs_emit(struct radeon_cs *cs)
 
     /* raw ib chunk */
     cs_chunk[0].chunk_id   = RADEON_CHUNK_ID_IB;
-    cs_chunk[0].length_dw  = cs->cdw;
-    cs_chunk[0].chunk_data = (unsigned long)(cs->packets);
+    cs_chunk[0].length_dw  = csi->cdw;
+    cs_chunk[0].chunk_data = (unsigned long)(csi->packets);
 
     /* reloc chaunk */
     cs_chunk[1].chunk_id   = RADEON_CHUNK_ID_RELOCS;
@@ -370,7 +377,7 @@ static int r600_cs_emit(struct radeon_cs *cs)
 
     do 
     {
-        r = drmCommandWriteRead(cs->csm->fd, DRM_RADEON_CS, &cs_cmd, sizeof(cs_cmd));
+        r = drmCommandWriteRead(csi->csm->fd, DRM_RADEON_CS, &cs_cmd, sizeof(cs_cmd));
         retry++;
     } while (r == -EAGAIN && retry < 1000);
 
@@ -381,11 +388,11 @@ static int r600_cs_emit(struct radeon_cs *cs)
 
     csm->pending_age = cs_cmd.cs_id;
 
-    r600_cs_set_age(cs);
+    r600_cs_set_age(csi);
 
-    cs->csm->read_used = 0;
-    cs->csm->vram_write_used = 0;
-    cs->csm->gart_write_used = 0;
+    csi->csm->read_used = 0;
+    csi->csm->vram_write_used = 0;
+    csi->csm->gart_write_used = 0;
 
     free(reloc_chunk);
 
@@ -405,35 +412,34 @@ static void inline r600_cs_free_reloc(void *relocs_p, int crelocs)
     }
 }
 
-static int r600_cs_destroy(struct radeon_cs *cs)
+static int r600_cs_destroy(struct radeon_cs_int *csi)
 {
-    r600_cs_free_reloc(cs->relocs, cs->crelocs);
-    free(cs->relocs);
-    free(cs->packets);
-    free(cs);
+    r600_cs_free_reloc(csi->relocs, csi->crelocs);
+    free(csi->relocs);
+    free(csi->packets);
+    free(csi);
     return 0;
 }
 
-static int r600_cs_erase(struct radeon_cs *cs)
+static int r600_cs_erase(struct radeon_cs_int *csi)
 {
-    r600_cs_free_reloc(cs->relocs, cs->crelocs);
-    free(cs->relocs);
-    cs->relocs_total_size = 0;
-    cs->relocs = NULL;
-    cs->crelocs = 0;
-    cs->cdw = 0;
-    cs->section = 0;
+    r600_cs_free_reloc(csi->relocs, csi->crelocs);
+    free(csi->relocs);
+    csi->relocs_total_size = 0;
+    csi->relocs = NULL;
+    csi->crelocs = 0;
+    csi->cdw = 0;
     return 0;
 }
 
-static int r600_cs_need_flush(struct radeon_cs *cs)
+static int r600_cs_need_flush(struct radeon_cs_int *csi)
 {
     /* this function used to flush when the BO usage got to
      * a certain size, now the higher levels handle this better */
     return 0;
 }
 
-static void r600_cs_print(struct radeon_cs *cs, FILE *file)
+static void r600_cs_print(struct radeon_cs_int *csi, FILE *file)
 {
 }
 
diff --git a/r600/r600_cmdbuf.h b/r600/r600_cmdbuf.h
index eba43d3..dff0009 100644
--- a/r600/r600_cmdbuf.h
+++ b/r600/r600_cmdbuf.h
@@ -118,22 +118,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R600_IT_SET_CTL_CONST                     0x00006F00
 #define R600_IT_SURFACE_BASE_UPDATE               0x00007300
 
-struct r600_cs_manager_legacy
-{
-    struct radeon_cs_manager    base;
-    struct radeon_context       *ctx;
-    /* hack for scratch stuff */
-    uint32_t                    pending_age;
-    uint32_t                    pending_count;
-};
-
-struct r600_cs_reloc_legacy {
-    struct radeon_cs_reloc  base;
-    uint32_t                cindices;
-    uint32_t                *indices;
-    uint32_t                *reloc_indices;
-};
-
 struct radeon_cs_manager * r600_radeon_cs_manager_legacy_ctor(struct radeon_context *ctx);
 
 /**
diff --git a/r600/r600_context.c b/r600/r600_context.c
index e0b77d4..dbd2337 100644
--- a/r600/r600_context.c
+++ b/r600/r600_context.c
@@ -59,10 +59,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_debug.h"
 #include "r600_context.h"
 #include "radeon_common_context.h"
+#include "radeon_buffer_objects.h"
 #include "radeon_span.h"
 #include "r600_cmdbuf.h"
 #include "r600_emit.h"
 #include "radeon_bocs_wrapper.h"
+#include "radeon_queryobj.h"
 
 #include "r700_state.h"
 #include "r700_ioctl.h"
@@ -72,11 +74,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "utils.h"
 #include "xmlpool.h"		/* for symbolic values of enum-type options */
 
-/* hw_tcl_on derives from future_hw_tcl_on when its safe to change it. */
-int future_hw_tcl_on = 1;
-int hw_tcl_on = 1;
-
 #define need_GL_VERSION_2_0
+#define need_GL_ARB_occlusion_query
 #define need_GL_ARB_point_parameters
 #define need_GL_ARB_vertex_program
 #define need_GL_EXT_blend_equation_separate
@@ -85,19 +84,20 @@ int hw_tcl_on = 1;
 #define need_GL_EXT_framebuffer_object
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_gpu_program_parameters
+#define need_GL_EXT_provoking_vertex
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_stencil_two_side
 #define need_GL_ATI_separate_stencil
 #define need_GL_NV_vertex_program
 
-#include "extension_helper.h"
-
-extern const struct tnl_pipeline_stage *r700_pipeline[];
+#include "main/remap_helper.h"
 
-const struct dri_extension card_extensions[] = {
+static const struct dri_extension card_extensions[] = {
   /* *INDENT-OFF* */
+  {"GL_ARB_depth_clamp",                NULL},
   {"GL_ARB_depth_texture",		NULL},
   {"GL_ARB_fragment_program",		NULL},
+  {"GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions},
   {"GL_ARB_multitexture",		NULL},
   {"GL_ARB_point_parameters",		GL_ARB_point_parameters_functions},
   {"GL_ARB_shadow",			NULL},
@@ -117,6 +117,7 @@ const struct dri_extension card_extensions[] = {
   {"GL_EXT_packed_depth_stencil",	NULL},
   {"GL_EXT_fog_coord",			GL_EXT_fog_coord_functions },
   {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
+  {"GL_EXT_provoking_vertex",           GL_EXT_provoking_vertex_functions },
   {"GL_EXT_secondary_color", 		GL_EXT_secondary_color_functions},
   {"GL_EXT_shadow_funcs",		NULL},
   {"GL_EXT_stencil_two_side",		GL_EXT_stencil_two_side_functions},
@@ -128,6 +129,8 @@ const struct dri_extension card_extensions[] = {
   {"GL_EXT_texture_lod_bias",		NULL},
   {"GL_EXT_texture_mirror_clamp",	NULL},
   {"GL_EXT_texture_rectangle",		NULL},
+  {"GL_EXT_vertex_array_bgra",          NULL},
+  {"GL_EXT_texture_sRGB",               NULL},
   {"GL_ATI_separate_stencil",		GL_ATI_separate_stencil_functions},
   {"GL_ATI_texture_env_combine3",	NULL},
   {"GL_ATI_texture_mirror_once",	NULL},
@@ -142,7 +145,7 @@ const struct dri_extension card_extensions[] = {
 };
 
 
-const struct dri_extension mm_extensions[] = {
+static const struct dri_extension mm_extensions[] = {
   { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
   { NULL, NULL }
 };
@@ -151,21 +154,24 @@ const struct dri_extension mm_extensions[] = {
  * The GL 2.0 functions are needed to make display lists work with
  * functions added by GL_ATI_separate_stencil.
  */
-const struct dri_extension gl_20_extension[] = {
+static const struct dri_extension gl_20_extension[] = {
   {"GL_VERSION_2_0",			GL_VERSION_2_0_functions },
 };
 
-
-static void r600RunPipeline(GLcontext * ctx)
-{
-    _mesa_lock_context_textures(ctx);
-
-    if (ctx->NewState)
-        _mesa_update_state_locked(ctx);
-    
-    _tnl_run_pipeline(ctx);
-    _mesa_unlock_context_textures(ctx);
-}
+static const struct tnl_pipeline_stage *r600_pipeline[] = {
+	/* Catch any t&l fallbacks
+	 */
+	&_tnl_vertex_transform_stage,
+	&_tnl_normal_transform_stage,
+	&_tnl_lighting_stage,
+	&_tnl_fog_coordinate_stage,
+	&_tnl_texgen_stage,
+	&_tnl_texture_transform_stage,
+	&_tnl_point_attenuation_stage,
+	&_tnl_vertex_program_stage,
+	&_tnl_render_stage,
+	0,
+};
 
 static void r600_get_lock(radeonContextPtr rmesa)
 {
@@ -176,7 +182,7 @@ static void r600_get_lock(radeonContextPtr rmesa)
 		if (!rmesa->radeonScreen->kernel_mm)
 			radeon_bo_legacy_texture_age(rmesa->radeonScreen->bom);
 	}
-}		  
+}
 
 static void r600_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
 {
@@ -198,6 +204,24 @@ static void r600_fallback(GLcontext *ctx, GLuint bit, GLboolean mode)
 		context->radeon.Fallback &= ~bit;
 }
 
+static void r600_emit_query_finish(radeonContextPtr radeon)
+{
+	context_t *context = (context_t*) radeon;
+	BATCH_LOCALS(&context->radeon);
+
+	struct radeon_query_object *query = radeon->query.current;
+
+	BEGIN_BATCH_NO_AUTOSTATE(4 + 2);
+	R600_OUT_BATCH(CP_PACKET3(R600_IT_EVENT_WRITE, 2));
+	R600_OUT_BATCH(ZPASS_DONE);
+	R600_OUT_BATCH(query->curr_offset + 8); /* hw writes qwords */
+	R600_OUT_BATCH(0x00000000);
+	R600_OUT_BATCH_RELOC(VGT_EVENT_INITIATOR, query->bo, 0, 0, RADEON_GEM_DOMAIN_GTT, 0);
+	END_BATCH();
+	assert(query->curr_offset < RADEON_QUERY_PAGE_SIZE);
+	query->emitted_begin = GL_FALSE;
+}
+
 static void r600_init_vtbl(radeonContextPtr radeon)
 {
 	radeon->vtbl.get_lock = r600_get_lock;
@@ -206,6 +230,101 @@ static void r600_init_vtbl(radeonContextPtr radeon)
 	radeon->vtbl.swtcl_flush = NULL;
 	radeon->vtbl.pre_emit_atoms = r600_vtbl_pre_emit_atoms;
 	radeon->vtbl.fallback = r600_fallback;
+	radeon->vtbl.emit_query_finish = r600_emit_query_finish;
+}
+
+static void r600InitConstValues(GLcontext *ctx, radeonScreenPtr screen)
+{
+	context_t *r600 = R700_CONTEXT(ctx);
+
+	ctx->Const.MaxTextureImageUnits =
+	    driQueryOptioni(&r600->radeon.optionCache, "texture_image_units");
+	ctx->Const.MaxTextureCoordUnits =
+	    driQueryOptioni(&r600->radeon.optionCache, "texture_coord_units");
+	ctx->Const.MaxTextureUnits =
+	    MIN2(ctx->Const.MaxTextureImageUnits,
+		 ctx->Const.MaxTextureCoordUnits);
+	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+	ctx->Const.MaxTextureLodBias = 16.0;
+
+	ctx->Const.MaxTextureLevels = 13; /* hw support 14 */
+	ctx->Const.MaxTextureRectSize = 4096; /* hw support 8192 */
+
+	ctx->Const.MinPointSize   = 0x0001 / 8.0;
+	ctx->Const.MinPointSizeAA = 0x0001 / 8.0;
+	ctx->Const.MaxPointSize   = 0xffff / 8.0;
+	ctx->Const.MaxPointSizeAA = 0xffff / 8.0;
+
+	ctx->Const.MinLineWidth   = 0x0001 / 8.0;
+	ctx->Const.MinLineWidthAA = 0x0001 / 8.0;
+	ctx->Const.MaxLineWidth   = 0xffff / 8.0;
+	ctx->Const.MaxLineWidthAA = 0xffff / 8.0;
+
+	ctx->Const.MaxDrawBuffers = 1; /* hw supports 8 */
+
+	/* 256 for reg-based consts, inline consts also supported */
+	ctx->Const.VertexProgram.MaxInstructions = 8192; /* in theory no limit */
+	ctx->Const.VertexProgram.MaxNativeInstructions = 8192;
+	ctx->Const.VertexProgram.MaxNativeAttribs = 160;
+	ctx->Const.VertexProgram.MaxTemps = 128;
+	ctx->Const.VertexProgram.MaxNativeTemps = 128;
+	ctx->Const.VertexProgram.MaxNativeParameters = 256;
+	ctx->Const.VertexProgram.MaxNativeAddressRegs = 1; /* ??? */
+
+	ctx->Const.FragmentProgram.MaxNativeTemps = 128;
+	ctx->Const.FragmentProgram.MaxNativeAttribs = 32;
+	ctx->Const.FragmentProgram.MaxNativeParameters = 256;
+	ctx->Const.FragmentProgram.MaxNativeAluInstructions = 8192;
+	/* 8 per clause on r6xx, 16 on rv670/r7xx */
+	if ((screen->chip_family == CHIP_FAMILY_RV670) ||
+	    (screen->chip_family >= CHIP_FAMILY_RV770))
+		ctx->Const.FragmentProgram.MaxNativeTexInstructions = 16;
+	else
+		ctx->Const.FragmentProgram.MaxNativeTexInstructions = 8;
+	ctx->Const.FragmentProgram.MaxNativeInstructions = 8192;
+	ctx->Const.FragmentProgram.MaxNativeTexIndirections = 8; /* ??? */
+	ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;	/* and these are?? */
+}
+
+static void r600ParseOptions(context_t *r600, radeonScreenPtr screen)
+{
+	/* Parse configuration files.
+	 * Do this here so that initialMaxAnisotropy is set before we create
+	 * the default textures.
+	 */
+	driParseConfigFiles(&r600->radeon.optionCache, &screen->optionCache,
+			    screen->driScreen->myNum, "r600");
+
+	r600->radeon.initialMaxAnisotropy = driQueryOptionf(&r600->radeon.optionCache,
+							    "def_max_anisotropy");
+
+}
+
+static void r600InitGLExtensions(GLcontext *ctx)
+{
+	context_t *r600 = R700_CONTEXT(ctx);
+
+	driInitExtensions(ctx, card_extensions, GL_TRUE);
+	if (r600->radeon.radeonScreen->kernel_mm)
+	  driInitExtensions(ctx, mm_extensions, GL_FALSE);
+
+	if (driQueryOptionb
+	    (&r600->radeon.optionCache, "disable_stencil_two_side"))
+		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
+
+	if (r600->radeon.glCtx->Mesa_DXTn
+	    && !driQueryOptionb(&r600->radeon.optionCache, "disable_s3tc")) {
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+		_mesa_enable_extension(ctx, "GL_S3_s3tc");
+	} else
+	    if (driQueryOptionb(&r600->radeon.optionCache, "force_s3tc_enable"))
+	{
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+	}
+
+	/* XXX: RV740 only seems to report results from half of its DBs */
+	if (r600->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV740)
+		_mesa_disable_extension(ctx, "GL_ARB_occlusion_query");
 }
 
 /* Create the device specific rendering context.
@@ -231,19 +350,10 @@ GLboolean r600CreateContext(const __GLcontextModes * glVisual,
 		return GL_FALSE;
 	}
 
-	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
-		hw_tcl_on = future_hw_tcl_on = 0;
+	r600ParseOptions(r600, screen);
 
+	r600->radeon.radeonScreen = screen;
 	r600_init_vtbl(&r600->radeon);
-	/* Parse configuration files.
-	 * Do this here so that initialMaxAnisotropy is set before we create
-	 * the default textures.
-	 */
-	driParseConfigFiles(&r600->radeon.optionCache, &screen->optionCache,
-			    screen->driScreen->myNum, "r600");
-
-	r600->radeon.initialMaxAnisotropy = driQueryOptionf(&r600->radeon.optionCache,
-						     "def_max_anisotropy");
 
 	/* Init default driver functions then plug in our R600-specific functions
 	 * (the texture functions are especially important)
@@ -253,7 +363,9 @@ GLboolean r600CreateContext(const __GLcontextModes * glVisual,
 	r700InitStateFuncs(&functions);
 	r600InitTextureFuncs(&functions);
 	r700InitShaderFuncs(&functions);
+	radeonInitQueryObjFunctions(&functions);
 	r700InitIoctlFuncs(&functions);
+	radeonInitBufferObjectFuncs(&functions);
 
 	if (!radeonInitContext(&r600->radeon, &functions,
 			       glVisual, driContextPriv,
@@ -263,44 +375,14 @@ GLboolean r600CreateContext(const __GLcontextModes * glVisual,
 		return GL_FALSE;
 	}
 
-	/* Init r600 context data */
-	/* Set the maximum texture size small enough that we can guarentee that
-	 * all texture units can bind a maximal texture and have them both in
-	 * texturable memory at once.
-	 */
-
 	ctx = r600->radeon.glCtx;
 
-	ctx->Const.MaxTextureImageUnits =
-	    driQueryOptioni(&r600->radeon.optionCache, "texture_image_units");
-	ctx->Const.MaxTextureCoordUnits =
-	    driQueryOptioni(&r600->radeon.optionCache, "texture_coord_units");
-	ctx->Const.MaxTextureUnits =
-	    MIN2(ctx->Const.MaxTextureImageUnits,
-		 ctx->Const.MaxTextureCoordUnits);
-	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
-	ctx->Const.MaxTextureLodBias = 16.0;
-
-	ctx->Const.MaxTextureLevels = 13; /* hw support 14 */
-	ctx->Const.MaxTextureRectSize = 4096; /* hw support 8192 */
-
-	ctx->Const.MinPointSize   = 0x0001 / 8.0;
-	ctx->Const.MinPointSizeAA = 0x0001 / 8.0;
-	ctx->Const.MaxPointSize   = 0xffff / 8.0;
-	ctx->Const.MaxPointSizeAA = 0xffff / 8.0;
-
-	ctx->Const.MinLineWidth   = 0x0001 / 8.0;
-	ctx->Const.MinLineWidthAA = 0x0001 / 8.0;
-	ctx->Const.MaxLineWidth   = 0xffff / 8.0;
-	ctx->Const.MaxLineWidthAA = 0xffff / 8.0;
+	ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
+	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
 
-	/* Needs further modifications */
-#if 0
-	ctx->Const.MaxArrayLockSize =
-	    ( /*512 */ RADEON_BUFFER_SIZE * 16 * 1024) / (4 * 4);
-#endif
+	r600InitConstValues(ctx, screen);
 
-	ctx->Const.MaxDrawBuffers = 1;
+	_mesa_set_mvp_with_dp4( ctx, GL_TRUE );
 
 	/* Initialize the software rasterizer and helper modules.
 	 */
@@ -309,16 +391,12 @@ GLboolean r600CreateContext(const __GLcontextModes * glVisual,
 	_tnl_CreateContext(ctx);
 	_swsetup_CreateContext(ctx);
 	_swsetup_Wakeup(ctx);
-	_ae_create_context(ctx);
 
 	/* Install the customized pipeline:
 	 */
 	_tnl_destroy_pipeline(ctx);
-	_tnl_install_pipeline(ctx, r700_pipeline);
-
-	/* Try and keep materials and vertices separate:
-	 */
-/* 	_tnl_isolate_materials(ctx, GL_TRUE); */
+	_tnl_install_pipeline(ctx, r600_pipeline);
+	TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
 
 	/* Configure swrast and TNL to match hardware characteristics:
 	 */
@@ -327,66 +405,16 @@ GLboolean r600CreateContext(const __GLcontextModes * glVisual,
 	_tnl_allow_pixel_fog(ctx, GL_FALSE);
 	_tnl_allow_vertex_fog(ctx, GL_TRUE);
 
-	/* 256 for reg-based consts, inline consts also supported */
-	ctx->Const.VertexProgram.MaxInstructions = 8192; /* in theory no limit */
-	ctx->Const.VertexProgram.MaxNativeInstructions = 8192;
-	ctx->Const.VertexProgram.MaxNativeAttribs = 160;
-	ctx->Const.VertexProgram.MaxTemps = 128;
-	ctx->Const.VertexProgram.MaxNativeTemps = 128;
-	ctx->Const.VertexProgram.MaxNativeParameters = 256;
-	ctx->Const.VertexProgram.MaxNativeAddressRegs = 1; /* ??? */
-
-	ctx->Const.FragmentProgram.MaxNativeTemps = 128;
-	ctx->Const.FragmentProgram.MaxNativeAttribs = 32;
-	ctx->Const.FragmentProgram.MaxNativeParameters = 256;
-	ctx->Const.FragmentProgram.MaxNativeAluInstructions = 8192;
-	/* 8 per clause on r6xx, 16 on rv670/r7xx */
-	if ((screen->chip_family == CHIP_FAMILY_RV670) ||
-	    (screen->chip_family >= CHIP_FAMILY_RV770))
-		ctx->Const.FragmentProgram.MaxNativeTexInstructions = 16;
-	else
-		ctx->Const.FragmentProgram.MaxNativeTexInstructions = 8;
-	ctx->Const.FragmentProgram.MaxNativeInstructions = 8192;
-	ctx->Const.FragmentProgram.MaxNativeTexIndirections = 8; /* ??? */
-	ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;	/* and these are?? */
-	ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
-	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
-
 	radeon_init_debug();
 
-	driInitExtensions(ctx, card_extensions, GL_TRUE);
-	if (r600->radeon.radeonScreen->kernel_mm)
-	  driInitExtensions(ctx, mm_extensions, GL_FALSE);
-
-	if (driQueryOptionb
-	    (&r600->radeon.optionCache, "disable_stencil_two_side"))
-		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
-
-	if (r600->radeon.glCtx->Mesa_DXTn
-	    && !driQueryOptionb(&r600->radeon.optionCache, "disable_s3tc")) {
-		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
-		_mesa_enable_extension(ctx, "GL_S3_s3tc");
-	} else
-	    if (driQueryOptionb(&r600->radeon.optionCache, "force_s3tc_enable"))
-	{
-		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
-	}
+	r700InitDraw(ctx);
 
 	radeon_fbo_init(&r600->radeon);
    	radeonInitSpanFuncs( ctx );
-
 	r600InitCmdBuf(r600);
-
 	r700InitState(r600->radeon.glCtx);
 
-	TNL_CONTEXT(ctx)->Driver.RunPipeline = r600RunPipeline;
-
-	if (driQueryOptionb(&r600->radeon.optionCache, "no_rast")) {
-		radeon_warning("disabling 3D acceleration\n");
-#if R200_MERGED
-		FALLBACK(&r600->radeon, RADEON_FALLBACK_DISABLE, 1);
-#endif
-	}
+	r600InitGLExtensions(ctx);
 
 	return GL_TRUE;
 }
diff --git a/r600/r600_context.h b/r600/r600_context.h
index 9397ecd..394fd75 100644
--- a/r600/r600_context.h
+++ b/r600/r600_context.h
@@ -58,29 +58,6 @@ typedef struct r600_context context_t;
 
 #include "main/mm.h"
 
-/************ DMA BUFFERS **************/
-
-/* The blit width for texture uploads
- */
-#define R600_BLIT_WIDTH_BYTES 1024
-#define R600_MAX_TEXTURE_UNITS 8
-
-struct r600_texture_state {
-	int tc_count;		/* number of incoming texture coordinates from VAP */
-};
-
-/* Perhaps more if we store programs in vmem? */
-/* drm_r600_cmd_header_t->vpu->count is unsigned char */
-#define VSF_MAX_FRAGMENT_LENGTH (255*4)
-
-/* Can be tested with colormat currently. */
-#define VSF_MAX_FRAGMENT_TEMPS (14)
-
-#define STATE_R600_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
-#define STATE_R600_TEXRECT_FACTOR (STATE_INTERNAL_DRIVER+1)
-
-extern int hw_tcl_on;
-
 #define COLOR_IS_RGBA
 #define TAG(x) r600##x
 #include "tnl_dd/t_dd_vertex.h"
@@ -126,6 +103,32 @@ struct r600_hw_state {
 	struct radeon_state_atom tx_brdr_clr;
 };
 
+typedef struct StreamDesc
+{
+	GLint   size;   //number of data element
+	GLenum  type;  //data element type
+	GLsizei stride;
+
+	struct radeon_bo *bo;
+	GLint  bo_offset;
+
+	GLuint    dwords;
+	GLuint    dst_loc;
+	GLuint    _signed;
+	GLboolean normalize;
+	GLboolean is_named_bo;
+	GLubyte   element;
+} StreamDesc;
+
+typedef struct r700_index_buffer
+{
+	struct radeon_bo *bo;
+	int    bo_offset;
+
+	GLboolean is_32bit;
+	GLuint    count;
+} r700_index_buffer;
+
 /**
  * \brief R600 context structure.
  */
@@ -141,9 +144,9 @@ struct r600_context {
 
 	/* Vertex buffers
 	 */
-	GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
-	GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
-
+	GLint      nNumActiveAos;
+	StreamDesc stream_desc[VERT_ATTRIB_MAX];
+    struct r700_index_buffer ind_buf;
 };
 
 #define R700_CONTEXT(ctx)		((context_t *)(ctx->DriverCtx))
@@ -174,16 +177,13 @@ extern GLboolean r700SyncSurf(context_t *context,
 			      uint32_t write_domain,
 			      uint32_t sync_type);
 
-extern void r700SetupStreams(GLcontext * ctx);
 extern void r700Start3D(context_t *context);
 extern void r600InitAtoms(context_t *context);
+extern void r700InitDraw(GLcontext *ctx);
 
 #define RADEON_D_CAPTURE 0
 #define RADEON_D_PLAYBACK 1
 #define RADEON_D_PLAYBACK_RAW 2
 #define RADEON_D_T 3
 
-#define r600PackFloat32 radeonPackFloat32
-#define r600PackFloat24 radeonPackFloat24
-
 #endif				/* __R600_CONTEXT_H__ */
diff --git a/r600/r600_reg_r6xx.h b/r600/r600_reg_r6xx.h
index f7702c4..74af7b4 100644
--- a/r600/r600_reg_r6xx.h
+++ b/r600/r600_reg_r6xx.h
@@ -415,11 +415,11 @@ enum {
 	ALPHA_TO_MASK_ENABLE                              = 1 << 0,
 	ALPHA_TO_MASK_OFFSET0_mask                        = 0x03 << 8,
 	ALPHA_TO_MASK_OFFSET0_shift                       = 8,
-	ALPHA_TO_MASK_OFFSET1_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET1_mask                        = 0x03 << 10,
 	ALPHA_TO_MASK_OFFSET1_shift                       = 10,
-	ALPHA_TO_MASK_OFFSET2_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET2_mask                        = 0x03 << 12,
 	ALPHA_TO_MASK_OFFSET2_shift                       = 12,
-	ALPHA_TO_MASK_OFFSET3_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET3_mask                        = 0x03 << 14,
 	ALPHA_TO_MASK_OFFSET3_shift                       = 14,
 
 //  SQ_VTX_CONSTANT_WORD2_0                               = 0x00038008,
diff --git a/r600/r600_reg_r7xx.h b/r600/r600_reg_r7xx.h
index e5c01c8..eb169bd 100644
--- a/r600/r600_reg_r7xx.h
+++ b/r600/r600_reg_r7xx.h
@@ -143,6 +143,8 @@ enum {
 //  SQ_TEX_SAMPLER_MISC_0                                 = 0x0003d03c,
 	R7xx_TRUNCATE_COORD_bit                           = 1 << 9,
 	R7xx_DISABLE_CUBE_WRAP_bit                        = 1 << 10,
+//  DB_RENDER_CONTROL                                     = 0x00028d0c,
+	PERFECT_ZPASS_COUNTS_bit                          = 1 << 15,
 
 } ;
 
diff --git a/r600/r600_tex.c b/r600/r600_tex.c
index d105b90..9d83a64 100644
--- a/r600/r600_tex.c
+++ b/r600/r600_tex.c
@@ -40,7 +40,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/image.h"
 #include "main/mipmap.h"
 #include "main/simple_list.h"
-#include "main/texformat.h"
 #include "main/texstore.h"
 #include "main/teximage.h"
 #include "main/texobj.h"
@@ -286,6 +285,7 @@ static void r600TexParameter(GLcontext * ctx, GLenum target,
 			     GLenum pname, const GLfloat * params)
 {
 	radeonTexObj* t = radeon_tex_obj(texObj);
+	GLenum baseFormat;
 
 	radeon_print(RADEON_STATE | RADEON_TEXTURE, RADEON_VERBOSE,
 			"%s( %s )\n", __FUNCTION__,
@@ -312,23 +312,15 @@ static void r600TexParameter(GLcontext * ctx, GLenum target,
 	case GL_TEXTURE_MAX_LEVEL:
 	case GL_TEXTURE_MIN_LOD:
 	case GL_TEXTURE_MAX_LOD:
-		/* This isn't the most efficient solution but there doesn't appear to
-		 * be a nice alternative.  Since there's no LOD clamping,
-		 * we just have to rely on loading the right subset of mipmap levels
-		 * to simulate a clamped LOD.
-		 */
-		if (t->mt) {
-			radeon_miptree_unreference(t->mt);
-			t->mt = 0;
-			t->validated = GL_FALSE;
-		}
+		t->validated = GL_FALSE;
 		break;
 
 	case GL_DEPTH_TEXTURE_MODE:
 		if (!texObj->Image[0][texObj->BaseLevel])
 			return;
-		if (texObj->Image[0][texObj->BaseLevel]->TexFormat->BaseFormat
-		    == GL_DEPTH_COMPONENT) {
+		baseFormat = texObj->Image[0][texObj->BaseLevel]->_BaseFormat;
+		if (baseFormat == GL_DEPTH_COMPONENT ||
+		    baseFormat == GL_DEPTH_STENCIL) {
 			r600SetDepthTexMode(texObj);
 			break;
 		} else {
@@ -368,10 +360,8 @@ static void r600DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 		t->bo = NULL;
 	}
 
-	if (t->mt) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = 0;
-	}
+	radeon_miptree_unreference(&t->mt);
+
 	_mesa_delete_texture_object(ctx, texObj);
 }
 
diff --git a/r600/r600_texstate.c b/r600/r600_texstate.c
index bcb8d7c..4ec315b 100644
--- a/r600/r600_texstate.c
+++ b/r600/r600_texstate.c
@@ -39,7 +39,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/imports.h"
 #include "main/context.h"
 #include "main/macros.h"
-#include "main/texformat.h"
 #include "main/teximage.h"
 #include "main/texobj.h"
 #include "main/enums.h"
@@ -78,7 +77,7 @@ void r600UpdateTextureState(GLcontext * ctx)
 	}
 }
 
-static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, GLuint mesa_format)
+static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, gl_format mesa_format)
 {
 	radeonTexObj *t = radeon_tex_obj(tObj);
 
@@ -87,9 +86,19 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, GLuint mesa_fo
 	CLEARfield(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
 	CLEARfield(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
 
+	SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
+		 FORMAT_COMP_X_shift, FORMAT_COMP_X_mask);
+	SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
+		 FORMAT_COMP_Y_shift, FORMAT_COMP_Y_mask);
+	SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
+		 FORMAT_COMP_X_shift, FORMAT_COMP_Z_mask);
+	SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
+		 FORMAT_COMP_W_shift, FORMAT_COMP_W_mask);
+
 	switch (mesa_format) /* This is mesa format. */
 	{
 	case MESA_FORMAT_RGBA8888:
+	case MESA_FORMAT_SIGNED_RGBA8888:
 		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
 			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 
@@ -101,8 +110,19 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, GLuint mesa_fo
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		if (mesa_format == MESA_FORMAT_SIGNED_RGBA8888) {
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_SIGNED,
+				 FORMAT_COMP_X_shift, FORMAT_COMP_X_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_SIGNED,
+				 FORMAT_COMP_Y_shift, FORMAT_COMP_Y_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_SIGNED,
+				 FORMAT_COMP_Z_shift, FORMAT_COMP_Z_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_SIGNED,
+				 FORMAT_COMP_W_shift, FORMAT_COMP_W_mask);
+		}
 		break;
 	case MESA_FORMAT_RGBA8888_REV:
+	case MESA_FORMAT_SIGNED_RGBA8888_REV:
 		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
 			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 
@@ -114,6 +134,16 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, GLuint mesa_fo
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		if (mesa_format == MESA_FORMAT_SIGNED_RGBA8888_REV) {
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_SIGNED,
+				 FORMAT_COMP_X_shift, FORMAT_COMP_X_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_SIGNED,
+				 FORMAT_COMP_Y_shift, FORMAT_COMP_Y_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_SIGNED,
+				 FORMAT_COMP_Z_shift, FORMAT_COMP_Z_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_SIGNED,
+				 FORMAT_COMP_W_shift, FORMAT_COMP_W_mask);
+		}
 		break;
 	case MESA_FORMAT_ARGB8888:
 		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
@@ -480,13 +510,21 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, GLuint mesa_fo
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
 		break;
 	case MESA_FORMAT_Z16:
+	case MESA_FORMAT_X8_Z24:
+	case MESA_FORMAT_S8_Z24:
 	case MESA_FORMAT_Z24_S8:
 	case MESA_FORMAT_Z32:
+	case MESA_FORMAT_S8:
 		switch (mesa_format) {
 		case MESA_FORMAT_Z16:
 			SETfield(t->SQ_TEX_RESOURCE1, FMT_16,
 				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 			break;
+		case MESA_FORMAT_X8_Z24:
+		case MESA_FORMAT_S8_Z24:
+			SETfield(t->SQ_TEX_RESOURCE1, FMT_8_24,
+				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+			break;
 		case MESA_FORMAT_Z24_S8:
 			SETfield(t->SQ_TEX_RESOURCE1, FMT_24_8,
 				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
@@ -495,6 +533,12 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, GLuint mesa_fo
 			SETfield(t->SQ_TEX_RESOURCE1, FMT_32,
 				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 			break;
+		case MESA_FORMAT_S8:
+			SETfield(t->SQ_TEX_RESOURCE1, FMT_8,
+				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+			break;
+		default:
+			break;
 		};
 		switch (tObj->DepthMode) {
 		case GL_LUMINANCE:  /* X, X, X, ONE */
@@ -531,6 +575,49 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, GLuint mesa_fo
 			return GL_FALSE;
 		}
 		break;
+	/* EXT_texture_sRGB */
+	case MESA_FORMAT_SRGBA8:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		SETbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
+		break;
+	case MESA_FORMAT_SLA8:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		SETbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
+		break;
+	case MESA_FORMAT_SL8: /* X, X, X, ONE */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		SETbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
+		break;
 	default:
 		/* Not supported format */
 		return GL_FALSE;
@@ -548,7 +635,7 @@ void r600SetDepthTexMode(struct gl_texture_object *tObj)
 
 	t = radeon_tex_obj(tObj);
 
-	r600GetTexFormat(tObj, tObj->Image[0][tObj->BaseLevel]->TexFormat->MesaFormat);
+	r600GetTexFormat(tObj, tObj->Image[0][tObj->BaseLevel]->TexFormat);
 
 }
 
@@ -562,7 +649,6 @@ static void setup_hardware_state(context_t *rmesa, struct gl_texture_object *tex
 {
 	radeonTexObj *t = radeon_tex_obj(texObj);
 	const struct gl_texture_image *firstImage;
-	int firstlevel = t->mt ? t->mt->firstLevel : 0;
 	GLuint uTexelPitch, row_align;
 
 	if (rmesa->radeon.radeonScreen->driScreen->dri2.enabled &&
@@ -570,10 +656,10 @@ static void setup_hardware_state(context_t *rmesa, struct gl_texture_object *tex
 	    t->bo)
 		return;
 
-	firstImage = t->base.Image[0][firstlevel];
+	firstImage = t->base.Image[0][t->minLod];
 
 	if (!t->image_override) {
-		if (!r600GetTexFormat(texObj, firstImage->TexFormat->MesaFormat)) {
+		if (!r600GetTexFormat(texObj, firstImage->TexFormat)) {
 			radeon_error("unexpected texture format in %s\n",
 				      __FUNCTION__);
 			return;
@@ -605,7 +691,8 @@ static void setup_hardware_state(context_t *rmesa, struct gl_texture_object *tex
 	}
 
 	row_align = rmesa->radeon.texture_row_align - 1;
-	uTexelPitch = ((firstImage->Width * t->mt->bpp + row_align) & ~row_align) / t->mt->bpp;
+	uTexelPitch = (_mesa_format_row_stride(firstImage->TexFormat, firstImage->Width) + row_align) & ~row_align;
+	uTexelPitch = uTexelPitch / _mesa_get_format_bytes(firstImage->TexFormat);
 	uTexelPitch = (uTexelPitch + R700_TEXEL_PITCH_ALIGNMENT_MASK)
 		& ~R700_TEXEL_PITCH_ALIGNMENT_MASK;
 
@@ -619,10 +706,10 @@ static void setup_hardware_state(context_t *rmesa, struct gl_texture_object *tex
 	SETfield(t->SQ_TEX_RESOURCE1, firstImage->Height - 1,
 		 TEX_HEIGHT_shift, TEX_HEIGHT_mask);
 
-	if ((t->mt->lastLevel - t->mt->firstLevel) > 0) {
-		t->SQ_TEX_RESOURCE3 = t->mt->levels[0].size / 256;
-		SETfield(t->SQ_TEX_RESOURCE4, t->mt->firstLevel, BASE_LEVEL_shift, BASE_LEVEL_mask);
-		SETfield(t->SQ_TEX_RESOURCE5, t->mt->lastLevel, LAST_LEVEL_shift, LAST_LEVEL_mask);
+	if ((t->maxLod - t->minLod) > 0) {
+		t->SQ_TEX_RESOURCE3 = t->mt->levels[t->minLod].size / 256;
+		SETfield(t->SQ_TEX_RESOURCE4, 0, BASE_LEVEL_shift, BASE_LEVEL_mask);
+		SETfield(t->SQ_TEX_RESOURCE5, t->maxLod - t->minLod, LAST_LEVEL_shift, LAST_LEVEL_mask);
 	}
 }
 
@@ -721,7 +808,8 @@ void r600SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	struct gl_texture_object *tObj =
 	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
 	radeonTexObjPtr t = radeon_tex_obj(tObj);
-	uint32_t pitch_val, size;
+	const struct gl_texture_image *firstImage;
+	uint32_t pitch_val, size, row_align;
 
 	if (!tObj)
 		return;
@@ -731,7 +819,9 @@ void r600SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	if (!offset)
 		return;
 
-	size = pitch;//h * w * (depth / 8);
+	firstImage = t->base.Image[0][t->minLod];
+	row_align = rmesa->radeon.texture_row_align - 1;
+	size = ((_mesa_format_row_stride(firstImage->TexFormat, firstImage->Width) + row_align) & ~row_align) * firstImage->Height;
 	if (t->bo) {
 		radeon_bo_unref(t->bo);
 		t->bo = NULL;
@@ -854,20 +944,14 @@ void r600SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 		radeon_bo_unref(rImage->bo);
 		rImage->bo = NULL;
 	}
-	if (t->mt) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = NULL;
-	}
-	if (rImage->mt) {
-		radeon_miptree_unreference(rImage->mt);
-		rImage->mt = NULL;
-	}
+
+	radeon_miptree_unreference(&t->mt);
+	radeon_miptree_unreference(&rImage->mt);
+
 	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
 				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
 	texImage->RowStride = rb->pitch / rb->cpp;
-	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
-							internalFormat,
-							type, format, 0);
+
 	rImage->bo = rb->bo;
 	radeon_bo_ref(rImage->bo);
 	t->bo = rb->bo;
diff --git a/r600/r700_assembler.c b/r600/r700_assembler.c
index 00eda54..67e0ee7 100644
--- a/r600/r700_assembler.c
+++ b/r600/r700_assembler.c
@@ -336,7 +336,8 @@ unsigned int r700GetNumOperands(r700_AssemblerBase* pAsm)
 
     switch (pAsm->D.dst.opcode)
     {
-    case SQ_OP2_INST_ADD:                          
+    case SQ_OP2_INST_ADD:
+    case SQ_OP2_INST_KILLGT:
     case SQ_OP2_INST_MUL: 
     case SQ_OP2_INST_MAX:
     case SQ_OP2_INST_MIN:
@@ -354,9 +355,9 @@ unsigned int r700GetNumOperands(r700_AssemblerBase* pAsm)
         return 2;  
 
     case SQ_OP2_INST_MOV: 
+    case SQ_OP2_INST_MOVA_FLOOR:
     case SQ_OP2_INST_FRACT:
     case SQ_OP2_INST_FLOOR:
-    case SQ_OP2_INST_KILLGT:
     case SQ_OP2_INST_EXP_IEEE:
     case SQ_OP2_INST_LOG_CLAMPED:
     case SQ_OP2_INST_LOG_IEEE:
@@ -790,6 +791,133 @@ GLboolean assemble_vfetch_instruction(r700_AssemblerBase* pAsm,
 	return GL_TRUE;
 }
 
+GLboolean assemble_vfetch_instruction2(r700_AssemblerBase* pAsm,
+                                       GLuint              destination_register,								       
+                                       GLenum              type,
+                                       GLint               size,
+                                       GLubyte             element,
+                                       GLuint              _signed,
+                                       GLboolean           normalize,
+                                       VTX_FETCH_METHOD  * pFetchMethod)
+{
+    GLuint client_size_inbyte;
+	GLuint data_format;
+    GLuint mega_fetch_count;
+	GLuint is_mega_fetch_flag;
+
+	R700VertexGenericFetch*   vfetch_instruction_ptr;
+	R700VertexGenericFetch*   assembled_vfetch_instruction_ptr 
+                                     = pAsm->vfetch_instruction_ptr_array[element];
+
+	if (assembled_vfetch_instruction_ptr == NULL) 
+	{
+		vfetch_instruction_ptr = (R700VertexGenericFetch*) CALLOC_STRUCT(R700VertexGenericFetch);
+		if (vfetch_instruction_ptr == NULL) 
+		{
+			return GL_FALSE;
+		}
+        Init_R700VertexGenericFetch(vfetch_instruction_ptr);
+    }
+	else 
+	{
+		vfetch_instruction_ptr = assembled_vfetch_instruction_ptr;
+	}
+
+    data_format = GetSurfaceFormat(type, size, &client_size_inbyte);	
+
+	if(GL_TRUE == pFetchMethod->bEnableMini) //More conditions here
+	{
+		//TODO : mini fetch
+	}
+	else
+	{
+		mega_fetch_count = MEGA_FETCH_BYTES - 1;
+		is_mega_fetch_flag       = 0x1;
+		pFetchMethod->mega_fetch_remainder = MEGA_FETCH_BYTES - client_size_inbyte;
+	}
+
+	vfetch_instruction_ptr->m_Word0.f.vtx_inst         = SQ_VTX_INST_FETCH;
+	vfetch_instruction_ptr->m_Word0.f.fetch_type       = SQ_VTX_FETCH_VERTEX_DATA;
+	vfetch_instruction_ptr->m_Word0.f.fetch_whole_quad = 0x0;
+
+	vfetch_instruction_ptr->m_Word0.f.buffer_id        = element;
+	vfetch_instruction_ptr->m_Word0.f.src_gpr          = 0x0; 
+	vfetch_instruction_ptr->m_Word0.f.src_rel          = SQ_ABSOLUTE;
+	vfetch_instruction_ptr->m_Word0.f.src_sel_x        = SQ_SEL_X;
+	vfetch_instruction_ptr->m_Word0.f.mega_fetch_count = mega_fetch_count;
+
+	vfetch_instruction_ptr->m_Word1.f.dst_sel_x        = (size < 1) ? SQ_SEL_0 : SQ_SEL_X;
+	vfetch_instruction_ptr->m_Word1.f.dst_sel_y        = (size < 2) ? SQ_SEL_0 : SQ_SEL_Y;
+	vfetch_instruction_ptr->m_Word1.f.dst_sel_z        = (size < 3) ? SQ_SEL_0 : SQ_SEL_Z;
+	vfetch_instruction_ptr->m_Word1.f.dst_sel_w        = (size < 4) ? SQ_SEL_1 : SQ_SEL_W;
+
+	vfetch_instruction_ptr->m_Word1.f.use_const_fields = 1;
+    vfetch_instruction_ptr->m_Word1.f.data_format      = data_format;
+    vfetch_instruction_ptr->m_Word2.f.endian_swap      = SQ_ENDIAN_NONE;
+
+    if(1 == _signed)
+    {
+        vfetch_instruction_ptr->m_Word1.f.format_comp_all  = SQ_FORMAT_COMP_SIGNED;
+    }
+    else
+    {
+        vfetch_instruction_ptr->m_Word1.f.format_comp_all  = SQ_FORMAT_COMP_UNSIGNED;
+    }
+
+    if(GL_TRUE == normalize)
+    {
+        vfetch_instruction_ptr->m_Word1.f.num_format_all   = SQ_NUM_FORMAT_NORM;
+    }
+    else
+    {
+        vfetch_instruction_ptr->m_Word1.f.num_format_all   = SQ_NUM_FORMAT_INT;
+    }
+
+	// Destination register
+	vfetch_instruction_ptr->m_Word1_GPR.f.dst_gpr = destination_register; 
+	vfetch_instruction_ptr->m_Word1_GPR.f.dst_rel = SQ_ABSOLUTE;
+
+	vfetch_instruction_ptr->m_Word2.f.offset              = 0;
+	vfetch_instruction_ptr->m_Word2.f.const_buf_no_stride = 0x0;
+
+	vfetch_instruction_ptr->m_Word2.f.mega_fetch          = is_mega_fetch_flag;
+
+	if (assembled_vfetch_instruction_ptr == NULL) 
+	{
+		if ( GL_FALSE == add_vfetch_instruction(pAsm, (R700VertexInstruction *)vfetch_instruction_ptr) ) 
+        {   
+			return GL_FALSE;
+		}
+
+		if (pAsm->vfetch_instruction_ptr_array[element] != NULL) 
+		{
+			return GL_FALSE;
+		}
+		else 
+		{
+			pAsm->vfetch_instruction_ptr_array[element] = vfetch_instruction_ptr;
+		}
+	}
+
+	return GL_TRUE;
+}
+
+GLboolean cleanup_vfetch_instructions(r700_AssemblerBase* pAsm)
+{
+    GLint i;
+    pAsm->cf_current_clause_type    = CF_EMPTY_CLAUSE;
+    pAsm->cf_current_vtx_clause_ptr = NULL;
+
+    for (i=0; i<VERT_ATTRIB_MAX; i++) 
+	{
+		pAsm->vfetch_instruction_ptr_array[ i ] = NULL;
+	}
+
+    cleanup_vfetch_shaderinst(pAsm->pR700Shader);
+    
+    return GL_TRUE;
+}
+
 GLuint gethelpr(r700_AssemblerBase* pAsm) 
 {
     GLuint r = pAsm->uHelpReg;
@@ -1180,8 +1308,10 @@ GLboolean tex_src(r700_AssemblerBase *pAsm)
         case PROGRAM_INPUT:
             switch (pILInst->SrcReg[0].Index)
             {
+                case FRAG_ATTRIB_WPOS:
                 case FRAG_ATTRIB_COL0:
                 case FRAG_ATTRIB_COL1:
+                case FRAG_ATTRIB_FOGC:
                 case FRAG_ATTRIB_TEX0:
                 case FRAG_ATTRIB_TEX1:
                 case FRAG_ATTRIB_TEX2:
@@ -1194,7 +1324,16 @@ GLboolean tex_src(r700_AssemblerBase *pAsm)
                     pAsm->S[0].src.reg   =
                         pAsm->uiFP_AttributeMap[pILInst->SrcReg[0].Index];
                     pAsm->S[0].src.rtype = SRC_REG_INPUT;
-                break;
+                    break;
+                case FRAG_ATTRIB_FACE:
+                    fprintf(stderr, "FRAG_ATTRIB_FACE unsupported\n");
+                    break;
+                case FRAG_ATTRIB_PNTC:
+                    fprintf(stderr, "FRAG_ATTRIB_PNTC unsupported\n");
+                    break;
+                case FRAG_ATTRIB_VAR0:
+                    fprintf(stderr, "FRAG_ATTRIB_VAR0 unsupported\n");
+                    break;
             }
         break;
         }
@@ -1951,9 +2090,9 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
     GLuint contiguous_slots_needed;
 
     GLuint    uNumSrc = r700GetNumOperands(pAsm);
-    GLuint    channel_swizzle, j;
-    GLuint    chan_counter[4] = {0, 0, 0, 0};
-    PVSSRC *  pSource[3];
+    //GLuint    channel_swizzle, j;
+    //GLuint    chan_counter[4] = {0, 0, 0, 0};
+    //PVSSRC *  pSource[3];
     GLboolean bSplitInst = GL_FALSE;
 
     if (1 == pAsm->D.dst.math) 
@@ -2053,7 +2192,7 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
         }
 
         //other bits
-        alu_instruction_ptr->m_Word0.f.index_mode = SQ_INDEX_LOOP;
+        alu_instruction_ptr->m_Word0.f.index_mode = SQ_INDEX_AR_X;
 
         if(   (is_single_scalar_operation == GL_TRUE) 
            || (GL_TRUE == bSplitInst) )
@@ -2387,6 +2526,35 @@ GLboolean assemble_ADD(r700_AssemblerBase *pAsm)
     return GL_TRUE;
 }
 
+GLboolean assemble_ARL(r700_AssemblerBase *pAsm)
+{ /* TODO: ar values dont' persist between clauses */
+    if( GL_FALSE == checkop1(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOVA_FLOOR;
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg = 0;
+    pAsm->D.dst.writex = 0;
+    pAsm->D.dst.writey = 0;
+    pAsm->D.dst.writez = 0;
+    pAsm->D.dst.writew = 0;
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
 GLboolean assemble_BAD(char *opcode_str) 
 {
     radeon_error("Not yet implemented instruction (%s)\n", opcode_str);
@@ -2508,7 +2676,7 @@ GLboolean assemble_DOT(r700_AssemblerBase *pAsm)
     }
     else if(pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_DPH) 
     {
-        onecomp_PVSSRC(&(pAsm->S[1].src), 3);
+        onecomp_PVSSRC(&(pAsm->S[0].src), 3);
     } 
 
     if ( GL_FALSE == next_ins(pAsm) ) 
@@ -2561,6 +2729,133 @@ GLboolean assemble_EX2(r700_AssemblerBase *pAsm)
 {
     return assemble_math_function(pAsm, SQ_OP2_INST_EXP_IEEE);
 }
+
+GLboolean assemble_EXP(r700_AssemblerBase *pAsm)
+{
+    BITS tmp;
+
+    checkop1(pAsm);
+
+    tmp = gethelpr(pAsm);
+
+    // FLOOR   tmp.x,    a.x
+    // EX2     dst.x     tmp.x
+
+    if (pAsm->pILInst->DstReg.WriteMask & 0x1) {
+        pAsm->D.dst.opcode = SQ_OP2_INST_FLOOR;
+
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg    = tmp;
+        pAsm->D.dst.writex = 1;
+
+        if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+        {
+            return GL_FALSE;
+        }
+
+        if( GL_FALSE == next_ins(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        pAsm->D.dst.opcode = SQ_OP2_INST_EXP_IEEE;
+        pAsm->D.dst.math = 1;
+
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        pAsm->D.dst.writey = pAsm->D.dst.writez = pAsm->D.dst.writew = 0;
+
+        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+        pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+        pAsm->S[0].src.reg   = tmp;
+
+        setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+        noneg_PVSSRC(&(pAsm->S[0].src));
+
+        if( GL_FALSE == next_ins(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    // FRACT   dst.y     a.x
+
+    if ((pAsm->pILInst->DstReg.WriteMask >> 1) & 0x1) {
+        pAsm->D.dst.opcode = SQ_OP2_INST_FRACT;
+
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+        {
+            return GL_FALSE;
+        }
+
+        pAsm->D.dst.writex = pAsm->D.dst.writez = pAsm->D.dst.writew = 0;
+
+        if( GL_FALSE == next_ins(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    // EX2     dst.z,    a.x
+
+    if ((pAsm->pILInst->DstReg.WriteMask >> 2) & 0x1) {
+        pAsm->D.dst.opcode = SQ_OP2_INST_EXP_IEEE;
+        pAsm->D.dst.math = 1;
+
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+        {
+            return GL_FALSE;
+        }
+
+        pAsm->D.dst.writex = pAsm->D.dst.writey = pAsm->D.dst.writew = 0;
+
+        if( GL_FALSE == next_ins(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    // MOV     dst.w     1.0
+
+    if ((pAsm->pILInst->DstReg.WriteMask >> 3) & 0x1) {
+        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        pAsm->D.dst.writex = pAsm->D.dst.writey = pAsm->D.dst.writez = 0;
+
+        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+        pAsm->S[0].src.reg   = tmp;
+
+        setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_1);
+        noneg_PVSSRC(&(pAsm->S[0].src));
+
+        if( GL_FALSE == next_ins(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    return GL_TRUE;
+}
  
 GLboolean assemble_FLR(r700_AssemblerBase *pAsm)
 {
@@ -2617,15 +2912,15 @@ GLboolean assemble_FRC(r700_AssemblerBase *pAsm)
  
 GLboolean assemble_KIL(r700_AssemblerBase *pAsm)
 {
+    /* TODO: doc says KILL has to be last(end) ALU clause */
+    
     checkop1(pAsm);
 
     pAsm->D.dst.opcode = SQ_OP2_INST_KILLGT;  
-  
-    if ( GL_FALSE == assemble_dst(pAsm) )
-    {
-        return GL_FALSE;
-    }
 
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg   = 0;
     pAsm->D.dst.writex = 0;
     pAsm->D.dst.writey = 0;
     pAsm->D.dst.writez = 0;
@@ -2638,20 +2933,11 @@ GLboolean assemble_KIL(r700_AssemblerBase *pAsm)
     setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_0);
     noneg_PVSSRC(&(pAsm->S[0].src));
 
-    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
-
-    if(PROGRAM_TEMPORARY == pAsm->pILInst[pAsm->uiCurInst].DstReg.File)
+    if ( GL_FALSE == assemble_src(pAsm, 0, 1) )
     {
-        pAsm->S[1].src.reg = pAsm->pILInst[pAsm->uiCurInst].DstReg.Index + pAsm->starting_temp_register_number;
-    }
-    else
-    {   //PROGRAM_OUTPUT
-        pAsm->S[1].src.reg = pAsm->uiFP_OutputMap[pAsm->pILInst[pAsm->uiCurInst].DstReg.Index];
+        return GL_FALSE;
     }
   
-    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
-    noswizzle_PVSSRC(&(pAsm->S[1].src));
-  
     if ( GL_FALSE == next_ins(pAsm) )
     {
         return GL_FALSE;
@@ -2751,6 +3037,217 @@ GLboolean assemble_LRP(r700_AssemblerBase *pAsm)
     return GL_TRUE;
 }
 
+GLboolean assemble_LOG(r700_AssemblerBase *pAsm)
+{
+    BITS tmp1, tmp2, tmp3;
+
+    checkop1(pAsm);
+
+    tmp1 = gethelpr(pAsm);
+    tmp2 = gethelpr(pAsm);
+    tmp3 = gethelpr(pAsm);
+
+    // FIXME: The hardware can do fabs() directly on input
+    //        elements, but the compiler doesn't have the
+    //        capability to use that.
+
+    // MAX     tmp1.x,   a.x,    -a.x   (fabs(a.x))
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MAX;  
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp1;
+    pAsm->D.dst.writex = 1;
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+ 
+    pAsm->S[1].bits = pAsm->S[0].bits;
+    flipneg_PVSSRC(&(pAsm->S[1].src));
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    // Entire algo:
+    //
+    // LG2     tmp2.x,   tmp1.x
+    // FLOOR   tmp3.x,   tmp2.x
+    // MOV     dst.x,    tmp3.x
+    // ADD     tmp3.x,   tmp2.x,    -tmp3.x
+    // EX2     dst.y,    tmp3.x
+    // MOV     dst.z,    tmp2.x
+    // MOV     dst.w,    1.0
+
+    // LG2     tmp2.x,   tmp1.x
+    // FLOOR   tmp3.x,   tmp2.x
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_LOG_IEEE;
+    pAsm->D.dst.math = 1;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp2;
+    pAsm->D.dst.writex = 1;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp1;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_FLOOR;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp3;
+    pAsm->D.dst.writex = 1;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp2;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    // MOV     dst.x,    tmp3.x
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.writey = pAsm->D.dst.writez = pAsm->D.dst.writew = 0;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp3;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    // ADD     tmp3.x,   tmp2.x,    -tmp3.x
+    // EX2     dst.y,    tmp3.x
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_ADD;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp3;
+    pAsm->D.dst.writex = 1;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp2;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
+    pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[1].src.reg   = tmp3;
+
+    setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X);
+    neg_PVSSRC(&(pAsm->S[1].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_EXP_IEEE;
+    pAsm->D.dst.math = 1;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.writex = pAsm->D.dst.writez = pAsm->D.dst.writew = 0;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp3;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    // MOV     dst.z,    tmp2.x
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.writex = pAsm->D.dst.writey = pAsm->D.dst.writew = 0;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp2;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    // MOV     dst.w     1.0
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.writex = pAsm->D.dst.writey = pAsm->D.dst.writez = 0;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp1;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_1);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
 GLboolean assemble_MAD(struct r700_AssemblerBase *pAsm) 
 {
     int tmp, ii;
@@ -2908,6 +3405,7 @@ GLboolean assemble_LIT(r700_AssemblerBase *pAsm)
     pAsm->S[0].src.rtype = srcType;
     pAsm->S[0].src.reg   = srcReg;
     setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    swizzleagain_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X, SQ_SEL_X, SQ_SEL_X, SQ_SEL_X);
     pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
     pAsm->S[1].src.reg   = tmp;
     setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
@@ -3417,22 +3915,6 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
 	    need_barrier = GL_TRUE;
     }
 
-    switch (pAsm->pILInst[pAsm->uiCurInst].Opcode)
-    {
-        case OPCODE_TEX:
-            break;
-        case OPCODE_TXB:
-            radeon_error("do not support TXB yet\n");
-            return GL_FALSE;
-            break;
-        case OPCODE_TXP:
-            break;
-        default:
-            radeon_error("Internal error: bad texture op (not TEX)\n");
-            return GL_FALSE;
-            break;
-    }
-
     if (pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_TXP)
     {
         GLuint tmp = gethelpr(pAsm);
@@ -3611,7 +4093,15 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
 
     }
 
-    pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE;
+    if(pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_TXB)
+    {
+        pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE_L;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE;
+    }
+
     pAsm->is_tex = GL_TRUE;
     if ( GL_TRUE == need_barrier )
     {
@@ -3809,8 +4299,7 @@ GLboolean AssembleInstr(GLuint uiNumberInsts,
             break;  
 
         case OPCODE_ARL: 
-            radeon_error("Not yet implemented instruction OPCODE_ARL \n");
-            //if ( GL_FALSE == assemble_BAD("ARL") ) 
+            if ( GL_FALSE == assemble_ARL(pR700AsmCode) ) 
                 return GL_FALSE;
             break;
         case OPCODE_ARR: 
@@ -3845,10 +4334,9 @@ GLboolean AssembleInstr(GLuint uiNumberInsts,
                 return GL_FALSE;
             break;  
         case OPCODE_EXP: 
-            radeon_error("Not yet implemented instruction OPCODE_EXP \n");
-            //if ( GL_FALSE == assemble_BAD("EXP") ) 
+            if ( GL_FALSE == assemble_EXP(pR700AsmCode) ) 
                 return GL_FALSE;
-            break; // approx of EX2
+            break;
 
         case OPCODE_FLR:     
             if ( GL_FALSE == assemble_FLR(pR700AsmCode) ) 
@@ -3881,10 +4369,9 @@ GLboolean AssembleInstr(GLuint uiNumberInsts,
                 return GL_FALSE;
             break;  
         case OPCODE_LOG: 
-            radeon_error("Not yet implemented instruction OPCODE_LOG \n");
-            //if ( GL_FALSE == assemble_BAD("LOG") ) 
+            if ( GL_FALSE == assemble_LOG(pR700AsmCode) ) 
                 return GL_FALSE;
-            break; // approx of LG2
+            break;
 
         case OPCODE_MAD: 
             if ( GL_FALSE == assemble_MAD(pR700AsmCode) ) 
@@ -4155,6 +4642,7 @@ GLboolean Process_Fragment_Exports(r700_AssemblerBase *pR700AsmCode,
                                    GLbitfield          OutputsWritten)  
 { 
     unsigned int unBit;
+    GLuint export_count = 0;
 
     if(pR700AsmCode->depth_export_register_number >= 0) 
     {
@@ -4176,6 +4664,7 @@ GLboolean Process_Fragment_Exports(r700_AssemblerBase *pR700AsmCode,
         {
             return GL_FALSE;
         }
+        export_count++;
 	}
 	unBit = 1 << FRAG_RESULT_DEPTH;
 	if(OutputsWritten & unBit)
@@ -4189,8 +4678,15 @@ GLboolean Process_Fragment_Exports(r700_AssemblerBase *pR700AsmCode,
         {
             return GL_FALSE;
         }
+        export_count++;
 	}
-
+    /* Need to export something, otherwise we'll hang
+     * results are undefined anyway */
+    if(export_count == 0)
+    {
+        Process_Export(pR700AsmCode, SQ_EXPORT_PIXEL, 0, 1, 0, GL_FALSE);
+    }
+    
     if(pR700AsmCode->cf_last_export_ptr != NULL) 
     {
         pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst        = SQ_CF_INST_EXPORT_DONE;
diff --git a/r600/r700_assembler.h b/r600/r700_assembler.h
index 73bb8ba..c66db50 100644
--- a/r600/r700_assembler.h
+++ b/r600/r700_assembler.h
@@ -415,6 +415,15 @@ GLboolean assemble_vfetch_instruction(r700_AssemblerBase* pAsm,
 								GLuint number_of_elements,
                                 GLenum dataElementType,
 								VTX_FETCH_METHOD* pFetchMethod);
+GLboolean assemble_vfetch_instruction2(r700_AssemblerBase* pAsm,
+                                       GLuint              destination_register,								       
+                                       GLenum              type,
+                                       GLint               size,
+                                       GLubyte             element,
+                                       GLuint              _signed,
+                                       GLboolean           normalize,
+                                       VTX_FETCH_METHOD  * pFetchMethod);
+GLboolean cleanup_vfetch_instructions(r700_AssemblerBase* pAsm);
 GLuint gethelpr(r700_AssemblerBase* pAsm);
 void resethelpr(r700_AssemblerBase* pAsm);
 void checkop_init(r700_AssemblerBase* pAsm);
@@ -461,18 +470,21 @@ GLboolean next_ins(r700_AssemblerBase *pAsm);
 GLboolean assemble_math_function(r700_AssemblerBase* pAsm, BITS opcode);
 GLboolean assemble_ABS(r700_AssemblerBase *pAsm);
 GLboolean assemble_ADD(r700_AssemblerBase *pAsm);
+GLboolean assemble_ARL(r700_AssemblerBase *pAsm);
 GLboolean assemble_BAD(char *opcode_str);
 GLboolean assemble_CMP(r700_AssemblerBase *pAsm);
 GLboolean assemble_COS(r700_AssemblerBase *pAsm);
 GLboolean assemble_DOT(r700_AssemblerBase *pAsm);
 GLboolean assemble_DST(r700_AssemblerBase *pAsm);
 GLboolean assemble_EX2(r700_AssemblerBase *pAsm);
+GLboolean assemble_EXP(r700_AssemblerBase *pAsm);
 GLboolean assemble_FLR(r700_AssemblerBase *pAsm);
 GLboolean assemble_FLR_INT(r700_AssemblerBase *pAsm);
 GLboolean assemble_FRC(r700_AssemblerBase *pAsm);
 GLboolean assemble_KIL(r700_AssemblerBase *pAsm);
 GLboolean assemble_LG2(r700_AssemblerBase *pAsm);
 GLboolean assemble_LRP(r700_AssemblerBase *pAsm);
+GLboolean assemble_LOG(r700_AssemblerBase *pAsm);
 GLboolean assemble_MAD(r700_AssemblerBase *pAsm);
 GLboolean assemble_LIT(r700_AssemblerBase *pAsm);
 GLboolean assemble_MAX(r700_AssemblerBase *pAsm);
diff --git a/r600/r700_chip.c b/r600/r700_chip.c
index 06d7e9c..02c56b9 100644
--- a/r600/r700_chip.c
+++ b/r600/r700_chip.c
@@ -54,11 +54,15 @@ static void r700SendTexState(GLcontext *ctx, struct radeon_state_atom *atom)
 	for (i = 0; i < R700_TEXTURE_NUMBERUNITS; i++) {
 		if (ctx->Texture.Unit[i]._ReallyEnabled) {
 			radeonTexObj *t = r700->textures[i];
+			uint32_t offset;
 			if (t) {
-				if (!t->image_override)
+				if (!t->image_override) {
 					bo = t->mt->bo;
-				else
+					offset = get_base_teximage_offset(t);
+				} else {
 					bo = t->bo;
+					offset = 0;
+				}
 				if (bo) {
 
 					r700SyncSurf(context, bo,
@@ -77,7 +81,7 @@ static void r700SendTexState(GLcontext *ctx, struct radeon_state_atom *atom)
 					R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE6);
 					R600_OUT_BATCH_RELOC(r700->textures[i]->SQ_TEX_RESOURCE2,
 							     bo,
-							     0,
+							     offset,
 							     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
 					R600_OUT_BATCH_RELOC(r700->textures[i]->SQ_TEX_RESOURCE3,
 							     bo,
@@ -141,17 +145,15 @@ static void r700SendTexBorderColorState(GLcontext *ctx, struct radeon_state_atom
 	}
 }
 
+extern int getTypeSize(GLenum type);
 static void r700SetupVTXConstants(GLcontext  * ctx,
-				  unsigned int nStreamID,
 				  void *       pAos,
-				  unsigned int size,      /* number of elements in vector */
-				  unsigned int stride,
-				  unsigned int count)     /* number of vectors in stream */
+				  StreamDesc * pStreamDesc)
 {
     context_t *context = R700_CONTEXT(ctx);
     struct radeon_aos * paos = (struct radeon_aos *)pAos;
+    unsigned int nVBsize;
     BATCH_LOCALS(&context->radeon);
-    radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
 
     unsigned int uSQ_VTX_CONSTANT_WORD0_0;
     unsigned int uSQ_VTX_CONSTANT_WORD1_0;
@@ -171,18 +173,40 @@ static void r700SetupVTXConstants(GLcontext  * ctx,
     else
 	    r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, VC_ACTION_ENA_bit);
 
+    if(0 == pStreamDesc->stride)
+    {
+        nVBsize = paos->count * pStreamDesc->size * getTypeSize(pStreamDesc->type);
+    }
+    else
+    {
+        nVBsize = paos->count * pStreamDesc->stride;
+    }
+
     uSQ_VTX_CONSTANT_WORD0_0 = paos->offset;
-    uSQ_VTX_CONSTANT_WORD1_0 = count * (size * 4) - 1;
+    uSQ_VTX_CONSTANT_WORD1_0 = nVBsize - 1;
 
     SETfield(uSQ_VTX_CONSTANT_WORD2_0, 0, BASE_ADDRESS_HI_shift, BASE_ADDRESS_HI_mask); /* TODO */
-    SETfield(uSQ_VTX_CONSTANT_WORD2_0, stride, SQ_VTX_CONSTANT_WORD2_0__STRIDE_shift,
+    SETfield(uSQ_VTX_CONSTANT_WORD2_0, pStreamDesc->stride, SQ_VTX_CONSTANT_WORD2_0__STRIDE_shift,
 	     SQ_VTX_CONSTANT_WORD2_0__STRIDE_mask);
-    SETfield(uSQ_VTX_CONSTANT_WORD2_0, GetSurfaceFormat(GL_FLOAT, size, NULL),
+    SETfield(uSQ_VTX_CONSTANT_WORD2_0, GetSurfaceFormat(pStreamDesc->type, pStreamDesc->size, NULL),
 	     SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_shift,
 	     SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_mask); /* TODO : trace back api for initial data type, not only GL_FLOAT */
-    SETfield(uSQ_VTX_CONSTANT_WORD2_0, SQ_NUM_FORMAT_SCALED,
-	     SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_shift, SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_mask);
-    SETbit(uSQ_VTX_CONSTANT_WORD2_0, SQ_VTX_CONSTANT_WORD2_0__FORMAT_COMP_ALL_bit);
+    
+    if(GL_TRUE == pStreamDesc->normalize)
+    {
+        SETfield(uSQ_VTX_CONSTANT_WORD2_0, SQ_NUM_FORMAT_NORM,
+	             SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_shift, SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_mask);
+    }
+    //else
+    //{
+    //    SETfield(uSQ_VTX_CONSTANT_WORD2_0, SQ_NUM_FORMAT_INT,
+	//             SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_shift, SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_mask);
+    //}
+
+    if(1 == pStreamDesc->_signed)
+    {
+        SETbit(uSQ_VTX_CONSTANT_WORD2_0, SQ_VTX_CONSTANT_WORD2_0__FORMAT_COMP_ALL_bit);
+    }
 
     SETfield(uSQ_VTX_CONSTANT_WORD3_0, 1, MEM_REQUEST_SIZE_shift, MEM_REQUEST_SIZE_mask);
     SETfield(uSQ_VTX_CONSTANT_WORD6_0, SQ_TEX_VTX_VALID_BUFFER,
@@ -191,7 +215,7 @@ static void r700SetupVTXConstants(GLcontext  * ctx,
     BEGIN_BATCH_NO_AUTOSTATE(9 + 2);
 
     R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_RESOURCE, 7));
-    R600_OUT_BATCH((nStreamID + SQ_FETCH_RESOURCE_VS_OFFSET) * FETCH_RESOURCE_STRIDE);
+    R600_OUT_BATCH((pStreamDesc->element + SQ_FETCH_RESOURCE_VS_OFFSET) * FETCH_RESOURCE_STRIDE);
     R600_OUT_BATCH(uSQ_VTX_CONSTANT_WORD0_0);
     R600_OUT_BATCH(uSQ_VTX_CONSTANT_WORD1_0);
     R600_OUT_BATCH(uSQ_VTX_CONSTANT_WORD2_0);
@@ -208,31 +232,6 @@ static void r700SetupVTXConstants(GLcontext  * ctx,
 
 }
 
-void r700SetupStreams(GLcontext *ctx)
-{
-    context_t         *context = R700_CONTEXT(ctx);
-    struct r700_vertex_program *vp = context->selected_vp;
-    TNLcontext *tnl = TNL_CONTEXT(ctx);
-    struct vertex_buffer *vb = &tnl->vb;
-    unsigned int i, j = 0;
-	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
-
-    R600_STATECHANGE(context, vtx);
-
-    for(i=0; i<VERT_ATTRIB_MAX; i++) {
-	    if(vp->mesa_program->Base.InputsRead & (1 << i)) {
-		    rcommon_emit_vector(ctx,
-					&context->radeon.tcl.aos[j],
-					vb->AttribPtr[i]->data,
-					vb->AttribPtr[i]->size,
-					vb->AttribPtr[i]->stride,
-					vb->Count);
-		    j++;
-	    }
-    }
-    context->radeon.tcl.aos_count = j;
-}
-
 static void r700SendVTXState(GLcontext *ctx, struct radeon_state_atom *atom)
 {
     context_t         *context = R700_CONTEXT(ctx);
@@ -256,15 +255,12 @@ static void r700SendVTXState(GLcontext *ctx, struct radeon_state_atom *atom)
     COMMIT_BATCH();
 
     for(i=0; i<VERT_ATTRIB_MAX; i++) {
-	    if(vp->mesa_program->Base.InputsRead & (1 << i)) {
-		    /* currently aos are packed */
-		    r700SetupVTXConstants(ctx,
-					  i,
-					  (void*)(&context->radeon.tcl.aos[j]),
-					  (unsigned int)context->radeon.tcl.aos[j].components,
-					  (unsigned int)context->radeon.tcl.aos[j].stride * 4,
-					  (unsigned int)context->radeon.tcl.aos[j].count);
-		    j++;
+	    if(vp->mesa_program->Base.InputsRead & (1 << i))
+	    {
+                r700SetupVTXConstants(ctx,
+				      (void*)(&context->radeon.tcl.aos[j]),
+				      &(context->stream_desc[j]));
+		j++;
 	    }
     }
 }
@@ -366,7 +362,6 @@ static void r700SendDepthTargetState(GLcontext *ctx, struct radeon_state_atom *a
 
 	rrb = radeon_get_depthbuffer(&context->radeon);
 	if (!rrb || !rrb->bo) {
-		fprintf(stderr, "no rrb\n");
 		return;
 	}
 
@@ -408,7 +403,6 @@ static void r700SendRenderTargetState(GLcontext *ctx, struct radeon_state_atom *
 
 	rrb = radeon_get_colorbuffer(&context->radeon);
 	if (!rrb || !rrb->bo) {
-		fprintf(stderr, "no rrb\n");
 		return;
 	}
 
@@ -794,8 +788,7 @@ static void r700SendDBState(GLcontext *ctx, struct radeon_state_atom *atom)
 	BATCH_LOCALS(&context->radeon);
 	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
 
-        BEGIN_BATCH_NO_AUTOSTATE(23);
-	R600_OUT_BATCH_REGVAL(DB_HTILE_DATA_BASE, r700->DB_HTILE_DATA_BASE.u32All);
+	BEGIN_BATCH_NO_AUTOSTATE(17);
 
 	R600_OUT_BATCH_REGSEQ(DB_STENCIL_CLEAR, 2);
 	R600_OUT_BATCH(r700->DB_STENCIL_CLEAR.u32All);
@@ -808,7 +801,6 @@ static void r700SendDBState(GLcontext *ctx, struct radeon_state_atom *atom)
 	R600_OUT_BATCH(r700->DB_RENDER_CONTROL.u32All);
 	R600_OUT_BATCH(r700->DB_RENDER_OVERRIDE.u32All);
 
-	R600_OUT_BATCH_REGVAL(DB_HTILE_SURFACE, r700->DB_HTILE_SURFACE.u32All);
 	R600_OUT_BATCH_REGVAL(DB_ALPHA_TO_MASK, r700->DB_ALPHA_TO_MASK.u32All);
 
 	END_BATCH();
@@ -1108,6 +1100,32 @@ static void r700SendVSConsts(GLcontext *ctx, struct radeon_state_atom *atom)
 	COMMIT_BATCH();
 }
 
+static void r700SendQueryBegin(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	struct radeon_query_object *query = radeon->query.current;
+	BATCH_LOCALS(radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	/* clear the buffer */
+	radeon_bo_map(query->bo, GL_FALSE);
+	memset(query->bo->ptr, 0, 4 * 2 * sizeof(uint64_t)); /* 4 DBs, 2 qwords each */
+	radeon_bo_unmap(query->bo);
+
+	radeon_cs_space_check_with_bo(radeon->cmdbuf.cs,
+				      query->bo,
+				      0, RADEON_GEM_DOMAIN_GTT);
+
+	BEGIN_BATCH_NO_AUTOSTATE(4 + 2);
+	R600_OUT_BATCH(CP_PACKET3(R600_IT_EVENT_WRITE, 2));
+	R600_OUT_BATCH(ZPASS_DONE);
+	R600_OUT_BATCH(query->curr_offset); /* hw writes qwords */
+	R600_OUT_BATCH(0x00000000);
+	R600_OUT_BATCH_RELOC(VGT_EVENT_INITIATOR, query->bo, 0, 0, RADEON_GEM_DOMAIN_GTT, 0);
+	END_BATCH();
+	query->emitted_begin = GL_TRUE;
+}
+
 static int check_always(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	return atom->cmd_size;
@@ -1136,7 +1154,11 @@ static int check_blnd(GLcontext *ctx, struct radeon_state_atom *atom)
 		count += 3;
 
 	if (context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) {
-		for (ui = 0; ui < R700_MAX_RENDER_TARGETS; ui++) {
+		/* targets are enabled in r700SetRenderTarget but state
+		   size is calculated before that. Until MRT's are done
+		   hardcode target0 as enabled. */
+		count += 3;
+		for (ui = 1; ui < R700_MAX_RENDER_TARGETS; ui++) {
                         if (r700->render_target[ui].enabled)
 				count += 3;
 		}
@@ -1216,6 +1238,20 @@ static int check_vs_consts(GLcontext *ctx, struct radeon_state_atom *atom)
 	return count;
 }
 
+static int check_queryobj(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	struct radeon_query_object *query = radeon->query.current;
+	int count;
+
+	if (!query || query->emitted_begin)
+		count = 0;
+	else
+		count = atom->cmd_size;
+	radeon_print(RADEON_STATE, RADEON_TRACE, "%s %d\n", __func__, count);
+	return count;
+}
+
 #define ALLOC_STATE( ATOM, CHK, SZ, EMIT )				\
 do {									\
 	context->atoms.ATOM.cmd_size = (SZ);				\
@@ -1229,6 +1265,19 @@ do {									\
 	insert_at_tail(&context->radeon.hw.atomlist, &context->atoms.ATOM); \
 } while (0)
 
+static void r600_init_query_stateobj(radeonContextPtr radeon, int SZ)
+{
+	radeon->query.queryobj.cmd_size = (SZ);
+	radeon->query.queryobj.cmd = NULL;
+	radeon->query.queryobj.name = "queryobj";
+	radeon->query.queryobj.idx = 0;
+	radeon->query.queryobj.check = check_queryobj;
+	radeon->query.queryobj.dirty = GL_FALSE;
+	radeon->query.queryobj.emit = r700SendQueryBegin;
+	radeon->hw.max_state_size += (SZ);
+	insert_at_tail(&radeon->hw.atomlist, &radeon->query.queryobj);
+}
+
 void r600InitAtoms(context_t *context)
 {
 	radeon_print(RADEON_STATE, RADEON_NORMAL, "%s %p\n", __func__, context);
@@ -1239,7 +1288,7 @@ void r600InitAtoms(context_t *context)
 	context->radeon.hw.atomlist.name = "atom-list";
 
 	ALLOC_STATE(sq, always, 34, r700SendSQConfig);
-	ALLOC_STATE(db, always, 23, r700SendDBState);
+	ALLOC_STATE(db, always, 17, r700SendDBState);
 	ALLOC_STATE(stencil, always, 4, r700SendStencilState);
 	ALLOC_STATE(db_target, always, 12, r700SendDepthTargetState);
 	ALLOC_STATE(sc, always, 15, r700SendSCState);
@@ -1252,9 +1301,9 @@ void r600InitAtoms(context_t *context)
 	ALLOC_STATE(poly, always, 10, r700SendPolyState);
 	ALLOC_STATE(cb, cb, 18, r700SendCBState);
 	ALLOC_STATE(clrcmp, always, 6, r700SendCBCLRCMPState);
+	ALLOC_STATE(cb_target, always, 25, r700SendRenderTargetState);
 	ALLOC_STATE(blnd, blnd, (6 + (R700_MAX_RENDER_TARGETS * 3)), r700SendCBBlendState);
 	ALLOC_STATE(blnd_clr, always, 6, r700SendCBBlendColorState);
-	ALLOC_STATE(cb_target, always, 25, r700SendRenderTargetState);
 	ALLOC_STATE(sx, always, 9, r700SendSXState);
 	ALLOC_STATE(vgt, always, 41, r700SendVGTState);
 	ALLOC_STATE(spi, always, (59 + R700_MAX_SHADER_EXPORTS), r700SendSPIState);
@@ -1268,6 +1317,7 @@ void r600InitAtoms(context_t *context)
 	ALLOC_STATE(tx, tx, (R700_TEXTURE_NUMBERUNITS * 20), r700SendTexState);
 	ALLOC_STATE(tx_smplr, tx, (R700_TEXTURE_NUMBERUNITS * 5), r700SendTexSamplerState);
 	ALLOC_STATE(tx_brdr_clr, tx, (R700_TEXTURE_NUMBERUNITS * 6), r700SendTexBorderColorState);
+	r600_init_query_stateobj(&context->radeon, 6 * 2);
 
 	context->radeon.hw.is_dirty = GL_TRUE;
 	context->radeon.hw.all_dirty = GL_TRUE;
diff --git a/r600/r700_fragprog.c b/r600/r700_fragprog.c
index 78ce3ae..ccafd43 100644
--- a/r600/r700_fragprog.c
+++ b/r600/r700_fragprog.c
@@ -135,15 +135,19 @@ GLboolean Find_Instruction_Dependencies_fp(struct r700_fragment_program *fp,
 {
     GLuint i, j;
     GLint * puiTEMPwrites;
+    GLint * puiTEMPreads;
     struct prog_instruction * pILInst;
     InstDeps         *pInstDeps;
     struct prog_instruction * texcoord_DepInst;
     GLint              nDepInstID;
 
     puiTEMPwrites = (GLint*) MALLOC(sizeof(GLuint)*mesa_fp->Base.NumTemporaries);
+    puiTEMPreads = (GLint*) MALLOC(sizeof(GLuint)*mesa_fp->Base.NumTemporaries);
+
     for(i=0; i<mesa_fp->Base.NumTemporaries; i++)
     {
         puiTEMPwrites[i] = -1;
+        puiTEMPreads[i] = -1;
     }
 
     pInstDeps = (InstDeps*)MALLOC(sizeof(InstDeps)*mesa_fp->Base.NumInstructions);
@@ -167,6 +171,11 @@ GLboolean Find_Instruction_Dependencies_fp(struct r700_fragment_program *fp,
             {
                 //Set dep.
                 pInstDeps[i].nSrcDeps[j] = puiTEMPwrites[pILInst->SrcReg[j].Index];
+                //Set first read
+                if(puiTEMPreads[pILInst->SrcReg[j].Index] < 0 )
+                {
+                    puiTEMPreads[pILInst->SrcReg[j].Index] = i;
+                }
             }
             else
             {
@@ -177,8 +186,6 @@ GLboolean Find_Instruction_Dependencies_fp(struct r700_fragment_program *fp,
 
     fp->r700AsmCode.pInstDeps = pInstDeps;
 
-    FREE(puiTEMPwrites);
-
     //Find dep for tex inst    
     for(i=0; i<mesa_fp->Base.NumInstructions; i++)
     {
@@ -203,9 +210,25 @@ GLboolean Find_Instruction_Dependencies_fp(struct r700_fragment_program *fp,
                 {   //... other deps?
                 }
             }
+            // make sure that we dont overwrite src used earlier
+            nDepInstID = puiTEMPreads[pILInst->DstReg.Index];
+            if(nDepInstID < i)
+            {
+                pInstDeps[i].nDstDep = puiTEMPreads[pILInst->DstReg.Index];
+                texcoord_DepInst = &(mesa_fp->Base.Instructions[nDepInstID]);
+                if(GL_TRUE == IsAlu(texcoord_DepInst->Opcode) )
+                {
+                    pInstDeps[nDepInstID].nDstDep = i;
+                }
+ 
+            }
+
         }
 	}
 
+    FREE(puiTEMPwrites);
+    FREE(puiTEMPreads);
+
     return GL_TRUE;
 }
 
@@ -251,7 +274,15 @@ GLboolean r700TranslateFragmentShader(struct r700_fragment_program *fp,
 		number_of_colors_exported--;
 	}
 
-	fp->r700Shader.exportMode = number_of_colors_exported << 1 | z_enabled;
+	/* illegal to set this to 0 */
+	if(number_of_colors_exported || z_enabled)
+	{
+	    fp->r700Shader.exportMode = number_of_colors_exported << 1 | z_enabled;
+	}
+	else
+	{
+	    fp->r700Shader.exportMode = (1 << 1);
+	}
 
     fp->translated = GL_TRUE;
 
@@ -341,6 +372,11 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
         SETbit(r700->SPI_PS_IN_CONTROL_0.u32All, POSITION_ENA_bit);
         SETbit(r700->SPI_INPUT_Z.u32All, PROVIDE_Z_TO_SPI_bit);
     }
+    else
+    {
+        CLEARbit(r700->SPI_PS_IN_CONTROL_0.u32All, POSITION_ENA_bit);
+        CLEARbit(r700->SPI_INPUT_Z.u32All, PROVIDE_Z_TO_SPI_bit);
+    }
 
     ui = (unNumOfReg < ui) ? ui : unNumOfReg;
 
@@ -357,26 +393,6 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
     SETfield(r700->ps.SQ_PGM_EXPORTS_PS.u32All, fp->r700Shader.exportMode,
              EXPORT_MODE_shift, EXPORT_MODE_mask);
 
-    R600_STATECHANGE(context, db);
-
-    if(fp->r700Shader.killIsUsed)
-    {
-	    SETbit(r700->DB_SHADER_CONTROL.u32All, KILL_ENABLE_bit);
-    }
-    else
-    {
-        CLEARbit(r700->DB_SHADER_CONTROL.u32All, KILL_ENABLE_bit);
-    }
-
-    if(fp->r700Shader.depthIsExported)
-    {
-	    SETbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit);
-    }
-    else
-    {
-        CLEARbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit);
-    }
-
     // emit ps input map
     unBit = 1 << FRAG_ATTRIB_WPOS;
     if(mesa_fp->Base.InputsRead & unBit)
@@ -443,9 +459,12 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
 	    }
     }
 
-    R600_STATECHANGE(context, cb);
     exportCount = (r700->ps.SQ_PGM_EXPORTS_PS.u32All & EXPORT_MODE_mask) / (1 << EXPORT_MODE_shift);
-    r700->CB_SHADER_CONTROL.u32All = (1 << exportCount) - 1;
+    if (r700->CB_SHADER_CONTROL.u32All != ((1 << exportCount) - 1))
+    {
+	    R600_STATECHANGE(context, cb);
+	    r700->CB_SHADER_CONTROL.u32All = (1 << exportCount) - 1;
+    }
 
     /* sent out shader constants. */
     paramList = fp->mesa_program.Base.Parameters;
diff --git a/r600/r700_oglprog.c b/r600/r700_oglprog.c
index 5290ef3..0d476fc 100644
--- a/r600/r700_oglprog.c
+++ b/r600/r700_oglprog.c
@@ -40,6 +40,24 @@
 #include "r700_vertprog.h"
 
 
+static void freeVertProgCache(GLcontext *ctx, struct r700_vertex_program_cont *cache)
+{
+	struct r700_vertex_program *tmp, *vp = cache->progs;
+
+	while (vp) {
+		tmp = vp->next;
+		/* Release DMA region */
+		r600DeleteShader(ctx, vp->shaderbo);
+		/* Clean up */
+		Clean_Up_Assembler(&(vp->r700AsmCode));
+		Clean_Up_Shader(&(vp->r700Shader));
+		
+		_mesa_reference_vertprog(ctx, &vp->mesa_program, NULL);
+		_mesa_free(vp);
+		vp = tmp;
+	}
+}
+
 static struct gl_program *r700NewProgram(GLcontext * ctx, 
                                          GLenum target,
 					                     GLuint id)
@@ -84,8 +102,7 @@ static struct gl_program *r700NewProgram(GLcontext * ctx,
 
 static void r700DeleteProgram(GLcontext * ctx, struct gl_program *prog)
 {
-    struct r700_vertex_program_cont   * vpc;
-    struct r700_vertex_program *vp, *tmp;
+    struct r700_vertex_program_cont *vpc = (struct r700_vertex_program_cont *)prog;
     struct r700_fragment_program * fp;
 
 	radeon_print(RADEON_SHADER, RADEON_VERBOSE,
@@ -95,20 +112,7 @@ static void r700DeleteProgram(GLcontext * ctx, struct gl_program *prog)
     {
     case GL_VERTEX_STATE_PROGRAM_NV:
     case GL_VERTEX_PROGRAM_ARB:	    
-        vpc = (struct r700_vertex_program_cont*)prog;
-        vp = vpc->progs;
-	while (vp) {
-		tmp = vp->next;
-		/* Release DMA region */
-	 
-	        r600DeleteShader(ctx, vp->shaderbo);
-
-	        /* Clean up */
-	        Clean_Up_Assembler(&(vp->r700AsmCode));
-	        Clean_Up_Shader(&(vp->r700Shader));
-		_mesa_free(vp);
-		vp = tmp;
-	}
+	    freeVertProgCache(ctx, vpc);
 	    break;
     case GL_FRAGMENT_PROGRAM_NV:
     case GL_FRAGMENT_PROGRAM_ARB:
@@ -131,7 +135,24 @@ static void r700DeleteProgram(GLcontext * ctx, struct gl_program *prog)
 static void
 r700ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
 {
-
+	struct r700_vertex_program_cont *vpc = (struct r700_vertex_program_cont *)prog;
+	struct r700_fragment_program * fp = (struct r700_fragment_program*)prog;
+
+	switch (target) {
+	case GL_VERTEX_PROGRAM_ARB:
+		freeVertProgCache(ctx, vpc);
+		vpc->progs = NULL;
+		break;
+	case GL_FRAGMENT_PROGRAM_ARB:
+		r600DeleteShader(ctx, fp->shaderbo);
+		Clean_Up_Assembler(&(fp->r700AsmCode));
+		Clean_Up_Shader(&(fp->r700Shader));
+		fp->translated = GL_FALSE;
+		fp->loaded     = GL_FALSE;
+		fp->shaderbo   = NULL;
+		break;
+	}
+		
 }
 
 static GLboolean r700IsProgramNative(GLcontext * ctx, GLenum target, struct gl_program *prog)
diff --git a/r600/r700_render.c b/r600/r700_render.c
index b1c3648..47f89c9 100644
--- a/r600/r700_render.c
+++ b/r600/r700_render.c
@@ -43,6 +43,7 @@
 #include "tnl/t_context.h"
 #include "tnl/t_vertex.h"
 #include "tnl/t_pipeline.h"
+#include "vbo/vbo_context.h"
 
 #include "r600_context.h"
 #include "r600_cmdbuf.h"
@@ -53,13 +54,12 @@
 #include "r700_fragprog.h"
 #include "r700_state.h"
 
+#include "radeon_buffer_objects.h"
 #include "radeon_common_context.h"
 
 void r700WaitForIdle(context_t *context);
 void r700WaitForIdleClean(context_t *context);
-GLboolean r700SendTextureState(context_t *context);
 static unsigned int r700PrimitiveType(int prim);
-void r600UpdateTextureState(GLcontext * ctx);
 GLboolean r700SyncSurf(context_t *context,
 		       struct radeon_bo *pbo,
 		       uint32_t read_domain,
@@ -249,113 +249,635 @@ static int r700NumVerts(int num_verts, int prim)
 
 static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim)
 {
-	context_t *context = R700_CONTEXT(ctx);
-	BATCH_LOCALS(&context->radeon);
-	int type, i, total_emit;
-	int num_indices;
-	uint32_t vgt_draw_initiator = 0;
-	uint32_t vgt_index_type     = 0;
-	uint32_t vgt_primitive_type = 0;
-	uint32_t vgt_num_indices    = 0;
-	TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *vb = &tnl->vb;
-
-	type = r700PrimitiveType(prim);
-	num_indices = r700NumVerts(end - start, prim);
-
-	radeon_print(RADEON_RENDER, RADEON_TRACE,
-		"%s type %x num_indices %d\n",
-		__func__, type, num_indices);
-
-	if (type < 0 || num_indices <= 0)
-		return;
+    context_t *context = R700_CONTEXT(ctx);
+    BATCH_LOCALS(&context->radeon);
+    int type, total_emit;
+    int num_indices;
+    uint32_t vgt_draw_initiator = 0;
+    uint32_t vgt_index_type     = 0;
+    uint32_t vgt_primitive_type = 0;
+    uint32_t vgt_num_indices    = 0;
+
+    type = r700PrimitiveType(prim);
+    num_indices = r700NumVerts(end - start, prim);
+
+    radeon_print(RADEON_RENDER, RADEON_TRACE,
+		 "%s type %x num_indices %d\n",
+		 __func__, type, num_indices);
+
+    if (type < 0 || num_indices <= 0)
+	    return;
+
+    SETfield(vgt_primitive_type, type,
+	     VGT_PRIMITIVE_TYPE__PRIM_TYPE_shift, VGT_PRIMITIVE_TYPE__PRIM_TYPE_mask);
+
+    SETfield(vgt_index_type, DI_INDEX_SIZE_32_BIT, INDEX_TYPE_shift, INDEX_TYPE_mask);
+
+    if(GL_TRUE != context->ind_buf.is_32bit)
+    {
+            SETfield(vgt_index_type, DI_INDEX_SIZE_16_BIT, INDEX_TYPE_shift, INDEX_TYPE_mask);
+    }
+
+    vgt_num_indices = num_indices;
+    SETfield(vgt_draw_initiator, DI_SRC_SEL_DMA, SOURCE_SELECT_shift, SOURCE_SELECT_mask);
+    SETfield(vgt_draw_initiator, DI_MAJOR_MODE_0, MAJOR_MODE_shift, MAJOR_MODE_mask);
+
+    total_emit =   3  /* VGT_PRIMITIVE_TYPE */
+	         + 2  /* VGT_INDEX_TYPE */
+	         + 2  /* NUM_INSTANCES */
+	         + 5 + 2; /* DRAW_INDEX */
+
+    BEGIN_BATCH_NO_AUTOSTATE(total_emit);
+    // prim
+    R600_OUT_BATCH_REGSEQ(VGT_PRIMITIVE_TYPE, 1);
+    R600_OUT_BATCH(vgt_primitive_type);
+    // index type
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_INDEX_TYPE, 0));
+    R600_OUT_BATCH(vgt_index_type);
+    // num instances
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_NUM_INSTANCES, 0));
+    R600_OUT_BATCH(1);
+    // draw packet
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX, 3));
+    R600_OUT_BATCH(context->ind_buf.bo_offset);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(vgt_num_indices);
+    R600_OUT_BATCH(vgt_draw_initiator);
+    R600_OUT_BATCH_RELOC(context->ind_buf.bo_offset,
+			 context->ind_buf.bo,
+			 context->ind_buf.bo_offset,
+			 RADEON_GEM_DOMAIN_GTT, 0, 0);
+    END_BATCH();
+    COMMIT_BATCH();
+}
+
+static void r700RunRenderPrimitiveImmediate(GLcontext * ctx, int start, int end, int prim)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    BATCH_LOCALS(&context->radeon);
+    int type, i;
+    uint32_t num_indices, total_emit = 0;
+    uint32_t vgt_draw_initiator = 0;
+    uint32_t vgt_index_type     = 0;
+    uint32_t vgt_primitive_type = 0;
+    uint32_t vgt_num_indices    = 0;
+
+    type = r700PrimitiveType(prim);
+    num_indices = r700NumVerts(end - start, prim);
+
+    radeon_print(RADEON_RENDER, RADEON_TRACE,
+		 "%s type %x num_indices %d\n",
+		 __func__, type, num_indices);
+
+    if (type < 0 || num_indices <= 0)
+	    return;
 
-        total_emit =   3 /* VGT_PRIMITIVE_TYPE */
-		     + 2 /* VGT_INDEX_TYPE */
-		     + 2 /* NUM_INSTANCES */
-                     + num_indices + 3; /* DRAW_INDEX_IMMD */
-
-        BEGIN_BATCH_NO_AUTOSTATE(total_emit);
-	// prim
-        SETfield(vgt_primitive_type, type,
-		 VGT_PRIMITIVE_TYPE__PRIM_TYPE_shift, VGT_PRIMITIVE_TYPE__PRIM_TYPE_mask);
-        R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, 1));
-        R600_OUT_BATCH(mmVGT_PRIMITIVE_TYPE - ASIC_CONFIG_BASE_INDEX);
-        R600_OUT_BATCH(vgt_primitive_type);
-
-	// index type
-        SETfield(vgt_index_type, DI_INDEX_SIZE_32_BIT, INDEX_TYPE_shift, INDEX_TYPE_mask);
-        R600_OUT_BATCH(CP_PACKET3(R600_IT_INDEX_TYPE, 0));
-        R600_OUT_BATCH(vgt_index_type);
-
-	// num instances
-	R600_OUT_BATCH(CP_PACKET3(R600_IT_NUM_INSTANCES, 0));
-        R600_OUT_BATCH(1);
-
-	// draw packet
-        vgt_num_indices = num_indices;
-        SETfield(vgt_draw_initiator, DI_SRC_SEL_IMMEDIATE, SOURCE_SELECT_shift, SOURCE_SELECT_mask);
-	SETfield(vgt_draw_initiator, DI_MAJOR_MODE_0, MAJOR_MODE_shift, MAJOR_MODE_mask);
-
-        R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX_IMMD, (num_indices + 1)));
+    SETfield(vgt_primitive_type, type,
+	     VGT_PRIMITIVE_TYPE__PRIM_TYPE_shift, VGT_PRIMITIVE_TYPE__PRIM_TYPE_mask);
+
+    if (num_indices > 0xffff)
+    {
+	    SETfield(vgt_index_type, DI_INDEX_SIZE_32_BIT, INDEX_TYPE_shift, INDEX_TYPE_mask);
+    }
+    else
+    {
+            SETfield(vgt_index_type, DI_INDEX_SIZE_16_BIT, INDEX_TYPE_shift, INDEX_TYPE_mask);
+    }
+
+    vgt_num_indices = num_indices;
+    SETfield(vgt_draw_initiator, DI_MAJOR_MODE_0, MAJOR_MODE_shift, MAJOR_MODE_mask);
+
+    if (start == 0)
+    {
+	SETfield(vgt_draw_initiator, DI_SRC_SEL_AUTO_INDEX, SOURCE_SELECT_shift, SOURCE_SELECT_mask);
+    }
+    else
+    {
+	if (num_indices > 0xffff)
+	{
+		total_emit += num_indices;
+	}
+	else
+	{
+		total_emit += (num_indices + 1) / 2;
+	}
+	SETfield(vgt_draw_initiator, DI_SRC_SEL_IMMEDIATE, SOURCE_SELECT_shift, SOURCE_SELECT_mask);
+    }
+
+    total_emit +=   3 /* VGT_PRIMITIVE_TYPE */
+	          + 2 /* VGT_INDEX_TYPE */
+	          + 2 /* NUM_INSTANCES */
+	          + 3; /* DRAW */
+
+    BEGIN_BATCH_NO_AUTOSTATE(total_emit);
+    // prim
+    R600_OUT_BATCH_REGSEQ(VGT_PRIMITIVE_TYPE, 1);
+    R600_OUT_BATCH(vgt_primitive_type);
+    // index type
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_INDEX_TYPE, 0));
+    R600_OUT_BATCH(vgt_index_type);
+    // num instances
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_NUM_INSTANCES, 0));
+    R600_OUT_BATCH(1);
+    // draw packet
+    if(start == 0)
+    {
+        R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX_AUTO, 1));
         R600_OUT_BATCH(vgt_num_indices);
         R600_OUT_BATCH(vgt_draw_initiator);
-
-        for (i = start; i < (start + num_indices); i++) {
-		if(vb->Elts)
-			R600_OUT_BATCH(vb->Elts[i]);
+    }
+    else
+    {
+	if (num_indices > 0xffff)
+        {
+	    R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX_IMMD, (num_indices + 1)));
+	    R600_OUT_BATCH(vgt_num_indices);
+	    R600_OUT_BATCH(vgt_draw_initiator);
+	    for (i = start; i < (start + num_indices); i++)
+	    {
+		R600_OUT_BATCH(i);
+	    }
+	}
+	else
+        {
+	    R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX_IMMD, (((num_indices + 1) / 2) + 1)));
+	    R600_OUT_BATCH(vgt_num_indices);
+	    R600_OUT_BATCH(vgt_draw_initiator);
+	    for (i = start; i < (start + num_indices); i += 2)
+	    {
+		if ((i + 1) == (start + num_indices))
+		{
+		    R600_OUT_BATCH(i);
+		}
 		else
-			R600_OUT_BATCH(i);
-        }
-        END_BATCH();
-        COMMIT_BATCH();
+		{
+		    R600_OUT_BATCH(((i + 1) << 16) | (i));
+		}
+	    }
+	}
+    }
 
+    END_BATCH();
+    COMMIT_BATCH();
 }
 
 /* start 3d, idle, cb/db flush */
 #define PRE_EMIT_STATE_BUFSZ 10 + 5 + 14
 
-static GLuint r700PredictRenderSize(GLcontext* ctx)
+static GLuint r700PredictRenderSize(GLcontext* ctx,
+				    const struct _mesa_prim *prim,
+				    const struct _mesa_index_buffer *ib,
+				    GLuint nr_prims)
 {
     context_t *context = R700_CONTEXT(ctx);
-    TNLcontext *tnl = TNL_CONTEXT(ctx);
-    struct r700_vertex_program *vp = context->selected_vp;
-    struct vertex_buffer *vb = &tnl->vb;
     GLboolean flushed;
     GLuint dwords, i;
     GLuint state_size;
-    /* pre calculate aos count so state prediction works */
-    context->radeon.tcl.aos_count = _mesa_bitcount(vp->mesa_program->Base.InputsRead);
 
     dwords = PRE_EMIT_STATE_BUFSZ;
-    for (i = 0; i < vb->PrimitiveCount; i++)
-        dwords += vb->Primitive[i].count + 10;
+    if (ib)
+	    dwords += nr_prims * 14;
+    else {
+	    for (i = 0; i < nr_prims; ++i)
+	    {
+		    if (prim[i].start == 0)
+			    dwords += 10;
+		    else if (prim[i].count > 0xffff)
+			    dwords += prim[i].count + 10;
+		    else
+			    dwords += ((prim[i].count + 1) / 2) + 10;
+	    }
+    }
+
     state_size = radeonCountStateEmitSize(&context->radeon);
     flushed = rcommonEnsureCmdBufSpace(&context->radeon,
-            dwords + state_size, __FUNCTION__);
-
+				       dwords + state_size,
+				       __FUNCTION__);
     if (flushed)
-        dwords += radeonCountStateEmitSize(&context->radeon);
+	    dwords += radeonCountStateEmitSize(&context->radeon);
     else
-        dwords += state_size;
+	    dwords += state_size;
 
-    radeon_print(RADEON_RENDER, RADEON_VERBOSE,
-	"%s: total prediction size is %d.\n", __FUNCTION__, dwords);
+    radeon_print(RADEON_RENDER, RADEON_VERBOSE, "%s: total prediction size is %d.\n", __FUNCTION__, dwords);
     return dwords;
+
+}
+
+#define CONVERT( TYPE, MACRO ) do {		\
+	GLuint i, j, sz;				\
+	sz = input->Size;				\
+	if (input->Normalized) {			\
+		for (i = 0; i < count; i++) {		\
+			const TYPE *in = (TYPE *)src_ptr;		\
+			for (j = 0; j < sz; j++) {		\
+				*dst_ptr++ = MACRO(*in);		\
+				in++;				\
+			}					\
+			src_ptr += stride;			\
+		}						\
+	} else {					\
+		for (i = 0; i < count; i++) {		\
+			const TYPE *in = (TYPE *)src_ptr;		\
+			for (j = 0; j < sz; j++) {		\
+				*dst_ptr++ = (GLfloat)(*in);		\
+				in++;				\
+			}					\
+			src_ptr += stride;			\
+		}						\
+	}						\
+} while (0)
+
+/**
+ * Convert attribute data type to float
+ * If the attribute uses named buffer object replace the bo with newly allocated bo
+ */
+static void r700ConvertAttrib(GLcontext *ctx, int count, 
+                              const struct gl_client_array *input, 
+                              struct StreamDesc *attr)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    const GLvoid *src_ptr;
+    GLboolean mapped_named_bo = GL_FALSE;
+    GLfloat *dst_ptr;
+    GLuint stride;
+
+    stride = (input->StrideB == 0) ? getTypeSize(input->Type) * input->Size : input->StrideB;
+
+    /* Convert value for first element only */
+    if (input->StrideB == 0)
+    {
+        count = 1;
+    }
+
+    if (input->BufferObj->Name) 
+    {
+        if (!input->BufferObj->Pointer) 
+        {
+            ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+            mapped_named_bo = GL_TRUE;
+        }
+
+        src_ptr = ADD_POINTERS(input->BufferObj->Pointer, input->Ptr);
+    } 
+    else 
+    {
+        src_ptr = input->Ptr;
+    }
+
+    radeonAllocDmaRegion(&context->radeon, &attr->bo, &attr->bo_offset, 
+                         sizeof(GLfloat) * input->Size * count, 32);
+    dst_ptr = (GLfloat *)ADD_POINTERS(attr->bo->ptr, attr->bo_offset);
+
+    assert(src_ptr != NULL);
+
+    switch (input->Type) 
+    {
+        case GL_DOUBLE:
+            CONVERT(GLdouble, (GLfloat));
+            break;
+        case GL_UNSIGNED_INT:
+            CONVERT(GLuint, UINT_TO_FLOAT);
+            break;
+        case GL_INT:
+            CONVERT(GLint, INT_TO_FLOAT);
+            break;
+        case GL_UNSIGNED_SHORT:
+            CONVERT(GLushort, USHORT_TO_FLOAT);
+            break;
+        case GL_SHORT:
+            CONVERT(GLshort, SHORT_TO_FLOAT);
+            break;
+        case GL_UNSIGNED_BYTE:
+            assert(input->Format != GL_BGRA);
+            CONVERT(GLubyte, UBYTE_TO_FLOAT);
+            break;
+        case GL_BYTE:
+            CONVERT(GLbyte, BYTE_TO_FLOAT);
+            break;
+        default:
+            assert(0);
+            break;
+    }
+
+    if (mapped_named_bo) 
+    {
+        ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
+    }
+}
+
+static void r700AlignDataToDword(GLcontext *ctx, 
+                                 const struct gl_client_array *input, 
+                                 int count, 
+                                 struct StreamDesc *attr)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    const int dst_stride = (input->StrideB + 3) & ~3;
+    const int size = getTypeSize(input->Type) * input->Size * count;
+    GLboolean mapped_named_bo = GL_FALSE;
+
+    radeonAllocDmaRegion(&context->radeon, &attr->bo, &attr->bo_offset, size, 32);
+
+    if (!input->BufferObj->Pointer) 
+    {
+        ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+        mapped_named_bo = GL_TRUE;
+    }
+
+    {
+        GLvoid *src_ptr = ADD_POINTERS(input->BufferObj->Pointer, input->Ptr);
+        GLvoid *dst_ptr = ADD_POINTERS(attr->bo->ptr, attr->bo_offset);
+        int i;
+
+        for (i = 0; i < count; ++i) 
+        {
+            _mesa_memcpy(dst_ptr, src_ptr, input->StrideB);
+            src_ptr += input->StrideB;
+            dst_ptr += dst_stride;
+        }
+    }
+
+    if (mapped_named_bo) 
+    {
+        ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
+    }
+
+    attr->stride = dst_stride;
+}
+
+static void r700SetupStreams(GLcontext *ctx, const struct gl_client_array *input[], int count)
+{
+	context_t *context = R700_CONTEXT(ctx);
+    GLuint stride;
+    int ret;
+    int i, index;
+
+    R600_STATECHANGE(context, vtx);
+
+    for(index = 0; index < context->nNumActiveAos; index++) 
+    {
+        struct radeon_aos *aos = &context->radeon.tcl.aos[index];
+        i = context->stream_desc[index].element;
+
+        stride = (input[i]->StrideB == 0) ? getTypeSize(input[i]->Type) * input[i]->Size : input[i]->StrideB;
+
+        if (input[i]->Type == GL_DOUBLE || input[i]->Type == GL_UNSIGNED_INT || input[i]->Type == GL_INT ||
+#if MESA_BIG_ENDIAN
+            getTypeSize(input[i]->Type) != 4 || 
+#endif
+            stride < 4) 
+        {
+            r700ConvertAttrib(ctx, count, input[i], &context->stream_desc[index]);
+        } 
+        else 
+        {
+            if (input[i]->BufferObj->Name) 
+            {
+                if (stride % 4 != 0) 
+                {
+                    assert(((intptr_t) input[i]->Ptr) % input[i]->StrideB == 0);
+                    r700AlignDataToDword(ctx, input[i], count, &context->stream_desc[index]);
+                    context->stream_desc[index].is_named_bo = GL_FALSE;
+                } 
+                else 
+                {
+                    context->stream_desc[index].stride = input[i]->StrideB;
+                    context->stream_desc[index].bo_offset = (intptr_t) input[i]->Ptr;
+                    context->stream_desc[index].bo = get_radeon_buffer_object(input[i]->BufferObj)->bo;
+                    context->stream_desc[index].is_named_bo = GL_TRUE;
+                }
+            } 
+            else 
+            {
+                int size;
+                int local_count = count;
+                uint32_t *dst;
+
+                if (input[i]->StrideB == 0) 
+                {
+                    size = getTypeSize(input[i]->Type) * input[i]->Size;
+                    local_count = 1;
+                } 
+                else 
+                {
+                    size = getTypeSize(input[i]->Type) * input[i]->Size * local_count;
+                }
+
+                radeonAllocDmaRegion(&context->radeon, &context->stream_desc[index].bo, 
+                                     &context->stream_desc[index].bo_offset, size, 32);
+                assert(context->stream_desc[index].bo->ptr != NULL);
+                dst = (uint32_t *)ADD_POINTERS(context->stream_desc[index].bo->ptr, 
+                                               context->stream_desc[index].bo_offset);
+
+                switch (context->stream_desc[index].dwords) 
+                {
+                case 1:                     
+                    radeonEmitVec4(dst, input[i]->Ptr, input[i]->StrideB, local_count);                         
+                    break;
+                case 2: 
+                    radeonEmitVec8(dst, input[i]->Ptr, input[i]->StrideB, local_count); 
+                    break;
+                case 3: 
+                    radeonEmitVec12(dst, input[i]->Ptr, input[i]->StrideB, local_count); 
+                    break;
+                case 4: 
+                    radeonEmitVec16(dst, input[i]->Ptr, input[i]->StrideB, local_count); 
+                    break;
+                default: 
+                    assert(0); 
+                    break;
+                }
+            }
+        }
+
+        aos->count = context->stream_desc[index].stride == 0 ? 1 : count;
+        aos->stride = context->stream_desc[index].stride / sizeof(float);
+        aos->components = context->stream_desc[index].dwords;
+        aos->bo = context->stream_desc[index].bo;
+        aos->offset = context->stream_desc[index].bo_offset;
+
+        if(context->stream_desc[index].is_named_bo) 
+        {
+            radeon_cs_space_add_persistent_bo(context->radeon.cmdbuf.cs, 
+                                              context->stream_desc[index].bo, 
+                                              RADEON_GEM_DOMAIN_GTT, 0);
+        }
+    }
+
+    ret = radeon_cs_space_check_with_bo(context->radeon.cmdbuf.cs, 
+                                        first_elem(&context->radeon.dma.reserved)->bo, 
+                                        RADEON_GEM_DOMAIN_GTT, 0);    
+}
+
+static void r700FreeData(GLcontext *ctx)
+{
+    /* Need to zero tcl.aos[n].bo and tcl.elt_dma_bo
+     * to prevent double unref in radeonReleaseArrays
+     * called during context destroy
+     */
+    context_t *context = R700_CONTEXT(ctx);
+
+    int i;
+
+    for (i = 0; i < context->nNumActiveAos; i++)
+    {
+        if (!context->stream_desc[i].is_named_bo)
+        {
+	        radeon_bo_unref(context->stream_desc[i].bo);
+        }
+        context->radeon.tcl.aos[i].bo = NULL;
+    }
+
+    if (context->ind_buf.bo != NULL)
+    {
+            radeon_bo_unref(context->ind_buf.bo);
+    }
+}
+
+static void r700FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer *mesa_ind_buf)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    GLvoid *src_ptr;
+    GLuint *out;
+    int i;
+    GLboolean mapped_named_bo = GL_FALSE;
+
+    if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer)
+    {
+        ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+        mapped_named_bo = GL_TRUE;
+        assert(mesa_ind_buf->obj->Pointer != NULL);
+    }
+    src_ptr = ADD_POINTERS(mesa_ind_buf->obj->Pointer, mesa_ind_buf->ptr);
+
+    if (mesa_ind_buf->type == GL_UNSIGNED_BYTE)
+    {
+        GLuint size = sizeof(GLushort) * ((mesa_ind_buf->count + 1) & ~1);
+        GLubyte *in = (GLubyte *)src_ptr;
+
+	radeonAllocDmaRegion(&context->radeon, &context->ind_buf.bo,
+			     &context->ind_buf.bo_offset, size, 4);
+
+	assert(context->ind_buf.bo->ptr != NULL);
+	out = (GLuint *)ADD_POINTERS(context->ind_buf.bo->ptr, context->ind_buf.bo_offset);
+
+        for (i = 0; i + 1 < mesa_ind_buf->count; i += 2)
+        {
+            *out++ = in[i] | in[i + 1] << 16;
+        }
+
+        if (i < mesa_ind_buf->count)
+        {
+            *out++ = in[i];
+        }
+
+#if MESA_BIG_ENDIAN
+    }
+    else
+    { /* if (mesa_ind_buf->type == GL_UNSIGNED_SHORT) */
+        GLushort *in = (GLushort *)src_ptr;
+        GLuint size = sizeof(GLushort) * ((mesa_ind_buf->count + 1) & ~1);
+
+	radeonAllocDmaRegion(&context->radeon, &context->ind_buf.bo,
+			     &context->ind_buf.bo_offset, size, 4);
+
+	assert(context->ind_buf.bo->ptr != NULL);
+	out = (GLuint *)ADD_POINTERS(context->ind_buf.bo->ptr, context->ind_buf.bo_offset);
+
+        for (i = 0; i + 1 < mesa_ind_buf->count; i += 2)
+        {
+            *out++ = in[i] | in[i + 1] << 16;
+        }
+
+        if (i < mesa_ind_buf->count)
+        {
+            *out++ = in[i];
+        }
+#endif
+    }
+
+    context->ind_buf.is_32bit = GL_FALSE;
+    context->ind_buf.count = mesa_ind_buf->count;
+
+    if (mapped_named_bo)
+    {
+        ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, mesa_ind_buf->obj);
+    }
+}
+
+static void r700SetupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer *mesa_ind_buf)
+{
+    context_t *context = R700_CONTEXT(ctx);
+
+    if (!mesa_ind_buf) {
+        context->ind_buf.bo = NULL;
+        return;
+    }
+
+#if MESA_BIG_ENDIAN
+    if (mesa_ind_buf->type == GL_UNSIGNED_INT)
+    {
+#else
+    if (mesa_ind_buf->type != GL_UNSIGNED_BYTE)
+    {
+#endif
+        const GLvoid *src_ptr;
+        GLvoid *dst_ptr;
+        GLboolean mapped_named_bo = GL_FALSE;
+
+        if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer)
+        {
+	        ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+	        assert(mesa_ind_buf->obj->Pointer != NULL);
+	        mapped_named_bo = GL_TRUE;
+        }
+
+        src_ptr = ADD_POINTERS(mesa_ind_buf->obj->Pointer, mesa_ind_buf->ptr);
+
+        const GLuint size = mesa_ind_buf->count * getTypeSize(mesa_ind_buf->type);
+
+	radeonAllocDmaRegion(&context->radeon, &context->ind_buf.bo,
+			     &context->ind_buf.bo_offset, size, 4);
+	assert(context->ind_buf.bo->ptr != NULL);
+	dst_ptr = ADD_POINTERS(context->ind_buf.bo->ptr, context->ind_buf.bo_offset);
+
+        _mesa_memcpy(dst_ptr, src_ptr, size);
+
+        context->ind_buf.is_32bit = (mesa_ind_buf->type == GL_UNSIGNED_INT);
+        context->ind_buf.count = mesa_ind_buf->count;
+
+        if (mapped_named_bo)
+        {
+	        ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, mesa_ind_buf->obj);
+        }
+    }
+    else
+    {
+	    r700FixupIndexBuffer(ctx, mesa_ind_buf);
+    }
 }
 
-static GLboolean r700RunRender(GLcontext * ctx,
-			       struct tnl_pipeline_stage *stage)
+static GLboolean r700TryDrawPrims(GLcontext *ctx,
+				  const struct gl_client_array *arrays[],
+				  const struct _mesa_prim *prim,
+				  GLuint nr_prims,
+				  const struct _mesa_index_buffer *ib,
+				  GLuint min_index,
+				  GLuint max_index )
 {
     context_t *context = R700_CONTEXT(ctx);
     radeonContextPtr radeon = &context->radeon;
-    unsigned int i, id = 0;
-    TNLcontext *tnl = TNL_CONTEXT(ctx);
-    struct vertex_buffer *vb = &tnl->vb;
+    GLuint i, id = 0;
     struct radeon_renderbuffer *rrb;
 
-    radeon_print(RADEON_RENDER, RADEON_NORMAL, "%s: cs begin at %d\n",
-                __func__, context->radeon.cmdbuf.cs->cdw);
+    if (ctx->NewState)
+        _mesa_update_state( ctx );
+
+    _tnl_UpdateFixedFunctionProgram(ctx);
+    r700SetVertexFormat(ctx, arrays, max_index + 1);
+    /* shaders need to be updated before buffers are validated */
+    r700UpdateShaders(ctx);
+    if (!r600ValidateBuffers(ctx))
+	    return GL_FALSE;
 
     /* always emit CB base to prevent
      * lock ups on some chips.
@@ -367,21 +889,29 @@ static GLboolean r700RunRender(GLcontext * ctx,
     r700SetScissor(context);
     r700SetupVertexProgram(ctx);
     r700SetupFragmentProgram(ctx);
-    r600UpdateTextureState(ctx);
+    r700UpdateShaderStates(ctx);
 
-    GLuint emit_end = r700PredictRenderSize(ctx) 
-        + context->radeon.cmdbuf.cs->cdw;
-    r700SetupStreams(ctx);
+    GLuint emit_end = r700PredictRenderSize(ctx, prim, ib, nr_prims)
+                    + context->radeon.cmdbuf.cs->cdw;
+
+    r700SetupIndexBuffer(ctx, ib);
+    r700SetupStreams(ctx, arrays, max_index + 1);
 
     radeonEmitState(radeon);
 
     radeon_debug_add_indent();
-    /* richard test code */
-    for (i = 0; i < vb->PrimitiveCount; i++) {
-        GLuint prim = _tnl_translate_prim(&vb->Primitive[i]);
-        GLuint start = vb->Primitive[i].start;
-        GLuint end = vb->Primitive[i].start + vb->Primitive[i].count;
-        r700RunRenderPrimitive(ctx, start, end, prim);
+    for (i = 0; i < nr_prims; ++i)
+    {
+	    if (context->ind_buf.bo)
+		    r700RunRenderPrimitive(ctx,
+					   prim[i].start,
+					   prim[i].start + prim[i].count,
+					   prim[i].mode);
+	    else
+		    r700RunRenderPrimitiveImmediate(ctx,
+						    prim[i].start,
+						    prim[i].start + prim[i].count,
+						    prim[i].mode);
     }
     radeon_debug_remove_indent();
 
@@ -398,83 +928,54 @@ static GLboolean r700RunRender(GLcontext * ctx,
 	    r700SyncSurf(context, rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM,
 			 DB_ACTION_ENA_bit | DB_DEST_BASE_ENA_bit);
 
-    radeonReleaseArrays(ctx, ~0);
-
-    radeon_print(RADEON_RENDER, RADEON_TRACE, "%s: cs end at %d\n",
-                __func__, context->radeon.cmdbuf.cs->cdw);
+    r700FreeData(ctx);
 
-    if ( emit_end < context->radeon.cmdbuf.cs->cdw )
-       WARN_ONCE("Rendering was %d commands larger than predicted size."
-	       " We might overflow  command buffer.\n", context->radeon.cmdbuf.cs->cdw - emit_end);
-
-    return GL_FALSE;
-}
+    if (emit_end < context->radeon.cmdbuf.cs->cdw)
+    {
+        WARN_ONCE("Rendering was %d commands larger than predicted size."
+            " We might overflow  command buffer.\n", context->radeon.cmdbuf.cs->cdw - emit_end);
+    }
 
-static GLboolean r700RunNonTCLRender(GLcontext * ctx,
-				     struct tnl_pipeline_stage *stage) /* -------------------- */
-{
-	GLboolean bRet = GL_TRUE;
-	
-	return bRet;
+    return GL_TRUE;
 }
 
-static GLboolean r700RunTCLRender(GLcontext * ctx,  /*----------------------*/
-				  struct tnl_pipeline_stage *stage)
+static void r700DrawPrims(GLcontext *ctx,
+			  const struct gl_client_array *arrays[],
+			  const struct _mesa_prim *prim,
+			  GLuint nr_prims,
+			  const struct _mesa_index_buffer *ib,
+			  GLboolean index_bounds_valid,
+			  GLuint min_index,
+			  GLuint max_index)
 {
-	GLboolean bRet = GL_FALSE;
+	GLboolean retval = GL_FALSE;
 
-    /* TODO : sw fallback */
-
-    /* Need shader bo's setup before bo check */
-    r700UpdateShaders(ctx);
-    /**
+	/* This check should get folded into just the places that
+	 * min/max index are really needed.
+	 */
+	if (!index_bounds_valid) {
+		vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
+	}
 
-    * Ensure all enabled and complete textures are uploaded along with any buffers being used.
-    */
-    if(!r600ValidateBuffers(ctx))
-    {
-        return GL_TRUE;
-    }
+	if (min_index) {
+		vbo_rebase_prims( ctx, arrays, prim, nr_prims, ib, min_index, max_index, r700DrawPrims );
+		return;
+	}
 
-    bRet = r700RunRender(ctx, stage);
+	/* Make an attempt at drawing */
+	retval = r700TryDrawPrims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
 
-    return bRet;
-	//GL_FALSE will stop to do other pipe stage in _tnl_run_pipeline
-    //The render here DOES finish the whole pipe, so GL_FALSE should be returned for success.
+	/* If failed run tnl pipeline - it should take care of fallbacks */
+	if (!retval)
+		_tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
 }
 
-const struct tnl_pipeline_stage _r700_render_stage = {
-	"r700 Hardware Rasterization",
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	r700RunNonTCLRender
-};
-
-const struct tnl_pipeline_stage _r700_tcl_stage = {
-	"r700 Hardware Transform, Clipping and Lighting",
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	r700RunTCLRender
-};
-
-const struct tnl_pipeline_stage *r700_pipeline[] = 
+void r700InitDraw(GLcontext *ctx)
 {
-    &_r700_tcl_stage,
-    &_tnl_vertex_transform_stage,
-	&_tnl_normal_transform_stage,
-	&_tnl_lighting_stage,
-	&_tnl_fog_coordinate_stage,
-	&_tnl_texgen_stage,
-	&_tnl_texture_transform_stage,
-	&_tnl_vertex_program_stage,
-
-    &_r700_render_stage,
-    &_tnl_render_stage,
-    0,
-};
+	struct vbo_context *vbo = vbo_context(ctx);
+
+	/* to be enabled */
+	vbo->draw_prims = r700DrawPrims;
+}
 
 
diff --git a/r600/r700_shader.c b/r600/r700_shader.c
index b4fd51c..955ea4e 100644
--- a/r600/r700_shader.c
+++ b/r600/r700_shader.c
@@ -60,6 +60,55 @@ void AddInstToList(TypedShaderList * plstCFInstructions, R700ShaderInstruction *
 	plstCFInstructions->uNumOfNode++;
 }
 
+void TakeInstOutFromList(TypedShaderList * plstCFInstructions, R700ShaderInstruction * pInst)
+{
+    GLuint    ulIndex = 0;
+    GLboolean bFound  = GL_FALSE;
+    R700ShaderInstruction * pPrevInst = NULL;
+    R700ShaderInstruction * pCurInst = plstCFInstructions->pHead;
+
+    /* Need go thro list to make sure pInst is there. */
+    while(NULL != pCurInst)
+    {
+        if(pCurInst == pInst)
+        {                        
+            bFound  = GL_TRUE;
+            break;
+        }
+
+        pPrevInst = pCurInst;
+        pCurInst  = pCurInst->pNextInst;
+    }
+    if(GL_TRUE == bFound)
+    {
+        plstCFInstructions->uNumOfNode--;
+
+        pCurInst = pInst->pNextInst;
+        ulIndex  = pInst->m_uIndex;
+        while(NULL != pCurInst)
+        {
+            pCurInst->m_uIndex = ulIndex;
+            ulIndex++;
+            pCurInst = pCurInst->pNextInst;
+        }
+
+        if(plstCFInstructions->pHead == pInst)
+        {
+            plstCFInstructions->pHead = pInst->pNextInst;
+        }
+        if(plstCFInstructions->pTail == pInst)
+        {
+            plstCFInstructions->pTail = pPrevInst;
+        }
+        if(NULL != pPrevInst)
+        {
+            pPrevInst->pNextInst = pInst->pNextInst;
+        }
+
+        FREE(pInst);
+    }
+}
+
 void Init_R700_Shader(R700_Shader * pShader)
 {
 	pShader->Type = R700_SHADER_INVALID;
@@ -488,6 +537,47 @@ void DebugPrint(void)
 {
 }
 
+void cleanup_vfetch_shaderinst(R700_Shader *pShader)
+{
+    R700ShaderInstruction      *pInst;
+    R700ShaderInstruction      *pInstToFree;
+    R700VertexInstruction      *pVTXInst;
+    R700ControlFlowInstruction *pCFInst;
+
+    pInst = pShader->lstVTXInstructions.pHead;
+    while(NULL != pInst)
+    {
+        pVTXInst = (R700VertexInstruction  *)pInst;        
+        pShader->uShaderBinaryDWORDSize -= GetInstructionSize(pVTXInst->m_ShaderInstType);
+
+        if(NULL != pVTXInst->m_pLinkedGenericClause)
+        {
+            pCFInst = (R700ControlFlowInstruction*)(pVTXInst->m_pLinkedGenericClause);
+
+            TakeInstOutFromList(&(pShader->lstCFInstructions), 
+                                 (R700ShaderInstruction*)pCFInst);
+
+            pShader->uShaderBinaryDWORDSize -= GetInstructionSize(pCFInst->m_ShaderInstType);
+        }
+
+        pInst = pInst->pNextInst;
+    };
+
+    //destroy each item in pShader->lstVTXInstructions;
+    pInst = pShader->lstVTXInstructions.pHead;
+    while(NULL != pInst)
+    {
+        pInstToFree = pInst;
+        pInst = pInst->pNextInst;
+        FREE(pInstToFree);
+    };
+
+    //set NULL pShader->lstVTXInstructions
+    pShader->lstVTXInstructions.pHead=NULL; 
+	pShader->lstVTXInstructions.pTail=NULL; 
+	pShader->lstVTXInstructions.uNumOfNode=0;
+}
+
 void Clean_Up_Shader(R700_Shader *pShader)
 {
     FREE(pShader->pProgram);
diff --git a/r600/r700_shader.h b/r600/r700_shader.h
index bfd01e1..c6a0586 100644
--- a/r600/r700_shader.h
+++ b/r600/r700_shader.h
@@ -128,6 +128,7 @@ typedef struct R700_Shader
 
 //Internal
 void AddInstToList(TypedShaderList * plstCFInstructions, R700ShaderInstruction * pInst);
+void TakeInstOutFromList(TypedShaderList * plstCFInstructions, R700ShaderInstruction * pInst);
 void ResolveLinks(R700_Shader *pShader);
 void Assemble(R700_Shader *pShader);
 
@@ -143,6 +144,7 @@ void LoadProgram(R700_Shader *pShader);
 void UpdateShaderRegisters(R700_Shader *pShader);
 void DeleteInstructions(R700_Shader *pShader);
 void DebugPrint(void);
+void cleanup_vfetch_shaderinst(R700_Shader *pShader);
 
 void Clean_Up_Shader(R700_Shader *pShader);
 
diff --git a/r600/r700_state.c b/r600/r700_state.c
index 124469b..16b05d5 100644
--- a/r600/r700_state.c
+++ b/r600/r700_state.c
@@ -46,7 +46,6 @@
 #include "shader/prog_parameter.h"
 #include "shader/prog_statevars.h"
 #include "vbo/vbo.h"
-#include "main/texformat.h"
 
 #include "r600_context.h"
 
@@ -55,18 +54,15 @@
 #include "r700_fragprog.h"
 #include "r700_vertprog.h"
 
-
+void r600UpdateTextureState(GLcontext * ctx);
 static void r700SetClipPlaneState(GLcontext * ctx, GLenum cap, GLboolean state);
 static void r700UpdatePolygonMode(GLcontext * ctx);
 static void r700SetPolygonOffsetState(GLcontext * ctx, GLboolean state);
 static void r700SetStencilState(GLcontext * ctx, GLboolean state);
 
-void r700UpdateShaders (GLcontext * ctx)  //----------------------------------
+void r700UpdateShaders(GLcontext * ctx)
 {
     context_t *context = R700_CONTEXT(ctx);
-    GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
-    GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
-    int i;
 
     /* should only happenen once, just after context is created */
     /* TODO: shouldn't we fallback to sw here? */
@@ -77,21 +73,6 @@ void r700UpdateShaders (GLcontext * ctx)  //----------------------------------
 
     r700SelectFragmentShader(ctx);
 
-    if (context->radeon.NewGLState) {
-	    for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
-		    /* mat states from state var not array for sw */
-		    dummy_attrib[i].stride = 0;
-	            temp_attrib[i] = TNL_CONTEXT(ctx)->vb.AttribPtr[i];
-		    TNL_CONTEXT(ctx)->vb.AttribPtr[i] = &(dummy_attrib[i]);
-	    }
-
-	    _tnl_UpdateFixedFunctionProgram(ctx);
-
-	    for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
-		    TNL_CONTEXT(ctx)->vb.AttribPtr[i] = temp_attrib[i];
-	    }
-    }
-
     r700SelectVertexShader(ctx);
     r700UpdateStateParameters(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
     context->radeon.NewGLState = 0;
@@ -171,6 +152,14 @@ static void r700InvalidateState(GLcontext * ctx, GLuint new_state) //-----------
 	    R600_STATECHANGE(context, db_target);
     }
 
+    if (new_state & (_NEW_LIGHT)) {
+	    R600_STATECHANGE(context, su);
+	    if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION)
+		    SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, PROVOKING_VTX_LAST_bit);
+	    else
+		    CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, PROVOKING_VTX_LAST_bit);
+    }
+
     r700UpdateStateParameters(ctx, new_state);
 
     R600_STATECHANGE(context, cl);
@@ -202,6 +191,67 @@ static void r700InvalidateState(GLcontext * ctx, GLuint new_state) //-----------
     context->radeon.NewGLState |= new_state;
 }
 
+static void r700SetDBRenderState(GLcontext * ctx)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	struct r700_fragment_program *fp = (struct r700_fragment_program *)
+		(ctx->FragmentProgram._Current);
+
+	R600_STATECHANGE(context, db);
+
+	SETbit(r700->DB_SHADER_CONTROL.u32All, DUAL_EXPORT_ENABLE_bit);
+	SETfield(r700->DB_SHADER_CONTROL.u32All, EARLY_Z_THEN_LATE_Z, Z_ORDER_shift, Z_ORDER_mask);
+	/* XXX need to enable htile for hiz/s */
+	SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIZ_ENABLE_shift, FORCE_HIZ_ENABLE_mask);
+	SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE0_shift, FORCE_HIS_ENABLE0_mask);
+	SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE1_shift, FORCE_HIS_ENABLE1_mask);
+
+	if (context->radeon.query.current)
+	{
+		SETbit(r700->DB_RENDER_OVERRIDE.u32All, NOOP_CULL_DISABLE_bit);
+		if (context->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV770)
+		{
+			SETbit(r700->DB_RENDER_CONTROL.u32All, PERFECT_ZPASS_COUNTS_bit);
+		}
+	}
+	else
+	{
+		CLEARbit(r700->DB_RENDER_OVERRIDE.u32All, NOOP_CULL_DISABLE_bit);
+		if (context->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV770)
+		{
+			CLEARbit(r700->DB_RENDER_CONTROL.u32All, PERFECT_ZPASS_COUNTS_bit);
+		}
+	}
+
+	if (fp)
+	{
+		if (fp->r700Shader.killIsUsed)
+		{
+			SETbit(r700->DB_SHADER_CONTROL.u32All, KILL_ENABLE_bit);
+		}
+		else
+		{
+			CLEARbit(r700->DB_SHADER_CONTROL.u32All, KILL_ENABLE_bit);
+		}
+
+		if (fp->r700Shader.depthIsExported)
+		{
+			SETbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit);
+		}
+		else
+		{
+			CLEARbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit);
+		}
+	}
+}
+
+void r700UpdateShaderStates(GLcontext * ctx)
+{
+	r700SetDBRenderState(ctx);
+	r600UpdateTextureState(ctx);
+}
+
 static void r700SetDepthState(GLcontext * ctx)
 {
 	context_t *context = R700_CONTEXT(ctx);
@@ -467,10 +517,10 @@ static void r700SetBlendState(GLcontext * ctx)
 		 eqn, COLOR_COMB_FCN_shift, COLOR_COMB_FCN_mask);
 
 	SETfield(blend_reg,
-		 blend_factor(ctx->Color.BlendSrcRGB, GL_TRUE),
+		 blend_factor(ctx->Color.BlendSrcA, GL_TRUE),
 		 ALPHA_SRCBLEND_shift, ALPHA_SRCBLEND_mask);
 	SETfield(blend_reg,
-		 blend_factor(ctx->Color.BlendDstRGB, GL_FALSE),
+		 blend_factor(ctx->Color.BlendDstA, GL_FALSE),
 		 ALPHA_DESTBLEND_shift, ALPHA_DESTBLEND_mask);
 
 	switch (ctx->Color.BlendEquationA) {
@@ -655,6 +705,10 @@ static void r700UpdateCulling(GLcontext * ctx)
             CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, FACE_bit); /* default: ccw */
             break;
     }
+
+    /* Winding is inverted when rendering to FBO */
+    if (ctx->DrawBuffer && ctx->DrawBuffer->Name)
+	    r700->PA_SU_SC_MODE_CNTL.u32All ^= FACE_bit;
 }
 
 static void r700UpdateLineStipple(GLcontext * ctx)
@@ -745,9 +799,9 @@ static void r700ColorMask(GLcontext * ctx,
 			     (b ? 4 : 0) |
 			     (a ? 8 : 0));
 
-	if (mask != r700->CB_SHADER_MASK.u32All) {
+	if (mask != r700->CB_TARGET_MASK.u32All) {
 		R600_STATECHANGE(context, cb);
-		SETfield(r700->CB_SHADER_MASK.u32All, mask, OUTPUT0_ENABLE_shift, OUTPUT0_ENABLE_mask);
+		SETfield(r700->CB_TARGET_MASK.u32All, mask, TARGET0_ENABLE_shift, TARGET0_ENABLE_mask);
 	}
 }
 
@@ -1041,6 +1095,7 @@ static void r700UpdateWindow(GLcontext * ctx, int id) //--------------------
 	GLfloat tz = v[MAT_TZ] * depthScale;
 
 	R600_STATECHANGE(context, vpt);
+	R600_STATECHANGE(context, cl);
 
 	r700->viewport[id].PA_CL_VPORT_XSCALE.f32All  = sx;
 	r700->viewport[id].PA_CL_VPORT_XOFFSET.f32All = tx;
@@ -1051,6 +1106,18 @@ static void r700UpdateWindow(GLcontext * ctx, int id) //--------------------
 	r700->viewport[id].PA_CL_VPORT_ZSCALE.f32All  = sz;
 	r700->viewport[id].PA_CL_VPORT_ZOFFSET.f32All = tz;
 
+	if (ctx->Transform.DepthClamp) {
+		r700->viewport[id].PA_SC_VPORT_ZMIN_0.f32All = MIN2(ctx->Viewport.Near, ctx->Viewport.Far);
+		r700->viewport[id].PA_SC_VPORT_ZMAX_0.f32All = MAX2(ctx->Viewport.Near, ctx->Viewport.Far);
+		SETbit(r700->PA_CL_CLIP_CNTL.u32All, ZCLIP_NEAR_DISABLE_bit);
+		SETbit(r700->PA_CL_CLIP_CNTL.u32All, ZCLIP_FAR_DISABLE_bit);
+	} else {
+		r700->viewport[id].PA_SC_VPORT_ZMIN_0.f32All = 0.0;
+		r700->viewport[id].PA_SC_VPORT_ZMAX_0.f32All = 1.0;
+		CLEARbit(r700->PA_CL_CLIP_CNTL.u32All, ZCLIP_NEAR_DISABLE_bit);
+		CLEARbit(r700->PA_CL_CLIP_CNTL.u32All, ZCLIP_FAR_DISABLE_bit);
+	}
+
 	r700->viewport[id].enabled = GL_TRUE;
 
 	r700SetScissor(context);
@@ -1164,13 +1231,8 @@ static void r700UpdatePolygonMode(GLcontext * ctx)
 		/* Handle GL_CW (clock wise and GL_CCW (counter clock wise)
 		 * correctly by selecting the correct front and back face
 		 */
-		if (ctx->Polygon.FrontFace == GL_CCW) {
-			f = ctx->Polygon.FrontMode;
-			b = ctx->Polygon.BackMode;
-		} else {
-			f = ctx->Polygon.BackMode;
-			b = ctx->Polygon.FrontMode;
-		}
+		f = ctx->Polygon.FrontMode;
+		b = ctx->Polygon.BackMode;
 
 		/* Enable polygon mode */
 		SETfield(r700->PA_SU_SC_MODE_CNTL.u32All, X_DUAL_MODE, POLY_MODE_shift, POLY_MODE_mask);
@@ -1269,11 +1331,15 @@ void r700SetScissor(context_t *context) //---------------
 		return;
 	}
 	if (context->radeon.state.scissor.enabled) {
-		/* r600 has exclusive scissors */
 		x1 = context->radeon.state.scissor.rect.x1;
 		y1 = context->radeon.state.scissor.rect.y1;
-		x2 = context->radeon.state.scissor.rect.x2 + 1;
-		y2 = context->radeon.state.scissor.rect.y2 + 1;
+		x2 = context->radeon.state.scissor.rect.x2;
+		y2 = context->radeon.state.scissor.rect.y2;
+		/* r600 has exclusive BR scissors */
+		if (context->radeon.radeonScreen->kernel_mm) {
+			x2++;
+			y2++;
+		}
 	} else {
 		if (context->radeon.radeonScreen->driScreen->dri2.enabled) {
 			x1 = 0;
@@ -1352,8 +1418,6 @@ void r700SetScissor(context_t *context) //---------------
 	SETfield(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_BR.u32All, y2,
 		 PA_SC_VPORT_SCISSOR_0_BR__BR_Y_shift, PA_SC_VPORT_SCISSOR_0_BR__BR_Y_mask);
 
-	r700->viewport[id].PA_SC_VPORT_ZMIN_0.u32All = 0;
-	r700->viewport[id].PA_SC_VPORT_ZMAX_0.u32All = 0x3F800000;
 	r700->viewport[id].enabled = GL_TRUE;
 }
 
@@ -1668,19 +1732,10 @@ void r700InitState(GLcontext * ctx) //-------------------
     r700Enable(ctx, GL_DEPTH_TEST, ctx->Depth.Test);
     r700DepthMask(ctx, ctx->Depth.Mask);
     r700DepthFunc(ctx, ctx->Depth.Func);
-    SETbit(r700->DB_SHADER_CONTROL.u32All, DUAL_EXPORT_ENABLE_bit);
-
     r700->DB_DEPTH_CLEAR.u32All     = 0x3F800000;
-
-    r700->DB_RENDER_CONTROL.u32All  = 0;
     SETbit(r700->DB_RENDER_CONTROL.u32All, STENCIL_COMPRESS_DISABLE_bit);
     SETbit(r700->DB_RENDER_CONTROL.u32All, DEPTH_COMPRESS_DISABLE_bit);
-    r700->DB_RENDER_OVERRIDE.u32All = 0;
-    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
-	    SETbit(r700->DB_RENDER_OVERRIDE.u32All, FORCE_SHADER_Z_ORDER_bit);
-    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIZ_ENABLE_shift, FORCE_HIZ_ENABLE_mask);
-    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE0_shift, FORCE_HIS_ENABLE0_mask);
-    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE1_shift, FORCE_HIS_ENABLE1_mask);
+    r700SetDBRenderState(ctx);
 
     r700->DB_ALPHA_TO_MASK.u32All = 0;
     SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET0_shift, ALPHA_TO_MASK_OFFSET0_mask);
@@ -1754,7 +1809,7 @@ void r700InitState(GLcontext * ctx) //-------------------
     r700->CB_CLRCMP_MSK.u32All = 0xFFFFFFFF;
 
     /* screen/window/view */
-    SETfield(r700->CB_TARGET_MASK.u32All, 0xF, (4 * id), TARGET0_ENABLE_mask);
+    SETfield(r700->CB_SHADER_MASK.u32All, 0xF, (4 * id), OUTPUT0_ENABLE_mask);
 
     context->radeon.hw.all_dirty = GL_TRUE;
 
diff --git a/r600/r700_state.h b/r600/r700_state.h
index 0f53d5b..60c6a7f 100644
--- a/r600/r700_state.h
+++ b/r600/r700_state.h
@@ -35,6 +35,7 @@
 
 extern void r700UpdateStateParameters(GLcontext * ctx, GLuint new_state);
 extern void r700UpdateShaders (GLcontext * ctx);
+extern void r700UpdateShaderStates(GLcontext * ctx);
 
 extern void r700UpdateViewportOffset(GLcontext * ctx);
 
diff --git a/r600/r700_vertprog.c b/r600/r700_vertprog.c
index 04726ec..6986eb0 100644
--- a/r600/r700_vertprog.c
+++ b/r600/r700_vertprog.c
@@ -159,7 +159,35 @@ GLboolean Process_Vertex_Program_Vfetch_Instructions(
 	return GL_TRUE;
 }
 
-void Map_Vertex_Program(struct r700_vertex_program *vp,
+GLboolean Process_Vertex_Program_Vfetch_Instructions2(
+    GLcontext *ctx,
+	struct r700_vertex_program *vp,
+	struct gl_vertex_program   *mesa_vp)
+{
+    int i;
+    context_t *context = R700_CONTEXT(ctx);
+
+    VTX_FETCH_METHOD vtxFetchMethod;
+	vtxFetchMethod.bEnableMini          = GL_FALSE;
+	vtxFetchMethod.mega_fetch_remainder = 0;
+
+    for(i=0; i<context->nNumActiveAos; i++)
+    {
+        assemble_vfetch_instruction2(&vp->r700AsmCode,
+                                      vp->r700AsmCode.ucVP_AttributeMap[context->stream_desc[i].element],
+                                      context->stream_desc[i].type,
+                                      context->stream_desc[i].size,
+                                      context->stream_desc[i].element,
+                                      context->stream_desc[i]._signed,
+                                      context->stream_desc[i].normalize,						            
+                                     &vtxFetchMethod);
+    }
+
+    return GL_TRUE;
+}
+
+void Map_Vertex_Program(GLcontext *ctx,
+                        struct r700_vertex_program *vp,
 						struct gl_vertex_program   *mesa_vp)
 {
     GLuint ui;
@@ -175,10 +203,10 @@ void Map_Vertex_Program(struct r700_vertex_program *vp,
 	pAsm->number_used_registers += num_inputs;
 
 	// Create VFETCH instructions for inputs
-	if (GL_TRUE != Process_Vertex_Program_Vfetch_Instructions(vp, mesa_vp) ) 
+        if (GL_TRUE != Process_Vertex_Program_Vfetch_Instructions2(ctx, vp, mesa_vp) )
 	{
-		radeon_error("Calling Process_Vertex_Program_Vfetch_Instructions return error. \n");
-		return; //error
+		radeon_error("Calling Process_Vertex_Program_Vfetch_Instructions2 return error. \n");
+		return;
 	}
 
 	// Map Outputs
@@ -189,7 +217,7 @@ void Map_Vertex_Program(struct r700_vertex_program *vp,
 	pAsm->number_used_registers += pAsm->number_of_exports;
 
     pAsm->pucOutMask = (unsigned char*) MALLOC(pAsm->number_of_exports);
-    
+
     for(ui=0; ui<pAsm->number_of_exports; ui++)
     {
         pAsm->pucOutMask[ui] = 0x0;
@@ -206,7 +234,7 @@ void Map_Vertex_Program(struct r700_vertex_program *vp,
     {   /* fix func t_vp uses NumTemporaries */
         pAsm->number_used_registers += mesa_vp->Base.NumTemporaries;
     }
-	
+
     pAsm->uFirstHelpReg = pAsm->number_used_registers;
 }
 
@@ -261,13 +289,10 @@ GLboolean Find_Instruction_Dependencies_vp(struct r700_vertex_program *vp,
 }
 
 struct r700_vertex_program* r700TranslateVertexShader(GLcontext *ctx,
-						struct gl_vertex_program *mesa_vp)
+						      struct gl_vertex_program *mesa_vp)
 {
 	context_t *context = R700_CONTEXT(ctx);
 	struct r700_vertex_program *vp;
-	TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *vb = &tnl->vb;
-	unsigned int unBit;
 	unsigned int i;
 
 	vp = _mesa_calloc(sizeof(*vp));
@@ -278,17 +303,13 @@ struct r700_vertex_program* r700TranslateVertexShader(GLcontext *ctx,
                 _mesa_insert_mvp_code(ctx, vp->mesa_program);
         }
 
-	for(i=0; i<VERT_ATTRIB_MAX; i++)
+	for(i=0; i<context->nNumActiveAos; i++)
 	{
-		unBit = 1 << i;
-		if(vp->mesa_program->Base.InputsRead & unBit) /* ctx->Array.ArrayObj->xxxxxxx */
-		{
-			vp->aos_desc[i].size   = vb->AttribPtr[i]->size;
-			vp->aos_desc[i].stride = vb->AttribPtr[i]->size * sizeof(GL_FLOAT);/* when emit array, data is packed. vb->AttribPtr[i]->stride;*/
-			vp->aos_desc[i].type   = GL_FLOAT;
-		}
+		vp->aos_desc[i].size   = context->stream_desc[i].size;
+		vp->aos_desc[i].stride = context->stream_desc[i].stride;
+		vp->aos_desc[i].type   = context->stream_desc[i].type;
 	}
-	
+
 	if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
 	{
 		vp->r700AsmCode.bR6xx = 1;
@@ -296,19 +317,19 @@ struct r700_vertex_program* r700TranslateVertexShader(GLcontext *ctx,
 
 	//Init_Program
 	Init_r700_AssemblerBase(SPT_VP, &(vp->r700AsmCode), &(vp->r700Shader) );
-	Map_Vertex_Program( vp, vp->mesa_program );
+	Map_Vertex_Program(ctx, vp, vp->mesa_program );
 
 	if(GL_FALSE == Find_Instruction_Dependencies_vp(vp, vp->mesa_program))
 	{
 		return NULL;
-    }
+	}
 
 	if(GL_FALSE == AssembleInstr(vp->mesa_program->Base.NumInstructions,
-                                 &(vp->mesa_program->Base.Instructions[0]), 
+                                 &(vp->mesa_program->Base.Instructions[0]),
                                  &(vp->r700AsmCode)) )
 	{
 		return NULL;
-	} 
+	}
 
     if(GL_FALSE == Process_Vertex_Exports(&(vp->r700AsmCode), vp->mesa_program->Base.OutputsWritten) )
     {
@@ -330,9 +351,6 @@ void r700SelectVertexShader(GLcontext *ctx)
     context_t *context = R700_CONTEXT(ctx);
     struct r700_vertex_program_cont *vpc;
     struct r700_vertex_program *vp;
-    TNLcontext *tnl = TNL_CONTEXT(ctx);
-    struct vertex_buffer *vb = &tnl->vb;
-    unsigned int unBit;
     unsigned int i;
     GLboolean match;
     GLbitfield InputsRead;
@@ -343,29 +361,27 @@ void r700SelectVertexShader(GLcontext *ctx)
     if (vpc->mesa_program.IsPositionInvariant)
     {
 	InputsRead |= VERT_BIT_POS;
-    } 
-    
+    }
+
     for (vp = vpc->progs; vp; vp = vp->next)
     {
-	match = GL_TRUE;	
-	for(i=0; i<VERT_ATTRIB_MAX; i++)
+	match = GL_TRUE;
+	for(i=0; i<context->nNumActiveAos; i++)
 	{
-		unBit = 1 << i;
-		if(InputsRead & unBit)
+		if (vp->aos_desc[i].size != context->stream_desc[i].size)
 		{
-			if (vp->aos_desc[i].size != vb->AttribPtr[i]->size)
-				match = GL_FALSE;
-				break;
+			match = GL_FALSE;
+			break;
 		}
 	}
-	if (match) 
+	if (match)
 	{
 		context->selected_vp = vp;
 		return;
 	}
     }
 
-    vp = r700TranslateVertexShader(ctx, &(vpc->mesa_program) );
+    vp = r700TranslateVertexShader(ctx, &(vpc->mesa_program));
     if(!vp)
     {
 	radeon_error("Failed to translate vertex shader. \n");
@@ -377,6 +393,146 @@ void r700SelectVertexShader(GLcontext *ctx)
     return;
 }
 
+int getTypeSize(GLenum type)
+{
+    switch (type) 
+    {
+    case GL_DOUBLE:
+        return sizeof(GLdouble);
+    case GL_FLOAT:
+        return sizeof(GLfloat);
+    case GL_INT:
+        return sizeof(GLint);
+    case GL_UNSIGNED_INT:
+        return sizeof(GLuint);
+    case GL_SHORT:
+        return sizeof(GLshort);
+    case GL_UNSIGNED_SHORT:
+        return sizeof(GLushort);
+    case GL_BYTE:
+        return sizeof(GLbyte);
+    case GL_UNSIGNED_BYTE:
+        return sizeof(GLubyte);
+    default:
+        assert(0);
+        return 0;
+    }
+}
+
+static void r700TranslateAttrib(GLcontext *ctx, GLuint unLoc, int count, const struct gl_client_array *input)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    
+    StreamDesc * pStreamDesc = &(context->stream_desc[context->nNumActiveAos]);
+
+	GLuint stride;
+
+	stride = (input->StrideB == 0) ? getTypeSize(input->Type) * input->Size 
+                                   : input->StrideB;
+
+    if (input->Type == GL_DOUBLE || input->Type == GL_UNSIGNED_INT || input->Type == GL_INT ||
+#if MESA_BIG_ENDIAN
+        getTypeSize(input->Type) != 4 ||
+#endif
+        stride < 4) 
+    {
+        pStreamDesc->type = GL_FLOAT;
+
+        if (input->StrideB == 0) 
+        {
+	        pStreamDesc->stride = 0;
+        } 
+        else 
+        {
+	        pStreamDesc->stride = sizeof(GLfloat) * input->Size;
+        }
+        pStreamDesc->dwords = input->Size;
+        pStreamDesc->is_named_bo = GL_FALSE;
+    } 
+    else 
+    {
+        pStreamDesc->type = input->Type;
+        pStreamDesc->dwords = (getTypeSize(input->Type) * input->Size + 3)/ 4;
+        if (!input->BufferObj->Name) 
+        {
+            if (input->StrideB == 0) 
+            {
+                pStreamDesc->stride = 0;
+            } 
+            else 
+            {
+                pStreamDesc->stride = (getTypeSize(pStreamDesc->type) * input->Size + 3) & ~3;
+            }
+
+            pStreamDesc->is_named_bo = GL_FALSE;
+        }
+    }
+
+	pStreamDesc->size = input->Size;
+	pStreamDesc->dst_loc = context->nNumActiveAos;
+	pStreamDesc->element = unLoc;
+
+	switch (pStreamDesc->type) 
+	{ //GetSurfaceFormat
+	case GL_FLOAT:
+		pStreamDesc->_signed = 0;
+		pStreamDesc->normalize = GL_FALSE;
+		break;
+	case GL_SHORT:
+		pStreamDesc->_signed = 1;
+		pStreamDesc->normalize = input->Normalized;
+		break;
+	case GL_BYTE:
+		pStreamDesc->_signed = 1;
+		pStreamDesc->normalize = input->Normalized;
+		break;
+	case GL_UNSIGNED_SHORT:
+		pStreamDesc->_signed = 0;
+		pStreamDesc->normalize = input->Normalized;
+		break;
+	case GL_UNSIGNED_BYTE:
+		pStreamDesc->_signed = 0;
+		pStreamDesc->normalize = input->Normalized;
+		break;
+	default:
+	case GL_INT:
+	case GL_UNSIGNED_INT:
+	case GL_DOUBLE: 
+		assert(0);
+		break;
+	}
+	context->nNumActiveAos++;
+}
+
+void r700SetVertexFormat(GLcontext *ctx, const struct gl_client_array *arrays[], int count)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    struct r700_vertex_program *vpc
+           = (struct r700_vertex_program *)ctx->VertexProgram._Current;
+
+    struct gl_vertex_program * mesa_vp = (struct gl_vertex_program *)&(vpc->mesa_program);
+    unsigned int unLoc = 0;
+    unsigned int unBit = mesa_vp->Base.InputsRead;
+    context->nNumActiveAos = 0;
+
+    if (mesa_vp->IsPositionInvariant)
+    {
+        unBit |= VERT_BIT_POS;
+    }
+
+    while(unBit) 
+    {
+        if(unBit & 1)
+        {
+            r700TranslateAttrib(ctx, unLoc, count, arrays[unLoc]);
+        }
+
+        unBit >>= 1;
+        ++unLoc;
+    }
+    context->radeon.tcl.aos_count = context->nNumActiveAos;
+}
+
 void * r700GetActiveVpShaderBo(GLcontext * ctx)
 {
     context_t *context = R700_CONTEXT(ctx);
diff --git a/r600/r700_vertprog.h b/r600/r700_vertprog.h
index c48764c..00824c2 100644
--- a/r600/r700_vertprog.h
+++ b/r600/r700_vertprog.h
@@ -52,8 +52,7 @@ struct r700_vertex_program
 
     GLboolean translated;
     GLboolean loaded;
-    GLboolean needUpdateVF;
-	
+
     void * shaderbo;
 
     ArrayDesc              aos_desc[VERT_ATTRIB_MAX];
@@ -76,19 +75,27 @@ unsigned int Map_Vertex_Input(r700_AssemblerBase       *pAsm,
 GLboolean Process_Vertex_Program_Vfetch_Instructions(
 	struct r700_vertex_program *vp,
 	struct gl_vertex_program   *mesa_vp);
-void Map_Vertex_Program(struct r700_vertex_program *vp,
+GLboolean Process_Vertex_Program_Vfetch_Instructions2(
+    GLcontext *ctx,
+	struct r700_vertex_program *vp,
+	struct gl_vertex_program   *mesa_vp);
+void Map_Vertex_Program(GLcontext *ctx,
+            struct r700_vertex_program *vp,
 			struct gl_vertex_program   *mesa_vp);
 GLboolean Find_Instruction_Dependencies_vp(struct r700_vertex_program *vp,
 					   struct gl_vertex_program   *mesa_vp);
 
 struct r700_vertex_program* r700TranslateVertexShader(GLcontext *ctx,
-				    struct gl_vertex_program   *mesa_vp);
+						      struct gl_vertex_program   *mesa_vp);
 
 /* Interface */
 extern void r700SelectVertexShader(GLcontext *ctx);
+extern void r700SetVertexFormat(GLcontext *ctx, const struct gl_client_array *arrays[], int count);
 
 extern GLboolean r700SetupVertexProgram(GLcontext * ctx);
 
 extern void *    r700GetActiveVpShaderBo(GLcontext * ctx);
 
+extern int getTypeSize(GLenum type);
+
 #endif /* _R700_VERTPROG_H_ */
diff --git a/radeon/Makefile.am b/radeon/Makefile.am
index 554f67e..0ff149d 100644
--- a/radeon/Makefile.am
+++ b/radeon/Makefile.am
@@ -1,6 +1,6 @@
 AM_CFLAGS = -DIN_DRI_DRIVER -DGLX_DIRECT_RENDERING -DGLX_INDIRECT_RENDERING
 
-RADEON_CFLAGS = -Iserver -DRADEON_COMMON=0
+RADEON_CFLAGS = -DRADEON_R100 -Iserver
 
 radeon_dri_la_LTLIBRARIES = radeon_dri.la
 radeon_dri_la_CFLAGS = $(AM_CFLAGS) $(DRM_CFLAGS) $(DRI_CFLAGS) $(RADEON_CFLAGS)
@@ -36,5 +36,7 @@ if HAVE_LIBDRM_RADEON
 radeon_dri_la_CFLAGS += -DHAVE_LIBDRM_RADEON=1 $(LIBDRM_RADEON_CFLAGS)
 radeon_dri_la_LDFLAGS += $(LIBDRM_RADEON_LIBS)
 radeon_dri_la_SOURCES += \
-	radeon_cs_space_drm.c
+	radeon_cs_space_drm.c \
+	radeon_bo.c \
+	radeon_cs.c
 endif
diff --git a/radeon/radeon_bo.c b/radeon/radeon_bo.c
new file mode 100644
index 0000000..393d156
--- /dev/null
+++ b/radeon/radeon_bo.c
@@ -0,0 +1,110 @@
+#include <radeon_bocs_wrapper.h>
+#include <radeon_bo_int_drm.h>
+
+void radeon_bo_debug(struct radeon_bo *bo,
+		     const char *op)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+
+    fprintf(stderr, "%s %p 0x%08X 0x%08X 0x%08X\n",
+            op, bo, bo->handle, boi->size, boi->cref);
+}
+
+struct radeon_bo *radeon_bo_open(struct radeon_bo_manager *bom,
+				 uint32_t handle,
+				 uint32_t size,
+				 uint32_t alignment,
+				 uint32_t domains,
+				 uint32_t flags)
+{
+    struct radeon_bo *bo;
+    bo = bom->funcs->bo_open(bom, handle, size, alignment, domains, flags);
+    return bo;
+}
+
+void radeon_bo_ref(struct radeon_bo *bo)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    boi->cref++;
+    boi->bom->funcs->bo_ref(boi);
+}
+
+struct radeon_bo *radeon_bo_unref(struct radeon_bo *bo)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    boi->cref--;
+    return boi->bom->funcs->bo_unref(boi);
+}
+
+int radeon_bo_map(struct radeon_bo *bo, int write)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    return boi->bom->funcs->bo_map(boi, write);
+}
+
+int radeon_bo_unmap(struct radeon_bo *bo)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    return boi->bom->funcs->bo_unmap(boi);
+}
+
+int radeon_bo_wait(struct radeon_bo *bo)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    if (!boi->bom->funcs->bo_wait)
+	return 0;
+    return boi->bom->funcs->bo_wait(boi);
+}
+
+int radeon_bo_is_busy(struct radeon_bo *bo,
+		      uint32_t *domain)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    return boi->bom->funcs->bo_is_busy(boi, domain);
+}
+
+int radeon_bo_set_tiling(struct radeon_bo *bo,
+			 uint32_t tiling_flags, uint32_t pitch)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    return boi->bom->funcs->bo_set_tiling(boi, tiling_flags, pitch);
+}
+
+int radeon_bo_get_tiling(struct radeon_bo *bo,
+			  uint32_t *tiling_flags, uint32_t *pitch)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    return boi->bom->funcs->bo_get_tiling(boi, tiling_flags, pitch);
+}
+
+int radeon_bo_is_static(struct radeon_bo *bo)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    if (boi->bom->funcs->bo_is_static)
+	return boi->bom->funcs->bo_is_static(boi);
+    return 0;
+}
+
+int radeon_bo_is_referenced_by_cs(struct radeon_bo *bo,
+				  struct radeon_cs *cs)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    return boi->cref > 1;
+}
+
+uint32_t radeon_bo_get_handle(struct radeon_bo *bo)
+{
+    return bo->handle;
+}
+
+uint32_t radeon_bo_get_src_domain(struct radeon_bo *bo)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    uint32_t src_domain;
+
+    src_domain = boi->space_accounted & 0xffff;
+    if (!src_domain)
+	src_domain = boi->space_accounted >> 16;
+
+    return src_domain;
+}
diff --git a/radeon/radeon_bo_drm.h b/radeon/radeon_bo_drm.h
index 7141371..beb2369 100644
--- a/radeon/radeon_bo_drm.h
+++ b/radeon/radeon_bo_drm.h
@@ -32,188 +32,44 @@
 
 #include <stdio.h>
 #include <stdint.h>
-//#include "radeon_track.h"
 
 /* bo object */
 #define RADEON_BO_FLAGS_MACRO_TILE  1
 #define RADEON_BO_FLAGS_MICRO_TILE  2
 
 struct radeon_bo_manager;
+struct radeon_cs;
 
 struct radeon_bo {
-    uint32_t                    alignment;
+    void                        *ptr;
+    uint32_t                    flags;
     uint32_t                    handle;
     uint32_t                    size;
-    uint32_t                    domains;
-    uint32_t                    flags;
-    unsigned                    cref;
-#ifdef RADEON_BO_TRACK
-    struct radeon_track         *track;
-#endif
-    void                        *ptr;
-    struct radeon_bo_manager    *bom;
-    uint32_t                    space_accounted;
-};
-
-/* bo functions */
-struct radeon_bo_funcs {
-    struct radeon_bo *(*bo_open)(struct radeon_bo_manager *bom,
-                                 uint32_t handle,
-                                 uint32_t size,
-                                 uint32_t alignment,
-                                 uint32_t domains,
-                                 uint32_t flags);
-    void (*bo_ref)(struct radeon_bo *bo);
-    struct radeon_bo *(*bo_unref)(struct radeon_bo *bo);
-    int (*bo_map)(struct radeon_bo *bo, int write);
-    int (*bo_unmap)(struct radeon_bo *bo);
-    int (*bo_wait)(struct radeon_bo *bo);
-    int (*bo_is_static)(struct radeon_bo *bo);
-    int (*bo_set_tiling)(struct radeon_bo *bo, uint32_t tiling_flags,
-			  uint32_t pitch);
-    int (*bo_get_tiling)(struct radeon_bo *bo, uint32_t *tiling_flags,
-			  uint32_t *pitch);
-    int (*bo_is_busy)(struct radeon_bo *bo, uint32_t *domain);
 };
 
-struct radeon_bo_manager {
-    struct radeon_bo_funcs  *funcs;
-    int                     fd;
-
-#ifdef RADEON_BO_TRACK
-    struct radeon_tracker   tracker;
-#endif
-};
-    
-static inline void _radeon_bo_debug(struct radeon_bo *bo,
-                                    const char *op,
-                                    const char *file,
-                                    const char *func,
-                                    int line)
-{
-    fprintf(stderr, "%s %p 0x%08X 0x%08X 0x%08X [%s %s %d]\n",
-            op, bo, bo->handle, bo->size, bo->cref, file, func, line);
-}
-
-static inline struct radeon_bo *_radeon_bo_open(struct radeon_bo_manager *bom,
-                                                uint32_t handle,
-                                                uint32_t size,
-                                                uint32_t alignment,
-                                                uint32_t domains,
-                                                uint32_t flags,
-                                                const char *file,
-                                                const char *func,
-                                                int line)
-{
-    struct radeon_bo *bo;
-
-    bo = bom->funcs->bo_open(bom, handle, size, alignment, domains, flags);
-
-#ifdef RADEON_BO_TRACK
-    if (bo) {
-        bo->track = radeon_tracker_add_track(&bom->tracker, bo->handle);
-        radeon_track_add_event(bo->track, file, func, "open", line);
-    }
-#endif
-    return bo;
-}
-
-static inline void _radeon_bo_ref(struct radeon_bo *bo,
-                                  const char *file,
-                                  const char *func,
-                                  int line)
-{
-    bo->cref++;
-#ifdef RADEON_BO_TRACK
-    radeon_track_add_event(bo->track, file, func, "ref", line); 
-#endif
-    bo->bom->funcs->bo_ref(bo);
-}
-
-static inline struct radeon_bo *_radeon_bo_unref(struct radeon_bo *bo,
-                                                 const char *file,
-                                                 const char *func,
-                                                 int line)
-{
-    bo->cref--;
-#ifdef RADEON_BO_TRACK
-    radeon_track_add_event(bo->track, file, func, "unref", line);
-    if (bo->cref <= 0) {
-        radeon_tracker_remove_track(&bo->bom->tracker, bo->track);
-        bo->track = NULL;
-    }
-#endif
-    return bo->bom->funcs->bo_unref(bo);
-}
-
-static inline int _radeon_bo_map(struct radeon_bo *bo,
-                                 int write,
-                                 const char *file,
-                                 const char *func,
-                                 int line)
-{
-    return bo->bom->funcs->bo_map(bo, write);
-}
-
-static inline int _radeon_bo_unmap(struct radeon_bo *bo,
-                                   const char *file,
-                                   const char *func,
-                                   int line)
-{
-    return bo->bom->funcs->bo_unmap(bo);
-}
-
-static inline int _radeon_bo_wait(struct radeon_bo *bo,
-                                  const char *file,
-                                  const char *func,
-                                  int line)
-{
-    return bo->bom->funcs->bo_wait(bo);
-}
-
-static inline int _radeon_bo_is_busy(struct radeon_bo *bo,
-				     uint32_t *domain,
-                                     const char *file,
-                                     const char *func,
-                                     int line)
-{
-    return bo->bom->funcs->bo_is_busy(bo, domain);
-}
-
-static inline int radeon_bo_set_tiling(struct radeon_bo *bo,
-				       uint32_t tiling_flags, uint32_t pitch)
-{
-    return bo->bom->funcs->bo_set_tiling(bo, tiling_flags, pitch);
-}
-
-static inline int radeon_bo_get_tiling(struct radeon_bo *bo,
-				       uint32_t *tiling_flags, uint32_t *pitch)
-{
-    return bo->bom->funcs->bo_get_tiling(bo, tiling_flags, pitch);
-}
-
-static inline int radeon_bo_is_static(struct radeon_bo *bo)
-{
-	if (bo->bom->funcs->bo_is_static)
-		return bo->bom->funcs->bo_is_static(bo);
-	return 0;
-}
-
-#define radeon_bo_open(bom, h, s, a, d, f)\
-    _radeon_bo_open(bom, h, s, a, d, f, __FILE__, __FUNCTION__, __LINE__)
-#define radeon_bo_ref(bo)\
-    _radeon_bo_ref(bo, __FILE__, __FUNCTION__, __LINE__)
-#define radeon_bo_unref(bo)\
-    _radeon_bo_unref(bo, __FILE__, __FUNCTION__, __LINE__)
-#define radeon_bo_map(bo, w)\
-    _radeon_bo_map(bo, w, __FILE__, __FUNCTION__, __LINE__)
-#define radeon_bo_unmap(bo)\
-    _radeon_bo_unmap(bo, __FILE__, __FUNCTION__, __LINE__)
-#define radeon_bo_debug(bo, opcode)\
-    _radeon_bo_debug(bo, opcode, __FILE__, __FUNCTION__, __LINE__)
-#define radeon_bo_wait(bo) \
-    _radeon_bo_wait(bo, __FILE__, __func__, __LINE__)
-#define radeon_bo_is_busy(bo, domain) \
-    _radeon_bo_is_busy(bo, domain, __FILE__, __func__, __LINE__)
+struct radeon_bo_manager;
 
+void radeon_bo_debug(struct radeon_bo *bo,
+		     const char *op);
+
+struct radeon_bo *radeon_bo_open(struct radeon_bo_manager *bom,
+				  uint32_t handle,
+				  uint32_t size,
+				  uint32_t alignment,
+				  uint32_t domains,
+				  uint32_t flags);
+
+void radeon_bo_ref(struct radeon_bo *bo);
+struct radeon_bo *radeon_bo_unref(struct radeon_bo *bo);
+int radeon_bo_map(struct radeon_bo *bo, int write);
+int radeon_bo_unmap(struct radeon_bo *bo);
+int radeon_bo_wait(struct radeon_bo *bo);
+int radeon_bo_is_busy(struct radeon_bo *bo, uint32_t *domain);
+int radeon_bo_set_tiling(struct radeon_bo *bo, uint32_t tiling_flags, uint32_t pitch);
+int radeon_bo_get_tiling(struct radeon_bo *bo, uint32_t *tiling_flags, uint32_t *pitch);
+int radeon_bo_is_static(struct radeon_bo *bo);
+int radeon_bo_is_referenced_by_cs(struct radeon_bo *bo,
+				  struct radeon_cs *cs);
+uint32_t radeon_bo_get_handle(struct radeon_bo *bo);
+uint32_t radeon_bo_get_src_domain(struct radeon_bo *bo);
 #endif
diff --git a/radeon/radeon_bo_int_drm.h b/radeon/radeon_bo_int_drm.h
new file mode 100644
index 0000000..190c332
--- /dev/null
+++ b/radeon/radeon_bo_int_drm.h
@@ -0,0 +1,45 @@
+#ifndef RADEON_BO_INT
+#define RADEON_BO_INT
+
+struct radeon_bo_manager {
+    struct radeon_bo_funcs  *funcs;
+    int                     fd;
+};
+
+struct radeon_bo_int {
+    void                        *ptr;
+    uint32_t                    flags;
+    uint32_t                    handle;
+    uint32_t                    size;
+    /* private members */
+    uint32_t                    alignment;
+    uint32_t                    domains;
+    unsigned                    cref;
+    struct radeon_bo_manager    *bom;
+    uint32_t                    space_accounted;
+    uint32_t                    referenced_in_cs;
+};
+
+/* bo functions */
+struct radeon_bo_funcs {
+    struct radeon_bo *(*bo_open)(struct radeon_bo_manager *bom,
+                                 uint32_t handle,
+                                 uint32_t size,
+                                 uint32_t alignment,
+                                 uint32_t domains,
+                                 uint32_t flags);
+    void (*bo_ref)(struct radeon_bo_int *bo);
+    struct radeon_bo *(*bo_unref)(struct radeon_bo_int *bo);
+    int (*bo_map)(struct radeon_bo_int *bo, int write);
+    int (*bo_unmap)(struct radeon_bo_int *bo);
+    int (*bo_wait)(struct radeon_bo_int *bo);
+    int (*bo_is_static)(struct radeon_bo_int *bo);
+    int (*bo_set_tiling)(struct radeon_bo_int *bo, uint32_t tiling_flags,
+			  uint32_t pitch);
+    int (*bo_get_tiling)(struct radeon_bo_int *bo, uint32_t *tiling_flags,
+			  uint32_t *pitch);
+    int (*bo_is_busy)(struct radeon_bo_int *bo, uint32_t *domain);
+    int (*bo_is_referenced_by_cs)(struct radeon_bo_int *bo, struct radeon_cs *cs);
+};
+
+#endif
diff --git a/radeon/radeon_bo_legacy.c b/radeon/radeon_bo_legacy.c
index 3e7547d..cf12664 100644
--- a/radeon/radeon_bo_legacy.c
+++ b/radeon/radeon_bo_legacy.c
@@ -50,6 +50,12 @@
 #include "radeon_bocs_wrapper.h"
 #include "radeon_macros.h"
 
+#ifdef HAVE_LIBDRM_RADEON
+#include "radeon_bo_int.h"
+#else
+#include "radeon_bo_int_drm.h"
+#endif
+
 /* no seriously texmem.c is this screwed up */
 struct bo_legacy_texture_object {
     driTextureObject    base;
@@ -57,7 +63,7 @@ struct bo_legacy_texture_object {
 };
 
 struct bo_legacy {
-    struct radeon_bo    base;
+    struct radeon_bo_int    base;
     int                 map_count;
     uint32_t            pending;
     int                 is_pending;
@@ -187,10 +193,10 @@ static void legacy_get_current_age(struct bo_manager_legacy *boml)
     }
 }
 
-static int legacy_is_pending(struct radeon_bo *bo)
+static int legacy_is_pending(struct radeon_bo_int *boi)
 {
-    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
-    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)boi->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)boi;
 
     if (bo_legacy->is_pending <= 0) {
         bo_legacy->is_pending = 0;
@@ -204,13 +210,13 @@ static int legacy_is_pending(struct radeon_bo *bo)
         if (bo_legacy->pnext) {
             bo_legacy->pnext->pprev = bo_legacy->pprev;
         }
-	assert(bo_legacy->is_pending <= bo->cref);
+	assert(bo_legacy->is_pending <= boi->cref);
         while (bo_legacy->is_pending--) {
-	    bo = radeon_bo_unref(bo);
-	    if (!bo)
+	    boi = (struct radeon_bo_int *)radeon_bo_unref((struct radeon_bo *)boi);
+	    if (!boi)
 	      break;
         }
-	if (bo)
+	if (boi)
 	  bo_legacy->is_pending = 0;
         boml->cpendings--;
         return 0;
@@ -218,7 +224,7 @@ static int legacy_is_pending(struct radeon_bo *bo)
     return 1;
 }
 
-static int legacy_wait_pending(struct radeon_bo *bo)
+static int legacy_wait_pending(struct radeon_bo_int *bo)
 {
     struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
@@ -323,7 +329,7 @@ static struct bo_legacy *bo_allocate(struct bo_manager_legacy *boml,
     return bo_legacy;
 }
 
-static int bo_dma_alloc(struct radeon_bo *bo)
+static int bo_dma_alloc(struct radeon_bo_int *bo)
 {
     struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
@@ -333,7 +339,7 @@ static int bo_dma_alloc(struct radeon_bo *bo)
     int r;
 
     /* align size on 4Kb */
-    size = (((4 * 1024) - 1) + bo->size) & ~((4 * 1024) - 1);
+    size = (((4 * 1024) - 1) + bo_legacy->base.size) & ~((4 * 1024) - 1);
     alloc.region = RADEON_MEM_REGION_GART;
     alloc.alignment = bo_legacy->base.alignment;
     alloc.size = size;
@@ -355,7 +361,7 @@ static int bo_dma_alloc(struct radeon_bo *bo)
     return 0;
 }
 
-static int bo_dma_free(struct radeon_bo *bo)
+static int bo_dma_free(struct radeon_bo_int *bo)
 {
     struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
@@ -428,7 +434,7 @@ static struct radeon_bo *bo_open(struct radeon_bo_manager *bom,
         bo_legacy = boml->bos.next;
         while (bo_legacy) {
             if (bo_legacy->base.handle == handle) {
-                radeon_bo_ref(&(bo_legacy->base));
+                radeon_bo_ref((struct radeon_bo *)&(bo_legacy->base));
                 return (struct radeon_bo*)bo_legacy;
             }
             bo_legacy = bo_legacy->next;
@@ -468,20 +474,20 @@ retry:
             return NULL;
         }
     }
-    radeon_bo_ref(&(bo_legacy->base));
+    radeon_bo_ref((struct radeon_bo *)&(bo_legacy->base));
 
     return (struct radeon_bo*)bo_legacy;
 }
 
-static void bo_ref(struct radeon_bo *bo)
+static void bo_ref(struct radeon_bo_int *bo)
 {
 }
 
-static struct radeon_bo *bo_unref(struct radeon_bo *bo)
+static struct radeon_bo *bo_unref(struct radeon_bo_int *boi)
 {
-    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)boi;
 
-    if (bo->cref <= 0) {
+    if (boi->cref <= 0) {
         bo_legacy->prev->next = bo_legacy->next;
         if (bo_legacy->next) {
             bo_legacy->next->prev = bo_legacy->prev;
@@ -491,10 +497,10 @@ static struct radeon_bo *bo_unref(struct radeon_bo *bo)
         }
         return NULL;
     }
-    return bo;
+    return (struct radeon_bo *)boi;
 }
 
-static int bo_map(struct radeon_bo *bo, int write)
+static int bo_map(struct radeon_bo_int *bo, int write)
 {
     struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
@@ -528,7 +534,7 @@ static int bo_map(struct radeon_bo *bo, int write)
     return 0;
 }
 
-static int bo_unmap(struct radeon_bo *bo)
+static int bo_unmap(struct radeon_bo_int *bo)
 {
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
 
@@ -542,7 +548,7 @@ static int bo_unmap(struct radeon_bo *bo)
     return 0;
 }
 
-static int bo_is_busy(struct radeon_bo *bo, uint32_t *domain)
+static int bo_is_busy(struct radeon_bo_int *bo, uint32_t *domain)
 {
     *domain = 0;
     if (bo->domains & RADEON_GEM_DOMAIN_GTT)
@@ -555,7 +561,7 @@ static int bo_is_busy(struct radeon_bo *bo, uint32_t *domain)
         return 0;
 }
 
-static int bo_is_static(struct radeon_bo *bo)
+static int bo_is_static(struct radeon_bo_int *bo)
 {
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
     return bo_legacy->static_bo;
@@ -574,7 +580,7 @@ static struct radeon_bo_funcs bo_legacy_funcs = {
     bo_is_busy
 };
 
-static int bo_vram_validate(struct radeon_bo *bo,
+static int bo_vram_validate(struct radeon_bo_int *bo,
                             uint32_t *soffset,
                             uint32_t *eoffset)
 {
@@ -700,25 +706,30 @@ int radeon_bo_legacy_validate(struct radeon_bo *bo,
                               uint32_t *soffset,
                               uint32_t *eoffset)
 {
-    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)boi->bom;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
     int r;
     int retries = 0;
 
     if (bo_legacy->map_count) {
         fprintf(stderr, "bo(%p, %d) is mapped (%d) can't valide it.\n",
-                bo, bo->size, bo_legacy->map_count);
+                bo, boi->size, bo_legacy->map_count);
+        return -EINVAL;
+    }
+    if(boi->size == 0) {
+        fprintf(stderr, "bo(%p) has size 0.\n", bo);
         return -EINVAL;
     }
     if (bo_legacy->static_bo || bo_legacy->validated) {
         *soffset = bo_legacy->offset;
-        *eoffset = bo_legacy->offset + bo->size;
+        *eoffset = bo_legacy->offset + boi->size;
 
         return 0;
     }
-    if (!(bo->domains & RADEON_GEM_DOMAIN_GTT)) {
+    if (!(boi->domains & RADEON_GEM_DOMAIN_GTT)) {
 
-        r = bo_vram_validate(bo, soffset, eoffset);
+        r = bo_vram_validate(boi, soffset, eoffset);
         if (r) {
 	    legacy_track_pending(&boml->base, 0);
 	    legacy_kick_all_buffers(boml);
@@ -732,7 +743,7 @@ int radeon_bo_legacy_validate(struct radeon_bo *bo,
         }
     }
     *soffset = bo_legacy->offset;
-    *eoffset = bo_legacy->offset + bo->size;
+    *eoffset = bo_legacy->offset + boi->size;
     bo_legacy->validated = 1;
 
     return 0;
@@ -740,7 +751,8 @@ int radeon_bo_legacy_validate(struct radeon_bo *bo,
 
 void radeon_bo_legacy_pending(struct radeon_bo *bo, uint32_t pending)
 {
-    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)boi->bom;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
 
     bo_legacy->pending = pending;
@@ -795,7 +807,7 @@ static struct bo_legacy *radeon_legacy_bo_alloc_static(struct bo_manager_legacy
     if (bo->base.handle > bom->nhandle) {
         bom->nhandle = bo->base.handle + 1;
     }
-    radeon_bo_ref(&(bo->base));
+    radeon_bo_ref((struct radeon_bo *)&(bo->base));
     return bo;
 }
 
@@ -890,12 +902,13 @@ void radeon_bo_legacy_texture_age(struct radeon_bo_manager *bom)
 
 unsigned radeon_bo_legacy_relocs_size(struct radeon_bo *bo)
 {
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
 
-    if (bo_legacy->static_bo || (bo->domains & RADEON_GEM_DOMAIN_GTT)) {
+    if (bo_legacy->static_bo || (boi->domains & RADEON_GEM_DOMAIN_GTT)) {
         return 0;
     }
-    return bo->size;
+    return boi->size;
 }
 
 /*
@@ -920,7 +933,7 @@ struct radeon_bo *radeon_legacy_bo_alloc_fake(struct radeon_bo_manager *bom,
     if (bo->base.handle > boml->nhandle) {
         boml->nhandle = bo->base.handle + 1;
     }
-    radeon_bo_ref(&(bo->base));
-    return &(bo->base);
+    radeon_bo_ref((struct radeon_bo *)&(bo->base));
+    return (struct radeon_bo *)&(bo->base);
 }
 
diff --git a/radeon/radeon_bocs_wrapper.h b/radeon/radeon_bocs_wrapper.h
index 4520a7d..6c2648b 100644
--- a/radeon/radeon_bocs_wrapper.h
+++ b/radeon/radeon_bocs_wrapper.h
@@ -18,8 +18,11 @@
 #define RADEON_TILING_MACRO 0x1
 #define RADEON_TILING_MICRO 0x2
 #define RADEON_TILING_SWAP 0x4
+
+#ifndef RADEON_TILING_SURFACE
 #define RADEON_TILING_SURFACE 0x8 /* this object requires a surface
 				   * when mapped - i.e. front buffer */
+#endif
 
 /* to be used to build locally in mesa with no libdrm bits */
 #include "../radeon/radeon_bo_drm.h"
diff --git a/radeon/radeon_buffer_objects.c b/radeon/radeon_buffer_objects.c
index 8fac5c6..99d3ec7 100644
--- a/radeon/radeon_buffer_objects.c
+++ b/radeon/radeon_buffer_objects.c
@@ -136,8 +136,13 @@ radeonBufferSubData(GLcontext * ctx,
                     const GLvoid * data,
                     struct gl_buffer_object *obj)
 {
+    radeonContextPtr radeon = RADEON_CONTEXT(ctx);
     struct radeon_buffer_object *radeon_obj = get_radeon_buffer_object(obj);
 
+    if (radeon_bo_is_referenced_by_cs(radeon_obj->bo, radeon->cmdbuf.cs)) {
+        radeon_firevertices(radeon);
+    }
+
     radeon_bo_map(radeon_obj->bo, GL_TRUE);
 
     _mesa_memcpy(radeon_obj->bo->ptr + offset, data, size);
diff --git a/radeon/radeon_common.c b/radeon/radeon_common.c
index 9817ff8..9b64c21 100644
--- a/radeon/radeon_common.c
+++ b/radeon/radeon_common.c
@@ -229,16 +229,15 @@ void radeonUpdateScissor( GLcontext *ctx )
 	}
 	if (!rmesa->radeonScreen->kernel_mm) {
 	   /* Fix scissors for dri 1 */
-
 	   __DRIdrawablePrivate *dPriv = radeon_get_drawable(rmesa);
 	   x1 += dPriv->x;
-	   x2 += dPriv->x;
+	   x2 += dPriv->x + 1;
 	   min_x += dPriv->x;
-	   max_x += dPriv->x;
+	   max_x += dPriv->x + 1;
 	   y1 += dPriv->y;
-	   y2 += dPriv->y;
+	   y2 += dPriv->y + 1;
 	   min_y += dPriv->y;
-	   max_y += dPriv->y;
+	   max_y += dPriv->y + 1;
 	}
 
 	rmesa->state.scissor.rect.x1 = CLAMP(x1,  min_x, max_x);
@@ -263,29 +262,6 @@ void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h)
 	}
 }
 
-void radeonPolygonStipplePreKMS( GLcontext *ctx, const GLubyte *mask )
-{
-   radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-   GLuint i;
-   drm_radeon_stipple_t stipple;
-
-   /* Must flip pattern upside down.
-   */
-   for ( i = 0 ; i < 32 ; i++ ) {
-      stipple.mask[31 - i] = ((GLuint *) mask)[i];
-   }
-
-   /* TODO: push this into cmd mechanism
-   */
-   radeon_firevertices(radeon);
-   LOCK_HARDWARE( radeon );
-
-   drmCommandWrite( radeon->dri.fd, DRM_RADEON_STIPPLE,
-	 &stipple, sizeof(stipple) );
-   UNLOCK_HARDWARE( radeon );
-}
-
-
 /* ================================================================
  * SwapBuffers with client-side throttling
  */
@@ -1124,8 +1100,6 @@ void radeonFlush(GLcontext *ctx)
 	if (radeon->dma.flush)
 		radeon->dma.flush( ctx );
 
-	radeonEmitState(radeon);
-
 	if (radeon->cmdbuf.cs->cdw)
 		rcommonFlushCmdBuf(radeon, __FUNCTION__);
 
@@ -1148,9 +1122,6 @@ void radeonFlush(GLcontext *ctx)
 			}
 		}
 	}
-
-	make_empty_list(&radeon->query.not_flushed_head);
-
 }
 
 /* Make sure all commands have been sent to the hardware and have
@@ -1345,5 +1316,5 @@ void rcommonBeginBatch(radeonContextPtr rmesa, int n,
 
 void radeonUserClear(GLcontext *ctx, GLuint mask)
 {
-   _mesa_meta_clear(ctx, mask);
+   _mesa_meta_Clear(ctx, mask);
 }
diff --git a/radeon/radeon_common.h b/radeon/radeon_common.h
index f320191..0608fe2 100644
--- a/radeon/radeon_common.h
+++ b/radeon/radeon_common.h
@@ -10,7 +10,6 @@ void radeonRecalcScissorRects(radeonContextPtr radeon);
 void radeonSetCliprects(radeonContextPtr radeon);
 void radeonUpdateScissor( GLcontext *ctx );
 void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h);
-void radeonPolygonStipplePreKMS( GLcontext *ctx, const GLubyte *mask );
 
 void radeonWaitForIdleLocked(radeonContextPtr radeon);
 extern uint32_t radeonGetAge(radeonContextPtr radeon);
@@ -43,7 +42,7 @@ void
 radeon_renderbuffer_set_bo(struct radeon_renderbuffer *rb,
 			   struct radeon_bo *bo);
 struct radeon_renderbuffer *
-radeon_create_renderbuffer(GLenum format, __DRIdrawablePrivate *driDrawPriv);
+radeon_create_renderbuffer(gl_format format, __DRIdrawablePrivate *driDrawPriv);
 static inline struct radeon_renderbuffer *radeon_renderbuffer(struct gl_renderbuffer *rb)
 {
 	struct radeon_renderbuffer *rrb = (struct radeon_renderbuffer *)rb;
diff --git a/radeon/radeon_common_context.c b/radeon/radeon_common_context.c
index 330721a..71f70d7 100644
--- a/radeon/radeon_common_context.c
+++ b/radeon/radeon_common_context.c
@@ -47,7 +47,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast_setup/swrast_setup.h"
 #include "tnl/tnl.h"
 
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600) /* +r6/r7 */
+#if defined(RADEON_R600)
 #include "r600_context.h"
 #endif
 
@@ -262,10 +262,9 @@ GLboolean radeonInitContext(radeonContextPtr radeon,
 		else
 			radeon->texture_row_align = 32;
 		radeon->texture_rect_row_align = 64;
-		radeon->texture_compressed_row_align = 64;
+		radeon->texture_compressed_row_align = 32;
 	}
 
-	make_empty_list(&radeon->query.not_flushed_head);
 	radeon_init_dma(radeon);
 
 	return GL_TRUE;
@@ -496,19 +495,7 @@ radeon_make_renderbuffer_current(radeonContextPtr radeon,
 static unsigned
 radeon_bits_per_pixel(const struct radeon_renderbuffer *rb)
 {
-   switch (rb->base._ActualFormat) {
-   case GL_RGB5:
-   case GL_DEPTH_COMPONENT16:
-      return 16;
-   case GL_RGB8:
-   case GL_RGBA8:
-   case GL_DEPTH_COMPONENT24:
-   case GL_DEPTH24_STENCIL8_EXT:
-   case GL_STENCIL_INDEX8_EXT:
-      return 32;
-   default:
-      return 0;
-   }
+   return _mesa_get_format_bytes(rb->base.Format) * 8; 
 }
 
 void
diff --git a/radeon/radeon_common_context.h b/radeon/radeon_common_context.h
index 0309345..6298748 100644
--- a/radeon/radeon_common_context.h
+++ b/radeon/radeon_common_context.h
@@ -208,6 +208,10 @@ struct radeon_tex_obj {
 	 * and so on.
 	 */
 	GLboolean validated;
+	/* Minimum LOD to be used during rendering */
+	unsigned minLod;
+	/* Miximum LOD to be used during rendering */
+	unsigned maxLod;
 
 	GLuint override_offset;
 	GLboolean image_override; /* Image overridden by GLX_EXT_tfp */
@@ -401,9 +405,6 @@ struct radeon_state {
 	struct radeon_depthbuffer_state depth;
 	struct radeon_scissor_state scissor;
 	struct radeon_stencilbuffer_state stencil;
-
-	struct radeon_cs_space_check bos[RADEON_MAX_BOS];
-	int validated_bo_count;
 };
 
 /**
@@ -502,7 +503,6 @@ struct radeon_context {
 
    struct {
 	struct radeon_query_object *current;
-	struct radeon_query_object not_flushed_head;
 	struct radeon_state_atom queryobj;
    } query;
 
diff --git a/radeon/radeon_context.c b/radeon/radeon_context.c
index 8f4485a..5e700be 100644
--- a/radeon/radeon_context.c
+++ b/radeon/radeon_context.c
@@ -69,7 +69,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_framebuffer_object
-#include "extension_helper.h"
+#include "main/remap_helper.h"
 
 #define DRIVER_DATE	"20061018"
 
@@ -79,7 +79,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 /* Extension strings exported by the R100 driver.
  */
-const struct dri_extension card_extensions[] =
+static const struct dri_extension card_extensions[] =
 {
     { "GL_ARB_multitexture",               NULL },
     { "GL_ARB_occlusion_query",		   GL_ARB_occlusion_query_functions},
@@ -109,7 +109,7 @@ const struct dri_extension card_extensions[] =
     { NULL,                                NULL }
 };
 
-const struct dri_extension mm_extensions[] = {
+static const struct dri_extension mm_extensions[] = {
   { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
   { NULL, NULL }
 };
diff --git a/radeon/radeon_context.h b/radeon/radeon_context.h
index 4e2c52c..12ab33a 100644
--- a/radeon/radeon_context.h
+++ b/radeon/radeon_context.h
@@ -331,8 +331,12 @@ struct r100_hw_state {
 	struct radeon_state_atom stp;
 };
 
+struct radeon_stipple_state {
+	GLuint mask[32];
+};
 
 struct r100_state {
+	struct radeon_stipple_state stipple;
 	struct radeon_texture_state texture;
 };
 
diff --git a/radeon/radeon_cs.c b/radeon/radeon_cs.c
new file mode 100644
index 0000000..17e7433
--- /dev/null
+++ b/radeon/radeon_cs.c
@@ -0,0 +1,95 @@
+
+#include <stdio.h>
+#include <stdint.h>
+#include "drm.h"
+#include "radeon_drm.h"
+#include "radeon_bocs_wrapper.h"
+#include "radeon_cs_int_drm.h"
+
+struct radeon_cs *radeon_cs_create(struct radeon_cs_manager *csm,
+			    uint32_t ndw)
+{
+    struct radeon_cs_int *csi = csm->funcs->cs_create(csm, ndw);
+    return (struct radeon_cs *)csi;
+}
+
+int radeon_cs_write_reloc(struct radeon_cs *cs,
+			  struct radeon_bo *bo,
+			  uint32_t read_domain,
+			  uint32_t write_domain,
+			  uint32_t flags)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+
+    return csi->csm->funcs->cs_write_reloc(csi,
+					   bo,
+					   read_domain,
+					   write_domain,
+					   flags);
+}
+
+int radeon_cs_begin(struct radeon_cs *cs,
+		    uint32_t ndw,
+		    const char *file,
+		    const char *func,
+		    int line)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return csi->csm->funcs->cs_begin(csi, ndw, file, func, line);
+}
+
+int radeon_cs_end(struct radeon_cs *cs,
+		  const char *file,
+		  const char *func,
+		  int line)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return csi->csm->funcs->cs_end(csi, file, func, line);
+}
+
+int radeon_cs_emit(struct radeon_cs *cs)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return csi->csm->funcs->cs_emit(csi);
+}
+
+int radeon_cs_destroy(struct radeon_cs *cs)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return csi->csm->funcs->cs_destroy(csi);
+}
+
+int radeon_cs_erase(struct radeon_cs *cs)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return csi->csm->funcs->cs_erase(csi);
+}
+
+int radeon_cs_need_flush(struct radeon_cs *cs)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return csi->csm->funcs->cs_need_flush(csi);
+}
+
+void radeon_cs_print(struct radeon_cs *cs, FILE *file)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    csi->csm->funcs->cs_print(csi, file);
+}
+
+void radeon_cs_set_limit(struct radeon_cs *cs, uint32_t domain, uint32_t limit)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    if (domain == RADEON_GEM_DOMAIN_VRAM)
+	csi->csm->vram_limit = limit;
+    else
+	csi->csm->gart_limit = limit;
+}
+
+void radeon_cs_space_set_flush(struct radeon_cs *cs, void (*fn)(void *), void *data)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    csi->space_flush_fn = fn;
+    csi->space_flush_data = data;
+}
+
diff --git a/radeon/radeon_cs_drm.h b/radeon/radeon_cs_drm.h
index ab4eca3..a3f1750 100644
--- a/radeon/radeon_cs_drm.h
+++ b/radeon/radeon_cs_drm.h
@@ -36,6 +36,7 @@
 #include <string.h>
 #include "drm.h"
 #include "radeon_drm.h"
+#include "radeon_bo_drm.h"
 
 struct radeon_cs_reloc {
     struct radeon_bo    *bo;
@@ -49,173 +50,41 @@ struct radeon_cs_reloc {
 #define RADEON_CS_SPACE_OP_TO_BIG 1
 #define RADEON_CS_SPACE_FLUSH 2
 
-struct radeon_cs_space_check {
-    struct radeon_bo *bo;
-    uint32_t read_domains;
-    uint32_t write_domain;
-    uint32_t new_accounted;
-};
-
-#define MAX_SPACE_BOS (32)
-
-struct radeon_cs_manager;
-
 struct radeon_cs {
-    struct radeon_cs_manager    *csm;
-    void                        *relocs;
-    uint32_t                    *packets;
-    unsigned                    crelocs;
-    unsigned                    relocs_total_size;
-    unsigned                    cdw;
-    unsigned                    ndw;
-    int                         section;
+    uint32_t *packets;
+    unsigned cdw;
+    unsigned ndw;
     unsigned                    section_ndw;
     unsigned                    section_cdw;
-    const char                  *section_file;
-    const char                  *section_func;
-    int                         section_line;
-    struct radeon_cs_space_check bos[MAX_SPACE_BOS];
-    int                         bo_count;
-    void                        (*space_flush_fn)(void *);
-    void                        *space_flush_data;
-};
-
-/* cs functions */
-struct radeon_cs_funcs {
-    struct radeon_cs *(*cs_create)(struct radeon_cs_manager *csm,
-                                   uint32_t ndw);
-    int (*cs_write_reloc)(struct radeon_cs *cs,
-                          struct radeon_bo *bo,
-                          uint32_t read_domain,
-                          uint32_t write_domain,
-                          uint32_t flags);
-    int (*cs_begin)(struct radeon_cs *cs,
-                    uint32_t ndw,
-                    const char *file,
-                    const char *func,
-                    int line);
-    int (*cs_end)(struct radeon_cs *cs,
-                  const char *file,
-                  const char *func,
-                  int line);
-    int (*cs_emit)(struct radeon_cs *cs);
-    int (*cs_destroy)(struct radeon_cs *cs);
-    int (*cs_erase)(struct radeon_cs *cs);
-    int (*cs_need_flush)(struct radeon_cs *cs);
-    void (*cs_print)(struct radeon_cs *cs, FILE *file);
-};
-
-struct radeon_cs_manager {
-    struct radeon_cs_funcs  *funcs;
-    int                     fd;
-    int32_t vram_limit, gart_limit;
-    int32_t vram_write_used, gart_write_used;
-    int32_t read_used;
 };
 
-static inline struct radeon_cs *radeon_cs_create(struct radeon_cs_manager *csm,
-                                                 uint32_t ndw)
-{
-    return csm->funcs->cs_create(csm, ndw);
-}
-
-static inline int radeon_cs_write_reloc(struct radeon_cs *cs,
-                                        struct radeon_bo *bo,
-                                        uint32_t read_domain,
-                                        uint32_t write_domain,
-                                        uint32_t flags)
-{
-    return cs->csm->funcs->cs_write_reloc(cs,
-                                          bo,
-                                          read_domain,
-                                          write_domain,
-                                          flags);
-}
-
-static inline int radeon_cs_begin(struct radeon_cs *cs,
-                                  uint32_t ndw,
-                                  const char *file,
-                                  const char *func,
-                                  int line)
-{
-    return cs->csm->funcs->cs_begin(cs, ndw, file, func, line);
-}
-
-static inline int radeon_cs_end(struct radeon_cs *cs,
-                                const char *file,
-                                const char *func,
-                                int line)
-{
-    return cs->csm->funcs->cs_end(cs, file, func, line);
-}
-
-static inline int radeon_cs_emit(struct radeon_cs *cs)
-{
-    return cs->csm->funcs->cs_emit(cs);
-}
-
-static inline int radeon_cs_destroy(struct radeon_cs *cs)
-{
-    return cs->csm->funcs->cs_destroy(cs);
-}
-
-static inline int radeon_cs_erase(struct radeon_cs *cs)
-{
-    return cs->csm->funcs->cs_erase(cs);
-}
-
-static inline int radeon_cs_need_flush(struct radeon_cs *cs)
-{
-    return cs->csm->funcs->cs_need_flush(cs);
-}
-
-static inline void radeon_cs_print(struct radeon_cs *cs, FILE *file)
-{
-    cs->csm->funcs->cs_print(cs, file);
-}
-
-static inline void radeon_cs_set_limit(struct radeon_cs *cs, uint32_t domain, uint32_t limit)
-{
-    
-    if (domain == RADEON_GEM_DOMAIN_VRAM)
-	cs->csm->vram_limit = limit;
-    else
-	cs->csm->gart_limit = limit;
-}
-
-static inline void radeon_cs_write_dword(struct radeon_cs *cs, uint32_t dword)
-{
-    cs->packets[cs->cdw++] = dword;
-    if (cs->section) {
-        cs->section_cdw++;
-    }
-}
-
-static inline void radeon_cs_write_qword(struct radeon_cs *cs, uint64_t qword)
-{
-
-    memcpy(cs->packets + cs->cdw, &qword, sizeof(qword));
-    cs->cdw+=2;
-    if (cs->section) {
-        cs->section_cdw+=2;
-    }
-}
-
-static inline void radeon_cs_write_table(struct radeon_cs *cs, void *data, uint32_t size)
-{
-    memcpy(cs->packets + cs->cdw, data, size * 4);
-    cs->cdw += size;
-    if (cs->section) {
-	    cs->section_cdw += size;
-    }
-}
+#define MAX_SPACE_BOS (32)
 
-static inline void radeon_cs_space_set_flush(struct radeon_cs *cs, void (*fn)(void *), void *data)
-{
-    cs->space_flush_fn = fn;
-    cs->space_flush_data = data;
-}
+struct radeon_cs_manager;
 
+extern struct radeon_cs *radeon_cs_create(struct radeon_cs_manager *csm,
+					  uint32_t ndw);
+
+extern int radeon_cs_begin(struct radeon_cs *cs,
+			   uint32_t ndw,
+			   const char *file,
+			   const char *func, int line);
+extern int radeon_cs_end(struct radeon_cs *cs,
+			 const char *file,
+			 const char *func,
+			 int line);
+extern int radeon_cs_emit(struct radeon_cs *cs);
+extern int radeon_cs_destroy(struct radeon_cs *cs);
+extern int radeon_cs_erase(struct radeon_cs *cs);
+extern int radeon_cs_need_flush(struct radeon_cs *cs);
+extern void radeon_cs_print(struct radeon_cs *cs, FILE *file);
+extern void radeon_cs_set_limit(struct radeon_cs *cs, uint32_t domain, uint32_t limit);
+extern void radeon_cs_space_set_flush(struct radeon_cs *cs, void (*fn)(void *), void *data);
+extern int radeon_cs_write_reloc(struct radeon_cs *cs,
+				 struct radeon_bo *bo,
+				 uint32_t read_domain,
+				 uint32_t write_domain,
+				 uint32_t flags);
 
 /*
  * add a persistent BO to the list
@@ -243,4 +112,30 @@ int radeon_cs_space_check_with_bo(struct radeon_cs *cs,
 				  uint32_t read_domains,
 				  uint32_t write_domain);
 
+static inline void radeon_cs_write_dword(struct radeon_cs *cs, uint32_t dword)
+{
+    cs->packets[cs->cdw++] = dword;
+    if (cs->section_ndw) {
+        cs->section_cdw++;
+    }
+}
+
+static inline void radeon_cs_write_qword(struct radeon_cs *cs, uint64_t qword)
+{
+    memcpy(cs->packets + cs->cdw, &qword, sizeof(uint64_t));
+    cs->cdw += 2;
+    if (cs->section_ndw) {
+        cs->section_cdw += 2;
+    }
+}
+
+static inline void radeon_cs_write_table(struct radeon_cs *cs,
+					 void *data, uint32_t size)
+{
+    memcpy(cs->packets + cs->cdw, data, size * 4);
+    cs->cdw += size;
+    if (cs->section_ndw) {
+	cs->section_cdw += size;
+    }
+}
 #endif
diff --git a/radeon/radeon_cs_int_drm.h b/radeon/radeon_cs_int_drm.h
new file mode 100644
index 0000000..8ba76bf
--- /dev/null
+++ b/radeon/radeon_cs_int_drm.h
@@ -0,0 +1,66 @@
+
+#ifndef _RADEON_CS_INT_H_
+#define _RADEON_CS_INT_H_
+
+struct radeon_cs_space_check {
+    struct radeon_bo_int *bo;
+    uint32_t read_domains;
+    uint32_t write_domain;
+    uint32_t new_accounted;
+};
+
+struct radeon_cs_int {
+    /* keep first two in same place */
+    uint32_t                    *packets;    
+    unsigned                    cdw;
+    unsigned                    ndw;
+    unsigned                    section_ndw;
+    unsigned                    section_cdw;
+    /* private members */
+    struct radeon_cs_manager    *csm;
+    void                        *relocs;
+    unsigned                    crelocs;
+    unsigned                    relocs_total_size;
+    const char                  *section_file;
+    const char                  *section_func;
+    int                         section_line;
+    struct radeon_cs_space_check bos[MAX_SPACE_BOS];
+    int                         bo_count;
+    void                        (*space_flush_fn)(void *);
+    void                        *space_flush_data;
+};
+
+/* cs functions */
+struct radeon_cs_funcs {
+    struct radeon_cs_int *(*cs_create)(struct radeon_cs_manager *csm,
+                                   uint32_t ndw);
+    int (*cs_write_reloc)(struct radeon_cs_int *cs,
+                          struct radeon_bo *bo,
+                          uint32_t read_domain,
+                          uint32_t write_domain,
+                          uint32_t flags);
+    int (*cs_begin)(struct radeon_cs_int *cs,
+                    uint32_t ndw,
+		    const char *file,
+		    const char *func,
+		    int line);
+    int (*cs_end)(struct radeon_cs_int *cs,
+		  const char *file, const char *func,
+		  int line);
+
+
+    int (*cs_emit)(struct radeon_cs_int *cs);
+    int (*cs_destroy)(struct radeon_cs_int *cs);
+    int (*cs_erase)(struct radeon_cs_int *cs);
+    int (*cs_need_flush)(struct radeon_cs_int *cs);
+    void (*cs_print)(struct radeon_cs_int *cs, FILE *file);
+};
+
+struct radeon_cs_manager {
+    struct radeon_cs_funcs  *funcs;
+    int                     fd;
+    int32_t vram_limit, gart_limit;
+    int32_t vram_write_used, gart_write_used;
+    int32_t read_used;
+};
+#endif
diff --git a/radeon/radeon_cs_legacy.c b/radeon/radeon_cs_legacy.c
index f1addb2..45b608a 100644
--- a/radeon/radeon_cs_legacy.c
+++ b/radeon/radeon_cs_legacy.c
@@ -30,10 +30,18 @@
  *      Jérôme Glisse <glisse@freedesktop.org>
  */
 #include <errno.h>
+#include <unistd.h>
+#include <stdint.h>
+#include "drm.h"
+#include "radeon_drm.h"
 
 #include "radeon_bocs_wrapper.h"
 #include "radeon_common.h"
-
+#ifdef HAVE_LIBDRM_RADEON
+#include "radeon_cs_int.h"
+#else
+#include "radeon_cs_int_drm.h"
+#endif
 struct cs_manager_legacy {
     struct radeon_cs_manager    base;
     struct radeon_context       *ctx;
@@ -51,27 +59,27 @@ struct cs_reloc_legacy {
 };
 
 
-static struct radeon_cs *cs_create(struct radeon_cs_manager *csm,
-                                   uint32_t ndw)
+static struct radeon_cs_int *cs_create(struct radeon_cs_manager *csm,
+				       uint32_t ndw)
 {
-    struct radeon_cs *cs;
+    struct radeon_cs_int *csi;
 
-    cs = (struct radeon_cs*)calloc(1, sizeof(struct radeon_cs));
-    if (cs == NULL) {
+    csi = (struct radeon_cs_int*)calloc(1, sizeof(struct radeon_cs_int));
+    if (csi == NULL) {
         return NULL;
     }
-    cs->csm = csm;
-    cs->ndw = (ndw + 0x3FF) & (~0x3FF);
-    cs->packets = (uint32_t*)malloc(4*cs->ndw);
-    if (cs->packets == NULL) {
-        free(cs);
+    csi->csm = csm;
+    csi->ndw = (ndw + 0x3FF) & (~0x3FF);
+    csi->packets = (uint32_t*)malloc(4*csi->ndw);
+    if (csi->packets == NULL) {
+        free(csi);
         return NULL;
     }
-    cs->relocs_total_size = 0;
-    return cs;
+    csi->relocs_total_size = 0;
+    return csi;
 }
 
-static int cs_write_reloc(struct radeon_cs *cs,
+static int cs_write_reloc(struct radeon_cs_int *cs,
                           struct radeon_bo *bo,
                           uint32_t read_domain,
                           uint32_t write_domain,
@@ -150,20 +158,19 @@ static int cs_write_reloc(struct radeon_cs *cs,
     return 0;
 }
 
-static int cs_begin(struct radeon_cs *cs,
+static int cs_begin(struct radeon_cs_int *cs,
                     uint32_t ndw,
                     const char *file,
                     const char *func,
                     int line)
 {
-    if (cs->section) {
+    if (cs->section_ndw) {
         fprintf(stderr, "CS already in a section(%s,%s,%d)\n",
                 cs->section_file, cs->section_func, cs->section_line);
         fprintf(stderr, "CS can't start section(%s,%s,%d)\n",
                 file, func, line);
         return -EPIPE;
     }
-    cs->section = 1;
     cs->section_ndw = ndw;
     cs->section_cdw = 0;
     cs->section_file = file;
@@ -187,18 +194,17 @@ static int cs_begin(struct radeon_cs *cs,
     return 0;
 }
 
-static int cs_end(struct radeon_cs *cs,
+static int cs_end(struct radeon_cs_int *cs,
                   const char *file,
                   const char *func,
                   int line)
 
 {
-    if (!cs->section) {
+    if (!cs->section_ndw) {
         fprintf(stderr, "CS no section to end at (%s,%s,%d)\n",
                 file, func, line);
         return -EPIPE;
     }
-    cs->section = 0;
     if (cs->section_ndw != cs->section_cdw) {
         fprintf(stderr, "CS section size missmatch start at (%s,%s,%d) %d vs %d\n",
                 cs->section_file, cs->section_func, cs->section_line, cs->section_ndw, cs->section_cdw);
@@ -206,10 +212,12 @@ static int cs_end(struct radeon_cs *cs,
                 file, func, line);
         return -EPIPE;
     }
+    cs->section_ndw = 0;
+
     return 0;
 }
 
-static int cs_process_relocs(struct radeon_cs *cs)
+static int cs_process_relocs(struct radeon_cs_int *cs)
 {
     struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
     struct cs_reloc_legacy *relocs;
@@ -254,7 +262,7 @@ restart:
     return 0;
 }
 
-static int cs_set_age(struct radeon_cs *cs)
+static int cs_set_age(struct radeon_cs_int *cs)
 {
     struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
     struct cs_reloc_legacy *relocs;
@@ -268,7 +276,7 @@ static int cs_set_age(struct radeon_cs *cs)
     return 0;
 }
 
-static int cs_emit(struct radeon_cs *cs)
+static int cs_emit(struct radeon_cs_int *cs)
 {
     struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
     drm_radeon_cmd_buffer_t cmd;
@@ -276,7 +284,7 @@ static int cs_emit(struct radeon_cs *cs)
     uint64_t ull;
     int r;
 
-    csm->ctx->vtbl.emit_cs_header(cs, csm->ctx);
+    csm->ctx->vtbl.emit_cs_header((struct radeon_cs *)cs, csm->ctx);
 
     /* append buffer age */
     if ( IS_R300_CLASS(csm->ctx->radeonScreen) )
@@ -289,9 +297,9 @@ static int cs_emit(struct radeon_cs *cs)
       age.scratch.reg = 2;
       age.scratch.n_bufs = 1;
       age.scratch.flags = 0;
-      radeon_cs_write_dword(cs, age.u);
-      radeon_cs_write_qword(cs, ull);
-      radeon_cs_write_dword(cs, 0);
+      radeon_cs_write_dword((struct radeon_cs *)cs, age.u);
+      radeon_cs_write_qword((struct radeon_cs *)cs, ull);
+      radeon_cs_write_dword((struct radeon_cs *)cs, 0);
     }
 
     r = cs_process_relocs(cs);
@@ -342,7 +350,7 @@ static void inline cs_free_reloc(void *relocs_p, int crelocs)
       free(relocs[i].indices);
 }
 
-static int cs_destroy(struct radeon_cs *cs)
+static int cs_destroy(struct radeon_cs_int *cs)
 {
     cs_free_reloc(cs->relocs, cs->crelocs);
     free(cs->relocs);
@@ -351,7 +359,7 @@ static int cs_destroy(struct radeon_cs *cs)
     return 0;
 }
 
-static int cs_erase(struct radeon_cs *cs)
+static int cs_erase(struct radeon_cs_int *cs)
 {
     cs_free_reloc(cs->relocs, cs->crelocs);
     free(cs->relocs);
@@ -359,18 +367,18 @@ static int cs_erase(struct radeon_cs *cs)
     cs->relocs = NULL;
     cs->crelocs = 0;
     cs->cdw = 0;
-    cs->section = 0;
+    cs->section_ndw = 0;
     return 0;
 }
 
-static int cs_need_flush(struct radeon_cs *cs)
+static int cs_need_flush(struct radeon_cs_int *cs)
 {
     /* this function used to flush when the BO usage got to
      * a certain size, now the higher levels handle this better */
     return 0;
 }
 
-static void cs_print(struct radeon_cs *cs, FILE *file)
+static void cs_print(struct radeon_cs_int *cs, FILE *file)
 {
 }
 
diff --git a/radeon/radeon_cs_space_drm.c b/radeon/radeon_cs_space_drm.c
index 89cbbb5..e22b437 100644
--- a/radeon/radeon_cs_space_drm.c
+++ b/radeon/radeon_cs_space_drm.c
@@ -29,6 +29,8 @@
 #include <errno.h>
 #include <stdlib.h>
 #include "radeon_bocs_wrapper.h"
+#include "radeon_bo_int_drm.h"
+#include "radeon_cs_int_drm.h"
 
 struct rad_sizes {
     int32_t op_read;
@@ -39,7 +41,7 @@ struct rad_sizes {
 static inline int radeon_cs_setup_bo(struct radeon_cs_space_check *sc, struct rad_sizes *sizes)
 {
     uint32_t read_domains, write_domain;
-    struct radeon_bo *bo;
+    struct radeon_bo_int *bo;
 
     bo = sc->bo;
     sc->new_accounted = 0;
@@ -47,7 +49,7 @@ static inline int radeon_cs_setup_bo(struct radeon_cs_space_check *sc, struct ra
     write_domain = sc->write_domain;
 
     /* legacy needs a static check */
-    if (radeon_bo_is_static(bo)) {
+    if (radeon_bo_is_static((struct radeon_bo *)sc->bo)) {
 	bo->space_accounted = sc->new_accounted = (read_domains << 16) | write_domain;
 	return 0;
     }
@@ -100,11 +102,11 @@ static inline int radeon_cs_setup_bo(struct radeon_cs_space_check *sc, struct ra
     return 0;
 }
 
-static int radeon_cs_do_space_check(struct radeon_cs *cs, struct radeon_cs_space_check *new_tmp)
+static int radeon_cs_do_space_check(struct radeon_cs_int *cs, struct radeon_cs_space_check *new_tmp)
 {
     struct radeon_cs_manager *csm = cs->csm;
     int i;
-    struct radeon_bo *bo;
+    struct radeon_bo_int *bo;
     struct rad_sizes sizes;
     int ret;
 
@@ -158,25 +160,28 @@ static int radeon_cs_do_space_check(struct radeon_cs *cs, struct radeon_cs_space
 
 void radeon_cs_space_add_persistent_bo(struct radeon_cs *cs, struct radeon_bo *bo, uint32_t read_domains, uint32_t write_domain)
 {
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
     int i;
-    for (i = 0; i < cs->bo_count; i++) {
-	if (cs->bos[i].bo == bo &&
-	    cs->bos[i].read_domains == read_domains &&
-	    cs->bos[i].write_domain == write_domain)
+    for (i = 0; i < csi->bo_count; i++) {
+	if (csi->bos[i].bo == boi &&
+	    csi->bos[i].read_domains == read_domains &&
+	    csi->bos[i].write_domain == write_domain)
 	    return;
     }
     radeon_bo_ref(bo);
-    i = cs->bo_count;
-    cs->bos[i].bo = bo;
-    cs->bos[i].read_domains = read_domains;
-    cs->bos[i].write_domain = write_domain;
-    cs->bos[i].new_accounted = 0;
-    cs->bo_count++;
-
-    assert(cs->bo_count < MAX_SPACE_BOS);
+    i = csi->bo_count;
+    csi->bos[i].bo = boi;
+    csi->bos[i].read_domains = read_domains;
+    csi->bos[i].write_domain = write_domain;
+    csi->bos[i].new_accounted = 0;
+    csi->bo_count++;
+
+    assert(csi->bo_count < MAX_SPACE_BOS);
 }
 
-static int radeon_cs_check_space_internal(struct radeon_cs *cs, struct radeon_cs_space_check *tmp_bo)
+static int radeon_cs_check_space_internal(struct radeon_cs_int *cs,
+					  struct radeon_cs_space_check *tmp_bo)
 {
     int ret;
     int flushed = 0;
@@ -198,37 +203,42 @@ again:
 int radeon_cs_space_check_with_bo(struct radeon_cs *cs,
 				  struct radeon_bo *bo,
 				  uint32_t read_domains, uint32_t write_domain)
-{									
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
     struct radeon_cs_space_check temp_bo;
+    
     int ret = 0;
 
     if (bo) {
-	temp_bo.bo = bo;
+	temp_bo.bo = boi;
 	temp_bo.read_domains = read_domains;
 	temp_bo.write_domain = write_domain;
 	temp_bo.new_accounted = 0;
     }
 
-    ret = radeon_cs_check_space_internal(cs, bo ? &temp_bo : NULL);
+    ret = radeon_cs_check_space_internal(csi, bo ? &temp_bo : NULL);
     return ret;
 }
 
 int radeon_cs_space_check(struct radeon_cs *cs)
 {
-    return radeon_cs_check_space_internal(cs, NULL);
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return radeon_cs_check_space_internal(csi, NULL);
 }
 
 void radeon_cs_space_reset_bos(struct radeon_cs *cs)
 {
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
     int i;
-    for (i = 0; i < cs->bo_count; i++) {
-	radeon_bo_unref(cs->bos[i].bo);
-	cs->bos[i].bo = NULL;
-	cs->bos[i].read_domains = 0;
-	cs->bos[i].write_domain = 0;
-	cs->bos[i].new_accounted = 0;
+    for (i = 0; i < csi->bo_count; i++) {
+	radeon_bo_unref((struct radeon_bo *)csi->bos[i].bo);
+	csi->bos[i].bo = NULL;
+	csi->bos[i].read_domains = 0;
+	csi->bos[i].write_domain = 0;
+	csi->bos[i].new_accounted = 0;
     }
-    cs->bo_count = 0;
+    csi->bo_count = 0;
 }
 
 
diff --git a/radeon/radeon_dma.c b/radeon/radeon_dma.c
index c6edbae..232972d 100644
--- a/radeon/radeon_dma.c
+++ b/radeon/radeon_dma.c
@@ -306,10 +306,6 @@ static int radeon_bo_is_idle(struct radeon_bo* bo)
 		WARN_ONCE("Your libdrm or kernel doesn't have support for busy query.\n"
 			"This may cause small performance drop for you.\n");
 	}
-	/* Protect against bug in legacy bo handling that causes bos stay
-	 * referenced even after they should be freed */
-	if (bo->cref != 1)
-		return 0;
 	return ret != -EBUSY;
 }
 
@@ -346,9 +342,7 @@ void radeonReleaseDmaRegions(radeonContextPtr rmesa)
 	foreach_s(dma_bo, temp, &rmesa->dma.wait) {
 		if (dma_bo->expire_counter == time) {
 			WARN_ONCE("Leaking dma buffer object!\n");
-			/* force free of buffer so we don't realy start
-			 * leaking stuff now*/
-			while ((dma_bo->bo = radeon_bo_unref(dma_bo->bo))) {}
+			radeon_bo_unref(dma_bo->bo);
 			remove_from_list(dma_bo);
 			FREE(dma_bo);
 			continue;
diff --git a/radeon/radeon_fbo.c b/radeon/radeon_fbo.c
index d83b166..fc21069 100644
--- a/radeon/radeon_fbo.c
+++ b/radeon/radeon_fbo.c
@@ -33,7 +33,6 @@
 #include "main/framebuffer.h"
 #include "main/renderbuffer.h"
 #include "main/context.h"
-#include "main/texformat.h"
 #include "main/texrender.h"
 #include "drivers/common/meta.h"
 
@@ -91,11 +90,8 @@ radeon_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    case GL_R3_G3_B2:
    case GL_RGB4:
    case GL_RGB5:
-      rb->_ActualFormat = GL_RGB5;
+      rb->Format = _dri_texformat_rgb565;
       rb->DataType = GL_UNSIGNED_BYTE;
-      rb->RedBits = 5;
-      rb->GreenBits = 6;
-      rb->BlueBits = 5;
       cpp = 2;
       break;
    case GL_RGB:
@@ -103,12 +99,8 @@ radeon_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    case GL_RGB10:
    case GL_RGB12:
    case GL_RGB16:
-      rb->_ActualFormat = GL_RGB8;
+      rb->Format = _dri_texformat_argb8888;
       rb->DataType = GL_UNSIGNED_BYTE;
-      rb->RedBits = 8;
-      rb->GreenBits = 8;
-      rb->BlueBits = 8;
-      rb->AlphaBits = 0;
       cpp = 4;
       break;
    case GL_RGBA:
@@ -119,12 +111,8 @@ radeon_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    case GL_RGB10_A2:
    case GL_RGBA12:
    case GL_RGBA16:
-      rb->_ActualFormat = GL_RGBA8;
+      rb->Format = _dri_texformat_argb8888;
       rb->DataType = GL_UNSIGNED_BYTE;
-      rb->RedBits = 8;
-      rb->GreenBits = 8;
-      rb->BlueBits = 8;
-      rb->AlphaBits = 8;
       cpp = 4;
       break;
    case GL_STENCIL_INDEX:
@@ -133,39 +121,36 @@ radeon_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    case GL_STENCIL_INDEX8_EXT:
    case GL_STENCIL_INDEX16_EXT:
       /* alloc a depth+stencil buffer */
-      rb->_ActualFormat = GL_DEPTH24_STENCIL8_EXT;
+      rb->Format = MESA_FORMAT_S8_Z24;
       rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
-      rb->StencilBits = 8;
       cpp = 4;
       break;
    case GL_DEPTH_COMPONENT16:
-      rb->_ActualFormat = GL_DEPTH_COMPONENT16;
+      rb->Format = MESA_FORMAT_Z16;
       rb->DataType = GL_UNSIGNED_SHORT;
-      rb->DepthBits = 16;
       cpp = 2;
       break;
    case GL_DEPTH_COMPONENT:
    case GL_DEPTH_COMPONENT24:
    case GL_DEPTH_COMPONENT32:
-      rb->_ActualFormat = GL_DEPTH_COMPONENT24;
+      rb->Format = MESA_FORMAT_X8_Z24;
       rb->DataType = GL_UNSIGNED_INT;
-      rb->DepthBits = 24;
       cpp = 4;
       break;
    case GL_DEPTH_STENCIL_EXT:
    case GL_DEPTH24_STENCIL8_EXT:
-      rb->_ActualFormat = GL_DEPTH24_STENCIL8_EXT;
+      rb->Format = MESA_FORMAT_S8_Z24;
       rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
-      rb->DepthBits = 24;
-      rb->StencilBits = 8;
       cpp = 4;
       break;
    default:
       _mesa_problem(ctx,
-                    "Unexpected format in intel_alloc_renderbuffer_storage");
+                    "Unexpected format in radeon_alloc_renderbuffer_storage");
       return GL_FALSE;
    }
 
+  rb->_BaseFormat = _mesa_base_fbo_format(ctx, internalFormat);
+
   if (ctx->Driver.Flush)
 	  ctx->Driver.Flush(ctx); /* +r6/r7 */
 
@@ -213,7 +198,7 @@ radeon_alloc_window_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    ASSERT(rb->Name == 0);
    rb->Width = width;
    rb->Height = height;
-   rb->_ActualFormat = internalFormat;
+   rb->InternalFormat = internalFormat;
 
    return GL_TRUE;
 }
@@ -255,8 +240,13 @@ radeon_nop_alloc_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    return GL_FALSE;
 }
 
+
+/**
+ * Create a renderbuffer for a window's color, depth and/or stencil buffer.
+ * Not used for user-created renderbuffers.
+ */
 struct radeon_renderbuffer *
-radeon_create_renderbuffer(GLenum format, __DRIdrawablePrivate *driDrawPriv)
+radeon_create_renderbuffer(gl_format format, __DRIdrawablePrivate *driDrawPriv)
 {
     struct radeon_renderbuffer *rrb;
 
@@ -267,67 +257,64 @@ radeon_create_renderbuffer(GLenum format, __DRIdrawablePrivate *driDrawPriv)
     _mesa_init_renderbuffer(&rrb->base, 0);
     rrb->base.ClassID = RADEON_RB_CLASS;
 
-    /* XXX format junk */
+    rrb->base.Format = format;
+
     switch (format) {
-	case GL_RGB5:
-	    rrb->base._ActualFormat = GL_RGB5;
-	    rrb->base._BaseFormat = GL_RGBA;
-	    rrb->base.RedBits = 5;
-	    rrb->base.GreenBits = 6;
-	    rrb->base.BlueBits = 5;
+        case MESA_FORMAT_RGB565:
+	    assert(_mesa_little_endian());
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+            rrb->base._BaseFormat = GL_RGB;
+	    break;
+        case MESA_FORMAT_RGB565_REV:
+	    assert(!_mesa_little_endian());
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+            rrb->base._BaseFormat = GL_RGB;
+	    break;
+        case MESA_FORMAT_XRGB8888:
+	    assert(_mesa_little_endian());
 	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+            rrb->base._BaseFormat = GL_RGB;
 	    break;
-	case GL_RGB8:
-	    rrb->base._ActualFormat = GL_RGB8;
-	    rrb->base._BaseFormat = GL_RGB;
-	    rrb->base.RedBits = 8;
-	    rrb->base.GreenBits = 8;
-	    rrb->base.BlueBits = 8;
-	    rrb->base.AlphaBits = 0;
+        case MESA_FORMAT_XRGB8888_REV:
+	    assert(!_mesa_little_endian());
 	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+            rrb->base._BaseFormat = GL_RGB;
 	    break;
-	case GL_RGBA8:
-	    rrb->base._ActualFormat = GL_RGBA8;
-	    rrb->base._BaseFormat = GL_RGBA;
-	    rrb->base.RedBits = 8;
-	    rrb->base.GreenBits = 8;
-	    rrb->base.BlueBits = 8;
-	    rrb->base.AlphaBits = 8;
+	case MESA_FORMAT_ARGB8888:
+	    assert(_mesa_little_endian());
 	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+            rrb->base._BaseFormat = GL_RGBA;
 	    break;
-	case GL_STENCIL_INDEX8_EXT:
-	    rrb->base._ActualFormat = GL_STENCIL_INDEX8_EXT;
-	    rrb->base._BaseFormat = GL_STENCIL_INDEX;
-	    rrb->base.StencilBits = 8;
+	case MESA_FORMAT_ARGB8888_REV:
+	    assert(!_mesa_little_endian());
 	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+            rrb->base._BaseFormat = GL_RGBA;
 	    break;
-	case GL_DEPTH_COMPONENT16:
-	    rrb->base._ActualFormat = GL_DEPTH_COMPONENT16;
-	    rrb->base._BaseFormat = GL_DEPTH_COMPONENT;
-	    rrb->base.DepthBits = 16;
+	case MESA_FORMAT_S8:
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+            rrb->base._BaseFormat = GL_STENCIL_INDEX;
+	    break;
+	case MESA_FORMAT_Z16:
 	    rrb->base.DataType = GL_UNSIGNED_SHORT;
+            rrb->base._BaseFormat = GL_DEPTH_COMPONENT;
 	    break;
-	case GL_DEPTH_COMPONENT24:
-	    rrb->base._ActualFormat = GL_DEPTH_COMPONENT24;
-	    rrb->base._BaseFormat = GL_DEPTH_COMPONENT;
-	    rrb->base.DepthBits = 24;
+	case MESA_FORMAT_X8_Z24:
 	    rrb->base.DataType = GL_UNSIGNED_INT;
+            rrb->base._BaseFormat = GL_DEPTH_COMPONENT;
 	    break;
-	case GL_DEPTH24_STENCIL8_EXT:
-	    rrb->base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
-	    rrb->base._BaseFormat = GL_DEPTH_STENCIL_EXT;
-	    rrb->base.DepthBits = 24;
-	    rrb->base.StencilBits = 8;
+	case MESA_FORMAT_S8_Z24:
 	    rrb->base.DataType = GL_UNSIGNED_INT_24_8_EXT;
+            rrb->base._BaseFormat = GL_DEPTH_STENCIL;
 	    break;
 	default:
-	    fprintf(stderr, "%s: Unknown format 0x%04x\n", __FUNCTION__, format);
+	    fprintf(stderr, "%s: Unknown format %s\n",
+                    __FUNCTION__, _mesa_get_format_name(format));
 	    _mesa_delete_renderbuffer(&rrb->base);
 	    return NULL;
     }
 
     rrb->dPriv = driDrawPriv;
-    rrb->base.InternalFormat = format;
+    rrb->base.InternalFormat = _mesa_get_format_base_format(format);
 
     rrb->base.Delete = radeon_delete_renderbuffer;
     rrb->base.AllocStorage = radeon_alloc_window_storage;
@@ -382,51 +369,41 @@ radeon_framebuffer_renderbuffer(GLcontext * ctx,
 }
 
 
+/* TODO: According to EXT_fbo spec internal format of texture image
+ * once set during glTexImage call, should be preserved when
+ * attaching image to renderbuffer. When HW doesn't support
+ * rendering to format of attached image, set framebuffer
+ * completeness accordingly in radeon_validate_framebuffer (issue #79).
+ */
 static GLboolean
 radeon_update_wrapper(GLcontext *ctx, struct radeon_renderbuffer *rrb, 
 		     struct gl_texture_image *texImage)
 {
 	int retry = 0;
+	gl_format texFormat;
+
 restart:
-	if (texImage->TexFormat == &_mesa_texformat_argb8888) {
-		rrb->cpp = 4;
-		rrb->base._ActualFormat = GL_RGBA8;
-		rrb->base._BaseFormat = GL_RGBA;
+	if (texImage->TexFormat == _dri_texformat_argb8888) {
 		rrb->base.DataType = GL_UNSIGNED_BYTE;
 		DBG("Render to RGBA8 texture OK\n");
 	}
-	else if (texImage->TexFormat == &_mesa_texformat_rgb565) {
-		rrb->cpp = 2;
-		rrb->base._ActualFormat = GL_RGB5;
-		rrb->base._BaseFormat = GL_RGB;
+	else if (texImage->TexFormat == _dri_texformat_rgb565) {
 		rrb->base.DataType = GL_UNSIGNED_BYTE;
 		DBG("Render to RGB5 texture OK\n");
 	}
-	else if (texImage->TexFormat == &_mesa_texformat_argb1555) {
-		rrb->cpp = 2;
-		rrb->base._ActualFormat = GL_RGB5_A1;
-		rrb->base._BaseFormat = GL_RGBA;
+	else if (texImage->TexFormat == _dri_texformat_argb1555) {
 		rrb->base.DataType = GL_UNSIGNED_BYTE;
 		DBG("Render to ARGB1555 texture OK\n");
 	}
-	else if (texImage->TexFormat == &_mesa_texformat_argb4444) {
-		rrb->cpp = 2;
-		rrb->base._ActualFormat = GL_RGBA4;
-		rrb->base._BaseFormat = GL_RGBA;
+	else if (texImage->TexFormat == _dri_texformat_argb4444) {
 		rrb->base.DataType = GL_UNSIGNED_BYTE;
-		DBG("Render to ARGB1555 texture OK\n");
+		DBG("Render to ARGB4444 texture OK\n");
 	}
-	else if (texImage->TexFormat == &_mesa_texformat_z16) {
-		rrb->cpp = 2;
-		rrb->base._ActualFormat = GL_DEPTH_COMPONENT16;
-		rrb->base._BaseFormat = GL_DEPTH_COMPONENT;
+	else if (texImage->TexFormat == MESA_FORMAT_Z16) {
 		rrb->base.DataType = GL_UNSIGNED_SHORT;
 		DBG("Render to DEPTH16 texture OK\n");
 	}
-	else if (texImage->TexFormat == &_mesa_texformat_s8_z24) {
-		rrb->cpp = 4;
-		rrb->base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
-		rrb->base._BaseFormat = GL_DEPTH_STENCIL_EXT;
+	else if (texImage->TexFormat == MESA_FORMAT_S8_Z24) {
 		rrb->base.DataType = GL_UNSIGNED_INT_24_8_EXT;
 		DBG("Render to DEPTH_STENCIL texture OK\n");
 	}
@@ -434,27 +411,31 @@ restart:
 		/* try redoing the FBO */
 		if (retry == 1) {
 			DBG("Render to texture BAD FORMAT %d\n",
-			    texImage->TexFormat->MesaFormat);
+			    texImage->TexFormat);
 			return GL_FALSE;
 		}
+                /* XXX why is the tex format being set here?
+                 * I think this can be removed.
+                 */
 		texImage->TexFormat = radeonChooseTextureFormat(ctx, texImage->InternalFormat, 0,
-								texImage->TexFormat->DataType,
+								_mesa_get_format_datatype(texImage->TexFormat),
 								1);
 
 		retry++;
 		goto restart;
 	}
 	
+	texFormat = texImage->TexFormat;
+
+	rrb->base.Format = texFormat;
+
+        rrb->cpp = _mesa_get_format_bytes(texFormat);
 	rrb->pitch = texImage->Width * rrb->cpp;
-	rrb->base.InternalFormat = rrb->base._ActualFormat;
+	rrb->base.InternalFormat = texImage->InternalFormat;
+        rrb->base._BaseFormat = _mesa_base_fbo_format(ctx, rrb->base.InternalFormat);
+
 	rrb->base.Width = texImage->Width;
 	rrb->base.Height = texImage->Height;
-	rrb->base.RedBits = texImage->TexFormat->RedBits;
-	rrb->base.GreenBits = texImage->TexFormat->GreenBits;
-	rrb->base.BlueBits = texImage->TexFormat->BlueBits;
-	rrb->base.AlphaBits = texImage->TexFormat->AlphaBits;
-	rrb->base.DepthBits = texImage->TexFormat->DepthBits;
-	rrb->base.StencilBits = texImage->TexFormat->StencilBits;
 	
 	rrb->base.Delete = radeon_delete_renderbuffer;
 	rrb->base.AllocStorage = radeon_nop_alloc_storage;
@@ -555,8 +536,10 @@ radeon_render_texture(GLcontext * ctx,
       imageOffset += offsets[att->Zoffset];
    }
 
-   /* store that offset in the region */
+   /* store that offset in the region, along with the correct pitch for
+    * the image we are rendering to */
    rrb->draw_offset = imageOffset;
+   rrb->pitch = radeon_image->mt->levels[att->TextureLevel].rowstride;
 
    /* update drawing region, etc */
    radeon_draw_buffer(ctx, fb);
@@ -583,7 +566,7 @@ void radeon_fbo_init(struct radeon_context *radeon)
   radeon->glCtx->Driver.FinishRenderTexture = radeon_finish_render_texture;
   radeon->glCtx->Driver.ResizeBuffers = radeon_resize_buffers;
   radeon->glCtx->Driver.ValidateFramebuffer = radeon_validate_framebuffer;
-  radeon->glCtx->Driver.BlitFramebuffer = _mesa_meta_blit_framebuffer;
+  radeon->glCtx->Driver.BlitFramebuffer = _mesa_meta_BlitFramebuffer;
 }
 
   
diff --git a/radeon/radeon_lock.c b/radeon/radeon_lock.c
index 02de8e5..7ad781b 100644
--- a/radeon/radeon_lock.c
+++ b/radeon/radeon_lock.c
@@ -62,8 +62,6 @@ void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
 	__DRIdrawablePrivate *const readable = radeon_get_readable(rmesa);
 	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
 
-	assert(drawable != NULL);
-
 	drmGetLock(rmesa->dri.fd, rmesa->dri.hwContext, flags);
 
 	/* The window might have moved, so we might need to get new clip
@@ -74,12 +72,13 @@ void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
 	 * Since the hardware state depends on having the latest drawable
 	 * clip rects, all state checking must be done _after_ this call.
 	 */
-	DRI_VALIDATE_DRAWABLE_INFO(sPriv, drawable);
-	if (drawable != readable) {
+	if (drawable)
+		DRI_VALIDATE_DRAWABLE_INFO(sPriv, drawable);
+	if (readable && drawable != readable) {
 		DRI_VALIDATE_DRAWABLE_INFO(sPriv, readable);
 	}
 
-	if (rmesa->lastStamp != drawable->lastStamp) {
+	if (drawable && (rmesa->lastStamp != drawable->lastStamp)) {
 		radeon_window_moved(rmesa);
 		rmesa->lastStamp = drawable->lastStamp;
 	}
diff --git a/radeon/radeon_mipmap_tree.c b/radeon/radeon_mipmap_tree.c
index 38db305..5a346c5 100644
--- a/radeon/radeon_mipmap_tree.c
+++ b/radeon/radeon_mipmap_tree.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2009 Maciej Cencora.
  * Copyright (C) 2008 Nicolai Haehnle.
  *
  * All Rights Reserved.
@@ -32,51 +33,39 @@
 
 #include "main/simple_list.h"
 #include "main/texcompress.h"
-#include "main/texformat.h"
-
-static GLuint radeon_compressed_texture_size(GLcontext *ctx,
-		GLsizei width, GLsizei height, GLsizei depth,
-		GLuint mesaFormat)
+#include "main/teximage.h"
+#include "main/texobj.h"
+#include "radeon_texture.h"
+
+static unsigned get_aligned_compressed_row_stride(
+		gl_format format,
+		unsigned width,
+		unsigned minStride)
 {
-	GLuint size = _mesa_compressed_texture_size(ctx, width, height, depth, mesaFormat);
-
-	if (mesaFormat == MESA_FORMAT_RGB_DXT1 ||
-	    mesaFormat == MESA_FORMAT_RGBA_DXT1) {
-		if (width + 3 < 8)	/* width one block */
-			size = size * 4;
-		else if (width + 3 < 16)
-			size = size * 2;
-	} else {
-		/* DXT3/5, 16 bytes per block */
-	  //		WARN_ONCE("DXT 3/5 suffers from multitexturing problems!\n");
-		if (width + 3 < 8)
-			size = size * 2;
+	const unsigned blockSize = _mesa_get_format_bytes(format);
+	unsigned blockWidth, blockHeight, numXBlocks;
+
+	_mesa_get_format_block_size(format, &blockWidth, &blockHeight);
+	numXBlocks = (width + blockWidth - 1) / blockWidth;
+
+	while (numXBlocks * blockSize < minStride)
+	{
+		++numXBlocks;
 	}
 
-	return size;
+	return numXBlocks * blockSize;
 }
 
-
-static int radeon_compressed_num_bytes(GLuint mesaFormat)
+static unsigned get_compressed_image_size(
+		gl_format format,
+		unsigned rowStride,
+		unsigned height)
 {
-   int bytes = 0;
-   switch(mesaFormat) {
-     
-   case MESA_FORMAT_RGB_FXT1:
-   case MESA_FORMAT_RGBA_FXT1:
-   case MESA_FORMAT_RGB_DXT1:
-   case MESA_FORMAT_RGBA_DXT1:
-     bytes = 2;
-     break;
-     
-   case MESA_FORMAT_RGBA_DXT3:
-   case MESA_FORMAT_RGBA_DXT5:
-     bytes = 4;
-   default:
-     break;
-   }
-   
-   return bytes;
+	unsigned blockWidth, blockHeight;
+
+	_mesa_get_format_block_size(format, &blockWidth, &blockHeight);
+
+	return rowStride * ((height + blockHeight - 1) / blockHeight);
 }
 
 /**
@@ -93,25 +82,22 @@ static void compute_tex_image_offset(radeonContextPtr rmesa, radeon_mipmap_tree
 	uint32_t row_align;
 
 	/* Find image size in bytes */
-	if (mt->compressed) {
-		/* TODO: Is this correct? Need test cases for compressed textures! */
-		row_align = rmesa->texture_compressed_row_align - 1;
-		lvl->rowstride = (lvl->width * mt->bpp + row_align) & ~row_align;
-		lvl->size = radeon_compressed_texture_size(mt->radeon->glCtx,
-							   lvl->width, lvl->height, lvl->depth, mt->compressed);
+	if (_mesa_is_format_compressed(mt->mesaFormat)) {
+		lvl->rowstride = get_aligned_compressed_row_stride(mt->mesaFormat, lvl->width, rmesa->texture_compressed_row_align);
+		lvl->size = get_compressed_image_size(mt->mesaFormat, lvl->rowstride, lvl->height);
 	} else if (mt->target == GL_TEXTURE_RECTANGLE_NV) {
 		row_align = rmesa->texture_rect_row_align - 1;
-		lvl->rowstride = (lvl->width * mt->bpp + row_align) & ~row_align;
+		lvl->rowstride = (_mesa_format_row_stride(mt->mesaFormat, lvl->width) + row_align) & ~row_align;
 		lvl->size = lvl->rowstride * lvl->height;
 	} else if (mt->tilebits & RADEON_TXO_MICRO_TILE) {
 		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
 		 * though the actual offset may be different (if texture is less than
 		 * 32 bytes width) to the untiled case */
-		lvl->rowstride = (lvl->width * mt->bpp * 2 + 31) & ~31;
+		lvl->rowstride = (_mesa_format_row_stride(mt->mesaFormat, lvl->width) * 2 + 31) & ~31;
 		lvl->size = lvl->rowstride * ((lvl->height + 1) / 2) * lvl->depth;
 	} else {
 		row_align = rmesa->texture_row_align - 1;
-		lvl->rowstride = (lvl->width * mt->bpp + row_align) & ~row_align;
+		lvl->rowstride = (_mesa_format_row_stride(mt->mesaFormat, lvl->width) + row_align) & ~row_align;
 		lvl->size = lvl->rowstride * lvl->height * lvl->depth;
 	}
 	assert(lvl->size > 0);
@@ -138,22 +124,19 @@ static GLuint minify(GLuint size, GLuint levels)
 
 static void calculate_miptree_layout_r100(radeonContextPtr rmesa, radeon_mipmap_tree *mt)
 {
-	GLuint curOffset;
-	GLuint numLevels;
-	GLuint i;
-	GLuint face;
+	GLuint curOffset, i, face, level;
 
-	numLevels = mt->lastLevel - mt->firstLevel + 1;
-	assert(numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
+	assert(mt->numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
 
 	curOffset = 0;
 	for(face = 0; face < mt->faces; face++) {
 
-		for(i = 0; i < numLevels; i++) {
-			mt->levels[i].width = minify(mt->width0, i);
-			mt->levels[i].height = minify(mt->height0, i);
-			mt->levels[i].depth = minify(mt->depth0, i);
-			compute_tex_image_offset(rmesa, mt, face, i, &curOffset);
+		for(i = 0, level = mt->baseLevel; i < mt->numLevels; i++, level++) {
+			mt->levels[level].valid = 1;
+			mt->levels[level].width = minify(mt->width0, i);
+			mt->levels[level].height = minify(mt->height0, i);
+			mt->levels[level].depth = minify(mt->depth0, i);
+			compute_tex_image_offset(rmesa, mt, face, level, &curOffset);
 		}
 	}
 
@@ -163,23 +146,21 @@ static void calculate_miptree_layout_r100(radeonContextPtr rmesa, radeon_mipmap_
 
 static void calculate_miptree_layout_r300(radeonContextPtr rmesa, radeon_mipmap_tree *mt)
 {
-	GLuint curOffset;
-	GLuint numLevels;
-	GLuint i;
+	GLuint curOffset, i, level;
 
-	numLevels = mt->lastLevel - mt->firstLevel + 1;
-	assert(numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
+	assert(mt->numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
 
 	curOffset = 0;
-	for(i = 0; i < numLevels; i++) {
+	for(i = 0, level = mt->baseLevel; i < mt->numLevels; i++, level++) {
 		GLuint face;
 
-		mt->levels[i].width = minify(mt->width0, i);
-		mt->levels[i].height = minify(mt->height0, i);
-		mt->levels[i].depth = minify(mt->depth0, i);
+		mt->levels[level].valid = 1;
+		mt->levels[level].width = minify(mt->width0, i);
+		mt->levels[level].height = minify(mt->height0, i);
+		mt->levels[level].depth = minify(mt->depth0, i);
 
 		for(face = 0; face < mt->faces; face++)
-			compute_tex_image_offset(rmesa, mt, face, i, &curOffset);
+			compute_tex_image_offset(rmesa, mt, face, level, &curOffset);
 	}
 
 	/* Note the required size in memory */
@@ -189,27 +170,22 @@ static void calculate_miptree_layout_r300(radeonContextPtr rmesa, radeon_mipmap_
 /**
  * Create a new mipmap tree, calculate its layout and allocate memory.
  */
-radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
-		GLenum target, GLenum internal_format, GLuint firstLevel, GLuint lastLevel,
-		GLuint width0, GLuint height0, GLuint depth0,
-		GLuint bpp, GLuint tilebits, GLuint compressed)
+static radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa,
+		GLenum target, gl_format mesaFormat, GLuint baseLevel, GLuint numLevels,
+		GLuint width0, GLuint height0, GLuint depth0, GLuint tilebits)
 {
 	radeon_mipmap_tree *mt = CALLOC_STRUCT(_radeon_mipmap_tree);
 
-	mt->radeon = rmesa;
-	mt->internal_format = internal_format;
+	mt->mesaFormat = mesaFormat;
 	mt->refcount = 1;
-	mt->t = t;
 	mt->target = target;
 	mt->faces = (target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
-	mt->firstLevel = firstLevel;
-	mt->lastLevel = lastLevel;
+	mt->baseLevel = baseLevel;
+	mt->numLevels = numLevels;
 	mt->width0 = width0;
 	mt->height0 = height0;
 	mt->depth0 = depth0;
-	mt->bpp = compressed ? radeon_compressed_num_bytes(compressed) : bpp;
 	mt->tilebits = tilebits;
-	mt->compressed = compressed;
 
 	if (rmesa->radeonScreen->chip_family >= CHIP_FAMILY_R300)
 		calculate_miptree_layout_r300(rmesa, mt);
@@ -224,53 +200,43 @@ radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *
 	return mt;
 }
 
-void radeon_miptree_reference(radeon_mipmap_tree *mt)
+void radeon_miptree_reference(radeon_mipmap_tree *mt, radeon_mipmap_tree **ptr)
 {
+	assert(!*ptr);
+
 	mt->refcount++;
 	assert(mt->refcount > 0);
+
+	*ptr = mt;
 }
 
-void radeon_miptree_unreference(radeon_mipmap_tree *mt)
+void radeon_miptree_unreference(radeon_mipmap_tree **ptr)
 {
+	radeon_mipmap_tree *mt = *ptr;
 	if (!mt)
 		return;
 
 	assert(mt->refcount > 0);
+
 	mt->refcount--;
 	if (!mt->refcount) {
 		radeon_bo_unref(mt->bo);
 		free(mt);
 	}
-}
 
+	*ptr = 0;
+}
 
 /**
- * Calculate first and last mip levels for the given texture object,
- * where the dimensions are taken from the given texture image at
- * the given level.
- *
- * Note: level is the OpenGL level number, which is not necessarily the same
- * as the first level that is actually present.
- *
- * The base level image of the given texture face must be non-null,
- * or this will fail.
+ * Calculate min and max LOD for the given texture object.
+ * @param[in] tObj texture object whose LOD values to calculate
+ * @param[out] pminLod minimal LOD
+ * @param[out] pmaxLod maximal LOD
  */
-static void calculate_first_last_level(struct gl_texture_object *tObj,
-				       GLuint *pfirstLevel, GLuint *plastLevel,
-				       GLuint face, GLuint level)
+static void calculate_min_max_lod(struct gl_texture_object *tObj,
+				       unsigned *pminLod, unsigned *pmaxLod)
 {
-	const struct gl_texture_image * const baseImage =
-		tObj->Image[face][level];
-
-	assert(baseImage);
-	
-	/* These must be signed values.  MinLod and MaxLod can be negative numbers,
-	* and having firstLevel and lastLevel as signed prevents the need for
-	* extra sign checks.
-	*/
-	int   firstLevel;
-	int   lastLevel;
-
+	int minLod, maxLod;
 	/* Yes, this looks overly complicated, but it's all needed.
 	*/
 	switch (tObj->Target) {
@@ -281,32 +247,30 @@ static void calculate_first_last_level(struct gl_texture_object *tObj,
 		if (tObj->MinFilter == GL_NEAREST || tObj->MinFilter == GL_LINEAR) {
 			/* GL_NEAREST and GL_LINEAR only care about GL_TEXTURE_BASE_LEVEL.
 			*/
-			firstLevel = lastLevel = tObj->BaseLevel;
+			minLod = maxLod = tObj->BaseLevel;
 		} else {
-			firstLevel = tObj->BaseLevel + (GLint)(tObj->MinLod + 0.5);
-			firstLevel = MAX2(firstLevel, tObj->BaseLevel);
-			firstLevel = MIN2(firstLevel, level + baseImage->MaxLog2);
-			lastLevel = tObj->BaseLevel + (GLint)(tObj->MaxLod + 0.5);
-			lastLevel = MAX2(lastLevel, tObj->BaseLevel);
-			lastLevel = MIN2(lastLevel, level + baseImage->MaxLog2);
-			lastLevel = MIN2(lastLevel, tObj->MaxLevel);
-			lastLevel = MAX2(firstLevel, lastLevel); /* need at least one level */
+			minLod = tObj->BaseLevel + (GLint)(tObj->MinLod);
+			minLod = MAX2(minLod, tObj->BaseLevel);
+			minLod = MIN2(minLod, tObj->MaxLevel);
+			maxLod = tObj->BaseLevel + (GLint)(tObj->MaxLod + 0.5);
+			maxLod = MIN2(maxLod, tObj->MaxLevel);
+			maxLod = MIN2(maxLod, tObj->Image[0][minLod]->MaxLog2 + minLod);
+			maxLod = MAX2(maxLod, minLod); /* need at least one level */
 		}
 		break;
 	case GL_TEXTURE_RECTANGLE_NV:
 	case GL_TEXTURE_4D_SGIS:
-		firstLevel = lastLevel = 0;
+		minLod = maxLod = 0;
 		break;
 	default:
 		return;
 	}
 
 	/* save these values */
-	*pfirstLevel = firstLevel;
-	*plastLevel = lastLevel;
+	*pminLod = minLod;
+	*pmaxLod = maxLod;
 }
 
-
 /**
  * Checks whether the given miptree can hold the given texture image at the
  * given face and level.
@@ -316,20 +280,15 @@ GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
 {
 	radeon_mipmap_level *lvl;
 
-	if (face >= mt->faces || level < mt->firstLevel || level > mt->lastLevel)
-		return GL_FALSE;
-
-	if (texImage->InternalFormat != mt->internal_format ||
-	    texImage->IsCompressed != mt->compressed)
+	if (face >= mt->faces)
 		return GL_FALSE;
 
-	if (!texImage->IsCompressed &&
-	    !mt->compressed &&
-	    texImage->TexFormat->TexelBytes != mt->bpp)
+	if (texImage->TexFormat != mt->mesaFormat)
 		return GL_FALSE;
 
-	lvl = &mt->levels[level - mt->firstLevel];
-	if (lvl->width != texImage->Width ||
+	lvl = &mt->levels[level];
+	if (!lvl->valid ||
+	    lvl->width != texImage->Width ||
 	    lvl->height != texImage->Height ||
 	    lvl->depth != texImage->Depth)
 		return GL_FALSE;
@@ -337,59 +296,72 @@ GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
 	return GL_TRUE;
 }
 
-
 /**
  * Checks whether the given miptree has the right format to store the given texture object.
  */
-GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj)
+static GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj)
 {
 	struct gl_texture_image *firstImage;
-	GLuint compressed;
-	GLuint numfaces = 1;
-	GLuint firstLevel, lastLevel;
-
-	calculate_first_last_level(texObj, &firstLevel, &lastLevel, 0, texObj->BaseLevel);
-	if (texObj->Target == GL_TEXTURE_CUBE_MAP)
-		numfaces = 6;
-
-	firstImage = texObj->Image[0][firstLevel];
-	compressed = firstImage->IsCompressed ? firstImage->TexFormat->MesaFormat : 0;
-
-	return (mt->firstLevel == firstLevel &&
-	        mt->lastLevel == lastLevel &&
-	        mt->width0 == firstImage->Width &&
-	        mt->height0 == firstImage->Height &&
-	        mt->depth0 == firstImage->Depth &&
-	        mt->compressed == compressed &&
-	        (!mt->compressed ? (mt->bpp == firstImage->TexFormat->TexelBytes) : 1));
-}
+	unsigned numLevels;
+	radeon_mipmap_level *mtBaseLevel;
 
+	if (texObj->BaseLevel < mt->baseLevel)
+		return GL_FALSE;
+
+	mtBaseLevel = &mt->levels[texObj->BaseLevel - mt->baseLevel];
+	firstImage = texObj->Image[0][texObj->BaseLevel];
+	numLevels = MIN2(texObj->MaxLevel - texObj->BaseLevel + 1, firstImage->MaxLog2 + 1);
+
+	if (RADEON_DEBUG & RADEON_TEXTURE) {
+		fprintf(stderr, "Checking if miptree %p matches texObj %p\n", mt, texObj);
+		fprintf(stderr, "target %d vs %d\n", mt->target, texObj->Target);
+		fprintf(stderr, "format %d vs %d\n", mt->mesaFormat, firstImage->TexFormat);
+		fprintf(stderr, "numLevels %d vs %d\n", mt->numLevels, numLevels);
+		fprintf(stderr, "width0 %d vs %d\n", mtBaseLevel->width, firstImage->Width);
+		fprintf(stderr, "height0 %d vs %d\n", mtBaseLevel->height, firstImage->Height);
+		fprintf(stderr, "depth0 %d vs %d\n", mtBaseLevel->depth, firstImage->Depth);
+		if (mt->target == texObj->Target &&
+	        mt->mesaFormat == firstImage->TexFormat &&
+	        mt->numLevels >= numLevels &&
+	        mtBaseLevel->width == firstImage->Width &&
+	        mtBaseLevel->height == firstImage->Height &&
+	        mtBaseLevel->depth == firstImage->Depth) {
+			fprintf(stderr, "MATCHED\n");
+		} else {
+			fprintf(stderr, "NOT MATCHED\n");
+		}
+	}
+
+	return (mt->target == texObj->Target &&
+	        mt->mesaFormat == firstImage->TexFormat &&
+	        mt->numLevels >= numLevels &&
+	        mtBaseLevel->width == firstImage->Width &&
+	        mtBaseLevel->height == firstImage->Height &&
+	        mtBaseLevel->depth == firstImage->Depth);
+}
 
 /**
- * Try to allocate a mipmap tree for the given texture that will fit the
- * given image in the given position.
+ * Try to allocate a mipmap tree for the given texture object.
+ * @param[in] rmesa radeon context
+ * @param[in] t radeon texture object
  */
-void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
-		radeon_texture_image *image, GLuint face, GLuint level)
+void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t)
 {
-	GLuint compressed = image->base.IsCompressed ? image->base.TexFormat->MesaFormat : 0;
-	GLuint numfaces = 1;
-	GLuint firstLevel, lastLevel;
+	struct gl_texture_object *texObj = &t->base;
+	struct gl_texture_image *texImg = texObj->Image[0][texObj->BaseLevel];
+	GLuint numLevels;
 
 	assert(!t->mt);
 
-	calculate_first_last_level(&t->base, &firstLevel, &lastLevel, face, level);
-	if (t->base.Target == GL_TEXTURE_CUBE_MAP)
-		numfaces = 6;
-
-	if (level != firstLevel || face >= numfaces)
+	if (!texImg)
 		return;
 
-	t->mt = radeon_miptree_create(rmesa, t, t->base.Target,
-		image->base.InternalFormat,
-		firstLevel, lastLevel,
-		image->base.Width, image->base.Height, image->base.Depth,
-		image->base.TexFormat->TexelBytes, t->tile_bits, compressed);
+	numLevels = MIN2(texObj->MaxLevel - texObj->BaseLevel + 1, texImg->MaxLog2 + 1);
+
+	t->mt = radeon_miptree_create(rmesa, t->base.Target,
+		texImg->TexFormat, texObj->BaseLevel,
+		numLevels, texImg->Width, texImg->Height,
+		texImg->Depth, t->tile_bits);
 }
 
 /* Although we use the image_offset[] array to store relative offsets
@@ -401,21 +373,236 @@ void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
 void
 radeon_miptree_depth_offsets(radeon_mipmap_tree *mt, GLuint level, GLuint *offsets)
 {
-     if (mt->target != GL_TEXTURE_3D || mt->faces == 1)
-        offsets[0] = 0;
-     else {
-	int i;
-	for (i = 0; i < 6; i++)
-		offsets[i] = mt->levels[level].faces[i].offset;
-     }
+	if (mt->target != GL_TEXTURE_3D || mt->faces == 1) {
+		offsets[0] = 0;
+	} else {
+		int i;
+		for (i = 0; i < 6; i++) {
+			offsets[i] = mt->levels[level].faces[i].offset;
+		}
+	}
 }
 
 GLuint
 radeon_miptree_image_offset(radeon_mipmap_tree *mt,
 			    GLuint face, GLuint level)
 {
-   if (mt->target == GL_TEXTURE_CUBE_MAP_ARB)
-      return (mt->levels[level].faces[face].offset);
-   else
-      return mt->levels[level].faces[0].offset;
+	if (mt->target == GL_TEXTURE_CUBE_MAP_ARB)
+		return (mt->levels[level].faces[face].offset);
+	else
+		return mt->levels[level].faces[0].offset;
+}
+
+/**
+ * Ensure that the given image is stored in the given miptree from now on.
+ */
+static void migrate_image_to_miptree(radeon_mipmap_tree *mt,
+									 radeon_texture_image *image,
+									 int face, int level)
+{
+	radeon_mipmap_level *dstlvl = &mt->levels[level];
+	unsigned char *dest;
+
+	assert(image->mt != mt);
+	assert(dstlvl->valid);
+	assert(dstlvl->width == image->base.Width);
+	assert(dstlvl->height == image->base.Height);
+	assert(dstlvl->depth == image->base.Depth);
+
+	radeon_bo_map(mt->bo, GL_TRUE);
+	dest = mt->bo->ptr + dstlvl->faces[face].offset;
+
+	if (image->mt) {
+		/* Format etc. should match, so we really just need a memcpy().
+		 * In fact, that memcpy() could be done by the hardware in many
+		 * cases, provided that we have a proper memory manager.
+		 */
+		assert(mt->mesaFormat == image->base.TexFormat);
+
+		radeon_mipmap_level *srclvl = &image->mt->levels[image->mtlevel];
+
+		/* TODO: bring back these assertions once the FBOs are fixed */
+#if 0
+		assert(image->mtlevel == level);
+		assert(srclvl->size == dstlvl->size);
+		assert(srclvl->rowstride == dstlvl->rowstride);
+#endif
+
+		radeon_bo_map(image->mt->bo, GL_FALSE);
+
+		memcpy(dest,
+			image->mt->bo->ptr + srclvl->faces[face].offset,
+			dstlvl->size);
+		radeon_bo_unmap(image->mt->bo);
+
+		radeon_miptree_unreference(&image->mt);
+	} else if (image->base.Data) {
+		/* This condition should be removed, it's here to workaround
+		 * a segfault when mapping textures during software fallbacks.
+		 */
+		const uint32_t srcrowstride = _mesa_format_row_stride(image->base.TexFormat, image->base.Width);
+		uint32_t rows = image->base.Height * image->base.Depth;
+
+		if (_mesa_is_format_compressed(image->base.TexFormat)) {
+			uint32_t blockWidth, blockHeight;
+			_mesa_get_format_block_size(image->base.TexFormat, &blockWidth, &blockHeight);
+			rows = (rows + blockHeight - 1) / blockHeight;
+		}
+
+		copy_rows(dest, dstlvl->rowstride, image->base.Data, srcrowstride,
+				  rows, srcrowstride);
+
+		_mesa_free_texmemory(image->base.Data);
+		image->base.Data = 0;
+	}
+
+	radeon_bo_unmap(mt->bo);
+
+	radeon_miptree_reference(mt, &image->mt);
+	image->mtface = face;
+	image->mtlevel = level;
+}
+
+/**
+ * Filter matching miptrees, and select one with the most of data.
+ * @param[in] texObj radeon texture object
+ * @param[in] firstLevel first texture level to check
+ * @param[in] lastLevel last texture level to check
+ */
+static radeon_mipmap_tree * get_biggest_matching_miptree(radeonTexObj *texObj,
+														 unsigned firstLevel,
+														 unsigned lastLevel)
+{
+	const unsigned numLevels = lastLevel - firstLevel + 1;
+	unsigned *mtSizes = calloc(numLevels, sizeof(unsigned));
+	radeon_mipmap_tree **mts = calloc(numLevels, sizeof(radeon_mipmap_tree *));
+	unsigned mtCount = 0;
+	unsigned maxMtIndex = 0;
+	radeon_mipmap_tree *tmp;
+	unsigned level;
+	int i;
+
+	for (level = firstLevel; level <= lastLevel; ++level) {
+		radeon_texture_image *img = get_radeon_texture_image(texObj->base.Image[0][level]);
+		unsigned found = 0;
+		// TODO: why this hack??
+		if (!img)
+			break;
+
+		if (!img->mt)
+			continue;
+
+		for (i = 0; i < mtCount; ++i) {
+			if (mts[i] == img->mt) {
+				found = 1;
+				mtSizes[i] += img->mt->levels[img->mtlevel].size;
+				break;
+			}
+		}
+
+		if (!found && radeon_miptree_matches_texture(img->mt, &texObj->base)) {
+			mtSizes[mtCount] = img->mt->levels[img->mtlevel].size;
+			mts[mtCount] = img->mt;
+			mtCount++;
+		}
+	}
+
+	if (mtCount == 0) {
+		return NULL;
+	}
+
+	for (i = 1; i < mtCount; ++i) {
+		if (mtSizes[i] > mtSizes[maxMtIndex]) {
+			maxMtIndex = i;
+		}
+	}
+
+	tmp = mts[maxMtIndex];
+	free(mtSizes);
+	free(mts);
+
+	return tmp;
+}
+
+/**
+ * Validate texture mipmap tree.
+ * If individual images are stored in different mipmap trees
+ * use the mipmap tree that has the most of the correct data.
+ */
+int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *texObj)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	radeonTexObj *t = radeon_tex_obj(texObj);
+
+	if (t->validated || t->image_override) {
+		return GL_TRUE;
+	}
+
+	if (texObj->Image[0][texObj->BaseLevel]->Border > 0)
+		return GL_FALSE;
+
+	_mesa_test_texobj_completeness(rmesa->glCtx, texObj);
+	if (!texObj->_Complete) {
+		return GL_FALSE;
+	}
+
+	calculate_min_max_lod(&t->base, &t->minLod, &t->maxLod);
+
+	if (RADEON_DEBUG & RADEON_TEXTURE)
+		fprintf(stderr, "%s: Validating texture %p now, minLod = %d, maxLod = %d\n",
+				__FUNCTION__, texObj ,t->minLod, t->maxLod);
+
+	radeon_mipmap_tree *dst_miptree;
+	dst_miptree = get_biggest_matching_miptree(t, t->minLod, t->maxLod);
+
+	if (!dst_miptree) {
+		radeon_miptree_unreference(&t->mt);
+		radeon_try_alloc_miptree(rmesa, t);
+		dst_miptree = t->mt;
+		if (RADEON_DEBUG & RADEON_TEXTURE) {
+			fprintf(stderr, "%s: No matching miptree found, allocated new one %p\n", __FUNCTION__, t->mt);
+		}
+	} else if (RADEON_DEBUG & RADEON_TEXTURE) {
+		fprintf(stderr, "%s: Using miptree %p\n", __FUNCTION__, t->mt);
+	}
+
+	const unsigned faces = texObj->Target == GL_TEXTURE_CUBE_MAP ? 6 : 1;
+	unsigned face, level;
+	radeon_texture_image *img;
+	/* Validate only the levels that will actually be used during rendering */
+	for (face = 0; face < faces; ++face) {
+		for (level = t->minLod; level <= t->maxLod; ++level) {
+			img = get_radeon_texture_image(texObj->Image[face][level]);
+
+			if (RADEON_DEBUG & RADEON_TEXTURE) {
+				fprintf(stderr, "Checking image level %d, face %d, mt %p ... ", level, face, img->mt);
+			}
+			
+			if (img->mt != dst_miptree) {
+				if (RADEON_DEBUG & RADEON_TEXTURE) {
+					fprintf(stderr, "MIGRATING\n");
+				}
+				struct radeon_bo *src_bo = (img->mt) ? img->mt->bo : img->bo;
+				if (src_bo && radeon_bo_is_referenced_by_cs(src_bo, rmesa->cmdbuf.cs)) {
+					radeon_firevertices(rmesa);
+				}
+				migrate_image_to_miptree(dst_miptree, img, face, level);
+			} else if (RADEON_DEBUG & RADEON_TEXTURE) {
+				fprintf(stderr, "OK\n");
+			}
+		}
+	}
+
+	t->validated = GL_TRUE;
+
+	return GL_TRUE;
+}
+
+uint32_t get_base_teximage_offset(radeonTexObj *texObj)
+{
+	if (!texObj->mt) {
+		return 0;
+	} else {
+		return radeon_miptree_image_offset(texObj->mt, 0, texObj->minLod);
+	}
 }
diff --git a/radeon/radeon_mipmap_tree.h b/radeon/radeon_mipmap_tree.h
index db28252..a10649b 100644
--- a/radeon/radeon_mipmap_tree.h
+++ b/radeon/radeon_mipmap_tree.h
@@ -44,6 +44,7 @@ struct _radeon_mipmap_level {
 	GLuint depth;
 	GLuint size; /** Size of each image, in bytes */
 	GLuint rowstride; /** in bytes */
+	GLuint valid;
 	radeon_mipmap_image faces[6];
 };
 
@@ -59,43 +60,35 @@ struct _radeon_mipmap_level {
  * changed.
  */
 struct _radeon_mipmap_tree {
-	radeonContextPtr radeon;
-	radeonTexObj *t;
 	struct radeon_bo *bo;
 	GLuint refcount;
 
 	GLuint totalsize; /** total size of the miptree, in bytes */
 
 	GLenum target; /** GL_TEXTURE_xxx */
-	GLenum internal_format;
+	GLenum mesaFormat; /** MESA_FORMAT_xxx */
 	GLuint faces; /** # of faces: 6 for cubemaps, 1 otherwise */
-	GLuint firstLevel; /** First mip level stored in this mipmap tree */
-	GLuint lastLevel; /** Last mip level stored in this mipmap tree */
+	GLuint baseLevel; /** gl_texture_object->baseLevel it was created for */
+	GLuint numLevels; /** Number of mip levels stored in this mipmap tree */
 
-	GLuint width0; /** Width of firstLevel image */
-	GLuint height0; /** Height of firstLevel image */
-	GLuint depth0; /** Depth of firstLevel image */
+	GLuint width0; /** Width of baseLevel image */
+	GLuint height0; /** Height of baseLevel image */
+	GLuint depth0; /** Depth of baseLevel image */
 
-	GLuint bpp; /** Bytes per texel */
 	GLuint tilebits; /** RADEON_TXO_xxx_TILE */
-	GLuint compressed; /** MESA_FORMAT_xxx indicating a compressed format, or 0 if uncompressed */
 
 	radeon_mipmap_level levels[RADEON_MIPTREE_MAX_TEXTURE_LEVELS];
 };
 
-radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
-		GLenum target, GLenum internal_format, GLuint firstLevel, GLuint lastLevel,
-		GLuint width0, GLuint height0, GLuint depth0,
-		GLuint bpp, GLuint tilebits, GLuint compressed);
-void radeon_miptree_reference(radeon_mipmap_tree *mt);
-void radeon_miptree_unreference(radeon_mipmap_tree *mt);
+void radeon_miptree_reference(radeon_mipmap_tree *mt, radeon_mipmap_tree **ptr);
+void radeon_miptree_unreference(radeon_mipmap_tree **ptr);
 
 GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
 		struct gl_texture_image *texImage, GLuint face, GLuint level);
-GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj);
-void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
-			      radeon_texture_image *texImage, GLuint face, GLuint level);
+void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t);
 GLuint radeon_miptree_image_offset(radeon_mipmap_tree *mt,
 				   GLuint face, GLuint level);
 void radeon_miptree_depth_offsets(radeon_mipmap_tree *mt, GLuint level, GLuint *offsets);
+
+uint32_t get_base_teximage_offset(radeonTexObj *texObj);
 #endif /* __RADEON_MIPMAP_TREE_H_ */
diff --git a/radeon/radeon_queryobj.c b/radeon/radeon_queryobj.c
index b79d864..98117cd 100644
--- a/radeon/radeon_queryobj.c
+++ b/radeon/radeon_queryobj.c
@@ -31,24 +31,11 @@
 #include "main/imports.h"
 #include "main/simple_list.h"
 
-static int radeonQueryIsFlushed(GLcontext *ctx, struct gl_query_object *q)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-	struct radeon_query_object *tmp, *query = (struct radeon_query_object *)q;
-
-	foreach(tmp, &radeon->query.not_flushed_head) {
-		if (tmp == query) {
-			return 0;
-		}
-	}
-
-	return 1;
-}
-
 static void radeonQueryGetResult(GLcontext *ctx, struct gl_query_object *q)
 {
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
 	struct radeon_query_object *query = (struct radeon_query_object *)q;
-	uint32_t *result;
+        uint32_t *result;
 	int i;
 
 	radeon_print(RADEON_STATE, RADEON_VERBOSE,
@@ -56,13 +43,35 @@ static void radeonQueryGetResult(GLcontext *ctx, struct gl_query_object *q)
 			__FUNCTION__, query->Base.Id, (int) query->Base.Result);
 
 	radeon_bo_map(query->bo, GL_FALSE);
-
-	result = query->bo->ptr;
+        result = query->bo->ptr;
 
 	query->Base.Result = 0;
-	for (i = 0; i < query->curr_offset/sizeof(uint32_t); ++i) {
-		query->Base.Result += result[i];
-		radeon_print(RADEON_STATE, RADEON_TRACE, "result[%d] = %d\n", i, result[i]);
+	if (IS_R600_CLASS(radeon->radeonScreen)) {
+		/* ZPASS EVENT writes alternating qwords
+		 * At query start we set the start offset to 0 and
+		 * hw writes zpass start counts to qwords 0, 2, 4, 6.
+		 * At query end we set the start offset to 8 and
+		 * hw writes zpass end counts to qwords 1, 3, 5, 7.
+		 * then we substract. MSB is the valid bit.
+		 */
+		for (i = 0; i < 16; i += 4) {
+			uint64_t start = (uint64_t)LE32_TO_CPU(result[i]) |
+					 (uint64_t)LE32_TO_CPU(result[i + 1]) << 32;
+			uint64_t end = (uint64_t)LE32_TO_CPU(result[i + 2]) |
+				       (uint64_t)LE32_TO_CPU(result[i + 3]) << 32;
+			if ((start & 0x8000000000000000) && (end & 0x8000000000000000)) {
+				uint64_t query_count = end - start;
+				query->Base.Result += query_count;
+
+			}
+			radeon_print(RADEON_STATE, RADEON_TRACE,
+				     "%d start: %lx, end: %lx %ld\n", i, start, end, end - start);
+		}
+	} else {
+		for (i = 0; i < query->curr_offset/sizeof(uint32_t); ++i) {
+			query->Base.Result += LE32_TO_CPU(result[i]);
+			radeon_print(RADEON_STATE, RADEON_TRACE, "result[%d] = %d\n", i, LE32_TO_CPU(result[i]));
+		}
 	}
 
 	radeon_bo_unmap(query->bo);
@@ -99,10 +108,11 @@ static void radeonDeleteQuery(GLcontext *ctx, struct gl_query_object *q)
 
 static void radeonWaitQuery(GLcontext *ctx, struct gl_query_object *q)
 {
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
 	struct radeon_query_object *query = (struct radeon_query_object *)q;
 
 	/* If the cmdbuf with packets for this query hasn't been flushed yet, do it now */
-	if (!radeonQueryIsFlushed(ctx, q))
+	if (radeon_bo_is_referenced_by_cs(query->bo, radeon->cmdbuf.cs))
 		ctx->Driver.Flush(ctx);
 
 	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s: query id %d, bo %p, offset %d\n", __FUNCTION__, q->Id, query->bo, query->curr_offset);
@@ -134,8 +144,6 @@ static void radeonBeginQuery(GLcontext *ctx, struct gl_query_object *q)
 
 	radeon->query.queryobj.dirty = GL_TRUE;
 	radeon->hw.is_dirty = GL_TRUE;
-	insert_at_tail(&radeon->query.not_flushed_head, query);
-
 }
 
 void radeonEmitQueryEnd(GLcontext *ctx)
@@ -183,7 +191,7 @@ static void radeonCheckQuery(GLcontext *ctx, struct gl_query_object *q)
 		uint32_t domain;
 
 		/* Need to perform a flush, as per ARB_occlusion_query spec */
-		if (!radeonQueryIsFlushed(ctx, q)) {
+		if (radeon_bo_is_referenced_by_cs(query->bo, radeon->cmdbuf.cs)) {
 			ctx->Driver.Flush(ctx);
 		}
 
diff --git a/radeon/radeon_screen.c b/radeon/radeon_screen.c
index 5ffb55d..be2d836 100644
--- a/radeon/radeon_screen.c
+++ b/radeon/radeon_screen.c
@@ -48,17 +48,17 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_screen.h"
 #include "radeon_common.h"
 #include "radeon_span.h"
-#if !RADEON_COMMON
+#if defined(RADEON_R100)
 #include "radeon_context.h"
 #include "radeon_tex.h"
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+#elif defined(RADEON_R200)
 #include "r200_context.h"
 #include "r200_ioctl.h"
 #include "r200_tex.h"
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+#elif defined(RADEON_R300)
 #include "r300_context.h"
 #include "r300_tex.h"
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+#elif defined(RADEON_R600)
 #include "r600_context.h"
 #include "r700_driconf.h" /* +r6/r7 */
 #include "r600_tex.h"     /* +r6/r7 */
@@ -82,7 +82,7 @@ DRI_CONF_OPT_BEGIN_V(command_buffer_size,int,def, # min ":" # max ) \
         DRI_CONF_DESC(de,"Grösse des Befehlspuffers (in KB)") \
 DRI_CONF_OPT_END
 
-#if !RADEON_COMMON	/* R100 */
+#if defined(RADEON_R100)	/* R100 */
 PUBLIC const char __driConfigOptions[] =
 DRI_CONF_BEGIN
     DRI_CONF_SECTION_PERFORMANCE
@@ -109,7 +109,7 @@ DRI_CONF_BEGIN
 DRI_CONF_END;
 static const GLuint __driNConfigOptions = 15;
 
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+#elif defined(RADEON_R200)
 
 PUBLIC const char __driConfigOptions[] =
 DRI_CONF_BEGIN
@@ -141,13 +141,7 @@ DRI_CONF_BEGIN
 DRI_CONF_END;
 static const GLuint __driNConfigOptions = 17;
 
-extern const struct dri_extension blend_extensions[];
-extern const struct dri_extension ARB_vp_extension[];
-extern const struct dri_extension NV_vp_extension[];
-extern const struct dri_extension ATI_fs_extension[];
-extern const struct dri_extension point_extensions[];
-
-#elif RADEON_COMMON && (defined(RADEON_COMMON_FOR_R300) || defined(RADEON_COMMON_FOR_R600))
+#elif defined(RADEON_R300) || defined(RADEON_R600)
 
 #define DRI_CONF_FP_OPTIMIZATION_SPEED   0
 #define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
@@ -218,12 +212,7 @@ DRI_CONF_BEGIN
 DRI_CONF_END;
 static const GLuint __driNConfigOptions = 17;
 
-extern const struct dri_extension gl_20_extension[];
-
-#endif /* RADEON_COMMON && defined(RADEON_COMMON_FOR_R300) */
-
-extern const struct dri_extension card_extensions[];
-extern const struct dri_extension mm_extensions[];
+#endif
 
 static int getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo );
 
@@ -337,7 +326,7 @@ radeonFillInModes( __DRIscreenPrivate *psp,
     return (const __DRIconfig **) configs;
 }
 
-#if !RADEON_COMMON
+#if defined(RADEON_R100)
 static const __DRItexOffsetExtension radeonTexOffsetExtension = {
     { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
     radeonSetTexOffset,
@@ -350,7 +339,7 @@ static const __DRItexBufferExtension radeonTexBufferExtension = {
 };
 #endif
 
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+#if defined(RADEON_R200)
 static const __DRIallocateExtension r200AllocateExtension = {
     { __DRI_ALLOCATE, __DRI_ALLOCATE_VERSION },
     r200AllocateMemoryMESA,
@@ -370,7 +359,7 @@ static const __DRItexBufferExtension r200TexBufferExtension = {
 };
 #endif
 
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+#if defined(RADEON_R300)
 static const __DRItexOffsetExtension r300texOffsetExtension = {
     { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
    r300SetTexOffset,
@@ -383,7 +372,7 @@ static const __DRItexBufferExtension r300TexBufferExtension = {
 };
 #endif
 
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+#if defined(RADEON_R600)
 static const __DRItexOffsetExtension r600texOffsetExtension = {
     { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
    r600SetTexOffset, /* +r6/r7 */
@@ -401,12 +390,14 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
    screen->device_id = device_id;
    screen->chip_flags = 0;
    switch ( device_id ) {
+   case PCI_CHIP_RN50_515E:
+   case PCI_CHIP_RN50_5969:
+	return -1;
+
    case PCI_CHIP_RADEON_LY:
    case PCI_CHIP_RADEON_LZ:
    case PCI_CHIP_RADEON_QY:
    case PCI_CHIP_RADEON_QZ:
-   case PCI_CHIP_RN50_515E:
-   case PCI_CHIP_RN50_5969:
       screen->chip_family = CHIP_FAMILY_RV100;
       break;
 
@@ -1222,22 +1213,22 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
        screen->extensions[i++] = &driMediaStreamCounterExtension.base;
    }
 
-#if !RADEON_COMMON
+#if defined(RADEON_R100)
    screen->extensions[i++] = &radeonTexOffsetExtension.base;
 #endif
 
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+#if defined(RADEON_R200)
    if (IS_R200_CLASS(screen))
       screen->extensions[i++] = &r200AllocateExtension.base;
 
    screen->extensions[i++] = &r200texOffsetExtension.base;
 #endif
 
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+#if defined(RADEON_R300)
    screen->extensions[i++] = &r300texOffsetExtension.base;
 #endif
 
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+#if defined(RADEON_R600)
    screen->extensions[i++] = &r600texOffsetExtension.base;
 #endif
 
@@ -1376,22 +1367,22 @@ radeonCreateScreen2(__DRIscreenPrivate *sPriv)
        screen->extensions[i++] = &driMediaStreamCounterExtension.base;
    }
 
-#if !RADEON_COMMON
+#if defined(RADEON_R100)
    screen->extensions[i++] = &radeonTexBufferExtension.base;
 #endif
 
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+#if defined(RADEON_R200)
    if (IS_R200_CLASS(screen))
        screen->extensions[i++] = &r200AllocateExtension.base;
 
    screen->extensions[i++] = &r200TexBufferExtension.base;
 #endif
 
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+#if defined(RADEON_R300)
    screen->extensions[i++] = &r300TexBufferExtension.base;
 #endif
 
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+#if defined(RADEON_R600)
    screen->extensions[i++] = &r600TexBufferExtension.base;
 #endif
 
@@ -1480,7 +1471,7 @@ radeonCreateBuffer( __DRIscreenPrivate *driScrnPriv,
     const GLboolean swAccum = mesaVis->accumRedBits > 0;
     const GLboolean swStencil = mesaVis->stencilBits > 0 &&
 	mesaVis->depthBits != 24;
-    GLenum rgbFormat;
+    gl_format rgbFormat;
     struct radeon_framebuffer *rfb;
 
     if (isPixmap)
@@ -1493,11 +1484,11 @@ radeonCreateBuffer( __DRIscreenPrivate *driScrnPriv,
     _mesa_initialize_framebuffer(&rfb->base, mesaVis);
 
     if (mesaVis->redBits == 5)
-        rgbFormat = GL_RGB5;
+        rgbFormat = _mesa_little_endian() ? MESA_FORMAT_RGB565 : MESA_FORMAT_RGB565_REV;
     else if (mesaVis->alphaBits == 0)
-        rgbFormat = GL_RGB8;
+        rgbFormat = _mesa_little_endian() ? MESA_FORMAT_XRGB8888 : MESA_FORMAT_XRGB8888_REV;
     else
-        rgbFormat = GL_RGBA8;
+        rgbFormat = _mesa_little_endian() ? MESA_FORMAT_ARGB8888 : MESA_FORMAT_ARGB8888_REV;
 
     /* front color renderbuffer */
     rfb->color_rb[0] = radeon_create_renderbuffer(rgbFormat, driDrawPriv);
@@ -1513,19 +1504,22 @@ radeonCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 
     if (mesaVis->depthBits == 24) {
       if (mesaVis->stencilBits == 8) {
-	struct radeon_renderbuffer *depthStencilRb = radeon_create_renderbuffer(GL_DEPTH24_STENCIL8_EXT, driDrawPriv);
+	struct radeon_renderbuffer *depthStencilRb =
+           radeon_create_renderbuffer(MESA_FORMAT_S8_Z24, driDrawPriv);
 	_mesa_add_renderbuffer(&rfb->base, BUFFER_DEPTH, &depthStencilRb->base);
 	_mesa_add_renderbuffer(&rfb->base, BUFFER_STENCIL, &depthStencilRb->base);
 	depthStencilRb->has_surface = screen->depthHasSurface;
       } else {
 	/* depth renderbuffer */
-	struct radeon_renderbuffer *depth = radeon_create_renderbuffer(GL_DEPTH_COMPONENT24, driDrawPriv);
+	struct radeon_renderbuffer *depth =
+           radeon_create_renderbuffer(MESA_FORMAT_X8_Z24, driDrawPriv);
 	_mesa_add_renderbuffer(&rfb->base, BUFFER_DEPTH, &depth->base);
 	depth->has_surface = screen->depthHasSurface;
       }
     } else if (mesaVis->depthBits == 16) {
-      /* just 16-bit depth buffer, no hw stencil */
-	struct radeon_renderbuffer *depth = radeon_create_renderbuffer(GL_DEPTH_COMPONENT16, driDrawPriv);
+        /* just 16-bit depth buffer, no hw stencil */
+	struct radeon_renderbuffer *depth =
+           radeon_create_renderbuffer(MESA_FORMAT_Z16, driDrawPriv);
 	_mesa_add_renderbuffer(&rfb->base, BUFFER_DEPTH, &depth->base);
 	depth->has_surface = screen->depthHasSurface;
     }
@@ -1589,22 +1583,22 @@ radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 static const __DRIconfig **
 radeonInitScreen(__DRIscreenPrivate *psp)
 {
-#if !RADEON_COMMON
+#if defined(RADEON_R100)
    static const char *driver_name = "Radeon";
    static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
    static const __DRIversion dri_expected = { 4, 0, 0 };
    static const __DRIversion drm_expected = { 1, 6, 0 };
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+#elif defined(RADEON_R200)
    static const char *driver_name = "R200";
    static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
    static const __DRIversion dri_expected = { 4, 0, 0 };
    static const __DRIversion drm_expected = { 1, 6, 0 };
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+#elif defined(RADEON_R300)
    static const char *driver_name = "R300";
    static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
    static const __DRIversion dri_expected = { 4, 0, 0 };
    static const __DRIversion drm_expected = { 1, 24, 0 };
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+#elif defined(RADEON_R600)
    static const char *driver_name = "R600";
    static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
    static const __DRIversion dri_expected = { 4, 0, 0 };
@@ -1619,27 +1613,6 @@ radeonInitScreen(__DRIscreenPrivate *psp)
       return NULL;
    }
 
-   /* Calling driInitExtensions here, with a NULL context pointer,
-    * does not actually enable the extensions.  It just makes sure
-    * that all the dispatch offsets for all the extensions that
-    * *might* be enables are known.  This is needed because the
-    * dispatch offsets need to be known when _mesa_context_create
-    * is called, but we can't enable the extensions until we have a
-    * context pointer.
-    *
-    * Hello chicken.  Hello egg.  How are you two today?
-    */
-   driInitExtensions( NULL, card_extensions, GL_FALSE );
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
-   driInitExtensions( NULL, blend_extensions, GL_FALSE );
-   driInitSingleExtension( NULL, ARB_vp_extension );
-   driInitSingleExtension( NULL, NV_vp_extension );
-   driInitSingleExtension( NULL, ATI_fs_extension );
-   driInitExtensions( NULL, point_extensions, GL_FALSE );
-#elif (defined(RADEON_COMMON_FOR_R300) || defined(RADEON_COMMON_FOR_R600))
-   driInitSingleExtension( NULL, gl_20_extension );
-#endif
-
    if (!radeonInitDriver(psp))
        return NULL;
 
@@ -1672,28 +1645,6 @@ __DRIconfig **radeonInitScreen2(__DRIscreenPrivate *psp)
    int color;
    __DRIconfig **configs = NULL;
 
-   /* Calling driInitExtensions here, with a NULL context pointer,
-    * does not actually enable the extensions.  It just makes sure
-    * that all the dispatch offsets for all the extensions that
-    * *might* be enables are known.  This is needed because the
-    * dispatch offsets need to be known when _mesa_context_create
-    * is called, but we can't enable the extensions until we have a
-    * context pointer.
-    *
-    * Hello chicken.  Hello egg.  How are you two today?
-    */
-   driInitExtensions( NULL, card_extensions, GL_FALSE );
-   driInitExtensions( NULL, mm_extensions, GL_FALSE );
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
-   driInitExtensions( NULL, blend_extensions, GL_FALSE );
-   driInitSingleExtension( NULL, ARB_vp_extension );
-   driInitSingleExtension( NULL, NV_vp_extension );
-   driInitSingleExtension( NULL, ATI_fs_extension );
-   driInitExtensions( NULL, point_extensions, GL_FALSE );
-#elif (defined(RADEON_COMMON_FOR_R300) || defined(RADEON_COMMON_FOR_R600))
-   driInitSingleExtension( NULL, gl_20_extension );
-#endif
-
    if (!radeonInitDriver(psp)) {
        return NULL;
     }
@@ -1772,13 +1723,13 @@ getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo )
 const struct __DriverAPIRec driDriverAPI = {
    .InitScreen      = radeonInitScreen,
    .DestroyScreen   = radeonDestroyScreen,
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+#if defined(RADEON_R200)
    .CreateContext   = r200CreateContext,
    .DestroyContext  = r200DestroyContext,
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+#elif defined(RADEON_R600)
    .CreateContext   = r600CreateContext,
    .DestroyContext  = radeonDestroyContext,
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+#elif defined(RADEON_R300)
    .CreateContext   = r300CreateContext,
    .DestroyContext  = radeonDestroyContext,
 #else
diff --git a/radeon/radeon_span.c b/radeon/radeon_span.c
index d603f52..665f2b6 100644
--- a/radeon/radeon_span.c
+++ b/radeon/radeon_span.c
@@ -41,6 +41,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
 #include "main/glheader.h"
+#include "main/texformat.h"
 #include "swrast/swrast.h"
 
 #include "radeon_common.h"
@@ -55,7 +56,7 @@ static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb);
 /* r200 depth buffer is always tiled - this is the formula
    according to the docs unless I typo'ed in it
 */
-#if defined(RADEON_COMMON_FOR_R200)
+#if defined(RADEON_R200)
 static GLubyte *r200_depth_2byte(const struct radeon_renderbuffer * rrb,
 				 GLint x, GLint y)
 {
@@ -112,7 +113,7 @@ static GLubyte *r200_depth_4byte(const struct radeon_renderbuffer * rrb,
  * - 2D (akin to macro-tiled/micro-tiled on older asics)
  * only 1D tiling is implemented below
  */
-#if defined(RADEON_COMMON_FOR_R600)
+#if defined(RADEON_R600)
 static inline GLint r600_1d_tile_helper(const struct radeon_renderbuffer * rrb,
 					GLint x, GLint y, GLint is_depth, GLint is_stencil)
 {
@@ -334,22 +335,6 @@ static GLubyte *radeon_ptr_2byte_8x2(const struct radeon_renderbuffer * rrb,
 
 #endif
 
-#ifndef COMPILE_R300
-#ifndef COMPILE_R600
-static uint32_t
-z24s8_to_s8z24(uint32_t val)
-{
-   return (val << 24) | (val >> 8);
-}
-
-static uint32_t
-s8z24_to_z24s8(uint32_t val)
-{
-   return (val >> 24) | (val << 8);
-}
-#endif
-#endif
-
 /*
  * Note that all information needed to access pixels in a renderbuffer
  * should be obtained through the gl_renderbuffer parameter, not per-context
@@ -409,7 +394,19 @@ s8z24_to_z24s8(uint32_t val)
 
 #define TAG(x)    radeon##x##_RGB565
 #define TAG2(x,y) radeon##x##_RGB565##y
-#if defined(RADEON_COMMON_FOR_R600)
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#endif
+#include "spantmp2.h"
+
+#define SPANTMP_PIXEL_FMT GL_RGB
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5_REV
+
+#define TAG(x)    radeon##x##_RGB565_REV
+#define TAG2(x,y) radeon##x##_RGB565_REV##y
+#if defined(RADEON_R600)
 #define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
 #else
 #define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
@@ -423,7 +420,19 @@ s8z24_to_z24s8(uint32_t val)
 
 #define TAG(x)    radeon##x##_ARGB1555
 #define TAG2(x,y) radeon##x##_ARGB1555##y
-#if defined(RADEON_COMMON_FOR_R600)
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#endif
+#include "spantmp2.h"
+
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_1_5_5_5
+
+#define TAG(x)    radeon##x##_ARGB1555_REV
+#define TAG2(x,y) radeon##x##_ARGB1555_REV##y
+#if defined(RADEON_R600)
 #define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
 #else
 #define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
@@ -437,7 +446,19 @@ s8z24_to_z24s8(uint32_t val)
 
 #define TAG(x)    radeon##x##_ARGB4444
 #define TAG2(x,y) radeon##x##_ARGB4444##y
-#if defined(RADEON_COMMON_FOR_R600)
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#endif
+#include "spantmp2.h"
+
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_4_4_4_4
+
+#define TAG(x)    radeon##x##_ARGB4444_REV
+#define TAG2(x,y) radeon##x##_ARGB4444_REV##y
+#if defined(RADEON_R600)
 #define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
 #else
 #define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
@@ -451,7 +472,7 @@ s8z24_to_z24s8(uint32_t val)
 
 #define TAG(x)    radeon##x##_xRGB8888
 #define TAG2(x,y) radeon##x##_xRGB8888##y
-#if defined(RADEON_COMMON_FOR_R600)
+#if defined(RADEON_R600)
 #define GET_VALUE(_x, _y) ((*(GLuint*)(r600_ptr_color(rrb, _x + x_off, _y + y_off)) | 0xff000000))
 #define PUT_VALUE(_x, _y, d) { \
    GLuint *_ptr = (GLuint*)r600_ptr_color( rrb, _x + x_off, _y + y_off );		\
@@ -473,7 +494,7 @@ s8z24_to_z24s8(uint32_t val)
 
 #define TAG(x)    radeon##x##_ARGB8888
 #define TAG2(x,y) radeon##x##_ARGB8888##y
-#if defined(RADEON_COMMON_FOR_R600)
+#if defined(RADEON_R600)
 #define GET_VALUE(_x, _y) (*(GLuint*)(r600_ptr_color(rrb, _x + x_off, _y + y_off)))
 #define PUT_VALUE(_x, _y, d) { \
    GLuint *_ptr = (GLuint*)r600_ptr_color( rrb, _x + x_off, _y + y_off );		\
@@ -488,6 +509,42 @@ s8z24_to_z24s8(uint32_t val)
 #endif
 #include "spantmp2.h"
 
+/* 32 bit, BGRx8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8
+
+#define TAG(x)    radeon##x##_BGRx8888
+#define TAG2(x,y) radeon##x##_BGRx8888##y
+#if defined(RADEON_R600)
+#define GET_VALUE(_x, _y) ((*(GLuint*)(r600_ptr_color(rrb, _x + x_off, _y + y_off)) | 0x000000ff))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)r600_ptr_color( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
+#else
+#define GET_VALUE(_x, _y) ((*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) | 0x000000ff))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
+#endif
+#include "spantmp2.h"
+
+/* 32 bit, BGRA8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8
+
+#define TAG(x)    radeon##x##_BGRA8888
+#define TAG2(x,y) radeon##x##_BGRA8888##y
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_4byte(rrb, (X) + x_off, (Y) + y_off)
+#endif
+#include "spantmp2.h"
+
 /* ================================================================
  * Depth buffer
  */
@@ -506,10 +563,10 @@ s8z24_to_z24s8(uint32_t val)
  */
 #define VALUE_TYPE GLushort
 
-#if defined(RADEON_COMMON_FOR_R200)
+#if defined(RADEON_R200)
 #define WRITE_DEPTH( _x, _y, d )					\
    *(GLushort *)r200_depth_2byte(rrb, _x + x_off, _y + y_off) = d
-#elif defined(RADEON_COMMON_FOR_R600)
+#elif defined(RADEON_R600)
 #define WRITE_DEPTH( _x, _y, d )					\
    *(GLushort *)r600_ptr_depth(rrb, _x + x_off, _y + y_off) = d
 #else
@@ -517,10 +574,10 @@ s8z24_to_z24s8(uint32_t val)
    *(GLushort *)radeon_ptr_2byte_8x2(rrb, _x + x_off, _y + y_off) = d
 #endif
 
-#if defined(RADEON_COMMON_FOR_R200)
+#if defined(RADEON_R200)
 #define READ_DEPTH( d, _x, _y )						\
    d = *(GLushort *)r200_depth_2byte(rrb, _x + x_off, _y + y_off)
-#elif defined(RADEON_COMMON_FOR_R600)
+#elif defined(RADEON_R600)
 #define READ_DEPTH( d, _x, _y )						\
    d = *(GLushort *)r600_ptr_depth(rrb, _x + x_off, _y + y_off)
 #else
@@ -538,16 +595,16 @@ s8z24_to_z24s8(uint32_t val)
  */
 #define VALUE_TYPE GLuint
 
-#if defined(COMPILE_R300)
+#if defined(RADEON_R300)
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0x000000ff;							\
    tmp |= ((d << 8) & 0xffffff00);					\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
-#elif defined(RADEON_COMMON_FOR_R600)
+#elif defined(RADEON_R600)
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)r600_ptr_depth( rrb, _x + x_off, _y + y_off );		\
@@ -556,44 +613,44 @@ do {									\
    tmp |= ((d) & 0x00ffffff);					\
    *_ptr = tmp;					\
 } while (0)
-#elif defined(RADEON_COMMON_FOR_R200)
+#elif defined(RADEON_R200)
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0xff000000;							\
    tmp |= ((d) & 0x00ffffff);						\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
 #else
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );	\
-   GLuint tmp = *_ptr;							\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0xff000000;							\
    tmp |= ((d) & 0x00ffffff);						\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
 #endif
 
-#if defined(COMPILE_R300)
+#if defined(RADEON_R300)
 #define READ_DEPTH( d, _x, _y )						\
   do {									\
-    d = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) & 0xffffff00) >> 8; \
+    d = (LE32_TO_CPU(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off))) & 0xffffff00) >> 8; \
   }while(0)
-#elif defined(RADEON_COMMON_FOR_R600)
+#elif defined(RADEON_R600)
 #define READ_DEPTH( d, _x, _y )						\
   do {									\
     d = (*(GLuint*)(r600_ptr_depth(rrb, _x + x_off, _y + y_off)) & 0x00ffffff); \
   }while(0)
-#elif defined(RADEON_COMMON_FOR_R200)
+#elif defined(RADEON_R200)
 #define READ_DEPTH( d, _x, _y )						\
   do {									\
-    d = *(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off)) & 0x00ffffff; \
+    d = LE32_TO_CPU(*(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off))) & 0x00ffffff; \
   }while(0)
 #else
 #define READ_DEPTH( d, _x, _y )	\
-  d = *(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off)) & 0x00ffffff;
+  d = LE32_TO_CPU(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off))) & 0x00ffffff;
 #endif
 
 #define TAG(x) radeon##x##_z24
@@ -607,65 +664,64 @@ do {									\
  */
 #define VALUE_TYPE GLuint
 
-#if defined(COMPILE_R300)
+#if defined(RADEON_R300)
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
-   *_ptr = d;								\
+   *_ptr = CPU_TO_LE32((((d) & 0xff000000) >> 24) | (((d) & 0x00ffffff) << 8));   \
 } while (0)
-#elif defined(RADEON_COMMON_FOR_R600)
+#elif defined(RADEON_R600)
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)r600_ptr_depth( rrb, _x + x_off, _y + y_off );		\
    GLuint tmp = *_ptr;				\
    tmp &= 0xff000000;							\
-   tmp |= (((d) >> 8) & 0x00ffffff);					\
+   tmp |= ((d) & 0x00ffffff);					\
    *_ptr = tmp;					\
    _ptr = (GLuint*)r600_ptr_stencil(rrb, _x + x_off, _y + y_off);		\
    tmp = *_ptr;				\
    tmp &= 0xffffff00;							\
-   tmp |= (d) & 0xff;							\
+   tmp |= ((d) >> 24) & 0xff;						\
    *_ptr = tmp;					\
 } while (0)
-#elif defined(RADEON_COMMON_FOR_R200)
+#elif defined(RADEON_R200)
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
-   GLuint tmp = z24s8_to_s8z24(d);					\
-   *_ptr = tmp;								\
+   *_ptr = CPU_TO_LE32(d);						\
 } while (0)
 #else
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );	\
-   GLuint tmp = z24s8_to_s8z24(d);					\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(d);						\
 } while (0)
 #endif
 
-#if defined(COMPILE_R300)
+#if defined(RADEON_R300)
 #define READ_DEPTH( d, _x, _y )						\
   do { \
-    d = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)));	\
+    GLuint tmp = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)));	\
+    d = LE32_TO_CPU(((tmp & 0x000000ff) << 24) | ((tmp & 0xffffff00) >> 8));	\
   }while(0)
-#elif defined(RADEON_COMMON_FOR_R600)
+#elif defined(RADEON_R600)
 #define READ_DEPTH( d, _x, _y )						\
   do { \
-    d = ((*(GLuint*)(r600_ptr_depth(rrb, _x + x_off, _y + y_off))) << 8) & 0xffffff00; \
-    d |= (*(GLuint*)(r600_ptr_stencil(rrb, _x + x_off, _y + y_off))) & 0x000000ff;	\
+    d = (*(GLuint*)(r600_ptr_depth(rrb, _x + x_off, _y + y_off))) & 0x00ffffff; \
+    d |= ((*(GLuint*)(r600_ptr_stencil(rrb, _x + x_off, _y + y_off))) << 24) & 0xff000000; \
   }while(0)
-#elif defined(RADEON_COMMON_FOR_R200)
+#elif defined(RADEON_R200)
 #define READ_DEPTH( d, _x, _y )						\
   do { \
-    d = s8z24_to_z24s8(*(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off)));	\
+    d = LE32_TO_CPU(*(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off))); \
   }while(0)
 #else
 #define READ_DEPTH( d, _x, _y )	do {					\
-    d = s8z24_to_z24s8(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off ))); \
+    d = LE32_TO_CPU(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off))); \
   } while (0)
 #endif
 
-#define TAG(x) radeon##x##_z24_s8
+#define TAG(x) radeon##x##_s8_z24
 #include "depthtmp.h"
 
 /* ================================================================
@@ -674,16 +730,16 @@ do {									\
 
 /* 24 bit depth, 8 bit stencil depthbuffer functions
  */
-#ifdef COMPILE_R300
+#ifdef RADEON_R300
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte(rrb, _x + x_off, _y + y_off);		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0xffffff00;							\
    tmp |= (d) & 0xff;							\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
-#elif defined(RADEON_COMMON_FOR_R600)
+#elif defined(RADEON_R600)
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)r600_ptr_stencil(rrb, _x + x_off, _y + y_off);		\
@@ -692,57 +748,57 @@ do {									\
    tmp |= (d) & 0xff;							\
    *_ptr = tmp;					\
 } while (0)
-#elif defined(RADEON_COMMON_FOR_R200)
+#elif defined(RADEON_R200)
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)r200_depth_4byte(rrb, _x + x_off, _y + y_off);		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0x00ffffff;							\
    tmp |= (((d) & 0xff) << 24);						\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
 #else
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte(rrb, _x + x_off, _y + y_off);		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0x00ffffff;							\
    tmp |= (((d) & 0xff) << 24);						\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
 #endif
 
-#ifdef COMPILE_R300
+#ifdef RADEON_R300
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    d = tmp & 0x000000ff;						\
 } while (0)
-#elif defined(RADEON_COMMON_FOR_R600)
+#elif defined(RADEON_R600)
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
    GLuint *_ptr = (GLuint*)r600_ptr_stencil( rrb, _x + x_off, _y + y_off );		\
    GLuint tmp = *_ptr;				\
    d = tmp & 0x000000ff;						\
 } while (0)
-#elif defined(RADEON_COMMON_FOR_R200)
+#elif defined(RADEON_R200)
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
    GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    d = (tmp & 0xff000000) >> 24;					\
 } while (0)
 #else
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    d = (tmp & 0xff000000) >> 24;					\
 } while (0)
 #endif
 
-#define TAG(x) radeon##x##_z24_s8
+#define TAG(x) radeon##x##_s8_z24
 #include "stenciltmp.h"
 
 
@@ -755,8 +811,7 @@ static void map_unmap_rb(struct gl_renderbuffer *rb, int flag)
 		return;
 
 	if (flag) {
-		if (rrb->bo->bom->funcs->bo_wait)
-			radeon_bo_wait(rrb->bo);
+	        radeon_bo_wait(rrb->bo);
 		r = radeon_bo_map(rrb->bo, 1);
 		if (r) {
 			fprintf(stderr, "(%s) error(%d) mapping buffer.\n",
@@ -864,25 +919,35 @@ void radeonInitSpanFuncs(GLcontext * ctx)
  */
 static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb)
 {
-	if (rrb->base._ActualFormat == GL_RGB5) {
+	if (rrb->base.Format == MESA_FORMAT_RGB565) {
 		radeonInitPointers_RGB565(&rrb->base);
-	} else if (rrb->base._ActualFormat == GL_RGB8) {
+	} else if (rrb->base.Format == MESA_FORMAT_RGB565_REV) {
+		radeonInitPointers_RGB565_REV(&rrb->base);
+	} else if (rrb->base.Format == MESA_FORMAT_XRGB8888) {
 		radeonInitPointers_xRGB8888(&rrb->base);
-	} else if (rrb->base._ActualFormat == GL_RGBA8) {
+        } else if (rrb->base.Format == MESA_FORMAT_XRGB8888_REV) {
+		radeonInitPointers_BGRx8888(&rrb->base);
+	} else if (rrb->base.Format == MESA_FORMAT_ARGB8888) {
 		radeonInitPointers_ARGB8888(&rrb->base);
-	} else if (rrb->base._ActualFormat == GL_RGBA4) {
+        } else if (rrb->base.Format == MESA_FORMAT_ARGB8888_REV) {
+		radeonInitPointers_BGRA8888(&rrb->base);
+	} else if (rrb->base.Format == MESA_FORMAT_ARGB4444) {
 		radeonInitPointers_ARGB4444(&rrb->base);
-	} else if (rrb->base._ActualFormat == GL_RGB5_A1) {
+	} else if (rrb->base.Format == MESA_FORMAT_ARGB4444_REV) {
+		radeonInitPointers_ARGB4444_REV(&rrb->base);
+	} else if (rrb->base.Format == MESA_FORMAT_ARGB1555) {
 		radeonInitPointers_ARGB1555(&rrb->base);
-	} else if (rrb->base._ActualFormat == GL_DEPTH_COMPONENT16) {
+	} else if (rrb->base.Format == MESA_FORMAT_ARGB1555_REV) {
+		radeonInitPointers_ARGB1555_REV(&rrb->base);
+	} else if (rrb->base.Format == MESA_FORMAT_Z16) {
 		radeonInitDepthPointers_z16(&rrb->base);
-	} else if (rrb->base._ActualFormat == GL_DEPTH_COMPONENT24) {
+	} else if (rrb->base.Format == MESA_FORMAT_X8_Z24) {
 		radeonInitDepthPointers_z24(&rrb->base);
-	} else if (rrb->base._ActualFormat == GL_DEPTH24_STENCIL8_EXT) {
-		radeonInitDepthPointers_z24_s8(&rrb->base);
-	} else if (rrb->base._ActualFormat == GL_STENCIL_INDEX8_EXT) {
-		radeonInitStencilPointers_z24_s8(&rrb->base);
+	} else if (rrb->base.Format == MESA_FORMAT_S8_Z24) {
+		radeonInitDepthPointers_s8_z24(&rrb->base);
+	} else if (rrb->base.Format == MESA_FORMAT_S8) {
+		radeonInitStencilPointers_s8_z24(&rrb->base);
 	} else {
-		fprintf(stderr, "radeonSetSpanFunctions: bad actual format: 0x%04X\n", rrb->base._ActualFormat);
+		fprintf(stderr, "radeonSetSpanFunctions: bad format: 0x%04X\n", rrb->base.Format);
 	}
 }
diff --git a/radeon/radeon_state.c b/radeon/radeon_state.c
index 4d0d35e..f6c733a 100644
--- a/radeon/radeon_state.c
+++ b/radeon/radeon_state.c
@@ -550,6 +550,31 @@ static void radeonPolygonOffset( GLcontext *ctx,
    rmesa->hw.zbs.cmd[ZBS_SE_ZBIAS_CONSTANT] = constant.ui32;
 }
 
+static void radeonPolygonStipplePreKMS( GLcontext *ctx, const GLubyte *mask )
+{
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   GLuint i;
+   drm_radeon_stipple_t stipple;
+
+   /* Must flip pattern upside down.
+    */
+   for ( i = 0 ; i < 32 ; i++ ) {
+      rmesa->state.stipple.mask[31 - i] = ((GLuint *) mask)[i];
+   }
+
+   /* TODO: push this into cmd mechanism
+    */
+   radeon_firevertices(&rmesa->radeon);
+   LOCK_HARDWARE( &rmesa->radeon );
+
+   /* FIXME: Use window x,y offsets into stipple RAM.
+    */
+   stipple.mask = rmesa->state.stipple.mask;
+   drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_STIPPLE,
+		    &stipple, sizeof(drm_radeon_stipple_t) );
+   UNLOCK_HARDWARE( &rmesa->radeon );
+}
+
 static void radeonPolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
 {
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
diff --git a/radeon/radeon_state_init.c b/radeon/radeon_state_init.c
index f3ad0dd..dd82888 100644
--- a/radeon/radeon_state_init.c
+++ b/radeon/radeon_state_init.c
@@ -440,16 +440,18 @@ static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
    atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10);
    if (rrb->cpp == 4)
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
-   else switch (rrb->base._ActualFormat) {
-   case GL_RGB5:
+   else switch (rrb->base.Format) {
+   case MESA_FORMAT_RGB565:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
 	break;
-   case GL_RGBA4:
+   case MESA_FORMAT_ARGB4444:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB4444;
 	break;
-   case GL_RGB5_A1:
+   case MESA_FORMAT_ARGB1555:
 	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB1555;
 	break;
+   default:
+	_mesa_problem(ctx, "unexpected format in ctx_emit_cs()");
    }
 
    cbpitch = (rrb->pitch / rrb->cpp);
@@ -643,11 +645,11 @@ static void tex_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
      OUT_BATCH(CP_PACKET0(RADEON_PP_TXOFFSET_0 + (24 * i), 0));
      if (t->mt && !t->image_override) {
         if ((ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT)) {
-            lvl = &t->mt->levels[0];
+            lvl = &t->mt->levels[t->minLod];
 	    OUT_BATCH_RELOC(lvl->faces[5].offset, t->mt->bo, lvl->faces[5].offset,
 			RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
         } else {
-           OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+           OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, get_base_teximage_offset(t),
 		     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
         }
       } else {
diff --git a/radeon/radeon_tex.c b/radeon/radeon_tex.c
index 99865ff..749ab75 100644
--- a/radeon/radeon_tex.c
+++ b/radeon/radeon_tex.c
@@ -38,7 +38,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/simple_list.h"
-#include "main/texformat.h"
 #include "main/texstore.h"
 #include "main/teximage.h"
 #include "main/texobj.h"
@@ -349,17 +348,7 @@ static void radeonTexParameter( GLcontext *ctx, GLenum target,
    case GL_TEXTURE_MAX_LEVEL:
    case GL_TEXTURE_MIN_LOD:
    case GL_TEXTURE_MAX_LOD:
-
-      /* This isn't the most efficient solution but there doesn't appear to
-       * be a nice alternative.  Since there's no LOD clamping,
-       * we just have to rely on loading the right subset of mipmap levels
-       * to simulate a clamped LOD.
-       */
-      if (t->mt) {
-         radeon_miptree_unreference(t->mt);
-	 t->mt = 0;
-	 t->validated = GL_FALSE;
-      }
+      t->validated = GL_FALSE;
       break;
 
    default:
@@ -389,10 +378,8 @@ static void radeonDeleteTexture( GLcontext *ctx,
      }
    }
 
-   if (t->mt) {
-      radeon_miptree_unreference(t->mt);
-      t->mt = 0;
-   }
+   radeon_miptree_unreference(&t->mt);
+
    /* Free mipmap images and the texture object itself */
    _mesa_delete_texture_object(ctx, texObj);
 }
diff --git a/radeon/radeon_texstate.c b/radeon/radeon_texstate.c
index 9d252aa..3cbe3b4 100644
--- a/radeon/radeon_texstate.c
+++ b/radeon/radeon_texstate.c
@@ -38,8 +38,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/colormac.h"
 #include "main/context.h"
 #include "main/macros.h"
-#include "main/texformat.h"
 #include "main/teximage.h"
+#include "main/texstate.h"
 #include "main/texobj.h"
 #include "main/enums.h"
 
@@ -81,8 +81,10 @@ struct tx_table {
    GLuint format, filter;
 };
 
+/* XXX verify this table against MESA_FORMAT_x values */
 static const struct tx_table tx_table[] =
 {
+   _INVALID(NONE), /* MESA_FORMAT_NONE */
    _ALPHA(RGBA8888),
    _ALPHA_REV(RGBA8888),
    _ALPHA(ARGB8888),
@@ -660,7 +662,7 @@ void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_
 	rmesa = pDRICtx->driverPrivate;
 
 	rfb = dPriv->driverPrivate;
-        texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
+        texUnit = _mesa_get_current_tex_unit(radeon->glCtx);
 	texObj = _mesa_select_tex_object(radeon->glCtx, texUnit, target);
         texImage = _mesa_get_tex_image(radeon->glCtx, texObj, target, 0);
 
@@ -697,20 +699,14 @@ void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_
 		radeon_bo_unref(rImage->bo);
 		rImage->bo = NULL;
 	}
-	if (t->mt) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = NULL;
-	}
-	if (rImage->mt) {
-		radeon_miptree_unreference(rImage->mt);
-		rImage->mt = NULL;
-	}
+
+	radeon_miptree_unreference(&t->mt);
+	radeon_miptree_unreference(&rImage->mt);
+
 	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
 				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
 	texImage->RowStride = rb->pitch / rb->cpp;
-	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
-							internalFormat,
-							type, format, 0);
+
 	rImage->bo = rb->bo;
 	radeon_bo_ref(rImage->bo);
 	t->bo = rb->bo;
@@ -718,8 +714,6 @@ void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_
 	t->tile_bits = 0;
 	t->image_override = GL_TRUE;
 	t->override_offset = 0;
-	t->pp_txpitch &= (1 << 13) -1;
-	pitch_val = rb->pitch;
 	switch (rb->cpp) {
 	case 4:
 		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
@@ -738,12 +732,17 @@ void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_
 		t->pp_txfilter |= tx_table[MESA_FORMAT_RGB565].filter;
 		break;
 	}
-        t->pp_txsize = ((rb->base.Width - 1) << RADEON_TEX_USIZE_SHIFT)
-		   | ((rb->base.Height - 1) << RADEON_TEX_VSIZE_SHIFT);
-        t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
-	t->pp_txpitch = pitch_val;
-        t->pp_txpitch -= 32;
 
+	t->pp_txpitch &= (1 << 13) -1;
+	pitch_val = rb->pitch;
+
+        t->pp_txsize = ((rb->base.Width - 1) << RADEON_TEX_USIZE_SHIFT)
+		| ((rb->base.Height - 1) << RADEON_TEX_VSIZE_SHIFT);
+	if (target == GL_TEXTURE_RECTANGLE_NV) {
+		t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
+		t->pp_txpitch = pitch_val;
+		t->pp_txpitch -= 32;
+	}
 	t->validated = GL_TRUE;
 	_mesa_unlock_texture(radeon->glCtx, texObj);
 	return;
@@ -833,11 +832,14 @@ static void import_tex_obj_state( r100ContextPtr rmesa,
    cmd[TEX_PP_TXFORMAT] |= texobj->pp_txformat & TEXOBJ_TXFORMAT_MASK;
    cmd[TEX_PP_BORDER_COLOR] = texobj->pp_border_color;
 
-   if (texobj->base.Target == GL_TEXTURE_RECTANGLE_NV) {
-      GLuint *txr_cmd = RADEON_DB_STATE( txr[unit] );
+   if (texobj->pp_txformat & RADEON_TXFORMAT_NON_POWER2) {
+      uint32_t *txr_cmd = &rmesa->hw.txr[unit].cmd[TXR_CMD_0];
       txr_cmd[TXR_PP_TEX_SIZE] = texobj->pp_txsize; /* NPOT only! */
       txr_cmd[TXR_PP_TEX_PITCH] = texobj->pp_txpitch; /* NPOT only! */
-      RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.txr[unit] );
+      RADEON_STATECHANGE( rmesa, txr[unit] );
+   }
+
+   if (texobj->base.Target == GL_TEXTURE_RECTANGLE_NV) {
       se_coord_fmt |= RADEON_VTX_ST0_NONPARAMETRIC << unit;
    }
    else {
@@ -1018,7 +1020,7 @@ static GLboolean setup_hardware_state(r100ContextPtr rmesa, radeonTexObj *t, int
 	return GL_TRUE;
    }
 
-   firstImage = t->base.Image[0][t->mt->firstLevel];   
+   firstImage = t->base.Image[0][t->minLod];
 
    if (firstImage->Border > 0) {
       fprintf(stderr, "%s: border\n", __FUNCTION__);
@@ -1028,27 +1030,27 @@ static GLboolean setup_hardware_state(r100ContextPtr rmesa, radeonTexObj *t, int
    log2Width  = firstImage->WidthLog2;
    log2Height = firstImage->HeightLog2;
    log2Depth  = firstImage->DepthLog2;
-   texelBytes = firstImage->TexFormat->TexelBytes;
+   texelBytes = _mesa_get_format_bytes(firstImage->TexFormat);
 
    if (!t->image_override) {
-      if (VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
+      if (VALID_FORMAT(firstImage->TexFormat)) {
 	const struct tx_table *table = tx_table;
 
 	 t->pp_txformat &= ~(RADEON_TXFORMAT_FORMAT_MASK |
 			     RADEON_TXFORMAT_ALPHA_IN_MAP);
 	 t->pp_txfilter &= ~RADEON_YUV_TO_RGB;	 
 	 
-	 t->pp_txformat |= table[ firstImage->TexFormat->MesaFormat ].format;
-	 t->pp_txfilter |= table[ firstImage->TexFormat->MesaFormat ].filter;
+	 t->pp_txformat |= table[ firstImage->TexFormat ].format;
+	 t->pp_txfilter |= table[ firstImage->TexFormat ].filter;
       } else {
 	 _mesa_problem(NULL, "unexpected texture format in %s",
 		       __FUNCTION__);
 	 return GL_FALSE;
       }
    }
-   
+
    t->pp_txfilter &= ~RADEON_MAX_MIP_LEVEL_MASK;
-   t->pp_txfilter |= (t->mt->lastLevel - t->mt->firstLevel) << RADEON_MAX_MIP_LEVEL_SHIFT;
+   t->pp_txfilter |= (t->maxLod - t->minLod) << RADEON_MAX_MIP_LEVEL_SHIFT;
 	
    t->pp_txformat &= ~(RADEON_TXFORMAT_WIDTH_MASK |
 		       RADEON_TXFORMAT_HEIGHT_MASK |
@@ -1057,9 +1059,9 @@ static GLboolean setup_hardware_state(r100ContextPtr rmesa, radeonTexObj *t, int
 		       RADEON_TXFORMAT_F5_HEIGHT_MASK);
    t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_WIDTH_SHIFT) |
 		      (log2Height << RADEON_TXFORMAT_HEIGHT_SHIFT));
-   
+
    t->tile_bits = 0;
-   
+
    if (t->base.Target == GL_TEXTURE_CUBE_MAP) {
       ASSERT(log2Width == log2Height);
       t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_F5_WIDTH_SHIFT) |
@@ -1080,7 +1082,7 @@ static GLboolean setup_hardware_state(r100ContextPtr rmesa, radeonTexObj *t, int
 		   | ((firstImage->Height - 1) << RADEON_TEX_VSIZE_SHIFT));
 
    if ( !t->image_override ) {
-      if (firstImage->IsCompressed)
+      if (_mesa_is_format_compressed(firstImage->TexFormat))
          t->pp_txpitch = (firstImage->Width + 63) & ~(63);
       else
          t->pp_txpitch = ((firstImage->Width * texelBytes) + 63) & ~(63);
@@ -1114,7 +1116,6 @@ static GLboolean radeon_validate_texture(GLcontext *ctx, struct gl_texture_objec
    RADEON_STATECHANGE( rmesa, ctx );
    rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= 
      (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit;
-
    RADEON_STATECHANGE( rmesa, tcl );
    rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_ST_BIT(unit);
 
diff --git a/radeon/radeon_texture.c b/radeon/radeon_texture.c
index fad3d1c..0317811 100644
--- a/radeon/radeon_texture.c
+++ b/radeon/radeon_texture.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2009 Maciej Cencora.
  * Copyright (C) 2008 Nicolai Haehnle.
  * Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
  *
@@ -34,7 +35,6 @@
 #include "main/convolve.h"
 #include "main/mipmap.h"
 #include "main/texcompress.h"
-#include "main/texformat.h"
 #include "main/texstore.h"
 #include "main/teximage.h"
 #include "main/texobj.h"
@@ -47,7 +47,7 @@
 #include "radeon_mipmap_tree.h"
 
 
-static void copy_rows(void* dst, GLuint dststride, const void* src, GLuint srcstride,
+void copy_rows(void* dst, GLuint dststride, const void* src, GLuint srcstride,
 	GLuint numrows, GLuint rowsize)
 {
 	assert(rowsize <= dststride);
@@ -82,8 +82,7 @@ void radeonFreeTexImageData(GLcontext *ctx, struct gl_texture_image *timage)
 	radeon_texture_image* image = get_radeon_texture_image(timage);
 
 	if (image->mt) {
-		radeon_miptree_unreference(image->mt);
-		image->mt = 0;
+		radeon_miptree_unreference(&image->mt);
 		assert(!image->base.Data);
 	} else {
 		_mesa_free_texture_image_data(ctx, timage);
@@ -101,10 +100,15 @@ void radeonFreeTexImageData(GLcontext *ctx, struct gl_texture_image *timage)
 /* Set Data pointer and additional data for mapped texture image */
 static void teximage_set_map_data(radeon_texture_image *image)
 {
-	radeon_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
+	radeon_mipmap_level *lvl;
+
+	if (!image->mt)
+		return;
+
+	lvl = &image->mt->levels[image->mtlevel];
 
 	image->base.Data = image->mt->bo->ptr + lvl->faces[image->mtface].offset;
-	image->base.RowStride = lvl->rowstride / image->mt->bpp;
+	image->base.RowStride = lvl->rowstride / _mesa_get_format_bytes(image->base.TexFormat);
 }
 
 
@@ -139,7 +143,6 @@ static void map_override(GLcontext *ctx, radeonTexObj *t)
 	radeon_bo_map(t->bo, GL_FALSE);
 
 	img->base.Data = t->bo->ptr;
-	_mesa_set_fetch_functions(&img->base, 2);
 }
 
 static void unmap_override(GLcontext *ctx, radeonTexObj *t)
@@ -171,7 +174,7 @@ void radeonMapTexture(GLcontext *ctx, struct gl_texture_object *texObj)
 
 	radeon_bo_map(t->mt->bo, GL_FALSE);
 	for(face = 0; face < t->mt->faces; ++face) {
-		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level)
+		for(level = t->minLod; level <= t->maxLod; ++level)
 			teximage_set_map_data(get_radeon_texture_image(texObj->Image[face][level]));
 	}
 }
@@ -188,7 +191,7 @@ void radeonUnmapTexture(GLcontext *ctx, struct gl_texture_object *texObj)
 	  return;
 
 	for(face = 0; face < t->mt->faces; ++face) {
-		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level)
+		for(level = t->minLod; level <= t->maxLod; ++level)
 			texObj->Image[face][level]->Data = 0;
 	}
 	radeon_bo_unmap(t->mt->bo);
@@ -237,8 +240,7 @@ static void radeon_generate_mipmap(GLcontext *ctx, GLenum target,
 			image->mtlevel = i;
 			image->mtface = face;
 
-			radeon_miptree_unreference(image->mt);
-			image->mt = NULL;
+			radeon_miptree_unreference(&image->mt);
 		}
 	}
 	
@@ -256,9 +258,9 @@ void radeonGenerateMipmap(GLcontext* ctx, GLenum target, struct gl_texture_objec
 
 
 /* try to find a format which will only need a memcopy */
-static const struct gl_texture_format *radeonChoose8888TexFormat(radeonContextPtr rmesa,
-								 GLenum srcFormat,
-								 GLenum srcType, GLboolean fbo)
+static gl_format radeonChoose8888TexFormat(radeonContextPtr rmesa,
+					   GLenum srcFormat,
+					   GLenum srcType, GLboolean fbo)
 {
 	const GLuint ui = 1;
 	const GLubyte littleEndian = *((const GLubyte *)&ui);
@@ -271,37 +273,37 @@ static const struct gl_texture_format *radeonChoose8888TexFormat(radeonContextPt
 	    (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
 	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
 	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && littleEndian)) {
-		return &_mesa_texformat_rgba8888;
+		return MESA_FORMAT_RGBA8888;
 	} else if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
 		   (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && littleEndian) ||
 		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
 		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && !littleEndian)) {
-		return &_mesa_texformat_rgba8888_rev;
+		return MESA_FORMAT_RGBA8888_REV;
 	} else if (IS_R200_CLASS(rmesa->radeonScreen)) {
 		return _dri_texformat_argb8888;
 	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
 					    srcType == GL_UNSIGNED_INT_8_8_8_8)) {
-		return &_mesa_texformat_argb8888_rev;
+		return MESA_FORMAT_ARGB8888_REV;
 	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && littleEndian) ||
 					    srcType == GL_UNSIGNED_INT_8_8_8_8_REV)) {
-		return &_mesa_texformat_argb8888;
+		return MESA_FORMAT_ARGB8888;
 	} else
 		return _dri_texformat_argb8888;
 }
 
-const struct gl_texture_format *radeonChooseTextureFormat_mesa(GLcontext * ctx,
-							  GLint internalFormat,
-							  GLenum format,
-							  GLenum type)
+gl_format radeonChooseTextureFormat_mesa(GLcontext * ctx,
+					 GLint internalFormat,
+					 GLenum format,
+					 GLenum type)
 {
 	return radeonChooseTextureFormat(ctx, internalFormat, format,
 					 type, 0);
 }
 
-const struct gl_texture_format *radeonChooseTextureFormat(GLcontext * ctx,
-							  GLint internalFormat,
-							  GLenum format,
-							  GLenum type, GLboolean fbo)
+gl_format radeonChooseTextureFormat(GLcontext * ctx,
+				    GLint internalFormat,
+				    GLenum format,
+				    GLenum type, GLboolean fbo)
 {
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
 	const GLboolean do32bpt =
@@ -425,58 +427,72 @@ const struct gl_texture_format *radeonChooseTextureFormat(GLcontext * ctx,
 	case GL_YCBCR_MESA:
 		if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
 		    type == GL_UNSIGNED_BYTE)
-			return &_mesa_texformat_ycbcr;
+			return MESA_FORMAT_YCBCR;
 		else
-			return &_mesa_texformat_ycbcr_rev;
+			return MESA_FORMAT_YCBCR_REV;
 
 	case GL_RGB_S3TC:
 	case GL_RGB4_S3TC:
 	case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-		return &_mesa_texformat_rgb_dxt1;
+		return MESA_FORMAT_RGB_DXT1;
 
 	case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-		return &_mesa_texformat_rgba_dxt1;
+		return MESA_FORMAT_RGBA_DXT1;
 
 	case GL_RGBA_S3TC:
 	case GL_RGBA4_S3TC:
 	case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-		return &_mesa_texformat_rgba_dxt3;
+		return MESA_FORMAT_RGBA_DXT3;
 
 	case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-		return &_mesa_texformat_rgba_dxt5;
+		return MESA_FORMAT_RGBA_DXT5;
 
 	case GL_ALPHA16F_ARB:
-		return &_mesa_texformat_alpha_float16;
+		return MESA_FORMAT_ALPHA_FLOAT16;
 	case GL_ALPHA32F_ARB:
-		return &_mesa_texformat_alpha_float32;
+		return MESA_FORMAT_ALPHA_FLOAT32;
 	case GL_LUMINANCE16F_ARB:
-		return &_mesa_texformat_luminance_float16;
+		return MESA_FORMAT_LUMINANCE_FLOAT16;
 	case GL_LUMINANCE32F_ARB:
-		return &_mesa_texformat_luminance_float32;
+		return MESA_FORMAT_LUMINANCE_FLOAT32;
 	case GL_LUMINANCE_ALPHA16F_ARB:
-		return &_mesa_texformat_luminance_alpha_float16;
+		return MESA_FORMAT_LUMINANCE_ALPHA_FLOAT16;
 	case GL_LUMINANCE_ALPHA32F_ARB:
-		return &_mesa_texformat_luminance_alpha_float32;
+		return MESA_FORMAT_LUMINANCE_ALPHA_FLOAT32;
 	case GL_INTENSITY16F_ARB:
-		return &_mesa_texformat_intensity_float16;
+		return MESA_FORMAT_INTENSITY_FLOAT16;
 	case GL_INTENSITY32F_ARB:
-		return &_mesa_texformat_intensity_float32;
+		return MESA_FORMAT_INTENSITY_FLOAT32;
 	case GL_RGB16F_ARB:
-		return &_mesa_texformat_rgba_float16;
+		return MESA_FORMAT_RGBA_FLOAT16;
 	case GL_RGB32F_ARB:
-		return &_mesa_texformat_rgba_float32;
+		return MESA_FORMAT_RGBA_FLOAT32;
 	case GL_RGBA16F_ARB:
-		return &_mesa_texformat_rgba_float16;
+		return MESA_FORMAT_RGBA_FLOAT16;
 	case GL_RGBA32F_ARB:
-		return &_mesa_texformat_rgba_float32;
+		return MESA_FORMAT_RGBA_FLOAT32;
 
+#ifdef RADEON_R300
+	case GL_DEPTH_COMPONENT:
+	case GL_DEPTH_COMPONENT16:
+		return MESA_FORMAT_Z16;
+	case GL_DEPTH_COMPONENT24:
+	case GL_DEPTH_COMPONENT32:
+	case GL_DEPTH_STENCIL_EXT:
+	case GL_DEPTH24_STENCIL8_EXT:
+		if (rmesa->radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+			return MESA_FORMAT_S8_Z24;
+		else
+			return MESA_FORMAT_Z16;
+#else
 	case GL_DEPTH_COMPONENT:
 	case GL_DEPTH_COMPONENT16:
 	case GL_DEPTH_COMPONENT24:
 	case GL_DEPTH_COMPONENT32:
 	case GL_DEPTH_STENCIL_EXT:
 	case GL_DEPTH24_STENCIL8_EXT:
-		return &_mesa_texformat_s8_z24;
+		return MESA_FORMAT_S8_Z24;
+#endif
 
 	/* EXT_texture_sRGB */
 	case GL_SRGB:
@@ -485,26 +501,193 @@ const struct gl_texture_format *radeonChooseTextureFormat(GLcontext * ctx,
 	case GL_SRGB8_ALPHA8:
 	case GL_COMPRESSED_SRGB:
 	case GL_COMPRESSED_SRGB_ALPHA:
-		return &_mesa_texformat_srgba8;
+		return MESA_FORMAT_SRGBA8;
 
 	case GL_SLUMINANCE:
 	case GL_SLUMINANCE8:
 	case GL_COMPRESSED_SLUMINANCE:
-		return &_mesa_texformat_sl8;
+		return MESA_FORMAT_SL8;
 
 	case GL_SLUMINANCE_ALPHA:
 	case GL_SLUMINANCE8_ALPHA8:
 	case GL_COMPRESSED_SLUMINANCE_ALPHA:
-		return &_mesa_texformat_sla8;
+		return MESA_FORMAT_SLA8;
 
 	default:
 		_mesa_problem(ctx,
 			      "unexpected internalFormat 0x%x in %s",
 			      (int)internalFormat, __func__);
+		return MESA_FORMAT_NONE;
+	}
+
+	return MESA_FORMAT_NONE;		/* never get here */
+}
+
+/** Check if given image is valid within current texture object.
+ */
+static int image_matches_texture_obj(struct gl_texture_object *texObj,
+	struct gl_texture_image *texImage,
+	unsigned level)
+{
+	const struct gl_texture_image *baseImage = texObj->Image[0][texObj->BaseLevel];
+
+	if (!baseImage)
+		return 0;
+
+	if (level < texObj->BaseLevel || level > texObj->MaxLevel)
+		return 0;
+
+	const unsigned levelDiff = level - texObj->BaseLevel;
+	const unsigned refWidth = MAX2(baseImage->Width >> levelDiff, 1);
+	const unsigned refHeight = MAX2(baseImage->Height >> levelDiff, 1);
+	const unsigned refDepth = MAX2(baseImage->Depth >> levelDiff, 1);
+
+	return (texImage->Width == refWidth &&
+			texImage->Height == refHeight &&
+			texImage->Depth == refDepth);
+}
+
+static void teximage_assign_miptree(radeonContextPtr rmesa,
+	struct gl_texture_object *texObj,
+	struct gl_texture_image *texImage,
+	unsigned face,
+	unsigned level)
+{
+	radeonTexObj *t = radeon_tex_obj(texObj);
+	radeon_texture_image* image = get_radeon_texture_image(texImage);
+
+	/* Since miptree holds only images for levels <BaseLevel..MaxLevel>
+	 * don't allocate the miptree if the teximage won't fit.
+	 */
+	if (!image_matches_texture_obj(texObj, texImage, level))
+		return;
+
+	/* Try using current miptree, or create new if there isn't any */
+	if (!t->mt || !radeon_miptree_matches_image(t->mt, texImage, face, level)) {
+		radeon_miptree_unreference(&t->mt);
+		radeon_try_alloc_miptree(rmesa, t);
+		if (RADEON_DEBUG & RADEON_TEXTURE) {
+			fprintf(stderr, "%s: texObj %p, texImage %p, face %d, level %d, "
+				"texObj miptree doesn't match, allocated new miptree %p\n",
+				__FUNCTION__, texObj, texImage, face, level, t->mt);
+		}
+	}
+
+	/* Miptree alocation may have failed,
+	 * when there was no image for baselevel specified */
+	if (t->mt) {
+		image->mtface = face;
+		image->mtlevel = level;
+		radeon_miptree_reference(t->mt, &image->mt);
+	}
+}
+
+static GLuint * allocate_image_offsets(GLcontext *ctx,
+	unsigned alignedWidth,
+	unsigned height,
+	unsigned depth)
+{
+	int i;
+	GLuint *offsets;
+
+	offsets = _mesa_malloc(depth * sizeof(GLuint)) ;
+	if (!offsets) {
+		_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTex[Sub]Image");
 		return NULL;
 	}
 
-	return NULL;		/* never get here */
+	for (i = 0; i < depth; ++i) {
+		offsets[i] = alignedWidth * height * i;
+	}
+
+	return offsets;
+}
+
+/**
+ * Update a subregion of the given texture image.
+ */
+static void radeon_store_teximage(GLcontext* ctx, int dims,
+		GLint xoffset, GLint yoffset, GLint zoffset,
+		GLsizei width, GLsizei height, GLsizei depth,
+		GLsizei imageSize,
+		GLenum format, GLenum type,
+		const GLvoid * pixels,
+		const struct gl_pixelstore_attrib *packing,
+		struct gl_texture_object *texObj,
+		struct gl_texture_image *texImage,
+		int compressed)
+{
+	radeonTexObj *t = radeon_tex_obj(texObj);
+	radeon_texture_image* image = get_radeon_texture_image(texImage);
+
+	GLuint dstRowStride;
+	GLuint *dstImageOffsets;
+
+	if (image->mt) {
+		dstRowStride = image->mt->levels[image->mtlevel].rowstride;
+	} else if (t->bo) {
+		/* TFP case */
+		/* TODO */
+		assert(0);
+	} else {
+		dstRowStride = _mesa_format_row_stride(texImage->TexFormat, texImage->Width);
+	}
+
+	assert(dstRowStride);
+
+	if (dims == 3) {
+		unsigned alignedWidth = dstRowStride/_mesa_get_format_bytes(texImage->TexFormat);
+		dstImageOffsets = allocate_image_offsets(ctx, alignedWidth, texImage->Height, texImage->Depth);
+		if (!dstImageOffsets) {
+			return;
+		}
+	} else {
+		dstImageOffsets = texImage->ImageOffsets;
+	}
+
+	radeon_teximage_map(image, GL_TRUE);
+
+	if (compressed) {
+		uint32_t srcRowStride, bytesPerRow, rows, block_width, block_height;
+		GLubyte *img_start;
+
+		_mesa_get_format_block_size(texImage->TexFormat, &block_width, &block_height);
+
+		if (!image->mt) {
+			dstRowStride = _mesa_format_row_stride(texImage->TexFormat, texImage->Width);
+			img_start = _mesa_compressed_image_address(xoffset, yoffset, 0,
+									texImage->TexFormat,
+									texImage->Width, texImage->Data);
+		}
+		else {
+			uint32_t offset;
+			offset = dstRowStride / _mesa_get_format_bytes(texImage->TexFormat) * yoffset / block_height + xoffset / block_width;
+			offset *= _mesa_get_format_bytes(texImage->TexFormat);
+			img_start = texImage->Data + offset;
+		}
+		srcRowStride = _mesa_format_row_stride(texImage->TexFormat, width);
+		bytesPerRow = srcRowStride;
+		rows = (height + block_height - 1) / block_height;
+
+		copy_rows(img_start, dstRowStride, pixels, srcRowStride, rows, bytesPerRow);
+	}
+	else {
+		if (!_mesa_texstore(ctx, dims, texImage->_BaseFormat,
+					texImage->TexFormat, texImage->Data,
+					xoffset, yoffset, zoffset,
+					dstRowStride,
+					dstImageOffsets,
+					width, height, depth,
+					format, type, pixels, packing)) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage");
+		}
+	}
+
+	if (dims == 3) {
+		_mesa_free(dstImageOffsets);
+	}
+
+	radeon_teximage_unmap(image);
 }
 
 /**
@@ -525,13 +708,22 @@ static void radeon_teximage(
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
 	radeonTexObj* t = radeon_tex_obj(texObj);
 	radeon_texture_image* image = get_radeon_texture_image(texImage);
-	GLuint dstRowStride;
 	GLint postConvWidth = width;
 	GLint postConvHeight = height;
-	GLuint texelBytes;
 	GLuint face = radeon_face_for_target(target);
 
-	radeon_firevertices(rmesa);
+	{
+		struct radeon_bo *bo;
+		bo = !image->mt ? image->bo : image->mt->bo;
+		if (bo && radeon_bo_is_referenced_by_cs(bo, rmesa->cmdbuf.cs)) {
+			radeon_firevertices(rmesa);
+		}
+	}
+
+	if (RADEON_DEBUG & RADEON_TEXTURE) {
+		fprintf(stderr, "radeon_teximage%dd: texObj %p, texImage %p, face %d, level %d\n",
+				dims, texObj, texImage, face, level);
+	}
 
 	t->validated = GL_FALSE;
 
@@ -540,62 +732,35 @@ static void radeon_teximage(
 						  &postConvHeight);
 	}
 
-	/* Choose and fill in the texture format for this image */
-	texImage->TexFormat = radeonChooseTextureFormat(ctx, internalFormat, format, type, 0);
-	_mesa_set_fetch_functions(texImage, dims);
-
-	if (texImage->TexFormat->TexelBytes == 0) {
-		texelBytes = 0;
-		texImage->IsCompressed = GL_TRUE;
-		texImage->CompressedSize =
-			ctx->Driver.CompressedTextureSize(ctx, texImage->Width,
-					   texImage->Height, texImage->Depth,
-					   texImage->TexFormat->MesaFormat);
-	} else {
-		texImage->IsCompressed = GL_FALSE;
-		texImage->CompressedSize = 0;
-
-		texelBytes = texImage->TexFormat->TexelBytes;
+	if (!_mesa_is_format_compressed(texImage->TexFormat)) {
+		GLuint texelBytes = _mesa_get_format_bytes(texImage->TexFormat);
 		/* Minimum pitch of 32 bytes */
 		if (postConvWidth * texelBytes < 32) {
-		  postConvWidth = 32 / texelBytes;
-		  texImage->RowStride = postConvWidth;
+			postConvWidth = 32 / texelBytes;
+			texImage->RowStride = postConvWidth;
 		}
-		if (!image->mt) {      
+		if (!image->mt) {
 			assert(texImage->RowStride == postConvWidth);
 		}
 	}
 
-	/* Allocate memory for image */
-	radeonFreeTexImageData(ctx, texImage); /* Mesa core only clears texImage->Data but not image->mt */
-
-	if (t->mt &&
-	    t->mt->firstLevel == level &&
-	    t->mt->lastLevel == level &&
-	    t->mt->target != GL_TEXTURE_CUBE_MAP_ARB &&
-	    !radeon_miptree_matches_image(t->mt, texImage, face, level)) {
-	  radeon_miptree_unreference(t->mt);
-	  t->mt = NULL;
-	}
-
-	if (!t->mt)
-		radeon_try_alloc_miptree(rmesa, t, image, face, level);
-	if (t->mt && radeon_miptree_matches_image(t->mt, texImage, face, level)) {
-		radeon_mipmap_level *lvl;
-		image->mt = t->mt;
-		image->mtlevel = level - t->mt->firstLevel;
-		image->mtface = face;
-		radeon_miptree_reference(t->mt);
-		lvl = &image->mt->levels[image->mtlevel];
-		dstRowStride = lvl->rowstride;
-	} else {
-		int size;
-		if (texImage->IsCompressed) {
-			size = texImage->CompressedSize;
-		} else {
-			size = texImage->Width * texImage->Height * texImage->Depth * texImage->TexFormat->TexelBytes;
+	/* Mesa core only clears texImage->Data but not image->mt */
+	radeonFreeTexImageData(ctx, texImage);
+
+	if (!t->bo) {
+		teximage_assign_miptree(rmesa, texObj, texImage, face, level);
+		if (!image->mt) {
+			int size = _mesa_format_image_size(texImage->TexFormat,
+								texImage->Width,
+								texImage->Height,
+								texImage->Depth);
+			texImage->Data = _mesa_alloc_texmemory(size);
+			if (RADEON_DEBUG & RADEON_TEXTURE) {
+				fprintf(stderr, "radeon_teximage%dd: texObj %p, texImage %p, "
+					" no miptree assigned, using local memory %p\n",
+					dims, texObj, texImage, texImage->Data);
+			}
 		}
-		texImage->Data = _mesa_alloc_texmemory(size);
 	}
 
 	/* Upload texture image; note that the spec allows pixels to be NULL */
@@ -609,69 +774,16 @@ static void radeon_teximage(
 	}
 
 	if (pixels) {
-		radeon_teximage_map(image, GL_TRUE);
-		if (compressed) {
-			if (image->mt) {
-				uint32_t srcRowStride, bytesPerRow, rows;
-				srcRowStride = _mesa_compressed_row_stride(texImage->TexFormat->MesaFormat, width);
-				bytesPerRow = srcRowStride;
-				rows = (height + 3) / 4;
-				copy_rows(texImage->Data, image->mt->levels[level].rowstride,
-					  pixels, srcRowStride, rows, bytesPerRow);
-			} else {
-				memcpy(texImage->Data, pixels, imageSize);
-			}
-		} else {
-			GLuint dstRowStride;
-			GLuint *dstImageOffsets;
-
-			if (image->mt) {
-				radeon_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
-				dstRowStride = lvl->rowstride;
-			} else {
-				dstRowStride = texImage->Width * texImage->TexFormat->TexelBytes;
-			}
-
-			if (dims == 3) {
-				int i;
-
-				dstImageOffsets = _mesa_malloc(depth * sizeof(GLuint)) ;
-				if (!dstImageOffsets)
-					_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
-
-				for (i = 0; i < depth; ++i) {
-					dstImageOffsets[i] = dstRowStride/texImage->TexFormat->TexelBytes * height * i;
-				}
-			} else {
-				dstImageOffsets = texImage->ImageOffsets;
-			}
-
-			if (!texImage->TexFormat->StoreImage(ctx, dims,
-						texImage->_BaseFormat,
-						texImage->TexFormat,
-						texImage->Data, 0, 0, 0, /* dstX/Y/Zoffset */
-						dstRowStride,
-						dstImageOffsets,
-						width, height, depth,
-						format, type, pixels, packing))
-				_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
-
-			if (dims == 3)
-				_mesa_free(dstImageOffsets);
-		}
-
-		/* SGIS_generate_mipmap */
-		if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-			radeon_generate_mipmap(ctx, target, texObj);
-		}
+		radeon_store_teximage(ctx, dims,
+			0, 0, 0,
+			width, height, depth,
+			imageSize, format, type,
+			pixels, packing,
+			texObj, texImage,
+			compressed);
 	}
 
 	_mesa_unmap_teximage_pbo(ctx, packing);
-
-	if (pixels)
-	  radeon_teximage_unmap(image);
-
-
 }
 
 void radeonTexImage1D(GLcontext * ctx, GLenum target, GLint level,
@@ -724,7 +836,7 @@ void radeonTexImage3D(GLcontext * ctx, GLenum target, GLint level,
 }
 
 /**
- * Update a subregion of the given texture image.
+ * All glTexSubImage calls go through this function.
  */
 static void radeon_texsubimage(GLcontext* ctx, int dims, GLenum target, int level,
 		GLint xoffset, GLint yoffset, GLint zoffset,
@@ -741,69 +853,39 @@ static void radeon_texsubimage(GLcontext* ctx, int dims, GLenum target, int leve
 	radeonTexObj* t = radeon_tex_obj(texObj);
 	radeon_texture_image* image = get_radeon_texture_image(texImage);
 
-	radeon_firevertices(rmesa);
+	{
+		struct radeon_bo *bo;
+		bo = !image->mt ? image->bo : image->mt->bo;
+		if (bo && radeon_bo_is_referenced_by_cs(bo, rmesa->cmdbuf.cs)) {
+			radeon_firevertices(rmesa);
+		}
+	}
+
+	if (RADEON_DEBUG & RADEON_TEXTURE) {
+		fprintf(stderr, "radeon_texsubimage%dd: texObj %p, texImage %p, face %d, level %d\n",
+				dims, texObj, texImage, radeon_face_for_target(target), level);
+	}
 
 	t->validated = GL_FALSE;
 	if (compressed) {
 		pixels = _mesa_validate_pbo_compressed_teximage(
-			ctx, imageSize, pixels, packing, "glCompressedTexImage");
+			ctx, imageSize, pixels, packing, "glCompressedTexSubImage");
 	} else {
 		pixels = _mesa_validate_pbo_teximage(ctx, dims,
-			width, height, depth, format, type, pixels, packing, "glTexSubImage1D");
+			width, height, depth, format, type, pixels, packing, "glTexSubImage");
 	}
 
 	if (pixels) {
-		GLint dstRowStride;
-		radeon_teximage_map(image, GL_TRUE);
-
-		if (image->mt) {
-			radeon_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
-			dstRowStride = lvl->rowstride;
-		} else {
-			dstRowStride = texImage->RowStride * texImage->TexFormat->TexelBytes;
-		}
-
-		if (compressed) {
-			uint32_t srcRowStride, bytesPerRow, rows;
-			GLubyte *img_start;
-			if (!image->mt) {
-				dstRowStride = _mesa_compressed_row_stride(texImage->TexFormat->MesaFormat, texImage->Width);
-				img_start = _mesa_compressed_image_address(xoffset, yoffset, 0,
-									   texImage->TexFormat->MesaFormat,
-									   texImage->Width, texImage->Data);
-			}
-			else {
-				uint32_t blocks_x = dstRowStride / (image->mt->bpp * 4);
-				img_start = texImage->Data + image->mt->bpp * 4 * (blocks_x * (yoffset / 4) + xoffset / 4);
-			}
-			srcRowStride = _mesa_compressed_row_stride(texImage->TexFormat->MesaFormat, width);
-			bytesPerRow = srcRowStride;
-			rows = (height + 3) / 4;
-
-			copy_rows(img_start, dstRowStride,  pixels, srcRowStride, rows,  bytesPerRow);
-			
-		} else {
-			if (!texImage->TexFormat->StoreImage(ctx, dims, texImage->_BaseFormat,
-							     texImage->TexFormat, texImage->Data,
-							     xoffset, yoffset, zoffset,
-							     dstRowStride,
-							     texImage->ImageOffsets,
-							     width, height, depth,
-							     format, type, pixels, packing))
-				_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage");
-		}
-
-		/* GL_SGIS_generate_mipmap */
-		if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-			radeon_generate_mipmap(ctx, target, texObj);
-		}
+		radeon_store_teximage(ctx, dims,
+			xoffset, yoffset, zoffset,
+			width, height, depth,
+			imageSize, format, type,
+			pixels, packing,
+			texObj, texImage,
+			compressed);
 	}
 
-	radeon_teximage_unmap(image);
-
 	_mesa_unmap_teximage_pbo(ctx, packing);
-
-
 }
 
 void radeonTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
@@ -859,143 +941,6 @@ void radeonTexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
 		format, type, pixels, packing, texObj, texImage, 0);
 }
 
-
-
-/**
- * Ensure that the given image is stored in the given miptree from now on.
- */
-static void migrate_image_to_miptree(radeon_mipmap_tree *mt, radeon_texture_image *image, int face, int level)
-{
-	radeon_mipmap_level *dstlvl = &mt->levels[level - mt->firstLevel];
-	unsigned char *dest;
-
-	assert(image->mt != mt);
-	assert(dstlvl->width == image->base.Width);
-	assert(dstlvl->height == image->base.Height);
-	assert(dstlvl->depth == image->base.Depth);
-
-
-	radeon_bo_map(mt->bo, GL_TRUE);
-	dest = mt->bo->ptr + dstlvl->faces[face].offset;
-
-	if (image->mt) {
-		/* Format etc. should match, so we really just need a memcpy().
-		 * In fact, that memcpy() could be done by the hardware in many
-		 * cases, provided that we have a proper memory manager.
-		 */
-		radeon_mipmap_level *srclvl = &image->mt->levels[image->mtlevel-image->mt->firstLevel];
-
-		assert(srclvl->size == dstlvl->size);
-		assert(srclvl->rowstride == dstlvl->rowstride);
-
-		radeon_bo_map(image->mt->bo, GL_FALSE);
-
-		memcpy(dest,
-			image->mt->bo->ptr + srclvl->faces[face].offset,
-			dstlvl->size);
-		radeon_bo_unmap(image->mt->bo);
-
-		radeon_miptree_unreference(image->mt);
-	} else {
-		uint32_t srcrowstride;
-		uint32_t height;
-		/* need to confirm this value is correct */
-		if (mt->compressed) {
-			height = (image->base.Height + 3) / 4;
-			srcrowstride = _mesa_compressed_row_stride(image->base.TexFormat->MesaFormat, image->base.Width);
-		} else {
-			height = image->base.Height * image->base.Depth;
-			srcrowstride = image->base.Width * image->base.TexFormat->TexelBytes;
-		}
-
-//		if (mt->tilebits)
-//			WARN_ONCE("%s: tiling not supported yet", __FUNCTION__);
-
-		copy_rows(dest, dstlvl->rowstride, image->base.Data, srcrowstride,
-			  height, srcrowstride);
-
-		_mesa_free_texmemory(image->base.Data);
-		image->base.Data = 0;
-	}
-
-	radeon_bo_unmap(mt->bo);
-
-	image->mt = mt;
-	image->mtface = face;
-	image->mtlevel = level;
-	radeon_miptree_reference(image->mt);
-}
-
-int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *texObj)
-{
-	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-	radeonTexObj *t = radeon_tex_obj(texObj);
-	radeon_texture_image *baseimage = get_radeon_texture_image(texObj->Image[0][texObj->BaseLevel]);
-	int face, level;
-
-	if (t->validated || t->image_override)
-		return GL_TRUE;
-
-	if (RADEON_DEBUG & RADEON_TEXTURE)
-		fprintf(stderr, "%s: Validating texture %p now\n", __FUNCTION__, texObj);
-
-	if (baseimage->base.Border > 0)
-		return GL_FALSE;
-
-	/* Ensure a matching miptree exists.
-	 *
-	 * Differing mipmap trees can result when the app uses TexImage to
-	 * change texture dimensions.
-	 *
-	 * Prefer to use base image's miptree if it
-	 * exists, since that most likely contains more valid data (remember
-	 * that the base level is usually significantly larger than the rest
-	 * of the miptree, so cubemaps are the only possible exception).
-	 */
-	if (baseimage->mt &&
-	    baseimage->mt != t->mt &&
-	    radeon_miptree_matches_texture(baseimage->mt, &t->base)) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = baseimage->mt;
-		radeon_miptree_reference(t->mt);
-	} else if (t->mt && !radeon_miptree_matches_texture(t->mt, &t->base)) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = 0;
-	}
-
-	if (!t->mt) {
-		if (RADEON_DEBUG & RADEON_TEXTURE)
-			fprintf(stderr, " Allocate new miptree\n");
-		radeon_try_alloc_miptree(rmesa, t, baseimage, 0, texObj->BaseLevel);
-		if (!t->mt) {
-			_mesa_problem(ctx, "radeon_validate_texture failed to alloc miptree");
-			return GL_FALSE;
-		}
-	}
-
-	/* Ensure all images are stored in the single main miptree */
-	for(face = 0; face < t->mt->faces; ++face) {
-		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level) {
-			radeon_texture_image *image = get_radeon_texture_image(texObj->Image[face][level]);
-			if (RADEON_DEBUG & RADEON_TEXTURE)
-				fprintf(stderr, " face %i, level %i... %p vs %p ", face, level, t->mt, image->mt);
-			if (t->mt == image->mt) {
-				if (RADEON_DEBUG & RADEON_TEXTURE)
-					fprintf(stderr, "OK\n");
-
-				continue;
-			}
-
-			if (RADEON_DEBUG & RADEON_TEXTURE)
-				fprintf(stderr, "migrating\n");
-			migrate_image_to_miptree(t->mt, image, face, level);
-		}
-	}
-
-	return GL_TRUE;
-}
-
-
 /**
  * Need to map texture image into memory before copying image data,
  * then unmap it.
diff --git a/radeon/radeon_texture.h b/radeon/radeon_texture.h
index 888a55b..906daf1 100644
--- a/radeon/radeon_texture.h
+++ b/radeon/radeon_texture.h
@@ -30,6 +30,11 @@
 
 #ifndef RADEON_TEXTURE_H
 #define RADEON_TEXTURE_H
+
+#include "main/formats.h"
+
+void copy_rows(void* dst, GLuint dststride, const void* src, GLuint srcstride,
+	GLuint numrows, GLuint rowsize);
 struct gl_texture_image *radeonNewTextureImage(GLcontext *ctx);
 void radeonFreeTexImageData(GLcontext *ctx, struct gl_texture_image *timage);
 
@@ -40,14 +45,16 @@ void radeonUnmapTexture(GLcontext *ctx, struct gl_texture_object *texObj);
 void radeonGenerateMipmap(GLcontext* ctx, GLenum target, struct gl_texture_object *texObj);
 int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *texObj);
 GLuint radeon_face_for_target(GLenum target);
-const struct gl_texture_format *radeonChooseTextureFormat_mesa(GLcontext * ctx,
-							  GLint internalFormat,
-							  GLenum format,
-							  GLenum type);
-const struct gl_texture_format *radeonChooseTextureFormat(GLcontext * ctx,
-							  GLint internalFormat,
-							  GLenum format,
-							  GLenum type, GLboolean fbo);
+
+gl_format radeonChooseTextureFormat_mesa(GLcontext * ctx,
+                                         GLint internalFormat,
+                                         GLenum format,
+                                         GLenum type);
+
+gl_format radeonChooseTextureFormat(GLcontext * ctx,
+                                    GLint internalFormat,
+                                    GLenum format,
+                                    GLenum type, GLboolean fbo);
 
 void radeonTexImage1D(GLcontext * ctx, GLenum target, GLint level,
 		      GLint internalFormat,
author	Luc Verhaegen <libv@skynet.be>	2010-03-14 22:15:41 +0000
committer	Luc Verhaegen <libv@skynet.be>	2010-03-14 22:15:41 +0000
commit	534eb0f6eea95ff5851d3cb74663679fcd375572 (patch)
tree	fe6b3c1c482725e17e0fed3ad9d9004e8870c988
parent	0c8469d1892b441c38d1cb09d6bbf85692c89e92 (diff)