Import radeon, r200 and r300 dri drivers from mesa 7.0.3.7.0.3

author: Luc Verhaegen <libv@skynet.be> 2010-03-14 07:04:46 +0100
committer: Luc Verhaegen <libv@skynet.be> 2010-03-14 07:04:46 +0100
commit: 50d4922305e925896a71e705c438ededbaedb80f (patch)
tree: d9a44227dcdda1de61337280b20170d0deb6211d
parent: 5dee9b7b19c1aa3a13618b08bc24f00677b5364b (diff)
109 files changed, 57751 insertions, 11 deletions
diff --git a/Makefile.am b/Makefile.am
index 15ea2b3..37b4466 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,3 +1,3 @@
 AUTOMAKE_OPTIONS = foreign
 
-SUBDIRS = src
+SUBDIRS = radeon r200 r300
diff --git a/configure.ac b/configure.ac
index 2a78cfd..c6e6dfe 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,7 +1,7 @@
 # Process this file with autoconf to produce a configure script
 
 AC_PREREQ(2.57)
-AC_INIT([mesa-dri-xxx], 7.0.3, [], mesa-dri-xxx)
+AC_INIT([mesa-dri-radeon], 7.0.3, [], mesa-dri-radeon)
 
 AM_INIT_AUTOMAKE([dist-bzip2])
 
@@ -16,9 +16,12 @@ AC_PROG_CC
 AC_HEADER_STDC
 
 PKG_CHECK_MODULES([DRM], [libdrm >= 2.3.0])
-PKG_CHECK_MODULES([DRI], [libmesadri = 7.0.3 libmesadricommon = 7.0.3])
+PKG_CHECK_MODULES([DRI], [libmesadri >= 7.0.3 libmesadri < 7.1.0
+			  libmesadricommon >= 7.0.3 libmesadricommon < 7.1.0])
 
 AC_OUTPUT([
 	Makefile
-	src/Makefile
+	radeon/Makefile
+	r200/Makefile
+	r300/Makefile
 ])
diff --git a/r200/Doxyfile b/r200/Doxyfile
new file mode 100644
index 0000000..27b3d03
--- /dev/null
+++ b/r200/Doxyfile
@@ -0,0 +1,232 @@
+# Doxyfile 1.3.2-Gideon
+
+#---------------------------------------------------------------------------
+# General configuration options
+#---------------------------------------------------------------------------
+PROJECT_NAME           = r200
+PROJECT_NUMBER         = $VERSION$
+OUTPUT_DIRECTORY       = 
+OUTPUT_LANGUAGE        = English
+USE_WINDOWS_ENCODING   = NO
+EXTRACT_ALL            = NO
+EXTRACT_PRIVATE        = NO
+EXTRACT_STATIC         = NO
+EXTRACT_LOCAL_CLASSES  = YES
+HIDE_UNDOC_MEMBERS     = NO
+HIDE_UNDOC_CLASSES     = NO
+HIDE_FRIEND_COMPOUNDS  = NO
+HIDE_IN_BODY_DOCS      = NO
+BRIEF_MEMBER_DESC      = YES
+REPEAT_BRIEF           = YES
+ALWAYS_DETAILED_SEC    = NO
+INLINE_INHERITED_MEMB  = NO
+FULL_PATH_NAMES        = NO
+STRIP_FROM_PATH        = 
+INTERNAL_DOCS          = NO
+CASE_SENSE_NAMES       = YES
+SHORT_NAMES            = NO
+HIDE_SCOPE_NAMES       = NO
+SHOW_INCLUDE_FILES     = YES
+JAVADOC_AUTOBRIEF      = NO
+MULTILINE_CPP_IS_BRIEF = NO
+DETAILS_AT_TOP         = NO
+INHERIT_DOCS           = YES
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = YES
+DISTRIBUTE_GROUP_DOC   = NO
+TAB_SIZE               = 8
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+GENERATE_BUGLIST       = YES
+GENERATE_DEPRECATEDLIST= YES
+ALIASES                = 
+ENABLED_SECTIONS       = 
+MAX_INITIALIZER_LINES  = 30
+OPTIMIZE_OUTPUT_FOR_C  = NO
+OPTIMIZE_OUTPUT_JAVA   = NO
+SHOW_USED_FILES        = YES
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+QUIET                  = NO
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_IF_DOC_ERROR      = YES
+WARN_FORMAT            = "$file:$line: $text"
+WARN_LOGFILE           = 
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+INPUT                  = /home/temp/Mesa/src/drv/r200
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.idl \
+                         *.odl \
+                         *.cs \
+                         *.C \
+                         *.H \
+                         *.tlh \
+                         *.diff \
+                         *.patch \
+                         *.moc \
+                         *.xpm
+RECURSIVE              = yes
+EXCLUDE                = 
+EXCLUDE_SYMLINKS       = NO
+EXCLUDE_PATTERNS       = 
+EXAMPLE_PATH           = 
+EXAMPLE_PATTERNS       = *
+EXAMPLE_RECURSIVE      = NO
+IMAGE_PATH             = 
+INPUT_FILTER           = 
+FILTER_SOURCE_FILES    = NO
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+SOURCE_BROWSER         = NO
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = YES
+REFERENCED_BY_RELATION = YES
+REFERENCES_RELATION    = YES
+VERBATIM_HEADERS       = YES
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+ALPHABETICAL_INDEX     = NO
+COLS_IN_ALPHA_INDEX    = 5
+IGNORE_PREFIX          = 
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+GENERATE_HTML          = YES
+HTML_OUTPUT            = html
+HTML_FILE_EXTENSION    = .html
+HTML_HEADER            = 
+HTML_FOOTER            = 
+HTML_STYLESHEET        = 
+HTML_ALIGN_MEMBERS     = YES
+GENERATE_HTMLHELP      = NO
+CHM_FILE               = 
+HHC_LOCATION           = 
+GENERATE_CHI           = NO
+BINARY_TOC             = NO
+TOC_EXPAND             = NO
+DISABLE_INDEX          = NO
+ENUM_VALUES_PER_LINE   = 4
+GENERATE_TREEVIEW      = NO
+TREEVIEW_WIDTH         = 250
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+GENERATE_LATEX         = YES
+LATEX_OUTPUT           = latex
+LATEX_CMD_NAME         = latex
+MAKEINDEX_CMD_NAME     = makeindex
+COMPACT_LATEX          = NO
+PAPER_TYPE             = a4wide
+EXTRA_PACKAGES         = 
+LATEX_HEADER           = 
+PDF_HYPERLINKS         = NO
+USE_PDFLATEX           = NO
+LATEX_BATCHMODE        = NO
+LATEX_HIDE_INDICES     = NO
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+GENERATE_RTF           = NO
+RTF_OUTPUT             = rtf
+COMPACT_RTF            = NO
+RTF_HYPERLINKS         = NO
+RTF_STYLESHEET_FILE    = 
+RTF_EXTENSIONS_FILE    = 
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+GENERATE_MAN           = NO
+MAN_OUTPUT             = man
+MAN_EXTENSION          = .3
+MAN_LINKS              = NO
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+GENERATE_XML           = yes
+XML_OUTPUT             = xml
+XML_SCHEMA             = 
+XML_DTD                = 
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+GENERATE_AUTOGEN_DEF   = NO
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+GENERATE_PERLMOD       = NO
+PERLMOD_LATEX          = NO
+PERLMOD_PRETTY         = YES
+PERLMOD_MAKEVAR_PREFIX = 
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor   
+#---------------------------------------------------------------------------
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = NO
+EXPAND_ONLY_PREDEF     = NO
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           = 
+INCLUDE_FILE_PATTERNS  = 
+PREDEFINED             = 
+EXPAND_AS_DEFINED      = 
+SKIP_FUNCTION_MACROS   = YES
+#---------------------------------------------------------------------------
+# Configuration::addtions related to external references   
+#---------------------------------------------------------------------------
+TAGFILES               = 
+GENERATE_TAGFILE       = 
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = YES
+PERL_PATH              = /usr/bin/perl
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool   
+#---------------------------------------------------------------------------
+CLASS_DIAGRAMS         = YES
+HIDE_UNDOC_RELATIONS   = YES
+HAVE_DOT               = NO
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = YES
+UML_LOOK               = NO
+TEMPLATE_RELATIONS     = NO
+INCLUDE_GRAPH          = YES
+INCLUDED_BY_GRAPH      = YES
+CALL_GRAPH             = NO
+GRAPHICAL_HIERARCHY    = YES
+DOT_IMAGE_FORMAT       = png
+DOT_PATH               = 
+DOTFILE_DIRS           = 
+MAX_DOT_GRAPH_WIDTH    = 1024
+MAX_DOT_GRAPH_HEIGHT   = 1024
+MAX_DOT_GRAPH_DEPTH    = 1000
+GENERATE_LEGEND        = YES
+DOT_CLEANUP            = YES
+#---------------------------------------------------------------------------
+# Configuration::addtions related to the search engine   
+#---------------------------------------------------------------------------
+SEARCHENGINE           = NO
+CGI_NAME               = search.cgi
+CGI_URL                = 
+DOC_URL                = 
+DOC_ABSPATH            = 
+BIN_ABSPATH            = /usr/local/bin/
+EXT_DOC_PATHS          = 
diff --git a/r200/Makefile.am b/r200/Makefile.am
new file mode 100644
index 0000000..0234e1d
--- /dev/null
+++ b/r200/Makefile.am
@@ -0,0 +1,29 @@
+AM_CFLAGS = -DIN_DRI_DRIVER -DGLX_DIRECT_RENDERING -DGLX_INDIRECT_RENDERING
+
+R200_CFLAGS = -DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R200
+R200_CFLAGS += -I../radeon -I../radeon/server
+
+r200_dri_la_LTLIBRARIES = r200_dri.la
+r200_dri_la_CFLAGS = $(AM_CFLAGS) $(DRM_CFLAGS) $(DRI_CFLAGS) $(R200_CFLAGS)
+r200_dri_la_LDFLAGS = -module -noprefix -avoid-version -lm -ldl \
+		$(DRM_LIBS) $(DRI_LIBS)
+r200_dri_ladir = @libdir@/dri
+r200_dri_la_SOURCES = \
+	r200_context.c \
+	r200_ioctl.c \
+	r200_lock.c \
+	r200_state.c \
+	r200_state_init.c \
+	r200_cmdbuf.c \
+	r200_pixel.c \
+	r200_tex.c \
+	r200_texmem.c \
+	r200_texstate.c \
+	r200_tcl.c \
+	r200_swtcl.c \
+	r200_span.c \
+	r200_maos.c \
+	r200_sanity.c \
+	r200_fragshader.c \
+	r200_vertprog.c \
+	../radeon/radeon_screen.c
diff --git a/r200/r200_cmdbuf.c b/r200/r200_cmdbuf.c
new file mode 100644
index 0000000..2920cea
--- /dev/null
+++ b/r200/r200_cmdbuf.c
@@ -0,0 +1,429 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_cmdbuf.c,v 1.1 2002/10/30 12:51:51 alanh Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "macros.h"
+#include "context.h"
+#include "swrast/swrast.h"
+#include "simple_list.h"
+
+#include "r200_context.h"
+#include "r200_state.h"
+#include "r200_ioctl.h"
+#include "r200_tcl.h"
+#include "r200_sanity.h"
+#include "radeon_reg.h"
+
+static void print_state_atom( struct r200_state_atom *state )
+{
+   int i;
+
+   fprintf(stderr, "emit %s/%d\n", state->name, state->cmd_size);
+
+   if (0 & R200_DEBUG & DEBUG_VERBOSE) 
+      for (i = 0 ; i < state->cmd_size ; i++) 
+	 fprintf(stderr, "\t%s[%d]: %x\n", state->name, i, state->cmd[i]);
+
+}
+
+/* The state atoms will be emitted in the order they appear in the atom list,
+ * so this step is important.
+ */
+void r200SetUpAtomList( r200ContextPtr rmesa )
+{
+   int i, mtu;
+
+   mtu = rmesa->glCtx->Const.MaxTextureUnits;
+
+   make_empty_list(&rmesa->hw.atomlist);
+   rmesa->hw.atomlist.name = "atom-list";
+
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.ctx );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.set );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.lin );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.msk );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpt );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vtx );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vap );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vte );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.msc );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.cst );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.zbs );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tcl );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.msl );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tcg );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.grd );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.fog );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tam );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tf );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.atf );
+   for (i = 0; i < mtu; ++i)
+       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tex[i] );
+   for (i = 0; i < mtu; ++i)
+       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.cube[i] );
+   for (i = 0; i < 6; ++i)
+       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.pix[i] );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.afs[0] );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.afs[1] );
+   for (i = 0; i < 8; ++i)
+       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.lit[i] );
+   for (i = 0; i < 3 + mtu; ++i)
+       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.mat[i] );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.eye );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.glt );
+   for (i = 0; i < 2; ++i)
+      insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.mtl[i] );
+   for (i = 0; i < 6; ++i)
+       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.ucp[i] );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.spr );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.ptp );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.prf );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.pvs );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpp[0] );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpp[1] );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpi[0] );
+   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpi[1] );
+}
+
+static void r200SaveHwState( r200ContextPtr rmesa )
+{
+   struct r200_state_atom *atom;
+   char * dest = rmesa->backup_store.cmd_buf;
+
+   if (R200_DEBUG & DEBUG_STATE)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   rmesa->backup_store.cmd_used = 0;
+
+   foreach( atom, &rmesa->hw.atomlist ) {
+      if ( atom->check( rmesa->glCtx, atom->idx ) ) {
+	 int size = atom->cmd_size * 4;
+	 memcpy( dest, atom->cmd, size);
+	 dest += size;
+	 rmesa->backup_store.cmd_used += size;
+	 if (R200_DEBUG & DEBUG_STATE)
+	    print_state_atom( atom );
+      }
+   }
+
+   assert( rmesa->backup_store.cmd_used <= R200_CMD_BUF_SZ );
+   if (R200_DEBUG & DEBUG_STATE)
+      fprintf(stderr, "Returning to r200EmitState\n");
+}
+
+void r200EmitState( r200ContextPtr rmesa )
+{
+   char *dest;
+   int mtu;
+   struct r200_state_atom *atom;
+
+   if (R200_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   if (rmesa->save_on_next_emit) {
+      r200SaveHwState(rmesa);
+      rmesa->save_on_next_emit = GL_FALSE;
+   }
+
+   if (!rmesa->hw.is_dirty && !rmesa->hw.all_dirty)
+      return;
+
+   mtu = rmesa->glCtx->Const.MaxTextureUnits;
+
+   /* To avoid going across the entire set of states multiple times, just check
+    * for enough space for the case of emitting all state, and inline the
+    * r200AllocCmdBuf code here without all the checks.
+    */
+   r200EnsureCmdBufSpace( rmesa, rmesa->hw.max_state_size );
+
+   /* we need to calculate dest after EnsureCmdBufSpace
+      as we may flush the buffer - airlied */
+   dest = rmesa->store.cmd_buf + rmesa->store.cmd_used;
+   if (R200_DEBUG & DEBUG_STATE) {
+      foreach( atom, &rmesa->hw.atomlist ) {
+	 if ( atom->dirty || rmesa->hw.all_dirty ) {
+	    if ( atom->check( rmesa->glCtx, atom->idx ) )
+	       print_state_atom( atom );
+	    else
+	       fprintf(stderr, "skip state %s\n", atom->name);
+	 }
+      }
+   }
+
+   foreach( atom, &rmesa->hw.atomlist ) {
+      if ( rmesa->hw.all_dirty )
+	 atom->dirty = GL_TRUE;
+      if ( atom->dirty ) {
+	 if ( atom->check( rmesa->glCtx, atom->idx ) ) {
+	    int size = atom->cmd_size * 4;
+	    memcpy( dest, atom->cmd, size);
+	    dest += size;
+	    rmesa->store.cmd_used += size;
+	    atom->dirty = GL_FALSE;
+	 }
+      }
+   }
+
+   assert( rmesa->store.cmd_used <= R200_CMD_BUF_SZ );
+
+   rmesa->hw.is_dirty = GL_FALSE;
+   rmesa->hw.all_dirty = GL_FALSE;
+}
+
+/* Fire a section of the retained (indexed_verts) buffer as a regular
+ * primtive.  
+ */
+void r200EmitVbufPrim( r200ContextPtr rmesa,
+                       GLuint primitive,
+                       GLuint vertex_nr )
+{
+   drm_radeon_cmd_header_t *cmd;
+
+   assert(!(primitive & R200_VF_PRIM_WALK_IND));
+   
+   r200EmitState( rmesa );
+   
+   if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_PRIMS))
+      fprintf(stderr, "%s cmd_used/4: %d prim %x nr %d\n", __FUNCTION__,
+	      rmesa->store.cmd_used/4, primitive, vertex_nr);
+   
+   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, VBUF_BUFSZ,
+						  __FUNCTION__ );
+   cmd[0].i = 0;
+   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
+   cmd[1].i = R200_CP_CMD_3D_DRAW_VBUF_2;
+   cmd[2].i = (primitive | 
+	       R200_VF_PRIM_WALK_LIST |
+	       R200_VF_COLOR_ORDER_RGBA |
+	       (vertex_nr << R200_VF_VERTEX_NUMBER_SHIFT));
+}
+
+
+void r200FlushElts( r200ContextPtr rmesa )
+{
+   int *cmd = (int *)(rmesa->store.cmd_buf + rmesa->store.elts_start);
+   int dwords;
+   int nr = (rmesa->store.cmd_used - (rmesa->store.elts_start + 12)) / 2;
+
+   if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_PRIMS))
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   assert( rmesa->dma.flush == r200FlushElts );
+   rmesa->dma.flush = NULL;
+
+   /* Cope with odd number of elts:
+    */
+   rmesa->store.cmd_used = (rmesa->store.cmd_used + 2) & ~2;
+   dwords = (rmesa->store.cmd_used - rmesa->store.elts_start) / 4;
+
+   cmd[1] |= (dwords - 3) << 16;
+   cmd[2] |= nr << R200_VF_VERTEX_NUMBER_SHIFT;
+
+   if (R200_DEBUG & DEBUG_SYNC) {
+      fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
+      r200Finish( rmesa->glCtx );
+   }
+}
+
+
+GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
+				    GLuint primitive,
+				    GLuint min_nr )
+{
+   drm_radeon_cmd_header_t *cmd;
+   GLushort *retval;
+
+   if (R200_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s %d prim %x\n", __FUNCTION__, min_nr, primitive);
+
+   assert((primitive & R200_VF_PRIM_WALK_IND));
+   
+   r200EmitState( rmesa );
+   
+   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, ELTS_BUFSZ(min_nr),
+						__FUNCTION__ );
+   cmd[0].i = 0;
+   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
+   cmd[1].i = R200_CP_CMD_3D_DRAW_INDX_2;
+   cmd[2].i = (primitive | 
+	       R200_VF_PRIM_WALK_IND |
+	       R200_VF_COLOR_ORDER_RGBA);
+
+   
+   retval = (GLushort *)(cmd+3);
+
+   if (R200_DEBUG & DEBUG_PRIMS)
+      fprintf(stderr, "%s: header 0x%x prim %x \n",
+	      __FUNCTION__,
+	      cmd[1].i, primitive);
+
+   assert(!rmesa->dma.flush);
+   rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+   rmesa->dma.flush = r200FlushElts;
+
+   rmesa->store.elts_start = ((char *)cmd) - rmesa->store.cmd_buf;
+
+   return retval;
+}
+
+
+
+void r200EmitVertexAOS( r200ContextPtr rmesa,
+			  GLuint vertex_size,
+			  GLuint offset )
+{
+   drm_radeon_cmd_header_t *cmd;
+
+   if (R200_DEBUG & (DEBUG_PRIMS|DEBUG_IOCTL))
+      fprintf(stderr, "%s:  vertex_size 0x%x offset 0x%x \n",
+	      __FUNCTION__, vertex_size, offset);
+
+   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, VERT_AOS_BUFSZ,
+						  __FUNCTION__ );
+
+   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
+   cmd[1].i = R200_CP_CMD_3D_LOAD_VBPNTR | (2 << 16);
+   cmd[2].i = 1;
+   cmd[3].i = vertex_size | (vertex_size << 8);
+   cmd[4].i = offset;
+}
+		       
+
+void r200EmitAOS( r200ContextPtr rmesa,
+		    struct r200_dma_region **component,
+		    GLuint nr,
+		    GLuint offset )
+{
+   drm_radeon_cmd_header_t *cmd;
+   int sz = AOS_BUFSZ(nr);
+   int i;
+   int *tmp;
+
+   if (R200_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s nr arrays: %d\n", __FUNCTION__, nr);
+
+   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, sz, __FUNCTION__ );
+   cmd[0].i = 0;
+   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
+   cmd[1].i = R200_CP_CMD_3D_LOAD_VBPNTR | (((sz / sizeof(int)) - 3) << 16);
+   cmd[2].i = nr;
+   tmp = &cmd[0].i;
+   cmd += 3;
+
+   for (i = 0 ; i < nr ; i++) {
+      if (i & 1) {
+	 cmd[0].i |= ((component[i]->aos_stride << 24) | 
+		      (component[i]->aos_size << 16));
+	 cmd[2].i = (component[i]->aos_start + 
+		     offset * component[i]->aos_stride * 4);
+	 cmd += 3;
+      }
+      else {
+	 cmd[0].i = ((component[i]->aos_stride << 8) | 
+		     (component[i]->aos_size << 0));
+	 cmd[1].i = (component[i]->aos_start + 
+		     offset * component[i]->aos_stride * 4);
+      }
+   }
+
+   if (R200_DEBUG & DEBUG_VERTS) {
+      fprintf(stderr, "%s:\n", __FUNCTION__);
+      for (i = 0 ; i < sz ; i++)
+	 fprintf(stderr, "   %d: %x\n", i, tmp[i]);
+   }
+}
+
+void r200EmitBlit( r200ContextPtr rmesa,
+		   GLuint color_fmt,
+		   GLuint src_pitch,
+		   GLuint src_offset,
+		   GLuint dst_pitch,
+		   GLuint dst_offset,
+		   GLint srcx, GLint srcy,
+		   GLint dstx, GLint dsty,
+		   GLuint w, GLuint h )
+{
+   drm_radeon_cmd_header_t *cmd;
+
+   if (R200_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
+	      __FUNCTION__, 
+	      src_pitch, src_offset, srcx, srcy,
+	      dst_pitch, dst_offset, dstx, dsty,
+	      w, h);
+
+   assert( (src_pitch & 63) == 0 );
+   assert( (dst_pitch & 63) == 0 );
+   assert( (src_offset & 1023) == 0 );
+   assert( (dst_offset & 1023) == 0 );
+   assert( w < (1<<16) );
+   assert( h < (1<<16) );
+
+   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, 8 * sizeof(int),
+						  __FUNCTION__ );
+
+
+   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
+   cmd[1].i = R200_CP_CMD_BITBLT_MULTI | (5 << 16);
+   cmd[2].i = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
+	       RADEON_GMC_DST_PITCH_OFFSET_CNTL |
+	       RADEON_GMC_BRUSH_NONE |
+	       (color_fmt << 8) |
+	       RADEON_GMC_SRC_DATATYPE_COLOR |
+	       RADEON_ROP3_S |
+	       RADEON_DP_SRC_SOURCE_MEMORY |
+	       RADEON_GMC_CLR_CMP_CNTL_DIS |
+	       RADEON_GMC_WR_MSK_DIS );
+
+   cmd[3].i = ((src_pitch/64)<<22) | (src_offset >> 10);
+   cmd[4].i = ((dst_pitch/64)<<22) | (dst_offset >> 10);
+   cmd[5].i = (srcx << 16) | srcy;
+   cmd[6].i = (dstx << 16) | dsty; /* dst */
+   cmd[7].i = (w << 16) | h;
+}
+
+
+void r200EmitWait( r200ContextPtr rmesa, GLuint flags )
+{
+   drm_radeon_cmd_header_t *cmd;
+
+   assert( !(flags & ~(RADEON_WAIT_2D|RADEON_WAIT_3D)) );
+
+   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, 1 * sizeof(int),
+					   __FUNCTION__ );
+   cmd[0].i = 0;
+   cmd[0].wait.cmd_type = RADEON_CMD_WAIT;
+   cmd[0].wait.flags = flags;
+}
diff --git a/r200/r200_context.c b/r200/r200_context.c
new file mode 100644
index 0000000..786a298
--- /dev/null
+++ b/r200/r200_context.c
@@ -0,0 +1,714 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_context.c,v 1.3 2003/05/06 23:52:08 daenzer Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "api_arrayelt.h"
+#include "context.h"
+#include "simple_list.h"
+#include "imports.h"
+#include "matrix.h"
+#include "extensions.h"
+#include "framebuffer.h"
+#include "state.h"
+
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "vbo/vbo.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+
+#include "drivers/common/driverfuncs.h"
+
+#include "r200_context.h"
+#include "r200_ioctl.h"
+#include "r200_state.h"
+#include "r200_span.h"
+#include "r200_pixel.h"
+#include "r200_tex.h"
+#include "r200_swtcl.h"
+#include "r200_tcl.h"
+#include "r200_maos.h"
+#include "r200_vertprog.h"
+
+#define need_GL_ARB_multisample
+#define need_GL_ARB_texture_compression
+#define need_GL_ARB_vertex_buffer_object
+#define need_GL_ARB_vertex_program
+#define need_GL_ATI_fragment_shader
+#define need_GL_EXT_blend_minmax
+#define need_GL_EXT_fog_coord
+#define need_GL_EXT_secondary_color
+#define need_GL_EXT_blend_equation_separate
+#define need_GL_EXT_blend_func_separate
+#define need_GL_NV_vertex_program
+#define need_GL_ARB_point_parameters
+#include "extension_helper.h"
+
+#define DRIVER_DATE	"20060602"
+
+#include "vblank.h"
+#include "utils.h"
+#include "xmlpool.h" /* for symbolic values of enum-type options */
+#ifndef R200_DEBUG
+int R200_DEBUG = (0);
+#endif
+
+/* Return various strings for glGetString().
+ */
+static const GLubyte *r200GetString( GLcontext *ctx, GLenum name )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   static char buffer[128];
+   unsigned   offset;
+   GLuint agp_mode = (rmesa->r200Screen->card_type == RADEON_CARD_PCI)? 0 :
+      rmesa->r200Screen->AGPMode;
+
+   switch ( name ) {
+   case GL_VENDOR:
+      return (GLubyte *)"Tungsten Graphics, Inc.";
+
+   case GL_RENDERER:
+      offset = driGetRendererString( buffer, "R200", DRIVER_DATE,
+				     agp_mode );
+
+      sprintf( & buffer[ offset ], " %sTCL",
+	       !(rmesa->TclFallback & R200_TCL_FALLBACK_TCL_DISABLE)
+	       ? "" : "NO-" );
+
+      return (GLubyte *)buffer;
+
+   default:
+      return NULL;
+   }
+}
+
+
+/* Extension strings exported by the R200 driver.
+ */
+const struct dri_extension card_extensions[] =
+{
+    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
+    { "GL_ARB_multitexture",               NULL },
+    { "GL_ARB_texture_border_clamp",       NULL },
+    { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
+    { "GL_ARB_texture_env_add",            NULL },
+    { "GL_ARB_texture_env_combine",        NULL },
+    { "GL_ARB_texture_env_dot3",           NULL },
+    { "GL_ARB_texture_env_crossbar",       NULL },
+    { "GL_ARB_texture_mirrored_repeat",    NULL },
+    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
+    { "GL_EXT_blend_minmax",               GL_EXT_blend_minmax_functions },
+    { "GL_EXT_blend_subtract",             NULL },
+    { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+    { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
+    { "GL_EXT_stencil_wrap",               NULL },
+    { "GL_EXT_texture_edge_clamp",         NULL },
+    { "GL_EXT_texture_env_combine",        NULL },
+    { "GL_EXT_texture_env_dot3",           NULL },
+    { "GL_EXT_texture_filter_anisotropic", NULL },
+    { "GL_EXT_texture_lod_bias",           NULL },
+    { "GL_EXT_texture_mirror_clamp",       NULL },
+    { "GL_EXT_texture_rectangle",          NULL },
+    { "GL_ATI_texture_env_combine3",       NULL },
+    { "GL_ATI_texture_mirror_once",        NULL },
+    { "GL_MESA_pack_invert",               NULL },
+    { "GL_NV_blend_square",                NULL },
+    { "GL_SGIS_generate_mipmap",           NULL },
+    { NULL,                                NULL }
+};
+
+const struct dri_extension blend_extensions[] = {
+    { "GL_EXT_blend_equation_separate",    GL_EXT_blend_equation_separate_functions },
+    { "GL_EXT_blend_func_separate",        GL_EXT_blend_func_separate_functions },
+    { NULL,                                NULL }
+};
+
+const struct dri_extension ARB_vp_extension[] = {
+    { "GL_ARB_vertex_program",             GL_ARB_vertex_program_functions }
+};
+
+const struct dri_extension NV_vp_extension[] = {
+    { "GL_NV_vertex_program",              GL_NV_vertex_program_functions }
+};
+
+const struct dri_extension ATI_fs_extension[] = {
+    { "GL_ATI_fragment_shader",            GL_ATI_fragment_shader_functions }
+};
+
+const struct dri_extension point_extensions[] = {
+    { "GL_ARB_point_sprite",               NULL },
+    { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
+    { NULL,                                NULL }
+};
+
+extern const struct tnl_pipeline_stage _r200_render_stage;
+extern const struct tnl_pipeline_stage _r200_tcl_stage;
+
+static const struct tnl_pipeline_stage *r200_pipeline[] = {
+
+   /* Try and go straight to t&l
+    */
+   &_r200_tcl_stage,  
+
+   /* Catch any t&l fallbacks
+    */
+   &_tnl_vertex_transform_stage,
+   &_tnl_normal_transform_stage,
+   &_tnl_lighting_stage,
+   &_tnl_fog_coordinate_stage,
+   &_tnl_texgen_stage,
+   &_tnl_texture_transform_stage,
+   &_tnl_point_attenuation_stage,
+   &_tnl_vertex_program_stage,
+   /* Try again to go to tcl? 
+    *     - no good for asymmetric-twoside (do with multipass)
+    *     - no good for asymmetric-unfilled (do with multipass)
+    *     - good for material
+    *     - good for texgen
+    *     - need to manipulate a bit of state
+    *
+    * - worth it/not worth it?
+    */
+			
+   /* Else do them here.
+    */
+/*    &_r200_render_stage,  */ /* FIXME: bugs with ut2003 */
+   &_tnl_render_stage,		/* FALLBACK:  */
+   NULL,
+};
+
+
+
+/* Initialize the driver's misc functions.
+ */
+static void r200InitDriverFuncs( struct dd_function_table *functions )
+{
+    functions->GetBufferSize		= NULL; /* OBSOLETE */
+    functions->GetString		= r200GetString;
+}
+
+static const struct dri_debug_control debug_control[] =
+{
+    { "fall",  DEBUG_FALLBACKS },
+    { "tex",   DEBUG_TEXTURE },
+    { "ioctl", DEBUG_IOCTL },
+    { "prim",  DEBUG_PRIMS },
+    { "vert",  DEBUG_VERTS },
+    { "state", DEBUG_STATE },
+    { "code",  DEBUG_CODEGEN },
+    { "vfmt",  DEBUG_VFMT },
+    { "vtxf",  DEBUG_VFMT },
+    { "verb",  DEBUG_VERBOSE },
+    { "dri",   DEBUG_DRI },
+    { "dma",   DEBUG_DMA },
+    { "san",   DEBUG_SANITY },
+    { "sync",  DEBUG_SYNC },
+    { "pix",   DEBUG_PIXEL },
+    { "mem",   DEBUG_MEMORY },
+    { NULL,    0 }
+};
+
+
+/* Create the device specific rendering context.
+ */
+GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+			     __DRIcontextPrivate *driContextPriv,
+			     void *sharedContextPrivate)
+{
+   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   radeonScreenPtr screen = (radeonScreenPtr)(sPriv->private);
+   struct dd_function_table functions;
+   r200ContextPtr rmesa;
+   GLcontext *ctx, *shareCtx;
+   int i;
+   int tcl_mode, fthrottle_mode;
+
+   assert(glVisual);
+   assert(driContextPriv);
+   assert(screen);
+
+   /* Allocate the R200 context */
+   rmesa = (r200ContextPtr) CALLOC( sizeof(*rmesa) );
+   if ( !rmesa )
+      return GL_FALSE;
+      
+   /* init exp fog table data */
+   r200InitStaticFogData();
+
+   /* Parse configuration files.
+    * Do this here so that initialMaxAnisotropy is set before we create
+    * the default textures.
+    */
+   driParseConfigFiles (&rmesa->optionCache, &screen->optionCache,
+			screen->driScreen->myNum, "r200");
+   rmesa->initialMaxAnisotropy = driQueryOptionf(&rmesa->optionCache,
+                                                 "def_max_anisotropy");
+
+   if ( driQueryOptionb( &rmesa->optionCache, "hyperz" ) ) {
+      if ( sPriv->drmMinor < 13 )
+	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
+			  "disabling.\n",sPriv->drmMinor );
+      else
+	 rmesa->using_hyperz = GL_TRUE;
+   }
+ 
+   if ( sPriv->drmMinor >= 15 )
+      rmesa->texmicrotile = GL_TRUE;
+
+   /* Init default driver functions then plug in our R200-specific functions
+    * (the texture functions are especially important)
+    */
+   _mesa_init_driver_functions(&functions);
+   r200InitDriverFuncs(&functions);
+   r200InitIoctlFuncs(&functions);
+   r200InitStateFuncs(&functions);
+   r200InitTextureFuncs(&functions);
+   r200InitShaderFuncs(&functions); 
+
+   /* Allocate and initialize the Mesa context */
+   if (sharedContextPrivate)
+      shareCtx = ((r200ContextPtr) sharedContextPrivate)->glCtx;
+   else
+      shareCtx = NULL;
+   rmesa->glCtx = _mesa_create_context(glVisual, shareCtx,
+                                       &functions, (void *) rmesa);
+   if (!rmesa->glCtx) {
+      FREE(rmesa);
+      return GL_FALSE;
+   }
+   driContextPriv->driverPrivate = rmesa;
+
+   /* Init r200 context data */
+   rmesa->dri.context = driContextPriv;
+   rmesa->dri.screen = sPriv;
+   rmesa->dri.drawable = NULL; /* Set by XMesaMakeCurrent */
+   rmesa->dri.hwContext = driContextPriv->hHWContext;
+   rmesa->dri.hwLock = &sPriv->pSAREA->lock;
+   rmesa->dri.fd = sPriv->fd;
+   rmesa->dri.drmMinor = sPriv->drmMinor;
+
+   rmesa->r200Screen = screen;
+   rmesa->sarea = (drm_radeon_sarea_t *)((GLubyte *)sPriv->pSAREA +
+				       screen->sarea_priv_offset);
+
+
+   rmesa->dma.buf0_address = rmesa->r200Screen->buffers->list[0].address;
+
+   (void) memset( rmesa->texture_heaps, 0, sizeof( rmesa->texture_heaps ) );
+   make_empty_list( & rmesa->swapped );
+
+   rmesa->nr_heaps = 1 /* screen->numTexHeaps */ ;
+   assert(rmesa->nr_heaps < RADEON_NR_TEX_HEAPS);
+   for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
+      rmesa->texture_heaps[i] = driCreateTextureHeap( i, rmesa,
+	    screen->texSize[i],
+	    12,
+	    RADEON_NR_TEX_REGIONS,
+	    (drmTextureRegionPtr)rmesa->sarea->tex_list[i],
+	    & rmesa->sarea->tex_age[i],
+	    & rmesa->swapped,
+	    sizeof( r200TexObj ),
+	    (destroy_texture_object_t *) r200DestroyTexObj );
+   }
+   rmesa->texture_depth = driQueryOptioni (&rmesa->optionCache,
+					   "texture_depth");
+   if (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
+      rmesa->texture_depth = ( screen->cpp == 4 ) ?
+	 DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
+
+   rmesa->swtcl.RenderIndex = ~0;
+   rmesa->hw.all_dirty = 1;
+
+   /* Set the maximum texture size small enough that we can guarentee that
+    * all texture units can bind a maximal texture and have all of them in
+    * texturable memory at once. Depending on the allow_large_textures driconf
+    * setting allow larger textures.
+    */
+
+   ctx = rmesa->glCtx;
+   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->optionCache,
+						 "texture_units");
+   ctx->Const.MaxTextureImageUnits = ctx->Const.MaxTextureUnits;
+   ctx->Const.MaxTextureCoordUnits = ctx->Const.MaxTextureUnits;
+
+   i = driQueryOptioni( &rmesa->optionCache, "allow_large_textures");
+
+   driCalculateMaxTextureLevels( rmesa->texture_heaps,
+				 rmesa->nr_heaps,
+				 & ctx->Const,
+				 4,
+				 11, /* max 2D texture size is 2048x2048 */
+#if ENABLE_HW_3D_TEXTURE
+				 8,  /* max 3D texture size is 256^3 */
+#else
+				 0,  /* 3D textures unsupported */
+#endif
+				 11, /* max cube texture size is 2048x2048 */
+				 11, /* max texture rectangle size is 2048x2048 */
+				 12,
+				 GL_FALSE,
+				 i );
+
+   ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+
+   /* No wide AA points.
+    */
+   ctx->Const.MinPointSize = 1.0;
+   ctx->Const.MinPointSizeAA = 1.0;
+   ctx->Const.MaxPointSizeAA = 1.0;
+   ctx->Const.PointSizeGranularity = 0.0625;
+   if (rmesa->r200Screen->drmSupportsPointSprites)
+      ctx->Const.MaxPointSize = 2047.0;
+   else
+      ctx->Const.MaxPointSize = 1.0;
+
+   /* mesa initialization problem - _mesa_init_point was already called */
+   ctx->Point.MaxSize = ctx->Const.MaxPointSize;
+
+   ctx->Const.MinLineWidth = 1.0;
+   ctx->Const.MinLineWidthAA = 1.0;
+   ctx->Const.MaxLineWidth = 10.0;
+   ctx->Const.MaxLineWidthAA = 10.0;
+   ctx->Const.LineWidthGranularity = 0.0625;
+
+   ctx->Const.VertexProgram.MaxNativeInstructions = R200_VSF_MAX_INST;
+   ctx->Const.VertexProgram.MaxNativeAttribs = 12;
+   ctx->Const.VertexProgram.MaxNativeTemps = R200_VSF_MAX_TEMPS;
+   ctx->Const.VertexProgram.MaxNativeParameters = R200_VSF_MAX_PARAM;
+   ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+
+   /* Initialize the software rasterizer and helper modules.
+    */
+   _swrast_CreateContext( ctx );
+   _vbo_CreateContext( ctx );
+   _tnl_CreateContext( ctx );
+   _swsetup_CreateContext( ctx );
+   _ae_create_context( ctx );
+
+   /* Install the customized pipeline:
+    */
+   _tnl_destroy_pipeline( ctx );
+   _tnl_install_pipeline( ctx, r200_pipeline );
+
+   /* Try and keep materials and vertices separate:
+    */
+/*    _tnl_isolate_materials( ctx, GL_TRUE ); */
+
+
+   /* Configure swrast and TNL to match hardware characteristics:
+    */
+   _swrast_allow_pixel_fog( ctx, GL_FALSE );
+   _swrast_allow_vertex_fog( ctx, GL_TRUE );
+   _tnl_allow_pixel_fog( ctx, GL_FALSE );
+   _tnl_allow_vertex_fog( ctx, GL_TRUE );
+
+
+   for ( i = 0 ; i < R200_MAX_TEXTURE_UNITS ; i++ ) {
+      _math_matrix_ctr( &rmesa->TexGenMatrix[i] );
+      _math_matrix_set_identity( &rmesa->TexGenMatrix[i] );
+   }
+   _math_matrix_ctr( &rmesa->tmpmat );
+   _math_matrix_set_identity( &rmesa->tmpmat );
+
+   driInitExtensions( ctx, card_extensions, GL_TRUE );
+   if (!(rmesa->r200Screen->chip_flags & R200_CHIPSET_YCBCR_BROKEN)) {
+     /* yuv textures don't work with some chips - R200 / rv280 okay so far
+	others get the bit ordering right but don't actually do YUV-RGB conversion */
+      _mesa_enable_extension( ctx, "GL_MESA_ycbcr_texture" );
+   }
+   if (rmesa->glCtx->Mesa_DXTn) {
+      _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
+      _mesa_enable_extension( ctx, "GL_S3_s3tc" );
+   }
+   else if (driQueryOptionb (&rmesa->optionCache, "force_s3tc_enable")) {
+      _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
+   }
+
+   if (rmesa->r200Screen->drmSupportsCubeMapsR200)
+      _mesa_enable_extension( ctx, "GL_ARB_texture_cube_map" );
+   if (rmesa->r200Screen->drmSupportsBlendColor) {
+       driInitExtensions( ctx, blend_extensions, GL_FALSE );
+   }
+   if(rmesa->r200Screen->drmSupportsVertexProgram)
+      driInitSingleExtension( ctx, ARB_vp_extension );
+   if(driQueryOptionb(&rmesa->optionCache, "nv_vertex_program"))
+      driInitSingleExtension( ctx, NV_vp_extension );
+
+   if ((ctx->Const.MaxTextureUnits == 6) && rmesa->r200Screen->drmSupportsFragShader)
+      driInitSingleExtension( ctx, ATI_fs_extension );
+   if (rmesa->r200Screen->drmSupportsPointSprites)
+      driInitExtensions( ctx, point_extensions, GL_FALSE );
+#if 0
+   r200InitDriverFuncs( ctx );
+   r200InitIoctlFuncs( ctx );
+   r200InitStateFuncs( ctx );
+   r200InitTextureFuncs( ctx );
+#endif
+   /* plug in a few more device driver functions */
+   /* XXX these should really go right after _mesa_init_driver_functions() */
+   r200InitPixelFuncs( ctx );
+   r200InitSpanFuncs( ctx );
+   r200InitTnlFuncs( ctx );
+   r200InitState( rmesa );
+   r200InitSwtcl( ctx );
+
+   fthrottle_mode = driQueryOptioni(&rmesa->optionCache, "fthrottle_mode");
+   rmesa->iw.irq_seq = -1;
+   rmesa->irqsEmitted = 0;
+   rmesa->do_irqs = (fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS &&
+		     rmesa->r200Screen->irq);
+
+   rmesa->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
+
+   if (!rmesa->do_irqs)
+      fprintf(stderr,
+	      "IRQ's not enabled, falling back to %s: %d %d\n",
+	      rmesa->do_usleeps ? "usleeps" : "busy waits",
+	      fthrottle_mode,
+	      rmesa->r200Screen->irq);
+
+   rmesa->vblank_flags = (rmesa->r200Screen->irq != 0)
+       ? driGetDefaultVBlankFlags(&rmesa->optionCache) : VBLANK_FLAG_NO_IRQ;
+
+   rmesa->prefer_gart_client_texturing = 
+      (getenv("R200_GART_CLIENT_TEXTURES") != 0);
+
+   (*dri_interface->getUST)( & rmesa->swap_ust );
+
+
+#if DO_DEBUG
+   R200_DEBUG  = driParseDebugString( getenv( "R200_DEBUG" ),
+				      debug_control );
+   R200_DEBUG |= driParseDebugString( getenv( "RADEON_DEBUG" ),
+				      debug_control );
+#endif
+
+   tcl_mode = driQueryOptioni(&rmesa->optionCache, "tcl_mode");
+   if (driQueryOptionb(&rmesa->optionCache, "no_rast")) {
+      fprintf(stderr, "disabling 3D acceleration\n");
+      FALLBACK(rmesa, R200_FALLBACK_DISABLE, 1);
+   }
+   else if (tcl_mode == DRI_CONF_TCL_SW || getenv("R200_NO_TCL") ||
+	    !(rmesa->r200Screen->chip_flags & RADEON_CHIPSET_TCL)) {
+      if (rmesa->r200Screen->chip_flags & RADEON_CHIPSET_TCL) {
+	 rmesa->r200Screen->chip_flags &= ~RADEON_CHIPSET_TCL;
+	 fprintf(stderr, "Disabling HW TCL support\n");
+      }
+      TCL_FALLBACK(rmesa->glCtx, R200_TCL_FALLBACK_TCL_DISABLE, 1);
+   }
+
+   return GL_TRUE;
+}
+
+
+/* Destroy the device specific context.
+ */
+/* Destroy the Mesa and driver specific context data.
+ */
+void r200DestroyContext( __DRIcontextPrivate *driContextPriv )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   r200ContextPtr rmesa = (r200ContextPtr) driContextPriv->driverPrivate;
+   r200ContextPtr current = ctx ? R200_CONTEXT(ctx) : NULL;
+
+   /* check if we're deleting the currently bound context */
+   if (rmesa == current) {
+      R200_FIREVERTICES( rmesa );
+      _mesa_make_current(NULL, NULL, NULL);
+   }
+
+   /* Free r200 context resources */
+   assert(rmesa); /* should never be null */
+   if ( rmesa ) {
+      GLboolean   release_texture_heaps;
+
+
+      release_texture_heaps = (rmesa->glCtx->Shared->RefCount == 1);
+      _swsetup_DestroyContext( rmesa->glCtx );
+      _tnl_DestroyContext( rmesa->glCtx );
+      _vbo_DestroyContext( rmesa->glCtx );
+      _swrast_DestroyContext( rmesa->glCtx );
+
+      r200DestroySwtcl( rmesa->glCtx );
+      r200ReleaseArrays( rmesa->glCtx, ~0 );
+
+      if (rmesa->dma.current.buf) {
+	 r200ReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
+	 r200FlushCmdBuf( rmesa, __FUNCTION__ );
+      }
+
+      if (rmesa->state.scissor.pClipRects) {
+	 FREE(rmesa->state.scissor.pClipRects);
+	 rmesa->state.scissor.pClipRects = NULL;
+      }
+
+      if ( release_texture_heaps ) {
+         /* This share group is about to go away, free our private
+          * texture object data.
+          */
+         int i;
+
+         for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
+	    driDestroyTextureHeap( rmesa->texture_heaps[ i ] );
+	    rmesa->texture_heaps[ i ] = NULL;
+         }
+
+	 assert( is_empty_list( & rmesa->swapped ) );
+      }
+
+      /* free the Mesa context */
+      rmesa->glCtx->DriverCtx = NULL;
+      _mesa_destroy_context( rmesa->glCtx );
+
+      /* free the option cache */
+      driDestroyOptionCache (&rmesa->optionCache);
+
+      FREE( rmesa );
+   }
+}
+
+
+
+
+void
+r200SwapBuffers( __DRIdrawablePrivate *dPriv )
+{
+   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+      r200ContextPtr rmesa;
+      GLcontext *ctx;
+      rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
+      ctx = rmesa->glCtx;
+      if (ctx->Visual.doubleBufferMode) {
+         _mesa_notifySwapBuffers( ctx );  /* flush pending rendering comands */
+         if ( rmesa->doPageFlip ) {
+            r200PageFlip( dPriv );
+         }
+         else {
+	     r200CopyBuffer( dPriv, NULL );
+         }
+      }
+   }
+   else {
+      /* XXX this shouldn't be an error but we can't handle it for now */
+      _mesa_problem(NULL, "%s: drawable has no context!", __FUNCTION__);
+   }
+}
+
+void
+r200CopySubBuffer( __DRIdrawablePrivate *dPriv,
+		   int x, int y, int w, int h )
+{
+   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+      r200ContextPtr rmesa;
+      GLcontext *ctx;
+      rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
+      ctx = rmesa->glCtx;
+      if (ctx->Visual.doubleBufferMode) {
+	 drm_clip_rect_t rect;
+	 rect.x1 = x + dPriv->x;
+	 rect.y1 = (dPriv->h - y - h) + dPriv->y;
+	 rect.x2 = rect.x1 + w;
+	 rect.y2 = rect.y1 + h;
+         _mesa_notifySwapBuffers( ctx );  /* flush pending rendering comands */
+	 r200CopyBuffer( dPriv, &rect );
+      }
+   }
+   else {
+      /* XXX this shouldn't be an error but we can't handle it for now */
+      _mesa_problem(NULL, "%s: drawable has no context!", __FUNCTION__);
+   }
+}
+
+/* Force the context `c' to be the current context and associate with it
+ * buffer `b'.
+ */
+GLboolean
+r200MakeCurrent( __DRIcontextPrivate *driContextPriv,
+                   __DRIdrawablePrivate *driDrawPriv,
+                   __DRIdrawablePrivate *driReadPriv )
+{
+   if ( driContextPriv ) {
+      r200ContextPtr newCtx = 
+	 (r200ContextPtr) driContextPriv->driverPrivate;
+
+      if (R200_DEBUG & DEBUG_DRI)
+	 fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *)newCtx->glCtx);
+
+      if ( newCtx->dri.drawable != driDrawPriv ) {
+	 driDrawableInitVBlank( driDrawPriv, newCtx->vblank_flags,
+				&newCtx->vbl_seq );
+      }
+
+      newCtx->dri.readable = driReadPriv;
+
+      if ( newCtx->dri.drawable != driDrawPriv ||
+           newCtx->lastStamp != driDrawPriv->lastStamp ) {
+	 newCtx->dri.drawable = driDrawPriv;
+
+	 r200SetCliprects(newCtx);
+	 r200UpdateViewportOffset( newCtx->glCtx );
+      }
+
+      _mesa_make_current( newCtx->glCtx,
+			  (GLframebuffer *) driDrawPriv->driverPrivate,
+			  (GLframebuffer *) driReadPriv->driverPrivate );
+
+      _mesa_update_state( newCtx->glCtx );
+      r200ValidateState( newCtx->glCtx );
+
+   } else {
+      if (R200_DEBUG & DEBUG_DRI)
+	 fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
+      _mesa_make_current( NULL, NULL, NULL );
+   }
+
+   if (R200_DEBUG & DEBUG_DRI)
+      fprintf(stderr, "End %s\n", __FUNCTION__);
+   return GL_TRUE;
+}
+
+/* Force the context `c' to be unbound from its buffer.
+ */
+GLboolean
+r200UnbindContext( __DRIcontextPrivate *driContextPriv )
+{
+   r200ContextPtr rmesa = (r200ContextPtr) driContextPriv->driverPrivate;
+
+   if (R200_DEBUG & DEBUG_DRI)
+      fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *)rmesa->glCtx);
+
+   return GL_TRUE;
+}
diff --git a/r200/r200_context.h b/r200/r200_context.h
new file mode 100644
index 0000000..a06a2f5
--- /dev/null
+++ b/r200/r200_context.h
@@ -0,0 +1,988 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_context.h,v 1.2 2002/12/16 16:18:54 dawes Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __R200_CONTEXT_H__
+#define __R200_CONTEXT_H__
+
+#include "tnl/t_vertex.h"
+#include "drm.h"
+#include "radeon_drm.h"
+#include "dri_util.h"
+#include "texmem.h"
+
+#include "macros.h"
+#include "mtypes.h"
+#include "colormac.h"
+#include "r200_reg.h"
+#include "r200_vertprog.h"
+
+#define ENABLE_HW_3D_TEXTURE 1  /* XXX this is temporary! */
+
+#ifndef R200_EMIT_VAP_PVS_CNTL
+#error This driver requires a newer libdrm to compile
+#endif
+
+struct r200_context;
+typedef struct r200_context r200ContextRec;
+typedef struct r200_context *r200ContextPtr;
+
+/* This union is used to avoid warnings/miscompilation
+   with float to uint32_t casts due to strict-aliasing */
+typedef union { GLfloat f; uint32_t ui32; } float_ui32_type;
+
+#include "r200_lock.h"
+#include "radeon_screen.h"
+#include "mm.h"
+
+/* Flags for software fallback cases */
+/* See correponding strings in r200_swtcl.c */
+#define R200_FALLBACK_TEXTURE           0x01
+#define R200_FALLBACK_DRAW_BUFFER       0x02
+#define R200_FALLBACK_STENCIL           0x04
+#define R200_FALLBACK_RENDER_MODE       0x08
+#define R200_FALLBACK_DISABLE           0x10
+#define R200_FALLBACK_BORDER_MODE       0x20
+
+/* The blit width for texture uploads
+ */
+#define BLIT_WIDTH_BYTES 1024
+
+/* Use the templated vertex format:
+ */
+#define COLOR_IS_RGBA
+#define TAG(x) r200##x
+#include "tnl_dd/t_dd_vertex.h"
+#undef TAG
+
+typedef void (*r200_tri_func)( r200ContextPtr,
+				 r200Vertex *,
+				 r200Vertex *,
+				 r200Vertex * );
+
+typedef void (*r200_line_func)( r200ContextPtr,
+				  r200Vertex *,
+				  r200Vertex * );
+
+typedef void (*r200_point_func)( r200ContextPtr,
+				   r200Vertex * );
+
+
+struct r200_vertex_program {
+        struct gl_vertex_program mesa_program; /* Must be first */
+        int translated;
+        /* need excess instr: 1 for late loop checking, 2 for 
+           additional instr due to instr/attr, 3 for fog */
+        VERTEX_SHADER_INSTRUCTION instr[R200_VSF_MAX_INST + 6];
+        int pos_end;
+        int inputs[VERT_ATTRIB_MAX];
+        GLubyte inputmap_rev[16];
+        int native;
+        int fogpidx;
+        int fogmode;
+};
+
+struct r200_colorbuffer_state {
+   GLuint clear;
+#if 000
+   GLint drawOffset, drawPitch;
+#endif
+   int roundEnable;
+};
+
+
+struct r200_depthbuffer_state {
+   GLuint clear;
+   GLfloat scale;
+};
+
+#if 000
+struct r200_pixel_state {
+   GLint readOffset, readPitch;
+};
+#endif
+
+struct r200_scissor_state {
+   drm_clip_rect_t rect;
+   GLboolean enabled;
+
+   GLuint numClipRects;			/* Cliprects active */
+   GLuint numAllocedClipRects;		/* Cliprects available */
+   drm_clip_rect_t *pClipRects;
+};
+
+struct r200_stencilbuffer_state {
+   GLboolean hwBuffer;
+   GLuint clear;			/* rb3d_stencilrefmask value */
+};
+
+struct r200_stipple_state {
+   GLuint mask[32];
+};
+
+
+
+#define TEX_0   0x1
+#define TEX_1   0x2
+#define TEX_2	0x4
+#define TEX_3	0x8
+#define TEX_4	0x10
+#define TEX_5	0x20
+#define TEX_ALL 0x3f
+
+typedef struct r200_tex_obj r200TexObj, *r200TexObjPtr;
+
+/* Texture object in locally shared texture space.
+ */
+struct r200_tex_obj {
+   driTextureObject   base;
+
+   GLuint bufAddr;			/* Offset to start of locally
+					   shared texture block */
+
+   GLuint dirty_state;		        /* Flags (1 per texunit) for
+					   whether or not this texobj
+					   has dirty hardware state
+					   (pp_*) that needs to be
+					   brought into the
+					   texunit. */
+
+   drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
+					/* Six, for the cube faces */
+   GLboolean image_override;		/* Image overridden by GLX_EXT_tfp */
+
+   GLuint pp_txfilter;		        /* hardware register values */
+   GLuint pp_txformat;
+   GLuint pp_txformat_x;
+   GLuint pp_txoffset;		        /* Image location in texmem.
+					   All cube faces follow. */
+   GLuint pp_txsize;		        /* npot only */
+   GLuint pp_txpitch;		        /* npot only */
+   GLuint pp_border_color;
+   GLuint pp_cubic_faces;	        /* cube face 1,2,3,4 log2 sizes */
+
+   GLboolean  border_fallback;
+
+   GLuint tile_bits;			/* hw texture tile bits used on this texture */
+};
+
+
+struct r200_texture_env_state {
+   r200TexObjPtr texobj;
+   GLuint outputreg;
+   GLuint unitneeded;
+};
+
+#define R200_MAX_TEXTURE_UNITS 6
+
+struct r200_texture_state {
+   struct r200_texture_env_state unit[R200_MAX_TEXTURE_UNITS];
+};
+
+
+struct r200_state_atom {
+   struct r200_state_atom *next, *prev;
+   const char *name;		         /* for debug */
+   int cmd_size;		         /* size in bytes */
+   GLuint idx;
+   int *cmd;			         /* one or more cmd's */
+   int *lastcmd;			 /* one or more cmd's */
+   GLboolean dirty;
+   GLboolean (*check)( GLcontext *, int );    /* is this state active? */
+};
+   
+
+
+/* Trying to keep these relatively short as the variables are becoming
+ * extravagently long.  Drop the driver name prefix off the front of
+ * everything - I think we know which driver we're in by now, and keep the
+ * prefix to 3 letters unless absolutely impossible.  
+ */
+
+#define CTX_CMD_0             0
+#define CTX_PP_MISC           1
+#define CTX_PP_FOG_COLOR      2
+#define CTX_RE_SOLID_COLOR    3
+#define CTX_RB3D_BLENDCNTL    4
+#define CTX_RB3D_DEPTHOFFSET  5
+#define CTX_RB3D_DEPTHPITCH   6
+#define CTX_RB3D_ZSTENCILCNTL 7
+#define CTX_CMD_1             8
+#define CTX_PP_CNTL           9
+#define CTX_RB3D_CNTL         10
+#define CTX_RB3D_COLOROFFSET  11
+#define CTX_CMD_2             12 /* why */
+#define CTX_RB3D_COLORPITCH   13 /* why */
+#define CTX_STATE_SIZE_OLDDRM 14
+#define CTX_CMD_3             14
+#define CTX_RB3D_BLENDCOLOR   15
+#define CTX_RB3D_ABLENDCNTL   16
+#define CTX_RB3D_CBLENDCNTL   17
+#define CTX_STATE_SIZE_NEWDRM 18
+
+#define SET_CMD_0               0
+#define SET_SE_CNTL             1
+#define SET_RE_CNTL             2 /* replace se_coord_fmt */
+#define SET_STATE_SIZE          3
+
+#define VTE_CMD_0               0
+#define VTE_SE_VTE_CNTL         1
+#define VTE_STATE_SIZE          2
+
+#define LIN_CMD_0               0
+#define LIN_RE_LINE_PATTERN     1
+#define LIN_RE_LINE_STATE       2
+#define LIN_CMD_1               3
+#define LIN_SE_LINE_WIDTH       4
+#define LIN_STATE_SIZE          5
+
+#define MSK_CMD_0               0
+#define MSK_RB3D_STENCILREFMASK 1
+#define MSK_RB3D_ROPCNTL        2
+#define MSK_RB3D_PLANEMASK      3
+#define MSK_STATE_SIZE          4
+
+#define VPT_CMD_0           0
+#define VPT_SE_VPORT_XSCALE          1
+#define VPT_SE_VPORT_XOFFSET         2
+#define VPT_SE_VPORT_YSCALE          3
+#define VPT_SE_VPORT_YOFFSET         4
+#define VPT_SE_VPORT_ZSCALE          5
+#define VPT_SE_VPORT_ZOFFSET         6
+#define VPT_STATE_SIZE      7
+
+#define ZBS_CMD_0               0
+#define ZBS_SE_ZBIAS_FACTOR     1
+#define ZBS_SE_ZBIAS_CONSTANT   2
+#define ZBS_STATE_SIZE          3
+
+#define MSC_CMD_0               0
+#define MSC_RE_MISC             1
+#define MSC_STATE_SIZE          2
+
+#define TAM_CMD_0               0
+#define TAM_DEBUG3              1
+#define TAM_STATE_SIZE          2
+
+#define TEX_CMD_0                   0
+#define TEX_PP_TXFILTER             1  /*2c00*/
+#define TEX_PP_TXFORMAT             2  /*2c04*/
+#define TEX_PP_TXFORMAT_X           3  /*2c08*/
+#define TEX_PP_TXSIZE               4  /*2c0c*/
+#define TEX_PP_TXPITCH              5  /*2c10*/
+#define TEX_PP_BORDER_COLOR         6  /*2c14*/
+#define TEX_CMD_1_OLDDRM            7
+#define TEX_PP_TXOFFSET_OLDDRM      8  /*2d00 */
+#define TEX_STATE_SIZE_OLDDRM       9
+#define TEX_PP_CUBIC_FACES          7
+#define TEX_PP_TXMULTI_CTL          8
+#define TEX_CMD_1_NEWDRM            9
+#define TEX_PP_TXOFFSET_NEWDRM     10
+#define TEX_STATE_SIZE_NEWDRM      11
+
+#define CUBE_CMD_0                  0  /* 1 register follows */ /* this command unnecessary */
+#define CUBE_PP_CUBIC_FACES         1  /* 0x2c18 */             /* with new enough drm */
+#define CUBE_CMD_1                  2  /* 5 registers follow */
+#define CUBE_PP_CUBIC_OFFSET_F1     3  /* 0x2d04 */
+#define CUBE_PP_CUBIC_OFFSET_F2     4  /* 0x2d08 */
+#define CUBE_PP_CUBIC_OFFSET_F3     5  /* 0x2d0c */
+#define CUBE_PP_CUBIC_OFFSET_F4     6  /* 0x2d10 */
+#define CUBE_PP_CUBIC_OFFSET_F5     7  /* 0x2d14 */
+#define CUBE_STATE_SIZE             8
+
+#define PIX_CMD_0                   0
+#define PIX_PP_TXCBLEND             1
+#define PIX_PP_TXCBLEND2            2
+#define PIX_PP_TXABLEND             3
+#define PIX_PP_TXABLEND2            4
+#define PIX_STATE_SIZE              5
+
+#define TF_CMD_0                    0
+#define TF_TFACTOR_0                1
+#define TF_TFACTOR_1                2
+#define TF_TFACTOR_2                3
+#define TF_TFACTOR_3                4
+#define TF_TFACTOR_4                5
+#define TF_TFACTOR_5                6
+#define TF_STATE_SIZE               7
+
+#define ATF_CMD_0                   0
+#define ATF_TFACTOR_0               1
+#define ATF_TFACTOR_1               2
+#define ATF_TFACTOR_2               3
+#define ATF_TFACTOR_3               4
+#define ATF_TFACTOR_4               5
+#define ATF_TFACTOR_5               6
+#define ATF_TFACTOR_6               7
+#define ATF_TFACTOR_7               8
+#define ATF_STATE_SIZE              9
+
+/* ATI_FRAGMENT_SHADER */
+#define AFS_CMD_0                 0
+#define AFS_IC0                   1 /* 2f00 */
+#define AFS_IC1                   2 /* 2f04 */
+#define AFS_IA0                   3 /* 2f08 */
+#define AFS_IA1                   4 /* 2f0c */
+#define AFS_STATE_SIZE           33
+
+#define PVS_CMD_0                 0
+#define PVS_CNTL_1                1
+#define PVS_CNTL_2                2
+#define PVS_STATE_SIZE            3
+
+/* those are quite big... */
+#define VPI_CMD_0                 0
+#define VPI_OPDST_0               1
+#define VPI_SRC0_0                2
+#define VPI_SRC1_0                3
+#define VPI_SRC2_0                4
+#define VPI_OPDST_63              253
+#define VPI_SRC0_63               254
+#define VPI_SRC1_63               255
+#define VPI_SRC2_63               256
+#define VPI_STATE_SIZE            257
+
+#define VPP_CMD_0                0
+#define VPP_PARAM0_0             1
+#define VPP_PARAM1_0             2
+#define VPP_PARAM2_0             3
+#define VPP_PARAM3_0             4
+#define VPP_PARAM0_95            381
+#define VPP_PARAM1_95            382
+#define VPP_PARAM2_95            383
+#define VPP_PARAM3_95            384
+#define VPP_STATE_SIZE           385
+
+#define TCL_CMD_0                 0
+#define TCL_LIGHT_MODEL_CTL_0     1
+#define TCL_LIGHT_MODEL_CTL_1     2
+#define TCL_PER_LIGHT_CTL_0       3
+#define TCL_PER_LIGHT_CTL_1       4
+#define TCL_PER_LIGHT_CTL_2       5
+#define TCL_PER_LIGHT_CTL_3       6
+#define TCL_CMD_1                 7
+#define TCL_UCP_VERT_BLEND_CTL    8
+#define TCL_STATE_SIZE            9
+
+#define MSL_CMD_0                     0
+#define MSL_MATRIX_SELECT_0           1
+#define MSL_MATRIX_SELECT_1           2
+#define MSL_MATRIX_SELECT_2           3
+#define MSL_MATRIX_SELECT_3           4
+#define MSL_MATRIX_SELECT_4           5
+#define MSL_STATE_SIZE                6
+
+#define TCG_CMD_0                 0
+#define TCG_TEX_PROC_CTL_2            1
+#define TCG_TEX_PROC_CTL_3            2
+#define TCG_TEX_PROC_CTL_0            3
+#define TCG_TEX_PROC_CTL_1            4
+#define TCG_TEX_CYL_WRAP_CTL      5
+#define TCG_STATE_SIZE            6
+
+#define MTL_CMD_0            0	
+#define MTL_EMMISSIVE_RED    1	
+#define MTL_EMMISSIVE_GREEN  2	
+#define MTL_EMMISSIVE_BLUE   3	
+#define MTL_EMMISSIVE_ALPHA  4	
+#define MTL_AMBIENT_RED      5
+#define MTL_AMBIENT_GREEN    6
+#define MTL_AMBIENT_BLUE     7
+#define MTL_AMBIENT_ALPHA    8
+#define MTL_DIFFUSE_RED      9
+#define MTL_DIFFUSE_GREEN    10
+#define MTL_DIFFUSE_BLUE     11
+#define MTL_DIFFUSE_ALPHA    12
+#define MTL_SPECULAR_RED     13
+#define MTL_SPECULAR_GREEN   14
+#define MTL_SPECULAR_BLUE    15
+#define MTL_SPECULAR_ALPHA   16
+#define MTL_CMD_1            17
+#define MTL_SHININESS        18
+#define MTL_STATE_SIZE       19
+
+#define VAP_CMD_0                   0
+#define VAP_SE_VAP_CNTL             1
+#define VAP_STATE_SIZE              2
+
+/* Replaces a lot of packet info from radeon
+ */
+#define VTX_CMD_0                   0
+#define VTX_VTXFMT_0            1
+#define VTX_VTXFMT_1            2
+#define VTX_TCL_OUTPUT_VTXFMT_0 3
+#define VTX_TCL_OUTPUT_VTXFMT_1 4
+#define VTX_CMD_1               5
+#define VTX_TCL_OUTPUT_COMPSEL  6
+#define VTX_CMD_2               7
+#define VTX_STATE_CNTL          8
+#define VTX_STATE_SIZE          9
+
+/* SPR - point sprite state
+ */
+#define SPR_CMD_0              0
+#define SPR_POINT_SPRITE_CNTL  1
+#define SPR_STATE_SIZE         2
+
+#define PTP_CMD_0              0
+#define PTP_VPORT_SCALE_0      1
+#define PTP_VPORT_SCALE_1      2
+#define PTP_VPORT_SCALE_PTSIZE 3
+#define PTP_VPORT_SCALE_3      4
+#define PTP_CMD_1              5
+#define PTP_ATT_CONST_QUAD     6
+#define PTP_ATT_CONST_LIN      7
+#define PTP_ATT_CONST_CON      8
+#define PTP_ATT_CONST_3        9
+#define PTP_EYE_X             10
+#define PTP_EYE_Y             11
+#define PTP_EYE_Z             12
+#define PTP_EYE_3             13
+#define PTP_CLAMP_MIN         14
+#define PTP_CLAMP_MAX         15
+#define PTP_CLAMP_2           16
+#define PTP_CLAMP_3           17
+#define PTP_STATE_SIZE        18
+
+#define VTX_COLOR(v,n)   (((v)>>(R200_VTX_COLOR_0_SHIFT+(n)*2))&\
+                         R200_VTX_COLOR_MASK)
+
+/**
+ * Given the \c R200_SE_VTX_FMT_1 for the current vertex state, determine
+ * how many components are in texture coordinate \c n.
+ */
+#define VTX_TEXn_COUNT(v,n)   (((v) >> (3 * n)) & 0x07)
+
+#define MAT_CMD_0              0
+#define MAT_ELT_0              1
+#define MAT_STATE_SIZE         17
+
+#define GRD_CMD_0                  0
+#define GRD_VERT_GUARD_CLIP_ADJ    1
+#define GRD_VERT_GUARD_DISCARD_ADJ 2
+#define GRD_HORZ_GUARD_CLIP_ADJ    3
+#define GRD_HORZ_GUARD_DISCARD_ADJ 4
+#define GRD_STATE_SIZE             5
+
+/* position changes frequently when lighting in modelpos - separate
+ * out to new state item?  
+ */
+#define LIT_CMD_0                  0
+#define LIT_AMBIENT_RED            1
+#define LIT_AMBIENT_GREEN          2
+#define LIT_AMBIENT_BLUE           3
+#define LIT_AMBIENT_ALPHA          4
+#define LIT_DIFFUSE_RED            5
+#define LIT_DIFFUSE_GREEN          6
+#define LIT_DIFFUSE_BLUE           7
+#define LIT_DIFFUSE_ALPHA          8
+#define LIT_SPECULAR_RED           9
+#define LIT_SPECULAR_GREEN         10
+#define LIT_SPECULAR_BLUE          11
+#define LIT_SPECULAR_ALPHA         12
+#define LIT_POSITION_X             13
+#define LIT_POSITION_Y             14
+#define LIT_POSITION_Z             15
+#define LIT_POSITION_W             16
+#define LIT_DIRECTION_X            17
+#define LIT_DIRECTION_Y            18
+#define LIT_DIRECTION_Z            19
+#define LIT_DIRECTION_W            20
+#define LIT_ATTEN_QUADRATIC        21
+#define LIT_ATTEN_LINEAR           22
+#define LIT_ATTEN_CONST            23
+#define LIT_ATTEN_XXX              24
+#define LIT_CMD_1                  25
+#define LIT_SPOT_DCD               26
+#define LIT_SPOT_DCM               27
+#define LIT_SPOT_EXPONENT          28
+#define LIT_SPOT_CUTOFF            29
+#define LIT_SPECULAR_THRESH        30
+#define LIT_RANGE_CUTOFF           31 /* ? */
+#define LIT_ATTEN_CONST_INV        32
+#define LIT_STATE_SIZE             33
+
+/* Fog
+ */
+#define FOG_CMD_0      0
+#define FOG_R          1
+#define FOG_C          2
+#define FOG_D          3
+#define FOG_PAD        4
+#define FOG_STATE_SIZE 5
+
+/* UCP
+ */
+#define UCP_CMD_0      0
+#define UCP_X          1
+#define UCP_Y          2
+#define UCP_Z          3
+#define UCP_W          4
+#define UCP_STATE_SIZE 5
+
+/* GLT - Global ambient
+ */
+#define GLT_CMD_0      0
+#define GLT_RED        1
+#define GLT_GREEN      2
+#define GLT_BLUE       3
+#define GLT_ALPHA      4
+#define GLT_STATE_SIZE 5
+
+/* EYE
+ */
+#define EYE_CMD_0          0
+#define EYE_X              1
+#define EYE_Y              2
+#define EYE_Z              3
+#define EYE_RESCALE_FACTOR 4
+#define EYE_STATE_SIZE     5
+
+/* CST - constant state
+ */
+#define CST_CMD_0                             0
+#define CST_PP_CNTL_X                         1
+#define CST_CMD_1                             2
+#define CST_RB3D_DEPTHXY_OFFSET               3
+#define CST_CMD_2                             4
+#define CST_RE_AUX_SCISSOR_CNTL               5
+#define CST_CMD_3                             6
+#define CST_RE_SCISSOR_TL_0                   7
+#define CST_RE_SCISSOR_BR_0                   8
+#define CST_CMD_4                             9
+#define CST_SE_VAP_CNTL_STATUS                10
+#define CST_CMD_5                             11
+#define CST_RE_POINTSIZE                      12
+#define CST_CMD_6                             13
+#define CST_SE_TCL_INPUT_VTX_0                14
+#define CST_SE_TCL_INPUT_VTX_1                15
+#define CST_SE_TCL_INPUT_VTX_2                16
+#define CST_SE_TCL_INPUT_VTX_3                17
+#define CST_STATE_SIZE                        18
+
+#define PRF_CMD_0         0
+#define PRF_PP_TRI_PERF   1
+#define PRF_PP_PERF_CNTL  2
+#define PRF_STATE_SIZE    3
+
+
+struct r200_hw_state {
+   /* Head of the linked list of state atoms. */
+   struct r200_state_atom atomlist;
+
+   /* Hardware state, stored as cmdbuf commands:  
+    *   -- Need to doublebuffer for
+    *           - reviving state after loss of context
+    *           - eliding noop statechange loops? (except line stipple count)
+    */
+   struct r200_state_atom ctx;
+   struct r200_state_atom set;
+   struct r200_state_atom vte;
+   struct r200_state_atom lin;
+   struct r200_state_atom msk;
+   struct r200_state_atom vpt;
+   struct r200_state_atom vap;
+   struct r200_state_atom vtx;
+   struct r200_state_atom tcl;
+   struct r200_state_atom msl;
+   struct r200_state_atom tcg;
+   struct r200_state_atom msc;
+   struct r200_state_atom cst;
+   struct r200_state_atom tam;
+   struct r200_state_atom tf;
+   struct r200_state_atom tex[6];
+   struct r200_state_atom cube[6];
+   struct r200_state_atom zbs;
+   struct r200_state_atom mtl[2];
+   struct r200_state_atom mat[9];
+   struct r200_state_atom lit[8]; /* includes vec, scl commands */
+   struct r200_state_atom ucp[6];
+   struct r200_state_atom pix[6]; /* pixshader stages */
+   struct r200_state_atom eye; /* eye pos */
+   struct r200_state_atom grd; /* guard band clipping */
+   struct r200_state_atom fog;
+   struct r200_state_atom glt;
+   struct r200_state_atom prf;
+   struct r200_state_atom afs[2];
+   struct r200_state_atom pvs;
+   struct r200_state_atom vpi[2];
+   struct r200_state_atom vpp[2];
+   struct r200_state_atom atf;
+   struct r200_state_atom spr;
+   struct r200_state_atom ptp;
+
+   int max_state_size;	/* Number of bytes necessary for a full state emit. */
+   GLboolean is_dirty, all_dirty;
+};
+
+struct r200_state {
+   /* Derived state for internal purposes:
+    */
+   struct r200_colorbuffer_state color;
+   struct r200_depthbuffer_state depth;
+#if 00
+   struct r200_pixel_state pixel;
+#endif
+   struct r200_scissor_state scissor;
+   struct r200_stencilbuffer_state stencil;
+   struct r200_stipple_state stipple;
+   struct r200_texture_state texture;
+   GLuint envneeded;
+};
+
+/* Need refcounting on dma buffers:
+ */
+struct r200_dma_buffer {
+   int refcount;		/* the number of retained regions in buf */
+   drmBufPtr buf;
+};
+
+#define GET_START(rvb) (rmesa->r200Screen->gart_buffer_offset +		\
+			(rvb)->address - rmesa->dma.buf0_address +	\
+			(rvb)->start)
+
+/* A retained region, eg vertices for indexed vertices.
+ */
+struct r200_dma_region {
+   struct r200_dma_buffer *buf;
+   char *address;		/* == buf->address */
+   int start, end, ptr;		/* offsets from start of buf */
+   int aos_start;
+   int aos_stride;
+   int aos_size;
+};
+
+
+struct r200_dma {
+   /* Active dma region.  Allocations for vertices and retained
+    * regions come from here.  Also used for emitting random vertices,
+    * these may be flushed by calling flush_current();
+    */
+   struct r200_dma_region current;
+   
+   void (*flush)( r200ContextPtr );
+
+   char *buf0_address;		/* start of buf[0], for index calcs */
+   GLuint nr_released_bufs;	/* flush after so many buffers released */
+};
+
+struct r200_dri_mirror {
+   __DRIcontextPrivate	*context;	/* DRI context */
+   __DRIscreenPrivate	*screen;	/* DRI screen */
+   __DRIdrawablePrivate	*drawable;	/* DRI drawable bound to this ctx */
+   __DRIdrawablePrivate	*readable;	/* DRI readable bound to this ctx */
+
+   drm_context_t hwContext;
+   drm_hw_lock_t *hwLock;
+   int fd;
+   int drmMinor;
+};
+
+
+#define R200_CMD_BUF_SZ  (16*1024) 
+
+struct r200_store {
+   GLuint statenr;
+   GLuint primnr;
+   char cmd_buf[R200_CMD_BUF_SZ];
+   int cmd_used;   
+   int elts_start;
+};
+
+
+/* r200_tcl.c
+ */
+struct r200_tcl_info {
+   GLuint hw_primitive;
+
+/* hw can handle 12 components max */
+   struct r200_dma_region *aos_components[12];
+   GLuint nr_aos_components;
+
+   GLuint *Elts;
+
+   struct r200_dma_region indexed_verts;
+   struct r200_dma_region vertex_data[15];
+};
+
+
+/* r200_swtcl.c
+ */
+struct r200_swtcl_info {
+   GLuint RenderIndex;
+   
+   /**
+    * Size of a hardware vertex.  This is calculated when \c ::vertex_attrs is
+    * installed in the Mesa state vector.
+    */
+   GLuint vertex_size;
+
+   /**
+    * Attributes instructing the Mesa TCL pipeline where / how to put vertex
+    * data in the hardware buffer.
+    */
+   struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
+
+   /**
+    * Number of elements of \c ::vertex_attrs that are actually used.
+    */
+   GLuint vertex_attr_count;
+
+   /**
+    * Cached pointer to the buffer where Mesa will store vertex data.
+    */
+   GLubyte *verts;
+
+   /* Fallback rasterization functions
+    */
+   r200_point_func draw_point;
+   r200_line_func draw_line;
+   r200_tri_func draw_tri;
+
+   GLuint hw_primitive;
+   GLenum render_primitive;
+   GLuint numverts;
+
+   /**
+    * Offset of the 4UB color data within a hardware (swtcl) vertex.
+    */
+   GLuint coloroffset;
+
+   /**
+    * Offset of the 3UB specular color data within a hardware (swtcl) vertex.
+    */
+   GLuint specoffset;
+
+   /**
+    * Should Mesa project vertex data or will the hardware do it?
+    */
+   GLboolean needproj;
+
+   struct r200_dma_region indexed_verts;
+};
+
+
+struct r200_ioctl {
+   GLuint vertex_offset;
+   GLuint vertex_size;
+};
+
+
+
+#define R200_MAX_PRIMS 64
+
+
+
+struct r200_prim {
+   GLuint start;
+   GLuint end;
+   GLuint prim;
+};
+
+   /* A maximum total of 29 elements per vertex:  3 floats for position, 3
+    * floats for normal, 4 floats for color, 4 bytes for secondary color,
+    * 3 floats for each texture unit (18 floats total).
+    * 
+    * we maybe need add. 4 to prevent segfault if someone specifies
+    * GL_TEXTURE6/GL_TEXTURE7 (esp. for the codegen-path) (FIXME: )
+    * 
+    * The position data is never actually stored here, so 3 elements could be
+    * trimmed out of the buffer.
+    */
+
+#define R200_MAX_VERTEX_SIZE ((3*6)+11)
+
+
+struct r200_context {
+   GLcontext *glCtx;			/* Mesa context */
+
+   /* Driver and hardware state management
+    */
+   struct r200_hw_state hw;
+   struct r200_state state;
+   struct r200_vertex_program *curr_vp_hw;
+
+   /* Texture object bookkeeping
+    */
+   unsigned              nr_heaps;
+   driTexHeap          * texture_heaps[ RADEON_NR_TEX_HEAPS ];
+   driTextureObject      swapped;
+   int                   texture_depth;
+   float                 initialMaxAnisotropy;
+
+   /* Rasterization and vertex state:
+    */
+   GLuint TclFallback;
+   GLuint Fallback;
+   GLuint NewGLState;
+   DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
+
+   /* Vertex buffers
+    */
+   struct r200_ioctl ioctl;
+   struct r200_dma dma;
+   struct r200_store store;
+   /* A full state emit as of the first state emit in the main store, in case
+    * the context is lost.
+    */
+   struct r200_store backup_store;
+
+   /* Page flipping
+    */
+   GLuint doPageFlip;
+
+   /* Busy waiting
+    */
+   GLuint do_usleeps;
+   GLuint do_irqs;
+   GLuint irqsEmitted;
+   drm_radeon_irq_wait_t iw;
+
+   /* Clientdata textures;
+    */
+   GLuint prefer_gart_client_texturing;
+
+   /* Drawable, cliprect and scissor information
+    */
+   GLuint numClipRects;			/* Cliprects for the draw buffer */
+   drm_clip_rect_t *pClipRects;
+   unsigned int lastStamp;
+   GLboolean lost_context;
+   GLboolean save_on_next_emit;
+   radeonScreenPtr r200Screen;	/* Screen private DRI data */
+   drm_radeon_sarea_t *sarea;		/* Private SAREA data */
+
+   /* TCL stuff
+    */
+   GLmatrix TexGenMatrix[R200_MAX_TEXTURE_UNITS];
+   GLboolean recheck_texgen[R200_MAX_TEXTURE_UNITS];
+   GLboolean TexGenNeedNormals[R200_MAX_TEXTURE_UNITS];
+   GLuint TexMatEnabled;
+   GLuint TexMatCompSel;
+   GLuint TexGenEnabled;
+   GLuint TexGenCompSel;
+   GLmatrix tmpmat;
+
+   /* VBI / buffer swap
+    */
+   GLuint vbl_seq;
+   GLuint vblank_flags;
+
+   int64_t swap_ust;
+   int64_t swap_missed_ust;
+
+   GLuint swap_count;
+   GLuint swap_missed_count;
+
+
+   /* r200_tcl.c
+    */
+   struct r200_tcl_info tcl;
+
+   /* r200_swtcl.c
+    */
+   struct r200_swtcl_info swtcl;
+
+   /* Mirrors of some DRI state
+    */
+   struct r200_dri_mirror dri;
+
+   /* Configuration cache
+    */
+   driOptionCache optionCache;
+
+   GLboolean using_hyperz;
+   GLboolean texmicrotile;
+
+  struct ati_fragment_shader *afs_loaded;
+};
+
+#define R200_CONTEXT(ctx)		((r200ContextPtr)(ctx->DriverCtx))
+
+
+static __inline GLuint r200PackColor( GLuint cpp,
+					GLubyte r, GLubyte g,
+					GLubyte b, GLubyte a )
+{
+   switch ( cpp ) {
+   case 2:
+      return PACK_COLOR_565( r, g, b );
+   case 4:
+      return PACK_COLOR_8888( a, r, g, b );
+   default:
+      return 0;
+   }
+}
+
+
+extern void r200DestroyContext( __DRIcontextPrivate *driContextPriv );
+extern GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+				    __DRIcontextPrivate *driContextPriv,
+				    void *sharedContextPrivate);
+extern void r200SwapBuffers( __DRIdrawablePrivate *dPriv );
+extern void r200CopySubBuffer( __DRIdrawablePrivate * dPriv,
+			       int x, int y, int w, int h );
+extern GLboolean r200MakeCurrent( __DRIcontextPrivate *driContextPriv,
+				  __DRIdrawablePrivate *driDrawPriv,
+				  __DRIdrawablePrivate *driReadPriv );
+extern GLboolean r200UnbindContext( __DRIcontextPrivate *driContextPriv );
+
+/* ================================================================
+ * Debugging:
+ */
+#define DO_DEBUG		1
+
+#if DO_DEBUG
+extern int R200_DEBUG;
+#else
+#define R200_DEBUG		0
+#endif
+
+#define DEBUG_TEXTURE	0x001
+#define DEBUG_STATE	0x002
+#define DEBUG_IOCTL	0x004
+#define DEBUG_PRIMS	0x008
+#define DEBUG_VERTS	0x010
+#define DEBUG_FALLBACKS	0x020
+#define DEBUG_VFMT	0x040
+#define DEBUG_CODEGEN	0x080
+#define DEBUG_VERBOSE	0x100
+#define DEBUG_DRI       0x200
+#define DEBUG_DMA       0x400
+#define DEBUG_SANITY    0x800
+#define DEBUG_SYNC      0x1000
+#define DEBUG_PIXEL     0x2000
+#define DEBUG_MEMORY    0x4000
+
+#endif /* __R200_CONTEXT_H__ */
diff --git a/r200/r200_fragshader.c b/r200/r200_fragshader.c
new file mode 100644
index 0000000..5dd3ada
--- /dev/null
+++ b/r200/r200_fragshader.c
@@ -0,0 +1,548 @@
+/**************************************************************************
+ *
+ * Copyright 2004 David Airlie
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL DAVID AIRLIE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+
+#include "tnl/t_context.h"
+#include "atifragshader.h"
+#include "program.h"
+#include "r200_context.h"
+#include "r200_ioctl.h"
+#include "r200_tex.h"
+
+#define SET_INST(inst, type) afs_cmd[((inst<<2) + (type<<1) + 1)]
+#define SET_INST_2(inst, type) afs_cmd[((inst<<2) + (type<<1) + 2)]
+
+static void r200SetFragShaderArg( GLuint *afs_cmd, GLuint opnum, GLuint optype,
+				const struct atifragshader_src_register srcReg,
+				GLuint argPos, GLuint *tfactor )
+{
+   const GLuint index = srcReg.Index;
+   const GLuint srcmod = srcReg.argMod;
+   const GLuint srcrep = srcReg.argRep;
+   GLuint reg0 = 0;
+   GLuint reg2 = 0;
+   GLuint useOddSrc = 0;
+
+   switch(srcrep) {
+   case GL_RED:
+      reg2 |= R200_TXC_REPL_RED << (R200_TXC_REPL_ARG_A_SHIFT + (2*argPos));
+      if (optype)
+	 useOddSrc = 1;
+      break;
+   case GL_GREEN:
+      reg2 |= R200_TXC_REPL_GREEN << (R200_TXC_REPL_ARG_A_SHIFT + (2*argPos));
+      if (optype)
+	 useOddSrc = 1;
+      break;
+   case GL_BLUE:
+      if (!optype)
+	 reg2 |= R200_TXC_REPL_BLUE << (R200_TXC_REPL_ARG_A_SHIFT + (2*argPos));
+      else
+	 useOddSrc = 1;
+      break;
+   case GL_ALPHA:
+      if (!optype)
+	 useOddSrc = 1;
+      break;
+   }
+
+   if (index >= GL_REG_0_ATI && index <= GL_REG_5_ATI)
+      reg0 |= (((index - GL_REG_0_ATI)*2) + 10 + useOddSrc) << (5*argPos);
+   else if (index >= GL_CON_0_ATI && index <= GL_CON_7_ATI) {
+      if ((*tfactor == 0) || (index == *tfactor)) {
+	 reg0 |= (R200_TXC_ARG_A_TFACTOR_COLOR + useOddSrc) << (5*argPos);
+	 reg2 |= (index - GL_CON_0_ATI) << R200_TXC_TFACTOR_SEL_SHIFT;
+	 *tfactor = index;
+      }
+      else {
+	 reg0 |= (R200_TXC_ARG_A_TFACTOR1_COLOR + useOddSrc) << (5*argPos);
+	 reg2 |= (index - GL_CON_0_ATI) << R200_TXC_TFACTOR1_SEL_SHIFT;
+      }
+   }
+   else if (index == GL_PRIMARY_COLOR_EXT) {
+      reg0 |= (R200_TXC_ARG_A_DIFFUSE_COLOR + useOddSrc) << (5*argPos);
+   }
+   else if (index == GL_SECONDARY_INTERPOLATOR_ATI) {
+      reg0 |= (R200_TXC_ARG_A_SPECULAR_COLOR + useOddSrc) << (5*argPos);
+   }
+   /* GL_ZERO is a noop, for GL_ONE we set the complement */
+   else if (index == GL_ONE) {
+      reg0 |= R200_TXC_COMP_ARG_A << (4*argPos);
+   }
+
+   if (srcmod & GL_COMP_BIT_ATI)
+      reg0 ^= R200_TXC_COMP_ARG_A << (4*argPos);
+   if (srcmod & GL_BIAS_BIT_ATI)
+      reg0 |= R200_TXC_BIAS_ARG_A << (4*argPos);
+   if (srcmod & GL_2X_BIT_ATI)
+      reg0 |= R200_TXC_SCALE_ARG_A << (4*argPos);
+   if (srcmod & GL_NEGATE_BIT_ATI)
+      reg0 ^= R200_TXC_NEG_ARG_A << (4*argPos);
+
+   SET_INST(opnum, optype) |= reg0;
+   SET_INST_2(opnum, optype) |= reg2;
+}
+
+static GLuint dstmask_table[8] =
+{
+   R200_TXC_OUTPUT_MASK_RGB,
+   R200_TXC_OUTPUT_MASK_R,
+   R200_TXC_OUTPUT_MASK_G,
+   R200_TXC_OUTPUT_MASK_RG,
+   R200_TXC_OUTPUT_MASK_B,
+   R200_TXC_OUTPUT_MASK_RB,
+   R200_TXC_OUTPUT_MASK_GB,
+   R200_TXC_OUTPUT_MASK_RGB
+};
+
+static void r200UpdateFSArith( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint *afs_cmd;
+   const struct ati_fragment_shader *shader = ctx->ATIFragmentShader.Current;
+   GLuint pass;
+
+   R200_STATECHANGE( rmesa, afs[0] );
+   R200_STATECHANGE( rmesa, afs[1] );
+
+   if (shader->NumPasses < 2) {
+      afs_cmd = (GLuint *) rmesa->hw.afs[1].cmd;
+   }
+   else {
+      afs_cmd = (GLuint *) rmesa->hw.afs[0].cmd;
+   }
+   for (pass = 0; pass < shader->NumPasses; pass++) {
+      GLuint opnum = 0;
+      GLuint pc;
+      for (pc = 0; pc < shader->numArithInstr[pass]; pc++) {
+         GLuint optype;
+	 struct atifs_instruction *inst = &shader->Instructions[pass][pc];
+
+	 SET_INST(opnum, 0) = 0;
+	 SET_INST_2(opnum, 0) = 0;
+	 SET_INST(opnum, 1) = 0;
+	 SET_INST_2(opnum, 1) = 0;
+
+	 for (optype = 0; optype < 2; optype++) {
+	    GLuint tfactor = 0;
+
+	    if (inst->Opcode[optype]) {
+	       switch (inst->Opcode[optype]) {
+	       /* these are all MADD in disguise
+		  MADD is A * B + C
+		  so for GL_ADD use arg B/C and make A complement 0
+		  for GL_SUB use arg B/C, negate C and make A complement 0
+		  for GL_MOV use arg C
+		  for GL_MUL use arg A
+		  for GL_MAD all good */
+	       case GL_SUB_ATI:
+		  /* negate C */
+		  SET_INST(opnum, optype) |= R200_TXC_NEG_ARG_C;
+		  /* fallthrough */
+	       case GL_ADD_ATI:
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][0], 1, &tfactor);
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][1], 2, &tfactor);
+		  /* A = complement 0 */
+		  SET_INST(opnum, optype) |= R200_TXC_COMP_ARG_A;
+		  SET_INST(opnum, optype) |= R200_TXC_OP_MADD;
+		  break;
+	       case GL_MOV_ATI:
+		  /* put arg0 in C */
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][0], 2, &tfactor);
+		  SET_INST(opnum, optype) |= R200_TXC_OP_MADD;
+		  break;
+	       case GL_MAD_ATI:
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][2], 2, &tfactor);
+		  /* fallthrough */
+	       case GL_MUL_ATI:
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][0], 0, &tfactor);
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][1], 1, &tfactor);
+		  SET_INST(opnum, optype) |= R200_TXC_OP_MADD;
+		  break;
+	       case GL_LERP_ATI:
+		  /* arg order is not native chip order, swap A and C */
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][0], 2, &tfactor);
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][1], 1, &tfactor);
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][2], 0, &tfactor);
+		  SET_INST(opnum, optype) |= R200_TXC_OP_LERP;
+		  break;
+	       case GL_CND_ATI:
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][0], 0, &tfactor);
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][1], 1, &tfactor);
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][2], 2, &tfactor);
+		  SET_INST(opnum, optype) |= R200_TXC_OP_CONDITIONAL;
+		  break;
+	       case GL_CND0_ATI:
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][0], 0, &tfactor);
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][1], 1, &tfactor);
+		  r200SetFragShaderArg(afs_cmd, opnum, optype,
+					inst->SrcReg[optype][2], 2, &tfactor);
+		  SET_INST(opnum, optype) |= R200_TXC_OP_CND0;
+		  break;
+		  /* cannot specify dot ops as alpha ops directly */
+	       case GL_DOT2_ADD_ATI:
+		  if (optype)
+		     SET_INST_2(opnum, 1) |= R200_TXA_DOT_ALPHA;
+		  else {
+		     r200SetFragShaderArg(afs_cmd, opnum, 0,
+					inst->SrcReg[0][0], 0, &tfactor);
+		     r200SetFragShaderArg(afs_cmd, opnum, 0,
+					inst->SrcReg[0][1], 1, &tfactor);
+		     r200SetFragShaderArg(afs_cmd, opnum, 0,
+					inst->SrcReg[0][2], 2, &tfactor);
+		     SET_INST(opnum, 0) |= R200_TXC_OP_DOT2_ADD;
+		  }
+		  break;
+	       case GL_DOT3_ATI:
+		  if (optype)
+		     SET_INST_2(opnum, 1) |= R200_TXA_DOT_ALPHA;
+		  else {
+		     r200SetFragShaderArg(afs_cmd, opnum, 0,
+					inst->SrcReg[0][0], 0, &tfactor);
+		     r200SetFragShaderArg(afs_cmd, opnum, 0,
+					inst->SrcReg[0][1], 1, &tfactor);
+		     SET_INST(opnum, 0) |= R200_TXC_OP_DOT3;
+		  }
+		  break;
+	       case GL_DOT4_ATI:
+	       /* experimental verification: for dot4 setup of alpha args is needed
+		  (dstmod is ignored, though, so dot2/dot3 should be safe)
+		  the hardware apparently does R1*R2 + G1*G2 + B1*B2 + A3*A4
+		  but the API doesn't allow it */
+		  if (optype)
+		     SET_INST_2(opnum, 1) |= R200_TXA_DOT_ALPHA;
+		  else {
+		     r200SetFragShaderArg(afs_cmd, opnum, 0,
+					inst->SrcReg[0][0], 0, &tfactor);
+		     r200SetFragShaderArg(afs_cmd, opnum, 0,
+					inst->SrcReg[0][1], 1, &tfactor);
+		     r200SetFragShaderArg(afs_cmd, opnum, 1,
+					inst->SrcReg[0][0], 0, &tfactor);
+		     r200SetFragShaderArg(afs_cmd, opnum, 1,
+					inst->SrcReg[0][1], 1, &tfactor);
+		     SET_INST(opnum, optype) |= R200_TXC_OP_DOT4;
+		  }
+		  break;
+	       }
+	    }
+
+	    /* destination */
+	    if (inst->DstReg[optype].Index) {
+	       GLuint dstreg = inst->DstReg[optype].Index - GL_REG_0_ATI;
+	       GLuint dstmask = inst->DstReg[optype].dstMask;
+	       GLuint sat = inst->DstReg[optype].dstMod & GL_SATURATE_BIT_ATI;
+	       GLuint dstmod = inst->DstReg[optype].dstMod;
+
+	       dstmod &= ~GL_SATURATE_BIT_ATI;
+
+	       SET_INST_2(opnum, optype) |= (dstreg + 1) << R200_TXC_OUTPUT_REG_SHIFT;
+	       SET_INST_2(opnum, optype) |= dstmask_table[dstmask];
+
+		/* fglrx does clamp the last instructions to 0_1 it seems */
+		/* this won't necessarily catch the last instruction
+		   which writes to reg0 */
+	       if (sat || (pc == (shader->numArithInstr[pass] - 1) &&
+			((pass == 1) || (shader->NumPasses == 1))))
+		  SET_INST_2(opnum, optype) |= R200_TXC_CLAMP_0_1;
+	       else
+		/*should we clamp or not? spec is vague, I would suppose yes but fglrx doesn't */
+		  SET_INST_2(opnum, optype) |= R200_TXC_CLAMP_8_8;
+/*		  SET_INST_2(opnum, optype) |= R200_TXC_CLAMP_WRAP;*/
+	       switch(dstmod) {
+	       case GL_2X_BIT_ATI:
+		  SET_INST_2(opnum, optype) |= R200_TXC_SCALE_2X;
+		  break;
+	       case GL_4X_BIT_ATI:
+		  SET_INST_2(opnum, optype) |= R200_TXC_SCALE_4X;
+		  break;
+	       case GL_8X_BIT_ATI:
+		  SET_INST_2(opnum, optype) |= R200_TXC_SCALE_8X;
+		  break;
+	       case GL_HALF_BIT_ATI:
+		  SET_INST_2(opnum, optype) |= R200_TXC_SCALE_INV2;
+		  break;
+	       case GL_QUARTER_BIT_ATI:
+		  SET_INST_2(opnum, optype) |= R200_TXC_SCALE_INV4;
+		  break;
+	       case GL_EIGHTH_BIT_ATI:
+		  SET_INST_2(opnum, optype) |= R200_TXC_SCALE_INV8;
+		  break;
+	       default:
+		  break;
+	       }
+	    }
+	 }
+/*	 fprintf(stderr, "pass %d nr %d inst 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n",
+		pass, opnum, SET_INST(opnum, 0), SET_INST_2(opnum, 0),
+		SET_INST(opnum, 1), SET_INST_2(opnum, 1));*/
+         opnum++;
+      }
+      afs_cmd = (GLuint *) rmesa->hw.afs[1].cmd;
+   }
+   rmesa->afs_loaded = ctx->ATIFragmentShader.Current;
+}
+
+static void r200UpdateFSRouting( GLcontext *ctx ) {
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   const struct ati_fragment_shader *shader = ctx->ATIFragmentShader.Current;
+   GLuint reg;
+
+   R200_STATECHANGE( rmesa, ctx );
+   R200_STATECHANGE( rmesa, cst );
+
+   for (reg = 0; reg < R200_MAX_TEXTURE_UNITS; reg++) {
+      if (shader->swizzlerq & (1 << (2 * reg)))
+	 /* r coord */
+	 set_re_cntl_d3d( ctx, reg, 1);
+	 /* q coord */
+      else set_re_cntl_d3d( ctx, reg, 0);
+   }
+
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~(R200_MULTI_PASS_ENABLE |
+				       R200_TEX_BLEND_ENABLE_MASK |
+				       R200_TEX_ENABLE_MASK);
+   rmesa->hw.cst.cmd[CST_PP_CNTL_X] &= ~(R200_PPX_PFS_INST_ENABLE_MASK |
+					 R200_PPX_TEX_ENABLE_MASK |
+					 R200_PPX_OUTPUT_REG_MASK);
+
+   /* first pass registers use slots 8 - 15
+      but single pass shaders use slots 0 - 7 */
+   if (shader->NumPasses < 2) {
+      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= shader->numArithInstr[0] == 8 ?
+	 0xff << (R200_TEX_BLEND_0_ENABLE_SHIFT - 1) :
+	 (0xff >> (8 - shader->numArithInstr[0])) << R200_TEX_BLEND_0_ENABLE_SHIFT;
+   } else {
+      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_MULTI_PASS_ENABLE;
+      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= shader->numArithInstr[1] == 8 ?
+	 0xff << (R200_TEX_BLEND_0_ENABLE_SHIFT - 1) :
+	 (0xff >> (8 - shader->numArithInstr[1])) << R200_TEX_BLEND_0_ENABLE_SHIFT;
+      rmesa->hw.cst.cmd[CST_PP_CNTL_X] |=
+	 (0xff >> (8 - shader->numArithInstr[0])) << R200_PPX_FPS_INST0_ENABLE_SHIFT;
+   }
+
+   if (shader->NumPasses < 2) {
+      for (reg = 0; reg < R200_MAX_TEXTURE_UNITS; reg++) {
+	 GLbitfield targetbit = ctx->Texture.Unit[reg]._ReallyEnabled;
+         R200_STATECHANGE( rmesa, tex[reg] );
+	 rmesa->hw.tex[reg].cmd[TEX_PP_TXMULTI_CTL] = 0;
+	 if (shader->SetupInst[0][reg].Opcode) {
+	    GLuint txformat = rmesa->hw.tex[reg].cmd[TEX_PP_TXFORMAT]
+		& ~(R200_TXFORMAT_ST_ROUTE_MASK | R200_TXFORMAT_LOOKUP_DISABLE);
+	    GLuint txformat_x = rmesa->hw.tex[reg].cmd[TEX_PP_TXFORMAT_X] & ~R200_TEXCOORD_MASK;
+	    txformat |= (shader->SetupInst[0][reg].src - GL_TEXTURE0_ARB)
+		<< R200_TXFORMAT_ST_ROUTE_SHIFT;
+	    /* fix up texcoords for proj/non-proj 2d (3d and cube are not defined when
+	       using projection so don't have to worry there).
+	       When passing coords, need R200_TEXCOORD_VOLUME, otherwise loose a coord */
+	    /* FIXME: someone might rely on default tex coords r/q, which we unfortunately
+	       don't provide (we have the same problem without shaders) */
+	    if (shader->SetupInst[0][reg].Opcode == ATI_FRAGMENT_SHADER_PASS_OP) {
+	       txformat |= R200_TXFORMAT_LOOKUP_DISABLE;
+	       if (shader->SetupInst[0][reg].swizzle == GL_SWIZZLE_STR_ATI ||
+		  shader->SetupInst[0][reg].swizzle == GL_SWIZZLE_STQ_ATI) {
+		  txformat_x |= R200_TEXCOORD_VOLUME;
+	       }
+	       else {
+		  txformat_x |= R200_TEXCOORD_PROJ;
+	       }
+	       rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_0_ENABLE << reg;
+	    }
+	    else if (targetbit == TEXTURE_3D_BIT) {
+	       txformat_x |= R200_TEXCOORD_VOLUME;
+	    }
+	    else if (targetbit == TEXTURE_CUBE_BIT) {
+	       txformat_x |= R200_TEXCOORD_CUBIC_ENV;
+	    }
+	    else if (shader->SetupInst[0][reg].swizzle == GL_SWIZZLE_STR_ATI ||
+	       shader->SetupInst[0][reg].swizzle == GL_SWIZZLE_STQ_ATI) {
+	       txformat_x |= R200_TEXCOORD_NONPROJ;
+	    }
+	    else {
+	       txformat_x |= R200_TEXCOORD_PROJ;
+	    }
+	    rmesa->hw.tex[reg].cmd[TEX_PP_TXFORMAT] = txformat;
+	    rmesa->hw.tex[reg].cmd[TEX_PP_TXFORMAT_X] = txformat_x;
+	    /* enabling texturing when unit isn't correctly configured may not be safe */
+	    if (targetbit)
+	       rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_0_ENABLE << reg;
+	 }
+      }
+
+   } else {
+      /* setup 1st pass */
+      for (reg = 0; reg < R200_MAX_TEXTURE_UNITS; reg++) {
+	 GLbitfield targetbit = ctx->Texture.Unit[reg]._ReallyEnabled;
+	 R200_STATECHANGE( rmesa, tex[reg] );
+	 GLuint txformat_multi = 0;
+	 if (shader->SetupInst[0][reg].Opcode) {
+	    txformat_multi |= (shader->SetupInst[0][reg].src - GL_TEXTURE0_ARB)
+		<< R200_PASS1_ST_ROUTE_SHIFT;
+	    if (shader->SetupInst[0][reg].Opcode == ATI_FRAGMENT_SHADER_PASS_OP) {
+	       txformat_multi |= R200_PASS1_TXFORMAT_LOOKUP_DISABLE;
+	       if (shader->SetupInst[0][reg].swizzle == GL_SWIZZLE_STR_ATI ||
+		  shader->SetupInst[0][reg].swizzle == GL_SWIZZLE_STQ_ATI) {
+		  txformat_multi |= R200_PASS1_TEXCOORD_VOLUME;
+	       }
+	       else {
+		  txformat_multi |= R200_PASS1_TEXCOORD_PROJ;
+	       }
+	       rmesa->hw.cst.cmd[CST_PP_CNTL_X] |= R200_PPX_TEX_0_ENABLE << reg;
+	    }
+	    else if (targetbit == TEXTURE_3D_BIT) {
+	       txformat_multi |= R200_PASS1_TEXCOORD_VOLUME;
+	    }
+	    else if (targetbit == TEXTURE_CUBE_BIT) {
+	       txformat_multi |= R200_PASS1_TEXCOORD_CUBIC_ENV;
+	    }
+	    else if (shader->SetupInst[0][reg].swizzle == GL_SWIZZLE_STR_ATI ||
+		  shader->SetupInst[0][reg].swizzle == GL_SWIZZLE_STQ_ATI) {
+		  txformat_multi |= R200_PASS1_TEXCOORD_NONPROJ;
+	    }
+	    else {
+	       txformat_multi |= R200_PASS1_TEXCOORD_PROJ;
+	    }
+	    if (targetbit)
+	       rmesa->hw.cst.cmd[CST_PP_CNTL_X] |= R200_PPX_TEX_0_ENABLE << reg;
+	 }
+         rmesa->hw.tex[reg].cmd[TEX_PP_TXMULTI_CTL] = txformat_multi;
+      }
+
+      /* setup 2nd pass */
+      for (reg=0; reg < R200_MAX_TEXTURE_UNITS; reg++) {
+	 GLbitfield targetbit = ctx->Texture.Unit[reg]._ReallyEnabled;
+	 if (shader->SetupInst[1][reg].Opcode) {
+	    GLuint coord = shader->SetupInst[1][reg].src;
+	    GLuint txformat = rmesa->hw.tex[reg].cmd[TEX_PP_TXFORMAT]
+		& ~(R200_TXFORMAT_ST_ROUTE_MASK | R200_TXFORMAT_LOOKUP_DISABLE);
+	    GLuint txformat_x = rmesa->hw.tex[reg].cmd[TEX_PP_TXFORMAT_X] & ~R200_TEXCOORD_MASK;
+	    R200_STATECHANGE( rmesa, tex[reg] );
+	    if (shader->SetupInst[1][reg].Opcode == ATI_FRAGMENT_SHADER_PASS_OP) {
+	       txformat |= R200_TXFORMAT_LOOKUP_DISABLE;
+	       txformat_x |= R200_TEXCOORD_VOLUME;
+	       if (shader->SetupInst[1][reg].swizzle == GL_SWIZZLE_STR_ATI ||
+		  shader->SetupInst[1][reg].swizzle == GL_SWIZZLE_STQ_ATI) {
+	          txformat_x |= R200_TEXCOORD_VOLUME;
+	       }
+	       else {
+		  txformat_x |= R200_TEXCOORD_PROJ;
+	       }
+	       rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_0_ENABLE << reg;
+	    }
+	    else if (targetbit == TEXTURE_3D_BIT) {
+	       txformat_x |= R200_TEXCOORD_VOLUME;
+	    }
+	    else if (targetbit == TEXTURE_CUBE_BIT) {
+	       txformat_x |= R200_TEXCOORD_CUBIC_ENV;
+	    }
+	    else if (shader->SetupInst[1][reg].swizzle == GL_SWIZZLE_STR_ATI ||
+	       shader->SetupInst[1][reg].swizzle == GL_SWIZZLE_STQ_ATI) {
+	       txformat_x |= R200_TEXCOORD_NONPROJ;
+	    }
+	    else {
+	       txformat_x |= R200_TEXCOORD_PROJ;
+	    }
+	    if (coord >= GL_REG_0_ATI) {
+	       GLuint txformat_multi = rmesa->hw.tex[reg].cmd[TEX_PP_TXMULTI_CTL];
+	       txformat_multi |= (coord - GL_REG_0_ATI + 2) << R200_PASS2_COORDS_REG_SHIFT;
+	       rmesa->hw.tex[reg].cmd[TEX_PP_TXMULTI_CTL] = txformat_multi;
+	       rmesa->hw.cst.cmd[CST_PP_CNTL_X] |= 1 <<
+		  (R200_PPX_OUTPUT_REG_0_SHIFT + coord - GL_REG_0_ATI);
+	    } else {
+	       txformat |= (coord - GL_TEXTURE0_ARB) << R200_TXFORMAT_ST_ROUTE_SHIFT;
+	    }
+	    rmesa->hw.tex[reg].cmd[TEX_PP_TXFORMAT_X] = txformat_x;
+	    rmesa->hw.tex[reg].cmd[TEX_PP_TXFORMAT] = txformat;
+	    if (targetbit)
+	       rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_0_ENABLE << reg;
+	 }
+      }
+   }
+}
+
+static void r200UpdateFSConstants( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   const struct ati_fragment_shader *shader = ctx->ATIFragmentShader.Current;
+   GLuint i;
+
+   /* update constants */
+   R200_STATECHANGE(rmesa, atf);
+   for (i = 0; i < 8; i++)
+   {
+      GLubyte con_byte[4];
+      if ((shader->LocalConstDef >> i) & 1) {
+	 CLAMPED_FLOAT_TO_UBYTE(con_byte[0], shader->Constants[i][0]);
+	 CLAMPED_FLOAT_TO_UBYTE(con_byte[1], shader->Constants[i][1]);
+	 CLAMPED_FLOAT_TO_UBYTE(con_byte[2], shader->Constants[i][2]);
+	 CLAMPED_FLOAT_TO_UBYTE(con_byte[3], shader->Constants[i][3]);
+      }
+      else {
+	 CLAMPED_FLOAT_TO_UBYTE(con_byte[0], ctx->ATIFragmentShader.GlobalConstants[i][0]);
+	 CLAMPED_FLOAT_TO_UBYTE(con_byte[1], ctx->ATIFragmentShader.GlobalConstants[i][1]);
+	 CLAMPED_FLOAT_TO_UBYTE(con_byte[2], ctx->ATIFragmentShader.GlobalConstants[i][2]);
+	 CLAMPED_FLOAT_TO_UBYTE(con_byte[3], ctx->ATIFragmentShader.GlobalConstants[i][3]);
+      }
+      rmesa->hw.atf.cmd[ATF_TFACTOR_0 + i] = r200PackColor (
+	 4, con_byte[0], con_byte[1], con_byte[2], con_byte[3] );
+   }
+}
+
+/* update routing, constants and arithmetic
+ * constants need to be updated always (globals can change, no separate notification)
+ * routing needs to be updated always too (non-shader code will overwrite state, plus
+ * some of the routing depends on what sort of texture is bound)
+ * for both of them, we need to update anyway because of disabling/enabling ati_fs which
+ * we'd need to track otherwise
+ * arithmetic is only updated if current shader changes (and probably the data should be
+ * stored in some DriverData object attached to the mesa atifs object, i.e. binding a
+ * shader wouldn't force us to "recompile" the shader).
+ */
+void r200UpdateFragmentShader( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   r200UpdateFSConstants( ctx );
+   r200UpdateFSRouting( ctx );
+   if (rmesa->afs_loaded != ctx->ATIFragmentShader.Current)
+      r200UpdateFSArith( ctx );
+}
diff --git a/r200/r200_ioctl.c b/r200/r200_ioctl.c
new file mode 100644
index 0000000..463bd64
--- /dev/null
+++ b/r200/r200_ioctl.c
@@ -0,0 +1,992 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_ioctl.c,v 1.4 2002/12/17 00:32:56 dawes Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+ 
+#include <sched.h>
+#include <errno.h>
+
+#include "glheader.h"
+#include "imports.h"
+#include "macros.h"
+#include "context.h"
+#include "swrast/swrast.h"
+
+#include "r200_context.h"
+#include "r200_state.h"
+#include "r200_ioctl.h"
+#include "r200_tcl.h"
+#include "r200_sanity.h"
+#include "radeon_reg.h"
+
+#include "drirenderbuffer.h"
+#include "vblank.h"
+
+#define R200_TIMEOUT             512
+#define R200_IDLE_RETRY           16
+
+
+static void r200WaitForIdle( r200ContextPtr rmesa );
+
+
+/* At this point we were in FlushCmdBufLocked but we had lost our context, so
+ * we need to unwire our current cmdbuf, hook the one with the saved state in
+ * it, flush it, and then put the current one back.  This is so commands at the
+ * start of a cmdbuf can rely on the state being kept from the previous one.
+ */
+static void r200BackUpAndEmitLostStateLocked( r200ContextPtr rmesa )
+{
+   GLuint nr_released_bufs;
+   struct r200_store saved_store;
+
+   if (rmesa->backup_store.cmd_used == 0)
+      return;
+
+   if (R200_DEBUG & DEBUG_STATE)
+      fprintf(stderr, "Emitting backup state on lost context\n");
+
+   rmesa->lost_context = GL_FALSE;
+
+   nr_released_bufs = rmesa->dma.nr_released_bufs;
+   saved_store = rmesa->store;
+   rmesa->dma.nr_released_bufs = 0;
+   rmesa->store = rmesa->backup_store;
+   r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+   rmesa->dma.nr_released_bufs = nr_released_bufs;
+   rmesa->store = saved_store;
+}
+
+int r200FlushCmdBufLocked( r200ContextPtr rmesa, const char * caller )
+{
+   int ret, i;
+   drm_radeon_cmd_buffer_t cmd;
+
+   if (rmesa->lost_context)
+      r200BackUpAndEmitLostStateLocked( rmesa );
+
+   if (R200_DEBUG & DEBUG_IOCTL) {
+      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
+
+      if (0 & R200_DEBUG & DEBUG_VERBOSE) 
+	 for (i = 0 ; i < rmesa->store.cmd_used ; i += 4 )
+	    fprintf(stderr, "%d: %x\n", i/4, 
+		    *(int *)(&rmesa->store.cmd_buf[i]));
+   }
+
+   if (R200_DEBUG & DEBUG_DMA)
+      fprintf(stderr, "%s: Releasing %d buffers\n", __FUNCTION__,
+	      rmesa->dma.nr_released_bufs);
+
+
+   if (R200_DEBUG & DEBUG_SANITY) {
+      if (rmesa->state.scissor.enabled) 
+	 ret = r200SanityCmdBuffer( rmesa, 
+				    rmesa->state.scissor.numClipRects,
+				    rmesa->state.scissor.pClipRects);
+      else
+	 ret = r200SanityCmdBuffer( rmesa, 
+				    rmesa->numClipRects,
+				    rmesa->pClipRects);
+      if (ret) {
+	 fprintf(stderr, "drmSanityCommandWrite: %d\n", ret);	 
+	 goto out;
+      }
+   }
+
+
+   if (R200_DEBUG & DEBUG_MEMORY) {
+      if (! driValidateTextureHeaps( rmesa->texture_heaps, rmesa->nr_heaps,
+				     & rmesa->swapped ) ) {
+	 fprintf( stderr, "%s: texture memory is inconsistent - expect "
+		  "mangled textures\n", __FUNCTION__ );
+      }
+   }
+
+
+   cmd.bufsz = rmesa->store.cmd_used;
+   cmd.buf = rmesa->store.cmd_buf;
+
+   if (rmesa->state.scissor.enabled) {
+      cmd.nbox = rmesa->state.scissor.numClipRects;
+      cmd.boxes = (drm_clip_rect_t *)rmesa->state.scissor.pClipRects;
+   } else {
+      cmd.nbox = rmesa->numClipRects;
+      cmd.boxes = (drm_clip_rect_t *)rmesa->pClipRects;
+   }
+
+   ret = drmCommandWrite( rmesa->dri.fd,
+			  DRM_RADEON_CMDBUF,
+			  &cmd, sizeof(cmd) );
+
+   if (ret)
+      fprintf(stderr, "drmCommandWrite: %d\n", ret);
+
+   if (R200_DEBUG & DEBUG_SYNC) {
+      fprintf(stderr, "\nSyncing in %s\n\n", __FUNCTION__);
+      r200WaitForIdleLocked( rmesa );
+   }
+
+
+ out:
+   rmesa->store.primnr = 0;
+   rmesa->store.statenr = 0;
+   rmesa->store.cmd_used = 0;
+   rmesa->dma.nr_released_bufs = 0;
+   rmesa->save_on_next_emit = 1;
+
+   return ret;
+}
+
+
+/* Note: does not emit any commands to avoid recursion on
+ * r200AllocCmdBuf.
+ */
+void r200FlushCmdBuf( r200ContextPtr rmesa, const char *caller )
+{
+   int ret;
+
+   LOCK_HARDWARE( rmesa );
+
+   ret = r200FlushCmdBufLocked( rmesa, caller );
+
+   UNLOCK_HARDWARE( rmesa );
+
+   if (ret) {
+      fprintf(stderr, "drmRadeonCmdBuffer: %d (exiting)\n", ret);
+      exit(ret);
+   }
+}
+
+
+/* =============================================================
+ * Hardware vertex buffer handling
+ */
+
+
+void r200RefillCurrentDmaRegion( r200ContextPtr rmesa )
+{
+   struct r200_dma_buffer *dmabuf;
+   int fd = rmesa->dri.fd;
+   int index = 0;
+   int size = 0;
+   drmDMAReq dma;
+   int ret;
+
+   if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
+      fprintf(stderr, "%s\n", __FUNCTION__);  
+
+   if (rmesa->dma.flush) {
+      rmesa->dma.flush( rmesa );
+   }
+
+   if (rmesa->dma.current.buf)
+      r200ReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
+
+   if (rmesa->dma.nr_released_bufs > 4)
+      r200FlushCmdBuf( rmesa, __FUNCTION__ );
+
+   dma.context = rmesa->dri.hwContext;
+   dma.send_count = 0;
+   dma.send_list = NULL;
+   dma.send_sizes = NULL;
+   dma.flags = 0;
+   dma.request_count = 1;
+   dma.request_size = RADEON_BUFFER_SIZE;
+   dma.request_list = &index;
+   dma.request_sizes = &size;
+   dma.granted_count = 0;
+
+   LOCK_HARDWARE(rmesa);	/* no need to validate */
+
+   while (1) {
+      ret = drmDMA( fd, &dma );
+      if (ret == 0)
+	 break;
+   
+      if (rmesa->dma.nr_released_bufs) {
+	 r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+      }
+
+      if (rmesa->do_usleeps) {
+	 UNLOCK_HARDWARE( rmesa );
+	 DO_USLEEP( 1 );
+	 LOCK_HARDWARE( rmesa );
+      }
+   }
+
+   UNLOCK_HARDWARE(rmesa);
+
+   if (R200_DEBUG & DEBUG_DMA)
+      fprintf(stderr, "Allocated buffer %d\n", index);
+
+   dmabuf = CALLOC_STRUCT( r200_dma_buffer );
+   dmabuf->buf = &rmesa->r200Screen->buffers->list[index];
+   dmabuf->refcount = 1;
+
+   rmesa->dma.current.buf = dmabuf;
+   rmesa->dma.current.address = dmabuf->buf->address;
+   rmesa->dma.current.end = dmabuf->buf->total;
+   rmesa->dma.current.start = 0;
+   rmesa->dma.current.ptr = 0;
+}
+
+void r200ReleaseDmaRegion( r200ContextPtr rmesa,
+			     struct r200_dma_region *region,
+			     const char *caller )
+{
+   if (R200_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
+   
+   if (!region->buf)
+      return;
+
+   if (rmesa->dma.flush)
+      rmesa->dma.flush( rmesa );
+
+   if (--region->buf->refcount == 0) {
+      drm_radeon_cmd_header_t *cmd;
+
+      if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
+	 fprintf(stderr, "%s -- DISCARD BUF %d\n", __FUNCTION__,
+		 region->buf->buf->idx);  
+      
+      cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, sizeof(*cmd), 
+						     __FUNCTION__ );
+      cmd->dma.cmd_type = RADEON_CMD_DMA_DISCARD;
+      cmd->dma.buf_idx = region->buf->buf->idx;
+      FREE(region->buf);
+      rmesa->dma.nr_released_bufs++;
+   }
+
+   region->buf = NULL;
+   region->start = 0;
+}
+
+/* Allocates a region from rmesa->dma.current.  If there isn't enough
+ * space in current, grab a new buffer (and discard what was left of current)
+ */
+void r200AllocDmaRegion( r200ContextPtr rmesa, 
+			   struct r200_dma_region *region,
+			   int bytes,
+			   int alignment )
+{
+   if (R200_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+
+   if (rmesa->dma.flush)
+      rmesa->dma.flush( rmesa );
+
+   if (region->buf)
+      r200ReleaseDmaRegion( rmesa, region, __FUNCTION__ );
+
+   alignment--;
+   rmesa->dma.current.start = rmesa->dma.current.ptr = 
+      (rmesa->dma.current.ptr + alignment) & ~alignment;
+
+   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
+      r200RefillCurrentDmaRegion( rmesa );
+
+   region->start = rmesa->dma.current.start;
+   region->ptr = rmesa->dma.current.start;
+   region->end = rmesa->dma.current.start + bytes;
+   region->address = rmesa->dma.current.address;
+   region->buf = rmesa->dma.current.buf;
+   region->buf->refcount++;
+
+   rmesa->dma.current.ptr += bytes; /* bug - if alignment > 7 */
+   rmesa->dma.current.start = 
+      rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;  
+
+   assert( rmesa->dma.current.ptr <= rmesa->dma.current.end );
+}
+
+/* ================================================================
+ * SwapBuffers with client-side throttling
+ */
+
+static u_int32_t r200GetLastFrame(r200ContextPtr rmesa)
+{
+   drm_radeon_getparam_t gp;
+   int ret;
+   u_int32_t frame;
+
+   gp.param = RADEON_PARAM_LAST_FRAME;
+   gp.value = (int *)&frame;
+   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_GETPARAM,
+			      &gp, sizeof(gp) );
+   if ( ret ) {
+      fprintf( stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__, ret );
+      exit(1);
+   }
+
+   return frame;
+}
+
+static void r200EmitIrqLocked( r200ContextPtr rmesa )
+{
+   drm_radeon_irq_emit_t ie;
+   int ret;
+
+   ie.irq_seq = &rmesa->iw.irq_seq;
+   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_IRQ_EMIT, 
+			      &ie, sizeof(ie) );
+   if ( ret ) {
+      fprintf( stderr, "%s: drmRadeonIrqEmit: %d\n", __FUNCTION__, ret );
+      exit(1);
+   }
+}
+
+
+static void r200WaitIrq( r200ContextPtr rmesa )
+{
+   int ret;
+
+   do {
+      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_IRQ_WAIT,
+			     &rmesa->iw, sizeof(rmesa->iw) );
+   } while (ret && (errno == EINTR || errno == EBUSY));
+
+   if ( ret ) {
+      fprintf( stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__, ret );
+      exit(1);
+   }
+}
+
+
+static void r200WaitForFrameCompletion( r200ContextPtr rmesa )
+{
+   drm_radeon_sarea_t *sarea = rmesa->sarea;
+
+   if (rmesa->do_irqs) {
+      if (r200GetLastFrame(rmesa) < sarea->last_frame) {
+	 if (!rmesa->irqsEmitted) {
+	    while (r200GetLastFrame (rmesa) < sarea->last_frame)
+	       ;
+	 }
+	 else {
+	    UNLOCK_HARDWARE( rmesa ); 
+	    r200WaitIrq( rmesa );	
+	    LOCK_HARDWARE( rmesa ); 
+	 }
+	 rmesa->irqsEmitted = 10;
+      }
+
+      if (rmesa->irqsEmitted) {
+	 r200EmitIrqLocked( rmesa );
+	 rmesa->irqsEmitted--;
+      }
+   } 
+   else {
+      while (r200GetLastFrame (rmesa) < sarea->last_frame) {
+	 UNLOCK_HARDWARE( rmesa ); 
+	 if (rmesa->do_usleeps) 
+	    DO_USLEEP( 1 );
+	 LOCK_HARDWARE( rmesa ); 
+      }
+   }
+}
+
+
+
+/* Copy the back color buffer to the front color buffer.
+ */
+void r200CopyBuffer( const __DRIdrawablePrivate *dPriv,
+		      const drm_clip_rect_t	 *rect)
+{
+   r200ContextPtr rmesa;
+   GLint nbox, i, ret;
+   GLboolean   missed_target;
+   int64_t ust;
+
+   assert(dPriv);
+   assert(dPriv->driContextPriv);
+   assert(dPriv->driContextPriv->driverPrivate);
+
+   rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
+
+   if ( R200_DEBUG & DEBUG_IOCTL ) {
+      fprintf( stderr, "\n%s( %p )\n\n", __FUNCTION__, (void *)rmesa->glCtx );
+   }
+
+   R200_FIREVERTICES( rmesa );
+
+   LOCK_HARDWARE( rmesa );
+
+
+   /* Throttle the frame rate -- only allow one pending swap buffers
+    * request at a time.
+    */
+   r200WaitForFrameCompletion( rmesa );
+   if (!rect)
+   {
+       UNLOCK_HARDWARE( rmesa );
+       driWaitForVBlank( dPriv, & rmesa->vbl_seq, rmesa->vblank_flags, & missed_target );
+       LOCK_HARDWARE( rmesa );
+   }
+
+   nbox = dPriv->numClipRects; /* must be in locked region */
+
+   for ( i = 0 ; i < nbox ; ) {
+      GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS , nbox );
+      drm_clip_rect_t *box = dPriv->pClipRects;
+      drm_clip_rect_t *b = rmesa->sarea->boxes;
+      GLint n = 0;
+
+      for ( ; i < nr ; i++ ) {
+
+	  *b = box[i];
+
+	  if (rect)
+	  {
+	     if (rect->x1 > b->x1)
+		 b->x1 = rect->x1;
+	     if (rect->y1 > b->y1)
+		 b->y1 = rect->y1;
+	     if (rect->x2 < b->x2)
+		 b->x2 = rect->x2;
+	     if (rect->y2 < b->y2)
+		 b->y2 = rect->y2;
+
+	     if (b->x1 < b->x2 && b->y1 < b->y2)
+		 b++;
+	  }
+	  else
+	      b++;
+
+	  n++;
+      }
+      rmesa->sarea->nbox = n;
+
+      ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_SWAP );
+
+      if ( ret ) {
+	 fprintf( stderr, "DRM_R200_SWAP_BUFFERS: return = %d\n", ret );
+	 UNLOCK_HARDWARE( rmesa );
+	 exit( 1 );
+      }
+   }
+
+   UNLOCK_HARDWARE( rmesa );
+   if (!rect)
+   {
+       rmesa->hw.all_dirty = GL_TRUE;
+
+       rmesa->swap_count++;
+       (*dri_interface->getUST)( & ust );
+       if ( missed_target ) {
+	   rmesa->swap_missed_count++;
+	   rmesa->swap_missed_ust = ust - rmesa->swap_ust;
+       }
+
+       rmesa->swap_ust = ust;
+
+       sched_yield();
+   }
+}
+
+void r200PageFlip( const __DRIdrawablePrivate *dPriv )
+{
+   r200ContextPtr rmesa;
+   GLint ret;
+   GLboolean   missed_target;
+
+   assert(dPriv);
+   assert(dPriv->driContextPriv);
+   assert(dPriv->driContextPriv->driverPrivate);
+
+   rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
+
+   if ( R200_DEBUG & DEBUG_IOCTL ) {
+      fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
+	      rmesa->sarea->pfCurrentPage);
+   }
+
+   R200_FIREVERTICES( rmesa );
+   LOCK_HARDWARE( rmesa );
+
+   if (!dPriv->numClipRects) {
+      UNLOCK_HARDWARE( rmesa );
+      usleep( 10000 );		/* throttle invisible client 10ms */
+      return;
+   }
+
+   /* Need to do this for the perf box placement:
+    */
+   {
+      drm_clip_rect_t *box = dPriv->pClipRects;
+      drm_clip_rect_t *b = rmesa->sarea->boxes;
+      b[0] = box[0];
+      rmesa->sarea->nbox = 1;
+   }
+
+   /* Throttle the frame rate -- only allow a few pending swap buffers
+    * request at a time.
+    */
+   r200WaitForFrameCompletion( rmesa );
+   UNLOCK_HARDWARE( rmesa );
+   driWaitForVBlank( dPriv, & rmesa->vbl_seq, rmesa->vblank_flags, & missed_target );
+   if ( missed_target ) {
+      rmesa->swap_missed_count++;
+      (void) (*dri_interface->getUST)( & rmesa->swap_missed_ust );
+   }
+   LOCK_HARDWARE( rmesa );
+
+   ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_FLIP );
+
+   UNLOCK_HARDWARE( rmesa );
+
+   if ( ret ) {
+      fprintf( stderr, "DRM_RADEON_FLIP: return = %d\n", ret );
+      exit( 1 );
+   }
+
+   rmesa->swap_count++;
+   (void) (*dri_interface->getUST)( & rmesa->swap_ust );
+
+#if 000
+   if ( rmesa->sarea->pfCurrentPage == 1 ) {
+	 rmesa->state.color.drawOffset = rmesa->r200Screen->frontOffset;
+	 rmesa->state.color.drawPitch  = rmesa->r200Screen->frontPitch;
+   } else {
+	 rmesa->state.color.drawOffset = rmesa->r200Screen->backOffset;
+	 rmesa->state.color.drawPitch  = rmesa->r200Screen->backPitch;
+   }
+
+   R200_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = rmesa->state.color.drawOffset
+					   + rmesa->r200Screen->fbLocation;
+   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH]  = rmesa->state.color.drawPitch;
+   if (rmesa->sarea->tiling_enabled) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
+   }
+#else
+   /* Get ready for drawing next frame.  Update the renderbuffers'
+    * flippedOffset/Pitch fields so we draw into the right place.
+    */
+   driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
+                        rmesa->sarea->pfCurrentPage);
+
+
+   r200UpdateDrawBuffer(rmesa->glCtx);
+#endif
+}
+
+
+/* ================================================================
+ * Buffer clear
+ */
+static void r200Clear( GLcontext *ctx, GLbitfield mask )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   GLuint flags = 0;
+   GLuint color_mask = 0;
+   GLint ret, i;
+   GLint cx, cy, cw, ch;
+
+   if ( R200_DEBUG & DEBUG_IOCTL ) {
+      fprintf( stderr, "r200Clear\n");
+   }
+
+   {
+      LOCK_HARDWARE( rmesa );
+      UNLOCK_HARDWARE( rmesa );
+      if ( dPriv->numClipRects == 0 ) 
+	 return;
+   }
+
+   r200Flush( ctx );
+
+   if ( mask & BUFFER_BIT_FRONT_LEFT ) {
+      flags |= RADEON_FRONT;
+      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      mask &= ~BUFFER_BIT_FRONT_LEFT;
+   }
+
+   if ( mask & BUFFER_BIT_BACK_LEFT ) {
+      flags |= RADEON_BACK;
+      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      mask &= ~BUFFER_BIT_BACK_LEFT;
+   }
+
+   if ( mask & BUFFER_BIT_DEPTH ) {
+      flags |= RADEON_DEPTH;
+      mask &= ~BUFFER_BIT_DEPTH;
+   }
+
+   if ( (mask & BUFFER_BIT_STENCIL) && rmesa->state.stencil.hwBuffer ) {
+      flags |= RADEON_STENCIL;
+      mask &= ~BUFFER_BIT_STENCIL;
+   }
+
+   if ( mask ) {
+      if (R200_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "%s: swrast clear, mask: %x\n", __FUNCTION__, mask);
+      _swrast_Clear( ctx, mask );
+   }
+
+   if ( !flags ) 
+      return;
+
+   if (rmesa->using_hyperz) {
+      flags |= RADEON_USE_COMP_ZBUF;
+/*      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200)
+	 flags |= RADEON_USE_HIERZ; */
+      if (!(rmesa->state.stencil.hwBuffer) ||
+	 ((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
+	    ((rmesa->state.stencil.clear & R200_STENCIL_WRITE_MASK) == R200_STENCIL_WRITE_MASK))) {
+	  flags |= RADEON_CLEAR_FASTZ;
+      }
+   }
+
+   LOCK_HARDWARE( rmesa );
+
+   /* compute region after locking: */
+   cx = ctx->DrawBuffer->_Xmin;
+   cy = ctx->DrawBuffer->_Ymin;
+   cw = ctx->DrawBuffer->_Xmax - cx;
+   ch = ctx->DrawBuffer->_Ymax - cy;
+
+   /* Flip top to bottom */
+   cx += dPriv->x;
+   cy  = dPriv->y + dPriv->h - cy - ch;
+
+   /* Throttle the number of clear ioctls we do.
+    */
+   while ( 1 ) {
+      drm_radeon_getparam_t gp;
+      int ret;
+      int clear;
+
+      gp.param = RADEON_PARAM_LAST_CLEAR;
+      gp.value = (int *)&clear;
+      ret = drmCommandWriteRead( rmesa->dri.fd,
+		      DRM_RADEON_GETPARAM, &gp, sizeof(gp) );
+
+      if ( ret ) {
+	 fprintf( stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__, ret );
+	 exit(1);
+      }
+
+      /* Clear throttling needs more thought.
+       */
+      if ( rmesa->sarea->last_clear - clear <= 25 ) {
+	 break;
+      }
+      
+      if (rmesa->do_usleeps) {
+	 UNLOCK_HARDWARE( rmesa );
+	 DO_USLEEP( 1 );
+	 LOCK_HARDWARE( rmesa );
+      }
+   }
+
+   /* Send current state to the hardware */
+   r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+
+   for ( i = 0 ; i < dPriv->numClipRects ; ) {
+      GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS, dPriv->numClipRects );
+      drm_clip_rect_t *box = dPriv->pClipRects;
+      drm_clip_rect_t *b = rmesa->sarea->boxes;
+      drm_radeon_clear_t clear;
+      drm_radeon_clear_rect_t depth_boxes[RADEON_NR_SAREA_CLIPRECTS];
+      GLint n = 0;
+
+      if (cw != dPriv->w || ch != dPriv->h) {
+         /* clear subregion */
+	 for ( ; i < nr ; i++ ) {
+	    GLint x = box[i].x1;
+	    GLint y = box[i].y1;
+	    GLint w = box[i].x2 - x;
+	    GLint h = box[i].y2 - y;
+
+	    if ( x < cx ) w -= cx - x, x = cx;
+	    if ( y < cy ) h -= cy - y, y = cy;
+	    if ( x + w > cx + cw ) w = cx + cw - x;
+	    if ( y + h > cy + ch ) h = cy + ch - y;
+	    if ( w <= 0 ) continue;
+	    if ( h <= 0 ) continue;
+
+	    b->x1 = x;
+	    b->y1 = y;
+	    b->x2 = x + w;
+	    b->y2 = y + h;
+	    b++;
+	    n++;
+	 }
+      } else {
+         /* clear whole window */
+	 for ( ; i < nr ; i++ ) {
+	    *b++ = box[i];
+	    n++;
+	 }
+      }
+
+      rmesa->sarea->nbox = n;
+
+      clear.flags       = flags;
+      clear.clear_color = rmesa->state.color.clear;
+      clear.clear_depth = rmesa->state.depth.clear;	/* needed for hyperz */
+      clear.color_mask  = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      clear.depth_mask  = rmesa->state.stencil.clear;
+      clear.depth_boxes = depth_boxes;
+
+      n--;
+      b = rmesa->sarea->boxes;
+      for ( ; n >= 0 ; n-- ) {
+	 depth_boxes[n].f[CLEAR_X1] = (float)b[n].x1;
+	 depth_boxes[n].f[CLEAR_Y1] = (float)b[n].y1;
+	 depth_boxes[n].f[CLEAR_X2] = (float)b[n].x2;
+	 depth_boxes[n].f[CLEAR_Y2] = (float)b[n].y2;
+	 depth_boxes[n].f[CLEAR_DEPTH] = ctx->Depth.Clear;
+      }
+
+      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_CLEAR,
+			     &clear, sizeof(clear));
+
+
+      if ( ret ) {
+	 UNLOCK_HARDWARE( rmesa );
+	 fprintf( stderr, "DRM_RADEON_CLEAR: return = %d\n", ret );
+	 exit( 1 );
+      }
+   }
+
+   UNLOCK_HARDWARE( rmesa );
+   rmesa->hw.all_dirty = GL_TRUE;
+}
+
+
+void r200WaitForIdleLocked( r200ContextPtr rmesa )
+{
+    int ret;
+    int i = 0;
+    
+    do {
+       ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_CP_IDLE);
+       if (ret) 
+	  DO_USLEEP( 1 );
+    } while (ret && ++i < 100);
+    
+    if ( ret < 0 ) {
+       UNLOCK_HARDWARE( rmesa );
+       fprintf( stderr, "Error: R200 timed out... exiting\n" );
+       exit( -1 );
+    }
+}
+
+
+static void r200WaitForIdle( r200ContextPtr rmesa )
+{
+   LOCK_HARDWARE(rmesa);
+   r200WaitForIdleLocked( rmesa );
+   UNLOCK_HARDWARE(rmesa);
+}
+
+
+void r200Flush( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+
+   if (R200_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   if (rmesa->dma.flush)
+      rmesa->dma.flush( rmesa );
+
+   r200EmitState( rmesa );
+   
+   if (rmesa->store.cmd_used)
+      r200FlushCmdBuf( rmesa, __FUNCTION__ );
+}
+
+/* Make sure all commands have been sent to the hardware and have
+ * completed processing.
+ */
+void r200Finish( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   r200Flush( ctx );
+
+   if (rmesa->do_irqs) {
+      LOCK_HARDWARE( rmesa );
+      r200EmitIrqLocked( rmesa );
+      UNLOCK_HARDWARE( rmesa );
+      r200WaitIrq( rmesa );
+   }
+   else 
+      r200WaitForIdle( rmesa );
+}
+
+
+/* This version of AllocateMemoryMESA allocates only GART memory, and
+ * only does so after the point at which the driver has been
+ * initialized.
+ *
+ * Theoretically a valid context isn't required.  However, in this
+ * implementation, it is, as I'm using the hardware lock to protect
+ * the kernel data structures, and the current context to get the
+ * device fd.
+ */
+void *r200AllocateMemoryMESA(__DRInativeDisplay *dpy, int scrn, GLsizei size,
+			     GLfloat readfreq, GLfloat writefreq, 
+			     GLfloat priority)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   r200ContextPtr rmesa;
+   int region_offset;
+   drm_radeon_mem_alloc_t alloc;
+   int ret;
+
+   if (R200_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s sz %d %f/%f/%f\n", __FUNCTION__, size, readfreq, 
+	      writefreq, priority);
+
+   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->r200Screen->gartTextures.map)
+      return NULL;
+
+   if (getenv("R200_NO_ALLOC"))
+      return NULL;
+
+   alloc.region = RADEON_MEM_REGION_GART;
+   alloc.alignment = 0;
+   alloc.size = size;
+   alloc.region_offset = &region_offset;
+
+   ret = drmCommandWriteRead( rmesa->r200Screen->driScreen->fd,
+			      DRM_RADEON_ALLOC,
+			      &alloc, sizeof(alloc));
+   
+   if (ret) {
+      fprintf(stderr, "%s: DRM_RADEON_ALLOC ret %d\n", __FUNCTION__, ret);
+      return NULL;
+   }
+   
+   {
+      char *region_start = (char *)rmesa->r200Screen->gartTextures.map;
+      return (void *)(region_start + region_offset);
+   }
+}
+
+
+/* Called via glXFreeMemoryMESA() */
+void r200FreeMemoryMESA(__DRInativeDisplay *dpy, int scrn, GLvoid *pointer)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   r200ContextPtr rmesa;
+   ptrdiff_t region_offset;
+   drm_radeon_mem_free_t memfree;
+   int ret;
+
+   if (R200_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s %p\n", __FUNCTION__, pointer);
+
+   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->r200Screen->gartTextures.map) {
+      fprintf(stderr, "%s: no context\n", __FUNCTION__);
+      return;
+   }
+
+   region_offset = (char *)pointer - (char *)rmesa->r200Screen->gartTextures.map;
+
+   if (region_offset < 0 || 
+       region_offset > rmesa->r200Screen->gartTextures.size) {
+      fprintf(stderr, "offset %d outside range 0..%d\n", region_offset,
+	      rmesa->r200Screen->gartTextures.size);
+      return;
+   }
+
+   memfree.region = RADEON_MEM_REGION_GART;
+   memfree.region_offset = region_offset;
+   
+   ret = drmCommandWrite( rmesa->r200Screen->driScreen->fd,
+			  DRM_RADEON_FREE,
+			  &memfree, sizeof(memfree));
+   
+   if (ret) 
+      fprintf(stderr, "%s: DRM_RADEON_FREE ret %d\n", __FUNCTION__, ret);
+}
+
+/* Called via glXGetMemoryOffsetMESA() */
+GLuint r200GetMemoryOffsetMESA(__DRInativeDisplay *dpy, int scrn, const GLvoid *pointer)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   r200ContextPtr rmesa;
+   GLuint card_offset;
+
+   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) ) {
+      fprintf(stderr, "%s: no context\n", __FUNCTION__);
+      return ~0;
+   }
+
+   if (!r200IsGartMemory( rmesa, pointer, 0 ))
+      return ~0;
+
+   card_offset = r200GartOffsetFromVirtual( rmesa, pointer );
+
+   return card_offset - rmesa->r200Screen->gart_base;
+}
+
+GLboolean r200IsGartMemory( r200ContextPtr rmesa, const GLvoid *pointer,
+			   GLint size )
+{
+   ptrdiff_t offset = (char *)pointer - (char *)rmesa->r200Screen->gartTextures.map;
+   int valid = (size >= 0 &&
+		offset >= 0 &&
+		offset + size < rmesa->r200Screen->gartTextures.size);
+
+   if (R200_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "r200IsGartMemory( %p ) : %d\n", pointer, valid );
+   
+   return valid;
+}
+
+
+GLuint r200GartOffsetFromVirtual( r200ContextPtr rmesa, const GLvoid *pointer )
+{
+   ptrdiff_t offset = (char *)pointer - (char *)rmesa->r200Screen->gartTextures.map;
+
+   if (offset < 0 || offset > rmesa->r200Screen->gartTextures.size)
+      return ~0;
+   else
+      return rmesa->r200Screen->gart_texture_offset + offset;
+}
+
+
+
+void r200InitIoctlFuncs( struct dd_function_table *functions )
+{
+    functions->Clear = r200Clear;
+    functions->Finish = r200Finish;
+    functions->Flush = r200Flush;
+}
+
diff --git a/r200/r200_ioctl.h b/r200/r200_ioctl.h
new file mode 100644
index 0000000..f537527
--- /dev/null
+++ b/r200/r200_ioctl.h
@@ -0,0 +1,210 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_ioctl.h,v 1.1 2002/10/30 12:51:52 alanh Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __R200_IOCTL_H__
+#define __R200_IOCTL_H__
+
+#include "simple_list.h"
+#include "radeon_dri.h"
+#include "r200_lock.h"
+
+#include "xf86drm.h"
+#include "drm.h"
+#include "radeon_drm.h"
+
+extern void r200EmitState( r200ContextPtr rmesa );
+extern void r200EmitVertexAOS( r200ContextPtr rmesa,
+				 GLuint vertex_size,
+				 GLuint offset );
+
+extern void r200EmitVbufPrim( r200ContextPtr rmesa,
+				GLuint primitive,
+				GLuint vertex_nr );
+
+extern void r200FlushElts( r200ContextPtr rmesa );
+
+extern GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
+					   GLuint primitive,
+					   GLuint min_nr );
+
+extern void r200EmitAOS( r200ContextPtr rmesa,
+			   struct r200_dma_region **regions,
+			   GLuint n,
+			   GLuint offset );
+
+extern void r200EmitBlit( r200ContextPtr rmesa,
+			  GLuint color_fmt,
+			  GLuint src_pitch,
+			  GLuint src_offset,
+			  GLuint dst_pitch,
+			  GLuint dst_offset,
+			  GLint srcx, GLint srcy,
+			  GLint dstx, GLint dsty,
+			  GLuint w, GLuint h );
+
+extern void r200EmitWait( r200ContextPtr rmesa, GLuint flags );
+
+extern void r200FlushCmdBuf( r200ContextPtr rmesa, const char * );
+extern int r200FlushCmdBufLocked( r200ContextPtr rmesa, const char * caller );
+
+extern void r200RefillCurrentDmaRegion( r200ContextPtr rmesa );
+
+extern void r200AllocDmaRegion( r200ContextPtr rmesa,
+				  struct r200_dma_region *region,
+				  int bytes, 
+				  int alignment );
+
+extern void r200ReleaseDmaRegion( r200ContextPtr rmesa,
+				    struct r200_dma_region *region,
+				    const char *caller );
+
+extern void r200CopyBuffer( const __DRIdrawablePrivate *drawable,
+			    const drm_clip_rect_t      *rect);
+extern void r200PageFlip( const __DRIdrawablePrivate *drawable );
+extern void r200Flush( GLcontext *ctx );
+extern void r200Finish( GLcontext *ctx );
+extern void r200WaitForIdleLocked( r200ContextPtr rmesa );
+extern void r200WaitForVBlank( r200ContextPtr rmesa );
+extern void r200InitIoctlFuncs( struct dd_function_table *functions );
+
+extern void *r200AllocateMemoryMESA( __DRInativeDisplay *dpy, int scrn, GLsizei size, GLfloat readfreq,
+				   GLfloat writefreq, GLfloat priority );
+extern void r200FreeMemoryMESA( __DRInativeDisplay *dpy, int scrn, GLvoid *pointer );
+extern GLuint r200GetMemoryOffsetMESA( __DRInativeDisplay *dpy, int scrn, const GLvoid *pointer );
+
+extern GLboolean r200IsGartMemory( r200ContextPtr rmesa, const GLvoid *pointer,
+				   GLint size );
+
+extern GLuint r200GartOffsetFromVirtual( r200ContextPtr rmesa, 
+					 const GLvoid *pointer );
+
+void r200SetUpAtomList( r200ContextPtr rmesa );
+
+/* ================================================================
+ * Helper macros:
+ */
+
+/* Close off the last primitive, if it exists.
+ */
+#define R200_NEWPRIM( rmesa )			\
+do {						\
+   if ( rmesa->dma.flush )			\
+      rmesa->dma.flush( rmesa );	\
+} while (0)
+
+/* Can accomodate several state changes and primitive changes without
+ * actually firing the buffer.
+ */
+#define R200_STATECHANGE( rmesa, ATOM )			\
+do {								\
+   R200_NEWPRIM( rmesa );					\
+   rmesa->hw.ATOM.dirty = GL_TRUE;				\
+   rmesa->hw.is_dirty = GL_TRUE;				\
+} while (0)
+
+#define R200_DB_STATE( ATOM )			        \
+   memcpy( rmesa->hw.ATOM.lastcmd, rmesa->hw.ATOM.cmd,	\
+	   rmesa->hw.ATOM.cmd_size * 4)
+
+static __inline int R200_DB_STATECHANGE( 
+   r200ContextPtr rmesa,
+   struct r200_state_atom *atom )
+{
+   if (memcmp(atom->cmd, atom->lastcmd, atom->cmd_size*4)) {
+      int *tmp;
+      R200_NEWPRIM( rmesa );
+      atom->dirty = GL_TRUE;
+      rmesa->hw.is_dirty = GL_TRUE;
+      tmp = atom->cmd; 
+      atom->cmd = atom->lastcmd;
+      atom->lastcmd = tmp;
+      return 1;
+   }
+   else
+      return 0;
+}
+
+
+/* Fire the buffered vertices no matter what.
+ */
+#define R200_FIREVERTICES( rmesa )			\
+do {							\
+   if ( rmesa->store.cmd_used || rmesa->dma.flush ) {	\
+      r200Flush( rmesa->glCtx );			\
+   }							\
+} while (0)
+
+/* Command lengths.  Note that any time you ensure ELTS_BUFSZ or VBUF_BUFSZ
+ * are available, you will also be adding an rmesa->state.max_state_size because
+ * r200EmitState is called from within r200EmitVbufPrim and r200FlushElts.
+ */
+#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2)) * sizeof(int))
+#define VERT_AOS_BUFSZ	(5 * sizeof(int))
+#define ELTS_BUFSZ(nr)	(12 + nr * 2)
+#define VBUF_BUFSZ	(3 * sizeof(int))
+
+/* Ensure that a minimum amount of space is available in the command buffer.
+ * This is used to ensure atomicity of state updates with the rendering requests
+ * that rely on them.
+ *
+ * An alternative would be to implement a "soft lock" such that when the buffer
+ * wraps at an inopportune time, we grab the lock, flush the current buffer,
+ * and hang on to the lock until the critical section is finished and we flush
+ * the buffer again and unlock.
+ */
+static __inline void r200EnsureCmdBufSpace( r200ContextPtr rmesa, int bytes )
+{
+   if (rmesa->store.cmd_used + bytes > R200_CMD_BUF_SZ)
+      r200FlushCmdBuf( rmesa, __FUNCTION__ );
+   assert( bytes <= R200_CMD_BUF_SZ );
+}
+
+/* Alloc space in the command buffer
+ */
+static __inline char *r200AllocCmdBuf( r200ContextPtr rmesa,
+					 int bytes, const char *where )
+{
+   char * head;
+
+   if (rmesa->store.cmd_used + bytes > R200_CMD_BUF_SZ)
+      r200FlushCmdBuf( rmesa, where );
+
+   head = rmesa->store.cmd_buf + rmesa->store.cmd_used;
+   rmesa->store.cmd_used += bytes;
+   assert( rmesa->store.cmd_used <= R200_CMD_BUF_SZ );
+   return head;
+}
+
+#endif /* __R200_IOCTL_H__ */
diff --git a/r200/r200_lock.c b/r200/r200_lock.c
new file mode 100644
index 0000000..b050dd7
--- /dev/null
+++ b/r200/r200_lock.c
@@ -0,0 +1,117 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_lock.c,v 1.1 2002/10/30 12:51:52 alanh Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+ 
+#include "r200_context.h"
+#include "r200_lock.h"
+#include "r200_tex.h"
+#include "r200_state.h"
+#include "r200_ioctl.h"
+
+#include "drirenderbuffer.h"
+
+
+#if DEBUG_LOCKING
+char *prevLockFile = NULL;
+int prevLockLine = 0;
+#endif
+
+/* Turn on/off page flipping according to the flags in the sarea:
+ */
+static void
+r200UpdatePageFlipping( r200ContextPtr rmesa )
+{
+   rmesa->doPageFlip = rmesa->sarea->pfState;
+   if (rmesa->glCtx->WinSysDrawBuffer) {
+      driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
+                           rmesa->sarea->pfCurrentPage);
+   }
+}
+
+
+
+/* Update the hardware state.  This is called if another context has
+ * grabbed the hardware lock, which includes the X server.  This
+ * function also updates the driver's window state after the X server
+ * moves, resizes or restacks a window -- the change will be reflected
+ * in the drawable position and clip rects.  Since the X server grabs
+ * the hardware lock when it changes the window state, this routine will
+ * automatically be called after such a change.
+ */
+void r200GetLock( r200ContextPtr rmesa, GLuint flags )
+{
+   __DRIdrawablePrivate *drawable = rmesa->dri.drawable;
+   __DRIdrawablePrivate *readable = rmesa->dri.readable;
+   __DRIscreenPrivate *sPriv = rmesa->dri.screen;
+   drm_radeon_sarea_t *sarea = rmesa->sarea;
+   int i;
+
+   drmGetLock( rmesa->dri.fd, rmesa->dri.hwContext, flags );
+
+   /* The window might have moved, so we might need to get new clip
+    * rects.
+    *
+    * NOTE: This releases and regrabs the hw lock to allow the X server
+    * to respond to the DRI protocol request for new drawable info.
+    * Since the hardware state depends on having the latest drawable
+    * clip rects, all state checking must be done _after_ this call.
+    */
+   DRI_VALIDATE_DRAWABLE_INFO( sPriv, drawable );
+   if (drawable != readable) {
+      DRI_VALIDATE_DRAWABLE_INFO( sPriv, readable );
+   }
+
+   if ( rmesa->lastStamp != drawable->lastStamp ) {
+      r200UpdatePageFlipping( rmesa );
+      r200SetCliprects( rmesa );
+      r200UpdateViewportOffset( rmesa->glCtx );
+      driUpdateFramebufferSize(rmesa->glCtx, drawable);
+   }
+
+   R200_STATECHANGE( rmesa, ctx );
+   if (rmesa->sarea->tiling_enabled) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
+   }
+   else rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &= ~R200_COLOR_TILE_ENABLE;
+
+   if ( sarea->ctx_owner != rmesa->dri.hwContext ) {
+      sarea->ctx_owner = rmesa->dri.hwContext;
+   }
+
+   for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
+      DRI_AGE_TEXTURES( rmesa->texture_heaps[ i ] );
+   }
+
+   rmesa->lost_context = GL_TRUE;
+}
diff --git a/r200/r200_lock.h b/r200/r200_lock.h
new file mode 100644
index 0000000..e4c3a7e
--- /dev/null
+++ b/r200/r200_lock.h
@@ -0,0 +1,107 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_lock.h,v 1.1 2002/10/30 12:51:52 alanh Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __R200_LOCK_H__
+#define __R200_LOCK_H__
+
+extern void r200GetLock( r200ContextPtr rmesa, GLuint flags );
+
+/* Turn DEBUG_LOCKING on to find locking conflicts.
+ */
+#define DEBUG_LOCKING	0
+
+#if DEBUG_LOCKING
+extern char *prevLockFile;
+extern int prevLockLine;
+
+#define DEBUG_LOCK()							\
+   do {									\
+      prevLockFile = (__FILE__);					\
+      prevLockLine = (__LINE__);					\
+   } while (0)
+
+#define DEBUG_RESET()							\
+   do {									\
+      prevLockFile = 0;							\
+      prevLockLine = 0;							\
+   } while (0)
+
+#define DEBUG_CHECK_LOCK()						\
+   do {									\
+      if ( prevLockFile ) {						\
+	 fprintf( stderr,						\
+		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
+		  prevLockFile, prevLockLine, __FILE__, __LINE__ );	\
+	 exit( 1 );							\
+      }									\
+   } while (0)
+
+#else
+
+#define DEBUG_LOCK()
+#define DEBUG_RESET()
+#define DEBUG_CHECK_LOCK()
+
+#endif
+
+/*
+ * !!! We may want to separate locks from locks with validation.  This
+ * could be used to improve performance for those things commands that
+ * do not do any drawing !!!
+ */
+
+
+/* Lock the hardware and validate our state.
+ */
+#define LOCK_HARDWARE( rmesa )					\
+   do {								\
+      char __ret = 0;						\
+      DEBUG_CHECK_LOCK();					\
+      DRM_CAS( rmesa->dri.hwLock, rmesa->dri.hwContext,		\
+	       (DRM_LOCK_HELD | rmesa->dri.hwContext), __ret );	\
+      if ( __ret )						\
+	 r200GetLock( rmesa, 0 );				\
+      DEBUG_LOCK();						\
+   } while (0)
+
+#define UNLOCK_HARDWARE( rmesa )					\
+   do {									\
+      DRM_UNLOCK( rmesa->dri.fd,					\
+		  rmesa->dri.hwLock,					\
+		  rmesa->dri.hwContext );				\
+      DEBUG_RESET();							\
+   } while (0)
+
+#endif /* __R200_LOCK_H__ */
diff --git a/r200/r200_maos.c b/r200/r200_maos.c
new file mode 100644
index 0000000..23e1c96
--- /dev/null
+++ b/r200/r200_maos.c
@@ -0,0 +1,15 @@
+
+
+/* Currently, can only use arrays, verts are not implemented, though
+ * verts are suspected to be faster.
+ * To get an idea how the verts path works, look at the radeon implementation.
+ */
+#include <string.h>
+ 
+#include "r200_context.h"
+#define R200_MAOS_VERTS 0
+#if (R200_MAOS_VERTS)
+#include "r200_maos_verts.c"
+#else
+#include "r200_maos_arrays.c"
+#endif
diff --git a/r200/r200_maos.h b/r200/r200_maos.h
new file mode 100644
index 0000000..4998f67
--- /dev/null
+++ b/r200/r200_maos.h
@@ -0,0 +1,44 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_maos.h,v 1.1 2002/10/30 12:51:52 alanh Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __R200_MAOS_H__
+#define __R200_MAOS_H__
+
+#include "r200_context.h"
+
+extern void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev );
+extern void r200ReleaseArrays( GLcontext *ctx, GLuint newinputs );
+
+#endif
diff --git a/r200/r200_maos_arrays.c b/r200/r200_maos_arrays.c
new file mode 100644
index 0000000..3162b50
--- /dev/null
+++ b/r200/r200_maos_arrays.c
@@ -0,0 +1,513 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_maos_arrays.c,v 1.3 2003/02/23 23:59:01 dawes Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "mtypes.h"
+#include "colormac.h"
+#include "imports.h"
+#include "macros.h"
+
+#include "swrast_setup/swrast_setup.h"
+#include "math/m_translate.h"
+#include "tnl/tnl.h"
+#include "tnl/t_context.h"
+
+#include "r200_context.h"
+#include "r200_ioctl.h"
+#include "r200_state.h"
+#include "r200_swtcl.h"
+#include "r200_maos.h"
+#include "r200_tcl.h"
+
+
+#if 0
+/* Usage:
+ *   - from r200_tcl_render
+ *   - call r200EmitArrays to ensure uptodate arrays in dma
+ *   - emit primitives (new type?) which reference the data
+ *       -- need to use elts for lineloop, quads, quadstrip/flat
+ *       -- other primitives are all well-formed (need tristrip-1,fake-poly)
+ *
+ */
+static void emit_ubyte_rgba3( GLcontext *ctx,
+		       struct r200_dma_region *rvb,
+		       char *data,
+		       int stride,
+		       int count )
+{
+   int i;
+   r200_color_t *out = (r200_color_t *)(rvb->start + rvb->address);
+
+   if (R200_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d out %p\n",
+	      __FUNCTION__, count, stride, (void *)out);
+
+   for (i = 0; i < count; i++) {
+      out->red   = *data;
+      out->green = *(data+1);
+      out->blue  = *(data+2);
+      out->alpha = 0xFF;
+      out++;
+      data += stride;
+   }
+}
+
+static void emit_ubyte_rgba4( GLcontext *ctx,
+			      struct r200_dma_region *rvb,
+			      char *data,
+			      int stride,
+			      int count )
+{
+   int i;
+   int *out = (int *)(rvb->address + rvb->start);
+
+   if (R200_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d\n",
+	      __FUNCTION__, count, stride);
+
+   if (stride == 4) {
+      for (i = 0; i < count; i++)
+	 ((int *)out)[i] = LE32_TO_CPU(((int *)data)[i]);
+   } else {
+      for (i = 0; i < count; i++) {
+	 *(int *)out++ = LE32_TO_CPU(*(int *)data);
+	 data += stride;
+      }
+   }
+}
+
+
+static void emit_ubyte_rgba( GLcontext *ctx,
+			     struct r200_dma_region *rvb,
+			     char *data,
+			     int size,
+			     int stride,
+			     int count )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   if (R200_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
+
+   assert (!rvb->buf);
+
+   if (stride == 0) {
+      r200AllocDmaRegion( rmesa, rvb, 4, 4 );
+      count = 1;
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = 0;
+      rvb->aos_size = 1;
+   }
+   else {
+      r200AllocDmaRegion( rmesa, rvb, 4 * count, 4 );	/* alignment? */
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = 1;
+      rvb->aos_size = 1;
+   }
+
+   /* Emit the data
+    */
+   switch (size) {
+   case 3:
+      emit_ubyte_rgba3( ctx, rvb, data, stride, count );
+      break;
+   case 4:
+      emit_ubyte_rgba4( ctx, rvb, data, stride, count );
+      break;
+   default:
+      assert(0);
+      exit(1);
+      break;
+   }
+}
+#endif
+
+
+#if defined(USE_X86_ASM)
+#define COPY_DWORDS( dst, src, nr )					\
+do {									\
+	int __tmp;							\
+	__asm__ __volatile__( "rep ; movsl"				\
+			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
+			      : "0" (nr),				\
+			        "D" ((long)dst),			\
+			        "S" ((long)src) );			\
+} while (0)
+#else
+#define COPY_DWORDS( dst, src, nr )		\
+do {						\
+   int j;					\
+   for ( j = 0 ; j < nr ; j++ )			\
+      dst[j] = ((int *)src)[j];			\
+   dst += nr;					\
+} while (0)
+#endif
+
+
+static void emit_vecfog( GLcontext *ctx,
+			 struct r200_dma_region *rvb,
+			 char *data,
+			 int stride,
+			 int count )
+{
+   int i;
+   GLfloat *out;
+
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   
+   if (R200_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d\n",
+	      __FUNCTION__, count, stride);
+
+   assert (!rvb->buf);
+
+   if (stride == 0) {
+      r200AllocDmaRegion( rmesa, rvb, 4, 4 );
+      count = 1;
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = 0;
+      rvb->aos_size = 1;
+   }
+   else {
+      r200AllocDmaRegion( rmesa, rvb, count * 4, 4 );	/* alignment? */
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = 1;
+      rvb->aos_size = 1;
+   }
+
+   /* Emit the data
+    */
+   out = (GLfloat *)(rvb->address + rvb->start);
+   for (i = 0; i < count; i++) {
+      out[0] = r200ComputeFogBlendFactor( ctx, *(GLfloat *)data );
+      out++;
+      data += stride;
+   }
+
+}
+
+
+static void emit_vec4( GLcontext *ctx,
+		       struct r200_dma_region *rvb,
+		       char *data,
+		       int stride,
+		       int count )
+{
+   int i;
+   int *out = (int *)(rvb->address + rvb->start);
+
+   if (R200_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d\n",
+	      __FUNCTION__, count, stride);
+
+   if (stride == 4)
+      COPY_DWORDS( out, data, count );
+   else
+      for (i = 0; i < count; i++) {
+	 out[0] = *(int *)data;
+	 out++;
+	 data += stride;
+      }
+}
+
+
+static void emit_vec8( GLcontext *ctx,
+		       struct r200_dma_region *rvb,
+		       char *data,
+		       int stride,
+		       int count )
+{
+   int i;
+   int *out = (int *)(rvb->address + rvb->start);
+
+   if (R200_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d\n",
+	      __FUNCTION__, count, stride);
+
+   if (stride == 8)
+      COPY_DWORDS( out, data, count*2 );
+   else
+      for (i = 0; i < count; i++) {
+	 out[0] = *(int *)data;
+	 out[1] = *(int *)(data+4);
+	 out += 2;
+	 data += stride;
+      }
+}
+
+static void emit_vec12( GLcontext *ctx,
+		       struct r200_dma_region *rvb,
+		       char *data,
+		       int stride,
+		       int count )
+{
+   int i;
+   int *out = (int *)(rvb->address + rvb->start);
+
+   if (R200_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+	      __FUNCTION__, count, stride, (void *)out, (void *)data);
+
+   if (stride == 12)
+      COPY_DWORDS( out, data, count*3 );
+   else
+      for (i = 0; i < count; i++) {
+	 out[0] = *(int *)data;
+	 out[1] = *(int *)(data+4);
+	 out[2] = *(int *)(data+8);
+	 out += 3;
+	 data += stride;
+      }
+}
+
+static void emit_vec16( GLcontext *ctx,
+			struct r200_dma_region *rvb,
+			char *data,
+			int stride,
+			int count )
+{
+   int i;
+   int *out = (int *)(rvb->address + rvb->start);
+
+   if (R200_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d\n",
+	      __FUNCTION__, count, stride);
+
+   if (stride == 16)
+      COPY_DWORDS( out, data, count*4 );
+   else
+      for (i = 0; i < count; i++) {
+	 out[0] = *(int *)data;
+	 out[1] = *(int *)(data+4);
+	 out[2] = *(int *)(data+8);
+	 out[3] = *(int *)(data+12);
+	 out += 4;
+	 data += stride;
+      }
+}
+
+
+static void emit_vector( GLcontext *ctx,
+			 struct r200_dma_region *rvb,
+			 char *data,
+			 int size,
+			 int stride,
+			 int count )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   if (R200_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d size %d stride %d\n",
+	      __FUNCTION__, count, size, stride);
+
+   assert (!rvb->buf);
+
+   if (stride == 0) {
+      r200AllocDmaRegion( rmesa, rvb, size * 4, 4 );
+      count = 1;
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = 0;
+      rvb->aos_size = size;
+   }
+   else {
+      r200AllocDmaRegion( rmesa, rvb, size * count * 4, 4 );	/* alignment? */
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = size;
+      rvb->aos_size = size;
+   }
+
+   /* Emit the data
+    */
+   switch (size) {
+   case 1:
+      emit_vec4( ctx, rvb, data, stride, count );
+      break;
+   case 2:
+      emit_vec8( ctx, rvb, data, stride, count );
+      break;
+   case 3:
+      emit_vec12( ctx, rvb, data, stride, count );
+      break;
+   case 4:
+      emit_vec16( ctx, rvb, data, stride, count );
+      break;
+   default:
+      assert(0);
+      exit(1);
+      break;
+   }
+
+}
+
+
+
+/* Emit any changed arrays to new GART memory, re-emit a packet to
+ * update the arrays.  
+ */
+void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev )
+{
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+   struct vertex_buffer *VB = &TNL_CONTEXT( ctx )->vb;
+   struct r200_dma_region **component = rmesa->tcl.aos_components;
+   GLuint nr = 0;
+   GLuint vfmt0 = 0, vfmt1 = 0;
+   GLuint count = VB->Count;
+   GLuint i, emitsize;
+
+   for ( i = 0; i < 15; i++ ) {
+      GLubyte attrib = vimap_rev[i];
+      if (attrib != 255) {
+	 switch (i) {
+	 case 0:
+	    emitsize = (VB->AttribPtr[attrib]->size);
+	    switch (emitsize) {
+	    case 4:
+	       vfmt0 |= R200_VTX_W0;
+	       /* fallthrough */
+	    case 3:
+	       vfmt0 |= R200_VTX_Z0;
+	       break;
+	    case 2:
+	       break;
+	    default: assert(0);
+	    }
+	    break;
+	 case 1:
+	    assert(attrib == VERT_ATTRIB_WEIGHT);
+	    emitsize = (VB->AttribPtr[attrib]->size);
+	    vfmt0 |= emitsize << R200_VTX_WEIGHT_COUNT_SHIFT;
+	    break;
+	 case 2:
+	    assert(attrib == VERT_ATTRIB_NORMAL);
+	    emitsize = 3;
+	    vfmt0 |= R200_VTX_N0;
+	    break;
+	 case 3:
+	    /* special handling to fix up fog. Will get us into trouble with vbos...*/
+	    assert(attrib == VERT_ATTRIB_FOG);
+	    if (!rmesa->tcl.vertex_data[i].buf) {
+	       if (ctx->VertexProgram._Enabled)
+		  emit_vector( ctx,
+			 &(rmesa->tcl.vertex_data[i]),
+			 (char *)VB->AttribPtr[attrib]->data,
+			 1,
+			 VB->AttribPtr[attrib]->stride,
+			 count);
+	       else
+		  emit_vecfog( ctx,
+			 &(rmesa->tcl.vertex_data[i]),
+			 (char *)VB->AttribPtr[attrib]->data,
+			 VB->AttribPtr[attrib]->stride,
+			 count);
+	    }
+	    vfmt0 |= R200_VTX_DISCRETE_FOG;
+	    goto after_emit;
+	    break;
+	 case 4:
+	 case 5:
+	 case 6:
+	 case 7:
+	    if (VB->AttribPtr[attrib]->size == 4 &&
+	       (VB->AttribPtr[attrib]->stride != 0 ||
+		VB->AttribPtr[attrib]->data[0][3] != 1.0)) emitsize = 4;
+	    else emitsize = 3;
+	    if (emitsize == 4)
+	       vfmt0 |= R200_VTX_FP_RGBA << (R200_VTX_COLOR_0_SHIFT + (i - 4) * 2);
+	    else {
+	       vfmt0 |= R200_VTX_FP_RGB << (R200_VTX_COLOR_0_SHIFT + (i - 4) * 2);
+	    }
+	    break;
+	 case 8:
+	 case 9:
+	 case 10:
+	 case 11:
+	 case 12:
+	 case 13:
+	    emitsize = VB->AttribPtr[attrib]->size;
+	    vfmt1 |= emitsize << (R200_VTX_TEX0_COMP_CNT_SHIFT + (i - 8) * 3);
+	    break;
+	 case 14:
+	    emitsize = VB->AttribPtr[attrib]->size >= 2 ? VB->AttribPtr[attrib]->size : 2;
+	    switch (emitsize) {
+	    case 2:
+	       vfmt0 |= R200_VTX_XY1;
+	       /* fallthrough */
+	    case 3:
+	       vfmt0 |= R200_VTX_Z1;
+	       /* fallthrough */
+	    case 4:
+	       vfmt0 |= R200_VTX_W1;
+	    break;
+	    }
+	 default:
+	    assert(0);
+	 }
+	 if (!rmesa->tcl.vertex_data[i].buf) {
+	    emit_vector( ctx,
+			 &(rmesa->tcl.vertex_data[i]),
+			 (char *)VB->AttribPtr[attrib]->data,
+			 emitsize,
+			 VB->AttribPtr[attrib]->stride,
+			 count );
+	 }
+after_emit:
+	 assert(nr < 12);
+	 component[nr++] = &rmesa->tcl.vertex_data[i];
+      }
+   }
+
+   if (vfmt0 != rmesa->hw.vtx.cmd[VTX_VTXFMT_0] ||
+       vfmt1 != rmesa->hw.vtx.cmd[VTX_VTXFMT_1]) {
+      R200_STATECHANGE( rmesa, vtx );
+      rmesa->hw.vtx.cmd[VTX_VTXFMT_0] = vfmt0;
+      rmesa->hw.vtx.cmd[VTX_VTXFMT_1] = vfmt1;
+   }
+
+   rmesa->tcl.nr_aos_components = nr;
+}
+
+
+void r200ReleaseArrays( GLcontext *ctx, GLuint newinputs )
+{
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+
+   /* only do it for changed inputs ? */
+   int i;
+   for (i = 0; i < 15; i++) {
+      if (newinputs & (1 << i))
+	 r200ReleaseDmaRegion( rmesa,
+	    &rmesa->tcl.vertex_data[i], __FUNCTION__ );
+   }
+}
diff --git a/r200/r200_pixel.c b/r200/r200_pixel.c
new file mode 100644
index 0000000..7b060f9
--- /dev/null
+++ b/r200/r200_pixel.c
@@ -0,0 +1,491 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_pixel.c,v 1.2 2002/12/16 16:18:54 dawes Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "enums.h"
+#include "mtypes.h"
+#include "macros.h"
+#include "swrast/swrast.h"
+
+#include "r200_context.h"
+#include "r200_ioctl.h"
+#include "r200_pixel.h"
+#include "r200_swtcl.h"
+
+#include "drirenderbuffer.h"
+
+
+static GLboolean
+check_color( const GLcontext *ctx, GLenum type, GLenum format,
+	     const struct gl_pixelstore_attrib *packing,
+	     const void *pixels, GLint sz, GLint pitch )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint cpp = rmesa->r200Screen->cpp;
+
+   if (R200_DEBUG & DEBUG_PIXEL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   if (	(pitch & 63) ||
+	ctx->_ImageTransferState ||
+	packing->SwapBytes ||
+	packing->LsbFirst) {
+      if (R200_DEBUG & DEBUG_PIXEL)
+	 fprintf(stderr, "%s: failed 1\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+   if ( type == GL_UNSIGNED_INT_8_8_8_8_REV && 
+	cpp == 4 && 
+	format == GL_BGRA ) {
+      if (R200_DEBUG & DEBUG_PIXEL)
+	 fprintf(stderr, "%s: passed 2\n", __FUNCTION__);
+      return GL_TRUE;
+   }
+
+   if (R200_DEBUG & DEBUG_PIXEL)
+      fprintf(stderr, "%s: failed\n", __FUNCTION__);
+
+   return GL_FALSE;
+}
+
+static GLboolean
+check_color_per_fragment_ops( const GLcontext *ctx )
+{
+   int result;
+   result = (!(     ctx->Color.AlphaEnabled || 
+		    ctx->Depth.Test ||
+		    ctx->Fog.Enabled ||
+		    ctx->Scissor.Enabled ||
+		    ctx->Stencil.Enabled ||
+		    !ctx->Color.ColorMask[0] ||
+		    !ctx->Color.ColorMask[1] ||
+		    !ctx->Color.ColorMask[2] ||
+		    !ctx->Color.ColorMask[3] ||
+		    ctx->Color.ColorLogicOpEnabled ||
+		    ctx->Texture._EnabledUnits
+           ) &&
+	   ctx->Current.RasterPosValid);
+   
+   return result;
+}
+
+
+
+static GLboolean
+clip_pixelrect( const GLcontext *ctx,
+		const GLframebuffer *buffer,
+		GLint *x, GLint *y,
+		GLsizei *width, GLsizei *height,
+		GLint *size )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   /* left clipping */
+   if (*x < buffer->_Xmin) {
+      *width -= (buffer->_Xmin - *x);
+      *x = buffer->_Xmin;
+   }
+
+   /* right clipping */
+   if (*x + *width > buffer->_Xmax)
+      *width -= (*x + *width - buffer->_Xmax - 1);
+
+   if (*width <= 0)
+      return GL_FALSE;
+
+   /* bottom clipping */
+   if (*y < buffer->_Ymin) {
+      *height -= (buffer->_Ymin - *y);
+      *y = buffer->_Ymin;
+   }
+
+   /* top clipping */
+   if (*y + *height > buffer->_Ymax)
+      *height -= (*y + *height - buffer->_Ymax - 1);
+
+   if (*height <= 0)
+      return GL_FALSE;
+
+   *size = ((*y + *height - 1) * rmesa->r200Screen->frontPitch +
+	    (*x + *width - 1) * rmesa->r200Screen->cpp);
+
+   return GL_TRUE;
+}
+
+static GLboolean
+r200TryReadPixels( GLcontext *ctx,
+		  GLint x, GLint y, GLsizei width, GLsizei height,
+		  GLenum format, GLenum type,
+		  const struct gl_pixelstore_attrib *pack,
+		  GLvoid *pixels )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLint pitch = pack->RowLength ? pack->RowLength : width;
+   GLint blit_format;
+   GLuint cpp = rmesa->r200Screen->cpp;
+   GLint size = width * height * cpp;
+
+   if (R200_DEBUG & DEBUG_PIXEL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   /* Only accelerate reading to GART buffers.
+    */
+   if ( !r200IsGartMemory(rmesa, pixels, 
+			 pitch * height * rmesa->r200Screen->cpp ) ) {
+      if (R200_DEBUG & DEBUG_PIXEL)
+	 fprintf(stderr, "%s: dest not GART\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+   /* Need GL_PACK_INVERT_MESA to cope with upsidedown results from
+    * blitter:
+    */
+   if (!pack->Invert) {
+      if (R200_DEBUG & DEBUG_PIXEL)
+	 fprintf(stderr, "%s: MESA_PACK_INVERT not set\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+   if (!check_color(ctx, type, format, pack, pixels, size, pitch))
+      return GL_FALSE;
+
+   switch ( rmesa->r200Screen->cpp ) {
+   case 4:
+      blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
+      break;
+   default:
+      return GL_FALSE;
+   }
+
+
+   /* Although the blits go on the command buffer, need to do this and
+    * fire with lock held to guarentee cliprects and drawOffset are
+    * correct.
+    *
+    * This is an unusual situation however, as the code which flushes
+    * a full command buffer expects to be called unlocked.  As a
+    * workaround, immediately flush the buffer on aquiring the lock.
+    */
+   LOCK_HARDWARE( rmesa );
+
+   if (rmesa->store.cmd_used)
+      r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+
+   if (!clip_pixelrect(ctx, ctx->ReadBuffer, &x, &y, &width, &height,
+		       &size)) {
+      UNLOCK_HARDWARE( rmesa );
+      if (R200_DEBUG & DEBUG_PIXEL)
+	 fprintf(stderr, "%s totally clipped -- nothing to do\n",
+		 __FUNCTION__);
+      return GL_TRUE;
+   }
+
+   {
+      __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+      driRenderbuffer *drb = (driRenderbuffer *) ctx->ReadBuffer->_ColorReadBuffer;
+      int nbox = dPriv->numClipRects;
+      int src_offset = drb->offset
+		     + rmesa->r200Screen->fbLocation;
+      int src_pitch = drb->pitch * drb->cpp;
+      int dst_offset = r200GartOffsetFromVirtual( rmesa, pixels );
+      int dst_pitch = pitch * rmesa->r200Screen->cpp;
+      drm_clip_rect_t *box = dPriv->pClipRects;
+      int i;
+
+      r200EmitWait( rmesa, RADEON_WAIT_3D ); 
+
+      y = dPriv->h - y - height;
+      x += dPriv->x;
+      y += dPriv->y;
+
+
+      if (R200_DEBUG & DEBUG_PIXEL)
+	 fprintf(stderr, "readpixel blit src_pitch %d dst_pitch %d\n",
+		 src_pitch, dst_pitch);
+
+      for (i = 0 ; i < nbox ; i++)
+      {
+	 GLint bx = box[i].x1;
+	 GLint by = box[i].y1;
+	 GLint bw = box[i].x2 - bx;
+	 GLint bh = box[i].y2 - by;
+	 
+	 if (bx < x) bw -= x - bx, bx = x;
+	 if (by < y) bh -= y - by, by = y;
+	 if (bx + bw > x + width) bw = x + width - bx;
+	 if (by + bh > y + height) bh = y + height - by;
+	 if (bw <= 0) continue;
+	 if (bh <= 0) continue;
+
+	 r200EmitBlit( rmesa,
+		       blit_format,
+		       src_pitch, src_offset,
+		       dst_pitch, dst_offset,
+		       bx, by,
+		       bx - x, by - y,
+		       bw, bh );
+      }
+
+      r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+   }
+   UNLOCK_HARDWARE( rmesa );
+
+   r200Finish( ctx ); /* required by GL */
+
+   return GL_TRUE;
+}
+
+static void
+r200ReadPixels( GLcontext *ctx,
+		 GLint x, GLint y, GLsizei width, GLsizei height,
+		 GLenum format, GLenum type,
+		 const struct gl_pixelstore_attrib *pack,
+		 GLvoid *pixels )
+{
+   if (R200_DEBUG & DEBUG_PIXEL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   if (!r200TryReadPixels( ctx, x, y, width, height, format, type, pack, 
+			   pixels))
+      _swrast_ReadPixels( ctx, x, y, width, height, format, type, pack, 
+			  pixels);
+}
+
+
+
+
+static void do_draw_pix( GLcontext *ctx,
+			 GLint x, GLint y, GLsizei width, GLsizei height,
+			 GLint pitch,
+			 const void *pixels,
+			 GLuint planemask)
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   drm_clip_rect_t *box = dPriv->pClipRects;
+   struct gl_renderbuffer *rb = ctx->ReadBuffer->_ColorDrawBuffers[0][0];
+   driRenderbuffer *drb = (driRenderbuffer *) rb;
+   int nbox = dPriv->numClipRects;
+   int i;
+   int blit_format;
+   int size;
+   int src_offset = r200GartOffsetFromVirtual( rmesa, pixels );
+   int src_pitch = pitch * rmesa->r200Screen->cpp;
+
+   if (R200_DEBUG & DEBUG_PIXEL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   switch ( rmesa->r200Screen->cpp ) {
+   case 2:
+      blit_format = R200_CP_COLOR_FORMAT_RGB565;
+      break;
+   case 4:
+      blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
+      break;
+   default:
+      return;
+   }
+
+
+   LOCK_HARDWARE( rmesa );
+
+   if (rmesa->store.cmd_used)
+      r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+
+   y -= height;			/* cope with pixel zoom */
+   
+   if (!clip_pixelrect(ctx, ctx->DrawBuffer,
+		       &x, &y, &width, &height,
+		       &size)) {
+      UNLOCK_HARDWARE( rmesa );
+      return;
+   }
+
+   y = dPriv->h - y - height; 	/* convert from gl to hardware coords */
+   x += dPriv->x;
+   y += dPriv->y;
+
+
+   r200EmitWait( rmesa, RADEON_WAIT_3D );
+
+   for (i = 0 ; i < nbox ; i++ )
+   {
+      GLint bx = box[i].x1;
+      GLint by = box[i].y1;
+      GLint bw = box[i].x2 - bx;
+      GLint bh = box[i].y2 - by;
+
+      if (bx < x) bw -= x - bx, bx = x;
+      if (by < y) bh -= y - by, by = y;
+      if (bx + bw > x + width) bw = x + width - bx;
+      if (by + bh > y + height) bh = y + height - by;
+      if (bw <= 0) continue;
+      if (bh <= 0) continue;
+
+      r200EmitBlit( rmesa,
+		    blit_format,
+		    src_pitch, src_offset,
+		    drb->pitch * drb->cpp,
+		    drb->offset + rmesa->r200Screen->fbLocation,
+		    bx - x, by - y,
+		    bx, by,
+		    bw, bh );
+   }
+
+   r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+   r200WaitForIdleLocked( rmesa ); /* required by GL */
+   UNLOCK_HARDWARE( rmesa );
+}
+
+
+
+
+static GLboolean
+r200TryDrawPixels( GLcontext *ctx,
+		  GLint x, GLint y, GLsizei width, GLsizei height,
+		  GLenum format, GLenum type,
+		  const struct gl_pixelstore_attrib *unpack,
+		  const GLvoid *pixels )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLint pitch = unpack->RowLength ? unpack->RowLength : width;
+   GLuint planemask;
+   GLuint cpp = rmesa->r200Screen->cpp;
+   GLint size = width * pitch * cpp;
+
+   if (R200_DEBUG & DEBUG_PIXEL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   /* check that we're drawing to exactly one color buffer */
+   if (ctx->DrawBuffer->_NumColorDrawBuffers[0] != 1)
+     return GL_FALSE;
+
+   switch (format) {
+   case GL_RGB:
+   case GL_RGBA:
+   case GL_BGRA:
+      planemask = r200PackColor(cpp,
+				ctx->Color.ColorMask[RCOMP],
+				ctx->Color.ColorMask[GCOMP],
+				ctx->Color.ColorMask[BCOMP],
+				ctx->Color.ColorMask[ACOMP]);
+
+      if (cpp == 2)
+	 planemask |= planemask << 16;
+
+      if (planemask != ~0)
+	 return GL_FALSE;	/* fix me -- should be possible */
+
+      /* Can't do conversions on GART reads/draws. 
+       */
+      if ( !r200IsGartMemory( rmesa, pixels, size ) ) {
+	 if (R200_DEBUG & DEBUG_PIXEL)
+	    fprintf(stderr, "%s: not GART memory\n", __FUNCTION__);
+	 return GL_FALSE;
+      }
+
+      if (!check_color(ctx, type, format, unpack, pixels, size, pitch)) {
+	 return GL_FALSE;
+      }
+      if (!check_color_per_fragment_ops(ctx)) {
+	 return GL_FALSE;
+      }
+
+      if (ctx->Pixel.ZoomX != 1.0F ||
+	  ctx->Pixel.ZoomY != -1.0F)
+	 return GL_FALSE;
+      break;
+
+   default:
+      return GL_FALSE;
+   }
+
+   if ( r200IsGartMemory(rmesa, pixels, size) )
+   {
+      do_draw_pix( ctx, x, y, width, height, pitch, pixels, planemask );
+      return GL_TRUE;
+   }
+   else if (0)
+   {
+      /* Pixels is in regular memory -- get dma buffers and perform
+       * upload through them.
+       */
+   }
+   else
+      return GL_FALSE;
+}
+
+static void
+r200DrawPixels( GLcontext *ctx,
+		 GLint x, GLint y, GLsizei width, GLsizei height,
+		 GLenum format, GLenum type,
+		 const struct gl_pixelstore_attrib *unpack,
+		 const GLvoid *pixels )
+{
+   if (R200_DEBUG & DEBUG_PIXEL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   if (!r200TryDrawPixels( ctx, x, y, width, height, format, type,
+			  unpack, pixels ))
+      _swrast_DrawPixels( ctx, x, y, width, height, format, type,
+			  unpack, pixels );
+}
+
+
+static void
+r200Bitmap( GLcontext *ctx, GLint px, GLint py,
+		  GLsizei width, GLsizei height,
+		  const struct gl_pixelstore_attrib *unpack,
+		  const GLubyte *bitmap )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   if (rmesa->Fallback)
+      _swrast_Bitmap( ctx, px, py, width, height, unpack, bitmap );
+   else
+      r200PointsBitmap( ctx, px, py, width, height, unpack, bitmap );
+}
+
+
+
+void r200InitPixelFuncs( GLcontext *ctx )
+{
+   if (!getenv("R200_NO_BLITS")) {
+      ctx->Driver.ReadPixels = r200ReadPixels;  
+      ctx->Driver.DrawPixels = r200DrawPixels; 
+      if (getenv("R200_HW_BITMAP")) 
+	 ctx->Driver.Bitmap = r200Bitmap;
+   }
+}
diff --git a/r200/r200_pixel.h b/r200/r200_pixel.h
new file mode 100644
index 0000000..8f3923b
--- /dev/null
+++ b/r200/r200_pixel.h
@@ -0,0 +1,40 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_pixel.h,v 1.1 2002/10/30 12:51:52 alanh Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __R200_PIXEL_H__
+#define __R200_PIXEL_H__
+
+extern void r200InitPixelFuncs( GLcontext *ctx );
+
+#endif
diff --git a/r200/r200_reg.h b/r200/r200_reg.h
new file mode 100644
index 0000000..a88ea4c
--- /dev/null
+++ b/r200/r200_reg.h
@@ -0,0 +1,1586 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_reg.h,v 1.2 2002/12/16 16:18:54 dawes Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef _R200_REG_H_
+#define _R200_REG_H_
+
+#define R200_PP_MISC                      0x1c14 
+#define     R200_REF_ALPHA_MASK        0x000000ff
+#define     R200_ALPHA_TEST_FAIL       (0 << 8)
+#define     R200_ALPHA_TEST_LESS       (1 << 8)
+#define     R200_ALPHA_TEST_LEQUAL     (2 << 8)
+#define     R200_ALPHA_TEST_EQUAL      (3 << 8)
+#define     R200_ALPHA_TEST_GEQUAL     (4 << 8)
+#define     R200_ALPHA_TEST_GREATER    (5 << 8)
+#define     R200_ALPHA_TEST_NEQUAL     (6 << 8)
+#define     R200_ALPHA_TEST_PASS       (7 << 8)
+#define     R200_ALPHA_TEST_OP_MASK    (7 << 8)
+#define     R200_CHROMA_FUNC_FAIL      (0 << 16)
+#define     R200_CHROMA_FUNC_PASS      (1 << 16)
+#define     R200_CHROMA_FUNC_NEQUAL    (2 << 16)
+#define     R200_CHROMA_FUNC_EQUAL     (3 << 16)
+#define     R200_CHROMA_KEY_NEAREST    (0 << 18)
+#define     R200_CHROMA_KEY_ZERO       (1 << 18)
+#define     R200_RIGHT_HAND_CUBE_D3D   (0 << 24)
+#define     R200_RIGHT_HAND_CUBE_OGL   (1 << 24)
+#define R200_PP_FOG_COLOR                 0x1c18 
+#define     R200_FOG_COLOR_MASK        0x00ffffff
+#define     R200_FOG_VERTEX            (0 << 24)
+#define     R200_FOG_TABLE             (1 << 24)
+#define     R200_FOG_USE_DEPTH         (0 << 25)
+#define     R200_FOG_USE_W             (1 << 25)
+#define     R200_FOG_USE_DIFFUSE_ALPHA (2 << 25)
+#define     R200_FOG_USE_SPEC_ALPHA    (3 << 25)
+#define     R200_FOG_USE_VTX_FOG       (4 << 25)
+#define     R200_FOG_USE_MASK          (7 << 25)
+#define R200_RE_SOLID_COLOR               0x1c1c 
+#define R200_RB3D_BLENDCNTL               0x1c20
+#define     R200_COMB_FCN_MASK                    (7  << 12)
+#define     R200_COMB_FCN_ADD_CLAMP               (0  << 12)
+#define     R200_COMB_FCN_ADD_NOCLAMP             (1  << 12)
+#define     R200_COMB_FCN_SUB_CLAMP               (2  << 12)
+#define     R200_COMB_FCN_SUB_NOCLAMP             (3  << 12)
+#define     R200_COMB_FCN_MIN                     (4  << 12)
+#define     R200_COMB_FCN_MAX                     (5  << 12)
+#define     R200_COMB_FCN_RSUB_CLAMP              (6  << 12)
+#define     R200_COMB_FCN_RSUB_NOCLAMP            (7  << 12)
+#define       R200_BLEND_GL_ZERO                  (32)
+#define       R200_BLEND_GL_ONE                   (33)
+#define       R200_BLEND_GL_SRC_COLOR             (34)
+#define       R200_BLEND_GL_ONE_MINUS_SRC_COLOR   (35)
+#define       R200_BLEND_GL_DST_COLOR             (36)
+#define       R200_BLEND_GL_ONE_MINUS_DST_COLOR   (37)
+#define       R200_BLEND_GL_SRC_ALPHA             (38)
+#define       R200_BLEND_GL_ONE_MINUS_SRC_ALPHA   (39)
+#define       R200_BLEND_GL_DST_ALPHA             (40)
+#define       R200_BLEND_GL_ONE_MINUS_DST_ALPHA   (41)
+#define       R200_BLEND_GL_SRC_ALPHA_SATURATE    (42) /* src factor only */
+#define       R200_BLEND_GL_CONST_COLOR           (43)
+#define       R200_BLEND_GL_ONE_MINUS_CONST_COLOR (44)
+#define       R200_BLEND_GL_CONST_ALPHA           (45)
+#define       R200_BLEND_GL_ONE_MINUS_CONST_ALPHA (46)
+#define       R200_BLEND_MASK                     (63)
+#define     R200_SRC_BLEND_SHIFT                  (16)
+#define     R200_DST_BLEND_SHIFT                  (24)
+#define R200_RB3D_DEPTHOFFSET             0x1c24
+#define R200_RB3D_DEPTHPITCH              0x1c28
+#define     R200_DEPTHPITCH_MASK         0x00001ff8
+#define     R200_DEPTH_HYPERZ            (3 << 16)
+#define     R200_DEPTH_ENDIAN_NO_SWAP    (0 << 18)
+#define     R200_DEPTH_ENDIAN_WORD_SWAP  (1 << 18)
+#define     R200_DEPTH_ENDIAN_DWORD_SWAP (2 << 18)
+#define R200_RB3D_ZSTENCILCNTL            0x1c2c 
+#define     R200_DEPTH_FORMAT_MASK          (0xf << 0)
+#define     R200_DEPTH_FORMAT_16BIT_INT_Z   (0  <<  0)
+#define     R200_DEPTH_FORMAT_24BIT_INT_Z   (2  <<  0)
+#define     R200_DEPTH_FORMAT_24BIT_FLOAT_Z (3  <<  0)
+#define     R200_DEPTH_FORMAT_32BIT_INT_Z   (4  <<  0)
+#define     R200_DEPTH_FORMAT_32BIT_FLOAT_Z (5  <<  0)
+#define     R200_DEPTH_FORMAT_24BIT_FLOAT_W (9  <<  0)
+#define     R200_DEPTH_FORMAT_32BIT_FLOAT_W (11 <<  0)
+#define     R200_Z_TEST_NEVER               (0  <<  4)
+#define     R200_Z_TEST_LESS                (1  <<  4)
+#define     R200_Z_TEST_LEQUAL              (2  <<  4)
+#define     R200_Z_TEST_EQUAL               (3  <<  4)
+#define     R200_Z_TEST_GEQUAL              (4  <<  4)
+#define     R200_Z_TEST_GREATER             (5  <<  4)
+#define     R200_Z_TEST_NEQUAL              (6  <<  4)
+#define     R200_Z_TEST_ALWAYS              (7  <<  4)
+#define     R200_Z_TEST_MASK                (7  <<  4)
+#define     R200_Z_HIERARCHY_ENABLE         (1  <<  8)
+#define     R200_STENCIL_TEST_NEVER         (0  << 12)
+#define     R200_STENCIL_TEST_LESS          (1  << 12)
+#define     R200_STENCIL_TEST_LEQUAL        (2  << 12)
+#define     R200_STENCIL_TEST_EQUAL         (3  << 12)
+#define     R200_STENCIL_TEST_GEQUAL        (4  << 12)
+#define     R200_STENCIL_TEST_GREATER       (5  << 12)
+#define     R200_STENCIL_TEST_NEQUAL        (6  << 12)
+#define     R200_STENCIL_TEST_ALWAYS        (7  << 12)
+#define     R200_STENCIL_TEST_MASK          (0x7 << 12)
+#define     R200_STENCIL_FAIL_KEEP          (0  << 16)
+#define     R200_STENCIL_FAIL_ZERO          (1  << 16)
+#define     R200_STENCIL_FAIL_REPLACE       (2  << 16)
+#define     R200_STENCIL_FAIL_INC           (3  << 16)
+#define     R200_STENCIL_FAIL_DEC           (4  << 16)
+#define     R200_STENCIL_FAIL_INVERT        (5  << 16)
+#define     R200_STENCIL_FAIL_INC_WRAP      (6  << 16)
+#define     R200_STENCIL_FAIL_DEC_WRAP      (7  << 16)
+#define     R200_STENCIL_FAIL_MASK          (0x7 << 16)
+#define     R200_STENCIL_ZPASS_KEEP         (0  << 20)
+#define     R200_STENCIL_ZPASS_ZERO         (1  << 20)
+#define     R200_STENCIL_ZPASS_REPLACE      (2  << 20)
+#define     R200_STENCIL_ZPASS_INC          (3  << 20)
+#define     R200_STENCIL_ZPASS_DEC          (4  << 20)
+#define     R200_STENCIL_ZPASS_INVERT       (5  << 20)
+#define     R200_STENCIL_ZPASS_INC_WRAP     (6  << 20)
+#define     R200_STENCIL_ZPASS_DEC_WRAP     (7  << 20)
+#define     R200_STENCIL_ZPASS_MASK         (0x7 << 20)
+#define     R200_STENCIL_ZFAIL_KEEP         (0  << 24)
+#define     R200_STENCIL_ZFAIL_ZERO         (1  << 24)
+#define     R200_STENCIL_ZFAIL_REPLACE      (2  << 24)
+#define     R200_STENCIL_ZFAIL_INC          (3  << 24)
+#define     R200_STENCIL_ZFAIL_DEC          (4  << 24)
+#define     R200_STENCIL_ZFAIL_INVERT       (5  << 24)
+#define     R200_STENCIL_ZFAIL_INC_WRAP     (6  << 24)
+#define     R200_STENCIL_ZFAIL_DEC_WRAP     (7  << 24)
+#define     R200_STENCIL_ZFAIL_MASK         (0x7 << 24)
+#define     R200_Z_COMPRESSION_ENABLE       (1  << 28)
+#define     R200_FORCE_Z_DIRTY              (1  << 29)
+#define     R200_Z_WRITE_ENABLE             (1  << 30)
+#define     R200_Z_DECOMPRESSION_ENABLE     (1  << 31)
+/*gap*/
+#define R200_PP_CNTL                      0x1c38 
+#define     R200_TEX_0_ENABLE                         0x00000010
+#define     R200_TEX_1_ENABLE                         0x00000020
+#define     R200_TEX_2_ENABLE                         0x00000040
+#define     R200_TEX_3_ENABLE                         0x00000080
+#define     R200_TEX_4_ENABLE                         0x00000100
+#define     R200_TEX_5_ENABLE                         0x00000200
+#define     R200_TEX_ENABLE_MASK                      0x000003f0
+#define     R200_FILTER_ROUND_MODE_MASK               0x00000400
+#define     R200_TEX_BLEND_7_ENABLE                   0x00000800
+#define     R200_TEX_BLEND_0_ENABLE                   0x00001000
+#define     R200_TEX_BLEND_1_ENABLE                   0x00002000
+#define     R200_TEX_BLEND_2_ENABLE                   0x00004000
+#define     R200_TEX_BLEND_3_ENABLE                   0x00008000
+#define     R200_TEX_BLEND_4_ENABLE                   0x00010000
+#define     R200_TEX_BLEND_5_ENABLE                   0x00020000
+#define     R200_TEX_BLEND_6_ENABLE                   0x00040000
+#define     R200_TEX_BLEND_ENABLE_MASK                0x0007f800
+#define     R200_TEX_BLEND_0_ENABLE_SHIFT             (12)
+#define     R200_MULTI_PASS_ENABLE                    0x00080000
+#define     R200_SPECULAR_ENABLE                      0x00200000
+#define     R200_FOG_ENABLE                           0x00400000
+#define     R200_ALPHA_TEST_ENABLE                    0x00800000
+#define     R200_ANTI_ALIAS_NONE                       0x00000000
+#define     R200_ANTI_ALIAS_LINE                       0x01000000
+#define     R200_ANTI_ALIAS_POLY                       0x02000000
+#define     R200_ANTI_ALIAS_MASK                       0x03000000
+#define R200_RB3D_CNTL                    0x1c3c 
+#define     R200_ALPHA_BLEND_ENABLE       (1  <<  0)
+#define     R200_PLANE_MASK_ENABLE        (1  <<  1)
+#define     R200_DITHER_ENABLE            (1  <<  2)
+#define     R200_ROUND_ENABLE             (1  <<  3)
+#define     R200_SCALE_DITHER_ENABLE      (1  <<  4)
+#define     R200_DITHER_INIT              (1  <<  5)
+#define     R200_ROP_ENABLE               (1  <<  6)
+#define     R200_STENCIL_ENABLE           (1  <<  7)
+#define     R200_Z_ENABLE                 (1  <<  8)
+#define     R200_DEPTH_XZ_OFFEST_ENABLE   (1  <<  9)
+#define     R200_COLOR_FORMAT_ARGB1555    (3  << 10)
+#define     R200_COLOR_FORMAT_RGB565      (4  << 10)
+#define     R200_COLOR_FORMAT_ARGB8888    (6  << 10)
+#define     R200_COLOR_FORMAT_RGB332      (7  << 10)
+#define     R200_COLOR_FORMAT_Y8          (8  << 10)
+#define     R200_COLOR_FORMAT_RGB8        (9  << 10)
+#define     R200_COLOR_FORMAT_YUV422_VYUY (11 << 10)
+#define     R200_COLOR_FORMAT_YUV422_YVYU (12 << 10)
+#define     R200_COLOR_FORMAT_aYUV444     (14 << 10)
+#define     R200_COLOR_FORMAT_ARGB4444    (15 << 10)
+#define     R200_CLRCMP_FLIP_ENABLE       (1  << 14)
+#define     R200_SEPARATE_ALPHA_ENABLE    (1  << 16)
+#define R200_RB3D_COLOROFFSET             0x1c40 
+#define     R200_COLOROFFSET_MASK      0xfffffff0
+#define R200_RE_WIDTH_HEIGHT              0x1c44 
+#define     R200_RE_WIDTH_SHIFT        0
+#define     R200_RE_HEIGHT_SHIFT       16
+#define R200_RB3D_COLORPITCH              0x1c48 
+#define     R200_COLORPITCH_MASK         0x000001ff8
+#define     R200_COLOR_TILE_ENABLE       (1 << 16)
+#define     R200_COLOR_MICROTILE_ENABLE  (1 << 17)
+#define     R200_COLOR_ENDIAN_NO_SWAP    (0 << 18)
+#define     R200_COLOR_ENDIAN_WORD_SWAP  (1 << 18)
+#define     R200_COLOR_ENDIAN_DWORD_SWAP (2 << 18)
+#define R200_SE_CNTL                      0x1c4c 
+#define     R200_FFACE_CULL_CW          (0 <<  0)
+#define     R200_FFACE_CULL_CCW         (1 <<  0)
+#define     R200_FFACE_CULL_DIR_MASK    (1 <<  0)
+#define     R200_BFACE_CULL             (0 <<  1)
+#define     R200_BFACE_SOLID            (3 <<  1)
+#define     R200_FFACE_CULL             (0 <<  3)
+#define     R200_FFACE_SOLID            (3 <<  3)
+#define     R200_FFACE_CULL_MASK        (3 <<  3)
+#define     R200_FLAT_SHADE_VTX_0       (0 <<  6)
+#define     R200_FLAT_SHADE_VTX_1       (1 <<  6)
+#define     R200_FLAT_SHADE_VTX_2       (2 <<  6)
+#define     R200_FLAT_SHADE_VTX_LAST    (3 <<  6)
+#define     R200_DIFFUSE_SHADE_SOLID    (0 <<  8)
+#define     R200_DIFFUSE_SHADE_FLAT     (1 <<  8)
+#define     R200_DIFFUSE_SHADE_GOURAUD  (2 <<  8)
+#define     R200_DIFFUSE_SHADE_MASK     (3 <<  8)
+#define     R200_ALPHA_SHADE_SOLID      (0 << 10)
+#define     R200_ALPHA_SHADE_FLAT       (1 << 10)
+#define     R200_ALPHA_SHADE_GOURAUD    (2 << 10)
+#define     R200_ALPHA_SHADE_MASK       (3 << 10)
+#define     R200_SPECULAR_SHADE_SOLID   (0 << 12)
+#define     R200_SPECULAR_SHADE_FLAT    (1 << 12)
+#define     R200_SPECULAR_SHADE_GOURAUD (2 << 12)
+#define     R200_SPECULAR_SHADE_MASK    (3 << 12)
+#define     R200_FOG_SHADE_SOLID        (0 << 14)
+#define     R200_FOG_SHADE_FLAT         (1 << 14)
+#define     R200_FOG_SHADE_GOURAUD      (2 << 14)
+#define     R200_FOG_SHADE_MASK         (3 << 14)
+#define     R200_ZBIAS_ENABLE_POINT     (1 << 16)
+#define     R200_ZBIAS_ENABLE_LINE      (1 << 17)
+#define     R200_ZBIAS_ENABLE_TRI       (1 << 18)
+#define     R200_WIDELINE_ENABLE        (1 << 20)
+#define     R200_DISC_FOG_SHADE_SOLID   (0 << 24)
+#define     R200_DISC_FOG_SHADE_FLAT    (1 << 24)
+#define     R200_DISC_FOG_SHADE_GOURAUD (2 << 24)
+#define     R200_DISC_FOG_SHADE_MASK    (3 << 24)
+#define     R200_VTX_PIX_CENTER_D3D     (0 << 27)
+#define     R200_VTX_PIX_CENTER_OGL     (1 << 27)
+#define     R200_ROUND_MODE_TRUNC       (0 << 28)
+#define     R200_ROUND_MODE_ROUND       (1 << 28)
+#define     R200_ROUND_MODE_ROUND_EVEN  (2 << 28)
+#define     R200_ROUND_MODE_ROUND_ODD   (3 << 28)
+#define     R200_ROUND_PREC_16TH_PIX    (0 << 30)
+#define     R200_ROUND_PREC_8TH_PIX     (1 << 30)
+#define     R200_ROUND_PREC_4TH_PIX     (2 << 30)
+#define     R200_ROUND_PREC_HALF_PIX    (3 << 30)
+#define R200_RE_CNTL                      0x1c50 
+#define     R200_STIPPLE_ENABLE                     0x1
+#define     R200_SCISSOR_ENABLE                     0x2
+#define     R200_PATTERN_ENABLE                     0x4
+#define     R200_PERSPECTIVE_ENABLE                 0x8
+#define     R200_POINT_SMOOTH                       0x20
+#define     R200_VTX_STQ0_D3D                       0x00010000
+#define     R200_VTX_STQ1_D3D                       0x00040000
+#define     R200_VTX_STQ2_D3D                       0x00100000
+#define     R200_VTX_STQ3_D3D                       0x00400000
+#define     R200_VTX_STQ4_D3D                       0x01000000
+#define     R200_VTX_STQ5_D3D                       0x04000000
+/* gap */
+#define R200_RE_STIPPLE_ADDR              0x1cc8
+#define R200_RE_STIPPLE_DATA              0x1ccc
+#define R200_RE_LINE_PATTERN              0x1cd0 
+#define     R200_LINE_PATTERN_MASK             0x0000ffff
+#define     R200_LINE_REPEAT_COUNT_SHIFT       16
+#define     R200_LINE_PATTERN_START_SHIFT      24
+#define     R200_LINE_PATTERN_LITTLE_BIT_ORDER (0 << 28)
+#define     R200_LINE_PATTERN_BIG_BIT_ORDER    (1 << 28)
+#define     R200_LINE_PATTERN_AUTO_RESET       (1 << 29)
+#define R200_RE_LINE_STATE                0x1cd4 
+#define     R200_LINE_CURRENT_PTR_SHIFT       0
+#define     R200_LINE_CURRENT_COUNT_SHIFT     8
+#define R200_RE_SCISSOR_TL_0              0x1cd8
+#define R200_RE_SCISSOR_BR_0              0x1cdc
+#define R200_RE_SCISSOR_TL_1              0x1ce0
+#define R200_RE_SCISSOR_BR_1              0x1ce4
+#define R200_RE_SCISSOR_TL_2              0x1ce8
+#define R200_RE_SCISSOR_BR_2              0x1cec
+/* gap */
+#define R200_RB3D_DEPTHXY_OFFSET          0x1d60 
+#define     R200_DEPTHX_SHIFT  0
+#define     R200_DEPTHY_SHIFT  16
+/* gap */
+#define R200_RB3D_STENCILREFMASK          0x1d7c 
+#define     R200_STENCIL_REF_SHIFT           0
+#define     R200_STENCIL_REF_MASK            (0xff << 0)
+#define     R200_STENCIL_MASK_SHIFT          16
+#define     R200_STENCIL_VALUE_MASK          (0xff << 16)
+#define     R200_STENCIL_WRITEMASK_SHIFT     24
+#define     R200_STENCIL_WRITE_MASK          (0xff << 24)
+#define R200_RB3D_ROPCNTL                 0x1d80 
+#define     R200_ROP_MASK                    (15 << 8)
+#define     R200_ROP_CLEAR                   (0  << 8)
+#define     R200_ROP_NOR                     (1  << 8)
+#define     R200_ROP_AND_INVERTED            (2  << 8)
+#define     R200_ROP_COPY_INVERTED           (3  << 8)
+#define     R200_ROP_AND_REVERSE             (4  << 8)
+#define     R200_ROP_INVERT                  (5  << 8)
+#define     R200_ROP_XOR                     (6  << 8)
+#define     R200_ROP_NAND                    (7  << 8)
+#define     R200_ROP_AND                     (8  << 8)
+#define     R200_ROP_EQUIV                   (9  << 8)
+#define     R200_ROP_NOOP                    (10 << 8)
+#define     R200_ROP_OR_INVERTED             (11 << 8)
+#define     R200_ROP_COPY                    (12 << 8)
+#define     R200_ROP_OR_REVERSE              (13 << 8)
+#define     R200_ROP_OR                      (14 << 8)
+#define     R200_ROP_SET                     (15 << 8)
+#define R200_RB3D_PLANEMASK               0x1d84 
+/* gap */
+#define R200_SE_VPORT_XSCALE              0x1d98 
+#define R200_SE_VPORT_XOFFSET             0x1d9c 
+#define R200_SE_VPORT_YSCALE              0x1da0 
+#define R200_SE_VPORT_YOFFSET             0x1da4 
+#define R200_SE_VPORT_ZSCALE              0x1da8 
+#define R200_SE_VPORT_ZOFFSET             0x1dac 
+#define R200_SE_ZBIAS_FACTOR              0x1db0 
+#define R200_SE_ZBIAS_CONSTANT            0x1db4 
+#define R200_SE_LINE_WIDTH                0x1db8 
+#define	    R200_LINE_WIDTH_SHIFT                   0x00000000
+#define	    R200_MINPOINTSIZE_SHIFT                 0x00000010
+/* gap */
+#define R200_SE_VAP_CNTL                           0x2080
+#define     R200_VAP_TCL_ENABLE                       0x00000001
+#define     R200_VAP_PROG_VTX_SHADER_ENABLE           0x00000004
+#define     R200_VAP_SINGLE_BUF_STATE_ENABLE          0x00000010
+#define     R200_VAP_FORCE_W_TO_ONE                   0x00010000
+#define     R200_VAP_D3D_TEX_DEFAULT                  0x00020000
+#define     R200_VAP_VF_MAX_VTX_NUM__SHIFT            18
+#define     R200_VAP_DX_CLIP_SPACE_DEF                0x00400000
+#define R200_SE_VF_CNTL                           0x2084
+#define     R200_VF_PRIM_NONE                         0x00000000
+#define     R200_VF_PRIM_POINTS                       0x00000001
+#define     R200_VF_PRIM_LINES                        0x00000002
+#define     R200_VF_PRIM_LINE_STRIP                   0x00000003
+#define     R200_VF_PRIM_TRIANGLES                    0x00000004
+#define     R200_VF_PRIM_TRIANGLE_FAN                 0x00000005
+#define     R200_VF_PRIM_TRIANGLE_STRIP               0x00000006
+#define     R200_VF_PRIM_RECT_LIST                    0x00000008
+#define     R200_VF_PRIM_3VRT_POINTS                  0x00000009
+#define     R200_VF_PRIM_3VRT_LINES                   0x0000000a
+#define     R200_VF_PRIM_POINT_SPRITES                0x0000000b
+#define     R200_VF_PRIM_LINE_LOOP                    0x0000000c
+#define     R200_VF_PRIM_QUADS                        0x0000000d
+#define     R200_VF_PRIM_QUAD_STRIP                   0x0000000e
+#define     R200_VF_PRIM_POLYGON                      0x0000000f
+#define     R200_VF_PRIM_MASK                         0x0000000f
+#define     R200_VF_PRIM_WALK_IND                     0x00000010
+#define     R200_VF_PRIM_WALK_LIST                    0x00000020
+#define     R200_VF_PRIM_WALK_RING                    0x00000030
+#define     R200_VF_PRIM_WALK_MASK                    0x00000030
+#define     R200_VF_COLOR_ORDER_RGBA                  0x00000040
+#define     R200_VF_TCL_OUTPUT_VTX_ENABLE             0x00000200
+#define     R200_VF_INDEX_SZ_4                        0x00000800
+#define     R200_VF_VERTEX_NUMBER_MASK                0xffff0000
+#define     R200_VF_VERTEX_NUMBER_SHIFT               16
+#define R200_SE_VTX_FMT_0                 0x2088
+#define     R200_VTX_XY                     0 /* always have xy */
+#define     R200_VTX_Z0                     (1<<0)
+#define     R200_VTX_W0                     (1<<1)
+#define     R200_VTX_WEIGHT_COUNT_SHIFT     (2)
+#define     R200_VTX_PV_MATRIX_SEL          (1<<5)
+#define     R200_VTX_N0                     (1<<6)
+#define     R200_VTX_POINT_SIZE             (1<<7)
+#define     R200_VTX_DISCRETE_FOG           (1<<8)
+#define     R200_VTX_SHININESS_0            (1<<9)
+#define     R200_VTX_SHININESS_1            (1<<10)
+#define       R200_VTX_COLOR_NOT_PRESENT      0
+#define       R200_VTX_PK_RGBA          1
+#define       R200_VTX_FP_RGB           2
+#define       R200_VTX_FP_RGBA          3
+#define       R200_VTX_COLOR_MASK             3
+#define     R200_VTX_COLOR_0_SHIFT          11
+#define     R200_VTX_COLOR_1_SHIFT          13
+#define     R200_VTX_COLOR_2_SHIFT          15
+#define     R200_VTX_COLOR_3_SHIFT          17
+#define     R200_VTX_COLOR_4_SHIFT          19
+#define     R200_VTX_COLOR_5_SHIFT          21
+#define     R200_VTX_COLOR_6_SHIFT          23
+#define     R200_VTX_COLOR_7_SHIFT          25
+#define     R200_VTX_XY1                    (1<<28)
+#define     R200_VTX_Z1                     (1<<29)
+#define     R200_VTX_W1                     (1<<30)
+#define     R200_VTX_N1                     (1<<31)
+#define R200_SE_VTX_FMT_1                 0x208c
+#define     R200_VTX_TEX0_COMP_CNT_SHIFT        0
+#define     R200_VTX_TEX1_COMP_CNT_SHIFT        3
+#define     R200_VTX_TEX2_COMP_CNT_SHIFT        6
+#define     R200_VTX_TEX3_COMP_CNT_SHIFT        9
+#define     R200_VTX_TEX4_COMP_CNT_SHIFT        12
+#define     R200_VTX_TEX5_COMP_CNT_SHIFT        15
+#define R200_SE_TCL_OUTPUT_VTX_FMT_0      0x2090 
+#define R200_SE_TCL_OUTPUT_VTX_FMT_1      0x2094 
+/* gap */
+#define R200_SE_VTE_CNTL                  0x20b0
+#define     R200_VPORT_X_SCALE_ENA                0x00000001
+#define     R200_VPORT_X_OFFSET_ENA               0x00000002
+#define     R200_VPORT_Y_SCALE_ENA                0x00000004
+#define     R200_VPORT_Y_OFFSET_ENA               0x00000008
+#define     R200_VPORT_Z_SCALE_ENA                0x00000010
+#define     R200_VPORT_Z_OFFSET_ENA               0x00000020
+#define     R200_VTX_XY_FMT                       0x00000100
+#define     R200_VTX_Z_FMT                        0x00000200
+#define     R200_VTX_W0_FMT                       0x00000400
+#define     R200_VTX_W0_NORMALIZE                 0x00000800
+#define     R200_VTX_ST_DENORMALIZED              0x00001000
+/* gap */
+#define R200_SE_VTX_NUM_ARRAYS            0x20c0
+#define R200_SE_VTX_AOS_ATTR01            0x20c4
+#define R200_SE_VTX_AOS_ADDR0             0x20c8
+#define R200_SE_VTX_AOS_ADDR1             0x20cc
+#define R200_SE_VTX_AOS_ATTR23            0x20d0
+#define R200_SE_VTX_AOS_ADDR2             0x20d4
+#define R200_SE_VTX_AOS_ADDR3             0x20d8
+#define R200_SE_VTX_AOS_ATTR45            0x20dc
+#define R200_SE_VTX_AOS_ADDR4             0x20e0
+#define R200_SE_VTX_AOS_ADDR5             0x20e4
+#define R200_SE_VTX_AOS_ATTR67            0x20e8
+#define R200_SE_VTX_AOS_ADDR6             0x20ec
+#define R200_SE_VTX_AOS_ADDR7             0x20f0
+#define R200_SE_VTX_AOS_ATTR89            0x20f4
+#define R200_SE_VTX_AOS_ADDR8             0x20f8
+#define R200_SE_VTX_AOS_ADDR9             0x20fc
+#define R200_SE_VTX_AOS_ATTR1011          0x2100
+#define R200_SE_VTX_AOS_ADDR10            0x2104
+#define R200_SE_VTX_AOS_ADDR11            0x2108
+#define R200_SE_VF_MAX_VTX_INDX           0x210c
+#define R200_SE_VF_MIN_VTX_INDX           0x2110
+/* gap */
+#define R200_SE_VAP_CNTL_STATUS           0x2140
+#define     R200_VC_NO_SWAP                  (0 << 0)
+#define     R200_VC_16BIT_SWAP               (1 << 0)
+#define     R200_VC_32BIT_SWAP               (2 << 0)
+/* gap */
+#define R200_SE_VTX_STATE_CNTL                     0x2180
+#define     R200_VSC_COLOR_0_ASSEMBLY_CNTL_SHIFT    0x00000000
+#define     R200_VSC_COLOR_1_ASSEMBLY_CNTL_SHIFT    0x00000002
+#define     R200_VSC_COLOR_2_ASSEMBLY_CNTL_SHIFT    0x00000004
+#define     R200_VSC_COLOR_3_ASSEMBLY_CNTL_SHIFT    0x00000006
+#define     R200_VSC_COLOR_4_ASSEMBLY_CNTL_SHIFT    0x00000008
+#define     R200_VSC_COLOR_5_ASSEMBLY_CNTL_SHIFT    0x0000000a
+#define     R200_VSC_COLOR_6_ASSEMBLY_CNTL_SHIFT    0x0000000c
+#define     R200_VSC_COLOR_7_ASSEMBLY_CNTL_SHIFT    0x0000000e
+#define     R200_VSC_UPDATE_USER_COLOR_0_ENABLE    0x00010000
+#define     R200_VSC_UPDATE_USER_COLOR_1_ENABLE    0x00020000
+/* gap */
+#define R200_SE_TCL_VECTOR_INDX_REG                0x2200
+#define R200_SE_TCL_VECTOR_DATA_REG                0x2204
+#define R200_SE_TCL_SCALAR_INDX_REG                0x2208
+#define R200_SE_TCL_SCALAR_DATA_REG                0x220c
+/* gap */
+#define R200_SE_TCL_MATRIX_SEL_0                   0x2230
+#define     R200_MODELVIEW_0_SHIFT           (0) 
+#define     R200_MODELVIEW_1_SHIFT           (8) 
+#define     R200_MODELVIEW_2_SHIFT           (16) 
+#define     R200_MODELVIEW_3_SHIFT           (24) 
+#define R200_SE_TCL_MATRIX_SEL_1                   0x2234
+#define     R200_IT_MODELVIEW_0_SHIFT        (0)
+#define     R200_IT_MODELVIEW_1_SHIFT        (8) 
+#define     R200_IT_MODELVIEW_2_SHIFT        (16)
+#define     R200_IT_MODELVIEW_3_SHIFT        (24)
+#define R200_SE_TCL_MATRIX_SEL_2                   0x2238
+#define     R200_MODELPROJECT_0_SHIFT         (0) 
+#define     R200_MODELPROJECT_1_SHIFT         (8) 
+#define     R200_MODELPROJECT_2_SHIFT         (16) 
+#define     R200_MODELPROJECT_3_SHIFT         (24) 
+#define R200_SE_TCL_MATRIX_SEL_3                   0x223c
+#define     R200_TEXMAT_0_SHIFT    0
+#define     R200_TEXMAT_1_SHIFT    8
+#define     R200_TEXMAT_2_SHIFT    16
+#define     R200_TEXMAT_3_SHIFT    24
+#define R200_SE_TCL_MATRIX_SEL_4                   0x2240
+#define     R200_TEXMAT_4_SHIFT    0
+#define     R200_TEXMAT_5_SHIFT    8
+/* gap */
+#define R200_SE_TCL_OUTPUT_VTX_COMP_SEL     0x2250
+#define     R200_OUTPUT_XYZW                    (1<<0)
+#define     R200_OUTPUT_COLOR_0                 (1<<8)
+#define     R200_OUTPUT_COLOR_1                 (1<<9)
+#define     R200_OUTPUT_TEX_0                   (1<<16)
+#define     R200_OUTPUT_TEX_1                   (1<<17)
+#define     R200_OUTPUT_TEX_2                   (1<<18)
+#define     R200_OUTPUT_TEX_3                   (1<<19)
+#define     R200_OUTPUT_TEX_4                   (1<<20)
+#define     R200_OUTPUT_TEX_5                   (1<<21)
+#define     R200_OUTPUT_TEX_MASK                (0x3f<<16)
+#define     R200_OUTPUT_DISCRETE_FOG            (1<<24)
+#define     R200_OUTPUT_PT_SIZE                 (1<<25)
+#define     R200_FORCE_INORDER_PROC             (1<<31)
+#define R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0  0x2254
+#define	    R200_VERTEX_POSITION_ADDR__SHIFT     0x00000000
+#define R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_1  0x2258
+#define	    R200_VTX_COLOR_0_ADDR__SHIFT         0x00000000
+#define	    R200_VTX_COLOR_1_ADDR__SHIFT         0x00000008
+#define R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_2  0x225c
+#define	    R200_VTX_TEX_0_ADDR__SHIFT           0x00000000
+#define	    R200_VTX_TEX_1_ADDR__SHIFT           0x00000008
+#define	    R200_VTX_TEX_2_ADDR__SHIFT           0x00000010
+#define	    R200_VTX_TEX_3_ADDR__SHIFT           0x00000018
+#define R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_3  0x2260
+#define	    R200_VTX_TEX_4_ADDR__SHIFT           0x00000000
+#define	    R200_VTX_TEX_5_ADDR__SHIFT           0x00000008
+
+/* gap */
+#define R200_SE_TCL_LIGHT_MODEL_CTL_0       0x2268 
+#define     R200_LIGHTING_ENABLE                (1<<0)
+#define     R200_LIGHT_IN_MODELSPACE            (1<<1)
+#define     R200_LOCAL_VIEWER                   (1<<2)
+#define     R200_NORMALIZE_NORMALS              (1<<3)
+#define     R200_RESCALE_NORMALS                (1<<4)
+#define     R200_SPECULAR_LIGHTS                (1<<5)
+#define     R200_DIFFUSE_SPECULAR_COMBINE       (1<<6)
+#define     R200_LIGHT_ALPHA                    (1<<7)
+#define     R200_LOCAL_LIGHT_VEC_GL             (1<<8)
+#define     R200_LIGHT_NO_NORMAL_AMBIENT_ONLY   (1<<9)
+#define     R200_LIGHT_TWOSIDE                  (1<<10)
+#define     R200_FRONT_SHININESS_SOURCE_SHIFT       (0xb)
+#define     R200_BACK_SHININESS_SOURCE_SHIFT        (0xd)
+#define       R200_LM0_SOURCE_MATERIAL_0           (0)
+#define       R200_LM0_SOURCE_MATERIAL_1           (1)
+#define       R200_LM0_SOURCE_VERTEX_SHININESS_0   (2)
+#define       R200_LM0_SOURCE_VERTEX_SHININESS_1   (3)
+#define R200_SE_TCL_LIGHT_MODEL_CTL_1       0x226c 
+#define       R200_LM1_SOURCE_LIGHT_PREMULT        (0)
+#define       R200_LM1_SOURCE_MATERIAL_0           (1)
+#define       R200_LM1_SOURCE_VERTEX_COLOR_0       (2)
+#define       R200_LM1_SOURCE_VERTEX_COLOR_1       (3)
+#define       R200_LM1_SOURCE_VERTEX_COLOR_2       (4)
+#define       R200_LM1_SOURCE_VERTEX_COLOR_3       (5)
+#define       R200_LM1_SOURCE_VERTEX_COLOR_4       (6)
+#define       R200_LM1_SOURCE_VERTEX_COLOR_5       (7)
+#define       R200_LM1_SOURCE_VERTEX_COLOR_6       (8)
+#define       R200_LM1_SOURCE_VERTEX_COLOR_7       (9)
+#define       R200_LM1_SOURCE_MATERIAL_1           (0xf)
+#define     R200_FRONT_EMISSIVE_SOURCE_SHIFT        (0)
+#define     R200_FRONT_AMBIENT_SOURCE_SHIFT         (4)
+#define     R200_FRONT_DIFFUSE_SOURCE_SHIFT         (8)
+#define     R200_FRONT_SPECULAR_SOURCE_SHIFT        (12)
+#define     R200_BACK_EMISSIVE_SOURCE_SHIFT         (16)
+#define     R200_BACK_AMBIENT_SOURCE_SHIFT          (20)
+#define     R200_BACK_DIFFUSE_SOURCE_SHIFT          (24)
+#define     R200_BACK_SPECULAR_SOURCE_SHIFT         (28)
+#define R200_SE_TCL_PER_LIGHT_CTL_0       0x2270 
+#define     R200_LIGHT_0_ENABLE                    (1<<0)
+#define     R200_LIGHT_0_ENABLE_AMBIENT            (1<<1)
+#define     R200_LIGHT_0_ENABLE_SPECULAR           (1<<2)
+#define     R200_LIGHT_0_IS_LOCAL                  (1<<3)
+#define     R200_LIGHT_0_IS_SPOT                   (1<<4)
+#define     R200_LIGHT_0_DUAL_CONE                 (1<<5)
+#define     R200_LIGHT_0_ENABLE_RANGE_ATTEN        (1<<6)
+#define     R200_LIGHT_0_CONSTANT_RANGE_ATTEN      (1<<7)
+#define     R200_LIGHT_1_ENABLE                    (1<<16)
+#define     R200_LIGHT_1_ENABLE_AMBIENT            (1<<17)
+#define     R200_LIGHT_1_ENABLE_SPECULAR           (1<<18)
+#define     R200_LIGHT_1_IS_LOCAL                  (1<<19)
+#define     R200_LIGHT_1_IS_SPOT                   (1<<20)
+#define     R200_LIGHT_1_DUAL_CONE                 (1<<21)
+#define     R200_LIGHT_1_ENABLE_RANGE_ATTEN        (1<<22)
+#define     R200_LIGHT_1_CONSTANT_RANGE_ATTEN      (1<<23)
+#define     R200_LIGHT_0_SHIFT                   (0)
+#define     R200_LIGHT_1_SHIFT                   (16)
+#define R200_SE_TCL_PER_LIGHT_CTL_1       0x2274 
+#define     R200_LIGHT_2_SHIFT                   (0)
+#define     R200_LIGHT_3_SHIFT                   (16)
+#define R200_SE_TCL_PER_LIGHT_CTL_2       0x2278 
+#define     R200_LIGHT_4_SHIFT                   (0)
+#define     R200_LIGHT_5_SHIFT                   (16)
+#define R200_SE_TCL_PER_LIGHT_CTL_3       0x227c 
+#define     R200_LIGHT_6_SHIFT                   (0)
+#define     R200_LIGHT_7_SHIFT                   (16)
+/* gap */
+#define R200_SE_TCL_TEX_PROC_CTL_2        0x22a8 
+#define     R200_TEXGEN_COMP_MASK                (0xf)
+#define     R200_TEXGEN_COMP_S                   (0x1)
+#define     R200_TEXGEN_COMP_T                   (0x2)
+#define     R200_TEXGEN_COMP_R                   (0x4)
+#define     R200_TEXGEN_COMP_Q                   (0x8)
+#define     R200_TEXGEN_0_COMP_MASK_SHIFT        (0)
+#define     R200_TEXGEN_1_COMP_MASK_SHIFT        (4)
+#define     R200_TEXGEN_2_COMP_MASK_SHIFT        (8)
+#define     R200_TEXGEN_3_COMP_MASK_SHIFT        (12)
+#define     R200_TEXGEN_4_COMP_MASK_SHIFT        (16)
+#define     R200_TEXGEN_5_COMP_MASK_SHIFT        (20)
+#define R200_SE_TCL_TEX_PROC_CTL_3        0x22ac 
+#define     R200_TEXGEN_0_INPUT_TEX_SHIFT        (0)
+#define     R200_TEXGEN_1_INPUT_TEX_SHIFT        (4)
+#define     R200_TEXGEN_2_INPUT_TEX_SHIFT        (8)
+#define     R200_TEXGEN_3_INPUT_TEX_SHIFT        (12)
+#define     R200_TEXGEN_4_INPUT_TEX_SHIFT        (16)
+#define     R200_TEXGEN_5_INPUT_TEX_SHIFT        (20)
+#define R200_SE_TCL_TEX_PROC_CTL_0        0x22b0 
+#define     R200_TEXGEN_TEXMAT_0_ENABLE         (1<<0)
+#define     R200_TEXGEN_TEXMAT_1_ENABLE         (1<<1)
+#define     R200_TEXGEN_TEXMAT_2_ENABLE         (1<<2)
+#define     R200_TEXGEN_TEXMAT_3_ENABLE         (1<<3)
+#define     R200_TEXGEN_TEXMAT_4_ENABLE         (1<<4)
+#define     R200_TEXGEN_TEXMAT_5_ENABLE         (1<<5)
+#define     R200_TEXMAT_0_ENABLE                (1<<8)
+#define     R200_TEXMAT_1_ENABLE                (1<<9)
+#define     R200_TEXMAT_2_ENABLE                (1<<10)
+#define     R200_TEXMAT_3_ENABLE                (1<<11)
+#define     R200_TEXMAT_4_ENABLE                (1<<12)
+#define     R200_TEXMAT_5_ENABLE                (1<<13)
+#define     R200_TEXGEN_FORCE_W_TO_ONE          (1<<16)
+#define R200_SE_TCL_TEX_PROC_CTL_1        0x22b4 
+#define       R200_TEXGEN_INPUT_MASK           (0xf)
+#define       R200_TEXGEN_INPUT_TEXCOORD_0     (0)
+#define       R200_TEXGEN_INPUT_TEXCOORD_1     (1)
+#define       R200_TEXGEN_INPUT_TEXCOORD_2     (2)
+#define       R200_TEXGEN_INPUT_TEXCOORD_3     (3)
+#define       R200_TEXGEN_INPUT_TEXCOORD_4     (4)
+#define       R200_TEXGEN_INPUT_TEXCOORD_5     (5)
+#define       R200_TEXGEN_INPUT_OBJ            (8)
+#define       R200_TEXGEN_INPUT_EYE            (9)
+#define       R200_TEXGEN_INPUT_EYE_NORMAL     (0xa)
+#define       R200_TEXGEN_INPUT_EYE_REFLECT    (0xb)
+#define       R200_TEXGEN_INPUT_SPHERE         (0xd)
+#define     R200_TEXGEN_0_INPUT_SHIFT        (0)
+#define     R200_TEXGEN_1_INPUT_SHIFT        (4)
+#define     R200_TEXGEN_2_INPUT_SHIFT        (8)
+#define     R200_TEXGEN_3_INPUT_SHIFT        (12)
+#define     R200_TEXGEN_4_INPUT_SHIFT        (16)
+#define     R200_TEXGEN_5_INPUT_SHIFT        (20)
+#define R200_SE_TC_TEX_CYL_WRAP_CTL       0x22b8
+/* gap */
+#define R200_SE_TCL_UCP_VERT_BLEND_CTL    0x22c0 
+#define     R200_UCP_IN_CLIP_SPACE              (1<<0)
+#define     R200_UCP_IN_MODEL_SPACE             (1<<1)
+#define     R200_UCP_ENABLE_0                   (1<<2)
+#define     R200_UCP_ENABLE_1                   (1<<3)
+#define     R200_UCP_ENABLE_2                   (1<<4)
+#define     R200_UCP_ENABLE_3                   (1<<5)
+#define     R200_UCP_ENABLE_4                   (1<<6)
+#define     R200_UCP_ENABLE_5                   (1<<7)
+#define     R200_TCL_FOG_MASK                   (3<<8)
+#define     R200_TCL_FOG_DISABLE                (0<<8)
+#define     R200_TCL_FOG_EXP                    (1<<8)
+#define     R200_TCL_FOG_EXP2                   (2<<8)
+#define     R200_TCL_FOG_LINEAR                 (3<<8)
+#define     R200_RNG_BASED_FOG                  (1<<10)
+#define     R200_CLIP_DISABLE                   (1<<11)
+#define     R200_CULL_FRONT_IS_CW               (0<<28)
+#define     R200_CULL_FRONT_IS_CCW              (1<<28)
+#define     R200_CULL_FRONT                     (1<<29)
+#define     R200_CULL_BACK                      (1<<30)
+#define R200_SE_TCL_POINT_SPRITE_CNTL     0x22c4
+#define     R200_PS_MULT_PVATTENCONST           (0<<0)
+#define     R200_PS_MULT_PVATTEN                (1<<0)
+#define     R200_PS_MULT_ATTENCONST             (2<<0)
+#define     R200_PS_MULT_PVCONST                (3<<0)
+#define     R200_PS_MULT_CONST                  (4<<0)
+#define     R200_PS_MULT_MASK                   (7<<0)
+#define     R200_PS_LIN_ATT_ZERO                (1<<3)
+#define     R200_PS_USE_MODEL_EYE_VEC           (1<<4)
+#define     R200_PS_ATT_ALPHA                   (1<<5)
+#define     R200_PS_UCP_MODE_MASK               (3<<6)
+#define     R200_PS_GEN_TEX_0                   (1<<8)
+#define     R200_PS_GEN_TEX_1                   (1<<9)
+#define     R200_PS_GEN_TEX_2                   (1<<10)
+#define     R200_PS_GEN_TEX_3                   (1<<11)
+#define     R200_PS_GEN_TEX_4                   (1<<12)
+#define     R200_PS_GEN_TEX_5                   (1<<13)
+#define     R200_PS_GEN_TEX_0_SHIFT             (8)
+#define     R200_PS_GEN_TEX_MASK                (0x3f<<8)
+#define     R200_PS_SE_SEL_STATE                (1<<16)
+/* gap */
+/* taken from r300, see comments there */
+#define R200_VAP_PVS_CNTL_1                 0x22d0
+#       define R200_PVS_CNTL_1_PROGRAM_START_SHIFT   0
+#       define R200_PVS_CNTL_1_POS_END_SHIFT         10
+#       define R200_PVS_CNTL_1_PROGRAM_END_SHIFT     20
+/* Addresses are relative the the vertex program parameters area. */
+#define R200_VAP_PVS_CNTL_2                 0x22d4
+#       define R200_PVS_CNTL_2_PARAM_OFFSET_SHIFT 0
+#       define R200_PVS_CNTL_2_PARAM_COUNT_SHIFT  16
+/* gap */
+
+#define R200_SE_VTX_ST_POS_0_X_4                   0x2300
+#define R200_SE_VTX_ST_POS_0_Y_4                   0x2304
+#define R200_SE_VTX_ST_POS_0_Z_4                   0x2308
+#define R200_SE_VTX_ST_POS_0_W_4                   0x230c
+#define R200_SE_VTX_ST_NORM_0_X                    0x2310
+#define R200_SE_VTX_ST_NORM_0_Y                    0x2314
+#define R200_SE_VTX_ST_NORM_0_Z                    0x2318
+#define R200_SE_VTX_ST_PVMS                        0x231c
+#define R200_SE_VTX_ST_CLR_0_R                     0x2320
+#define R200_SE_VTX_ST_CLR_0_G                     0x2324
+#define R200_SE_VTX_ST_CLR_0_B                     0x2328
+#define R200_SE_VTX_ST_CLR_0_A                     0x232c
+#define R200_SE_VTX_ST_CLR_1_R                     0x2330
+#define R200_SE_VTX_ST_CLR_1_G                     0x2334
+#define R200_SE_VTX_ST_CLR_1_B                     0x2338
+#define R200_SE_VTX_ST_CLR_1_A                     0x233c
+#define R200_SE_VTX_ST_CLR_2_R                     0x2340
+#define R200_SE_VTX_ST_CLR_2_G                     0x2344
+#define R200_SE_VTX_ST_CLR_2_B                     0x2348
+#define R200_SE_VTX_ST_CLR_2_A                     0x234c
+#define R200_SE_VTX_ST_CLR_3_R                     0x2350
+#define R200_SE_VTX_ST_CLR_3_G                     0x2354
+#define R200_SE_VTX_ST_CLR_3_B                     0x2358
+#define R200_SE_VTX_ST_CLR_3_A                     0x235c
+#define R200_SE_VTX_ST_CLR_4_R                     0x2360
+#define R200_SE_VTX_ST_CLR_4_G                     0x2364
+#define R200_SE_VTX_ST_CLR_4_B                     0x2368
+#define R200_SE_VTX_ST_CLR_4_A                     0x236c
+#define R200_SE_VTX_ST_CLR_5_R                     0x2370
+#define R200_SE_VTX_ST_CLR_5_G                     0x2374
+#define R200_SE_VTX_ST_CLR_5_B                     0x2378
+#define R200_SE_VTX_ST_CLR_5_A                     0x237c
+#define R200_SE_VTX_ST_CLR_6_R                     0x2380
+#define R200_SE_VTX_ST_CLR_6_G                     0x2384
+#define R200_SE_VTX_ST_CLR_6_B                     0x2388
+#define R200_SE_VTX_ST_CLR_6_A                     0x238c
+#define R200_SE_VTX_ST_CLR_7_R                     0x2390
+#define R200_SE_VTX_ST_CLR_7_G                     0x2394
+#define R200_SE_VTX_ST_CLR_7_B                     0x2398
+#define R200_SE_VTX_ST_CLR_7_A                     0x239c
+#define R200_SE_VTX_ST_TEX_0_S                     0x23a0
+#define R200_SE_VTX_ST_TEX_0_T                     0x23a4
+#define R200_SE_VTX_ST_TEX_0_R                     0x23a8
+#define R200_SE_VTX_ST_TEX_0_Q                     0x23ac
+#define R200_SE_VTX_ST_TEX_1_S                     0x23b0
+#define R200_SE_VTX_ST_TEX_1_T                     0x23b4
+#define R200_SE_VTX_ST_TEX_1_R                     0x23b8
+#define R200_SE_VTX_ST_TEX_1_Q                     0x23bc
+#define R200_SE_VTX_ST_TEX_2_S                     0x23c0
+#define R200_SE_VTX_ST_TEX_2_T                     0x23c4
+#define R200_SE_VTX_ST_TEX_2_R                     0x23c8
+#define R200_SE_VTX_ST_TEX_2_Q                     0x23cc
+#define R200_SE_VTX_ST_TEX_3_S                     0x23d0
+#define R200_SE_VTX_ST_TEX_3_T                     0x23d4
+#define R200_SE_VTX_ST_TEX_3_R                     0x23d8
+#define R200_SE_VTX_ST_TEX_3_Q                     0x23dc
+#define R200_SE_VTX_ST_TEX_4_S                     0x23e0
+#define R200_SE_VTX_ST_TEX_4_T                     0x23e4
+#define R200_SE_VTX_ST_TEX_4_R                     0x23e8
+#define R200_SE_VTX_ST_TEX_4_Q                     0x23ec
+#define R200_SE_VTX_ST_TEX_5_S                     0x23f0
+#define R200_SE_VTX_ST_TEX_5_T                     0x23f4
+#define R200_SE_VTX_ST_TEX_5_R                     0x23f8
+#define R200_SE_VTX_ST_TEX_5_Q                     0x23fc
+#define R200_SE_VTX_ST_PNT_SPRT_SZ                 0x2400
+#define R200_SE_VTX_ST_DISC_FOG                    0x2404
+#define R200_SE_VTX_ST_SHININESS_0                 0x2408
+#define R200_SE_VTX_ST_SHININESS_1                 0x240c
+#define R200_SE_VTX_ST_BLND_WT_0                   0x2410
+#define R200_SE_VTX_ST_BLND_WT_1                   0x2414
+#define R200_SE_VTX_ST_BLND_WT_2                   0x2418
+#define R200_SE_VTX_ST_BLND_WT_3                   0x241c
+#define R200_SE_VTX_ST_POS_1_X                     0x2420
+#define R200_SE_VTX_ST_POS_1_Y                     0x2424
+#define R200_SE_VTX_ST_POS_1_Z                     0x2428
+#define R200_SE_VTX_ST_POS_1_W                     0x242c
+#define R200_SE_VTX_ST_NORM_1_X                    0x2430
+#define R200_SE_VTX_ST_NORM_1_Y                    0x2434
+#define R200_SE_VTX_ST_NORM_1_Z                    0x2438
+#define R200_SE_VTX_ST_USR_CLR_0_R                 0x2440
+#define R200_SE_VTX_ST_USR_CLR_0_G                 0x2444
+#define R200_SE_VTX_ST_USR_CLR_0_B                 0x2448
+#define R200_SE_VTX_ST_USR_CLR_0_A                 0x244c
+#define R200_SE_VTX_ST_USR_CLR_1_R                 0x2450
+#define R200_SE_VTX_ST_USR_CLR_1_G                 0x2454
+#define R200_SE_VTX_ST_USR_CLR_1_B                 0x2458
+#define R200_SE_VTX_ST_USR_CLR_1_A                 0x245c
+#define R200_SE_VTX_ST_CLR_0_PKD                   0x2460
+#define R200_SE_VTX_ST_CLR_1_PKD                   0x2464
+#define R200_SE_VTX_ST_CLR_2_PKD                   0x2468
+#define R200_SE_VTX_ST_CLR_3_PKD                   0x246c
+#define R200_SE_VTX_ST_CLR_4_PKD                   0x2470
+#define R200_SE_VTX_ST_CLR_5_PKD                   0x2474
+#define R200_SE_VTX_ST_CLR_6_PKD                   0x2478
+#define R200_SE_VTX_ST_CLR_7_PKD                   0x247c
+#define R200_SE_VTX_ST_POS_0_X_2                   0x2480
+#define R200_SE_VTX_ST_POS_0_Y_2                   0x2484
+#define R200_SE_VTX_ST_PAR_CLR_LD                  0x2488
+#define R200_SE_VTX_ST_USR_CLR_PKD                 0x248c
+#define R200_SE_VTX_ST_POS_0_X_3                   0x2490
+#define R200_SE_VTX_ST_POS_0_Y_3                   0x2494
+#define R200_SE_VTX_ST_POS_0_Z_3                   0x2498
+#define R200_SE_VTX_ST_END_OF_PKT                  0x249c
+/* gap */
+#define R200_RE_POINTSIZE                          0x2648
+#define     R200_POINTSIZE_SHIFT                       0
+#define     R200_MAXPOINTSIZE_SHIFT                    16
+/* gap */
+#define R200_RE_TOP_LEFT                  0x26c0 
+#define     R200_RE_LEFT_SHIFT         0
+#define     R200_RE_TOP_SHIFT          16
+#define R200_RE_MISC                      0x26c4 
+#define     R200_STIPPLE_COORD_MASK           0x1f
+#define     R200_STIPPLE_X_OFFSET_SHIFT       0
+#define     R200_STIPPLE_X_OFFSET_MASK        (0x1f << 0)
+#define     R200_STIPPLE_Y_OFFSET_SHIFT       8
+#define     R200_STIPPLE_Y_OFFSET_MASK        (0x1f << 8)
+#define     R200_STIPPLE_LITTLE_BIT_ORDER     (0 << 16)
+#define     R200_STIPPLE_BIG_BIT_ORDER        (1 << 16)
+/* gap */
+#define R200_RE_AUX_SCISSOR_CNTL                   0x26f0
+#define     R200_EXCLUSIVE_SCISSOR_0      0x01000000
+#define     R200_EXCLUSIVE_SCISSOR_1      0x02000000
+#define     R200_EXCLUSIVE_SCISSOR_2      0x04000000
+#define     R200_SCISSOR_ENABLE_0         0x10000000
+#define     R200_SCISSOR_ENABLE_1         0x20000000
+#define     R200_SCISSOR_ENABLE_2         0x40000000
+/* gap */
+#define R200_PP_TXFILTER_0                0x2c00 
+#define     R200_MAG_FILTER_NEAREST                   (0  <<  0)
+#define     R200_MAG_FILTER_LINEAR                    (1  <<  0)
+#define     R200_MAG_FILTER_MASK                      (1  <<  0)
+#define     R200_MIN_FILTER_NEAREST                   (0  <<  1)
+#define     R200_MIN_FILTER_LINEAR                    (1  <<  1)
+#define     R200_MIN_FILTER_NEAREST_MIP_NEAREST       (2  <<  1)
+#define     R200_MIN_FILTER_NEAREST_MIP_LINEAR        (3  <<  1)
+#define     R200_MIN_FILTER_LINEAR_MIP_NEAREST        (6  <<  1)
+#define     R200_MIN_FILTER_LINEAR_MIP_LINEAR         (7  <<  1)
+#define     R200_MIN_FILTER_ANISO_NEAREST             (8  <<  1)
+#define     R200_MIN_FILTER_ANISO_LINEAR              (9  <<  1)
+#define     R200_MIN_FILTER_ANISO_NEAREST_MIP_NEAREST (10 <<  1)
+#define     R200_MIN_FILTER_ANISO_NEAREST_MIP_LINEAR  (11 <<  1)
+#define     R200_MIN_FILTER_MASK                      (15 <<  1)
+#define     R200_MAX_ANISO_1_TO_1                     (0  <<  5)
+#define     R200_MAX_ANISO_2_TO_1                     (1  <<  5)
+#define     R200_MAX_ANISO_4_TO_1                     (2  <<  5)
+#define     R200_MAX_ANISO_8_TO_1                     (3  <<  5)
+#define     R200_MAX_ANISO_16_TO_1                    (4  <<  5)
+#define     R200_MAX_ANISO_MASK                       (7  <<  5)
+#define     R200_MAX_MIP_LEVEL_MASK                   (0x0f << 16)
+#define     R200_MAX_MIP_LEVEL_SHIFT                  16
+#define     R200_YUV_TO_RGB                           (1  << 20)
+#define     R200_YUV_TEMPERATURE_COOL                 (0  << 21)
+#define     R200_YUV_TEMPERATURE_HOT                  (1  << 21)
+#define     R200_YUV_TEMPERATURE_MASK                 (1  << 21)
+#define     R200_WRAPEN_S                             (1  << 22)
+#define     R200_CLAMP_S_WRAP                         (0  << 23)
+#define     R200_CLAMP_S_MIRROR                       (1  << 23)
+#define     R200_CLAMP_S_CLAMP_LAST                   (2  << 23)
+#define     R200_CLAMP_S_MIRROR_CLAMP_LAST            (3  << 23)
+#define     R200_CLAMP_S_CLAMP_BORDER                 (4  << 23)
+#define     R200_CLAMP_S_MIRROR_CLAMP_BORDER          (5  << 23)
+#define     R200_CLAMP_S_CLAMP_GL                     (6  << 23)
+#define     R200_CLAMP_S_MIRROR_CLAMP_GL              (7  << 23)
+#define     R200_CLAMP_S_MASK                         (7  << 23)
+#define     R200_WRAPEN_T                             (1  << 26)
+#define     R200_CLAMP_T_WRAP                         (0  << 27)
+#define     R200_CLAMP_T_MIRROR                       (1  << 27)
+#define     R200_CLAMP_T_CLAMP_LAST                   (2  << 27)
+#define     R200_CLAMP_T_MIRROR_CLAMP_LAST            (3  << 27)
+#define     R200_CLAMP_T_CLAMP_BORDER                 (4  << 27)
+#define     R200_CLAMP_T_MIRROR_CLAMP_BORDER          (5  << 27)
+#define     R200_CLAMP_T_CLAMP_GL                     (6  << 27)
+#define     R200_CLAMP_T_MIRROR_CLAMP_GL              (7  << 27)
+#define     R200_CLAMP_T_MASK                         (7  << 27)
+#define     R200_KILL_LT_ZERO                         (1  << 30)
+#define     R200_BORDER_MODE_OGL                      (0  << 31)
+#define     R200_BORDER_MODE_D3D                      (1  << 31)
+#define R200_PP_TXFORMAT_0                0x2c04
+#define     R200_TXFORMAT_I8                 (0  <<  0)
+#define     R200_TXFORMAT_AI88               (1  <<  0)
+#define     R200_TXFORMAT_RGB332             (2  <<  0)
+#define     R200_TXFORMAT_ARGB1555           (3  <<  0)
+#define     R200_TXFORMAT_RGB565             (4  <<  0)
+#define     R200_TXFORMAT_ARGB4444           (5  <<  0)
+#define     R200_TXFORMAT_ARGB8888           (6  <<  0)
+#define     R200_TXFORMAT_RGBA8888           (7  <<  0)
+#define     R200_TXFORMAT_Y8                 (8  <<  0)
+#define     R200_TXFORMAT_AVYU4444           (9  <<  0)
+#define     R200_TXFORMAT_VYUY422            (10  <<  0)
+#define     R200_TXFORMAT_YVYU422            (11  <<  0)
+#define     R200_TXFORMAT_DXT1               (12  <<  0)
+#define     R200_TXFORMAT_DXT23              (14  <<  0)
+#define     R200_TXFORMAT_DXT45              (15  <<  0)
+#define     R200_TXFORMAT_DVDU88             (18  <<  0)
+#define     R200_TXFORMAT_LDVDU655           (19  <<  0)
+#define     R200_TXFORMAT_LDVDU8888          (20  <<  0)
+#define     R200_TXFORMAT_GR1616             (21  <<  0)
+#define     R200_TXFORMAT_ABGR8888           (22  <<  0)
+#define     R200_TXFORMAT_BGR111110          (23  <<  0)
+#define     R200_TXFORMAT_FORMAT_MASK        (31 <<  0)
+#define     R200_TXFORMAT_FORMAT_SHIFT       0
+#define     R200_TXFORMAT_APPLE_YUV          (1  <<  5)
+#define     R200_TXFORMAT_ALPHA_IN_MAP       (1  <<  6)
+#define     R200_TXFORMAT_NON_POWER2         (1  <<  7)
+#define     R200_TXFORMAT_WIDTH_MASK         (15 <<  8)
+#define     R200_TXFORMAT_WIDTH_SHIFT        8
+#define     R200_TXFORMAT_HEIGHT_MASK        (15 << 12)
+#define     R200_TXFORMAT_HEIGHT_SHIFT       12
+#define     R200_TXFORMAT_F5_WIDTH_MASK      (15 << 16)	/* cube face 5 */
+#define     R200_TXFORMAT_F5_WIDTH_SHIFT     16
+#define     R200_TXFORMAT_F5_HEIGHT_MASK     (15 << 20)
+#define     R200_TXFORMAT_F5_HEIGHT_SHIFT    20
+#define     R200_TXFORMAT_ST_ROUTE_STQ0      (0  << 24)
+#define     R200_TXFORMAT_ST_ROUTE_STQ1      (1  << 24)
+#define     R200_TXFORMAT_ST_ROUTE_STQ2      (2  << 24)
+#define     R200_TXFORMAT_ST_ROUTE_STQ3      (3  << 24)
+#define     R200_TXFORMAT_ST_ROUTE_STQ4      (4  << 24)
+#define     R200_TXFORMAT_ST_ROUTE_STQ5      (5  << 24)
+#define     R200_TXFORMAT_ST_ROUTE_MASK      (7  << 24)
+#define     R200_TXFORMAT_ST_ROUTE_SHIFT     24
+#define     R200_TXFORMAT_LOOKUP_DISABLE     (1  << 27)
+#define     R200_TXFORMAT_ALPHA_MASK_ENABLE  (1  << 28)
+#define     R200_TXFORMAT_CHROMA_KEY_ENABLE  (1  << 29)
+#define     R200_TXFORMAT_CUBIC_MAP_ENABLE   (1  << 30)
+#define R200_PP_TXFORMAT_X_0              0x2c08
+#define     R200_DEPTH_LOG2_MASK                      (0xf << 0)
+#define     R200_DEPTH_LOG2_SHIFT                     0
+#define     R200_VOLUME_FILTER_SHIFT                  4
+#define     R200_VOLUME_FILTER_MASK                   (1 << 4)
+#define     R200_VOLUME_FILTER_NEAREST                (0 << 4)
+#define     R200_VOLUME_FILTER_LINEAR                 (1 << 4)
+#define     R200_WRAPEN_Q                             (1  << 8)
+#define     R200_CLAMP_Q_WRAP                         (0  << 9)
+#define     R200_CLAMP_Q_MIRROR                       (1  << 9)
+#define     R200_CLAMP_Q_CLAMP_LAST                   (2  << 9)
+#define     R200_CLAMP_Q_MIRROR_CLAMP_LAST            (3  << 9)
+#define     R200_CLAMP_Q_CLAMP_BORDER                 (4  << 9)
+#define     R200_CLAMP_Q_MIRROR_CLAMP_BORDER          (5  << 9)
+#define     R200_CLAMP_Q_CLAMP_GL                     (6  << 9)
+#define     R200_CLAMP_Q_MIRROR_CLAMP_GL              (7  << 9)
+#define     R200_CLAMP_Q_MASK                         (7  << 9)
+#define     R200_MIN_MIP_LEVEL_MASK                   (0xff << 12)
+#define     R200_MIN_MIP_LEVEL_SHIFT                  12
+#define     R200_TEXCOORD_NONPROJ                     (0  << 16)
+#define     R200_TEXCOORD_CUBIC_ENV                   (1  << 16)
+#define     R200_TEXCOORD_VOLUME                      (2  << 16)
+#define     R200_TEXCOORD_PROJ                        (3  << 16)
+#define     R200_TEXCOORD_DEPTH                       (4  << 16)
+#define     R200_TEXCOORD_1D_PROJ                     (5  << 16)
+#define     R200_TEXCOORD_1D                          (6  << 16)
+#define     R200_TEXCOORD_ZERO                        (7  << 16)
+#define     R200_TEXCOORD_MASK                        (7  << 16)
+#define     R200_LOD_BIAS_MASK                        (0xfff80000)
+#define     R200_LOD_BIAS_SHIFT                       19
+#define R200_PP_TXSIZE_0                  0x2c0c /* NPOT only */
+#define R200_PP_TXPITCH_0                 0x2c10 /* NPOT only */
+#define R200_PP_BORDER_COLOR_0            0x2c14
+#define R200_PP_CUBIC_FACES_0             0x2c18
+#define     R200_FACE_WIDTH_1_SHIFT                   0
+#define     R200_FACE_HEIGHT_1_SHIFT                  4
+#define     R200_FACE_WIDTH_1_MASK                   (0xf << 0)
+#define     R200_FACE_HEIGHT_1_MASK                  (0xf << 4)
+#define     R200_FACE_WIDTH_2_SHIFT                   8
+#define     R200_FACE_HEIGHT_2_SHIFT                 12
+#define     R200_FACE_WIDTH_2_MASK                   (0xf << 8)
+#define     R200_FACE_HEIGHT_2_MASK                  (0xf << 12)
+#define     R200_FACE_WIDTH_3_SHIFT                  16
+#define     R200_FACE_HEIGHT_3_SHIFT                 20
+#define     R200_FACE_WIDTH_3_MASK                   (0xf << 16)
+#define     R200_FACE_HEIGHT_3_MASK                  (0xf << 20)
+#define     R200_FACE_WIDTH_4_SHIFT                  24
+#define     R200_FACE_HEIGHT_4_SHIFT                 28
+#define     R200_FACE_WIDTH_4_MASK                   (0xf << 24)
+#define     R200_FACE_HEIGHT_4_MASK                  (0xf << 28)
+#define R200_PP_TXMULTI_CTL_0                  0x2c1c /* name from ddx, rest RE... */
+#define     R200_PASS1_TXFORMAT_LOOKUP_DISABLE (1 << 0)
+#define     R200_PASS1_TEXCOORD_NONPROJ        (0 << 1)
+#define     R200_PASS1_TEXCOORD_CUBIC_ENV      (1 << 1)
+#define     R200_PASS1_TEXCOORD_VOLUME         (2 << 1)
+#define     R200_PASS1_TEXCOORD_PROJ           (3 << 1)
+#define     R200_PASS1_TEXCOORD_DEPTH          (4 << 1)
+#define     R200_PASS1_TEXCOORD_1D_PROJ        (5 << 1)
+#define     R200_PASS1_TEXCOORD_1D             (6 << 1) /* pass1 texcoords only */
+#define     R200_PASS1_TEXCOORD_ZERO           (7 << 1) /* verifed for 2d targets! */
+#define     R200_PASS1_TEXCOORD_MASK           (7 << 1) /* assumed same values as for pass2 */
+#define     R200_PASS1_ST_ROUTE_STQ0           (0 << 4)
+#define     R200_PASS1_ST_ROUTE_STQ1           (1 << 4)
+#define     R200_PASS1_ST_ROUTE_STQ2           (2 << 4)
+#define     R200_PASS1_ST_ROUTE_STQ3           (3 << 4)
+#define     R200_PASS1_ST_ROUTE_STQ4           (4 << 4)
+#define     R200_PASS1_ST_ROUTE_STQ5           (5 << 4)
+#define     R200_PASS1_ST_ROUTE_MASK           (7 << 4)
+#define     R200_PASS1_ST_ROUTE_SHIFT          (4)
+#define     R200_PASS2_COORDS_REG_0            (2 << 24)
+#define     R200_PASS2_COORDS_REG_1            (3 << 24)
+#define     R200_PASS2_COORDS_REG_2            (4 << 24)
+#define     R200_PASS2_COORDS_REG_3            (5 << 24)
+#define     R200_PASS2_COORDS_REG_4            (6 << 24)
+#define     R200_PASS2_COORDS_REG_5            (7 << 24)
+#define     R200_PASS2_COORDS_REG_MASK         (0x7 << 24)
+#define     R200_PASS2_COORDS_REG_SHIFT        (24)
+#define R200_PP_TXFILTER_1                0x2c20
+#define R200_PP_TXFORMAT_1                0x2c24
+#define R200_PP_TXFORMAT_X_1              0x2c28
+#define R200_PP_TXSIZE_1                  0x2c2c
+#define R200_PP_TXPITCH_1                 0x2c30
+#define R200_PP_BORDER_COLOR_1            0x2c34
+#define R200_PP_CUBIC_FACES_1             0x2c38
+#define R200_PP_TXMULTI_CTL_1             0x2c3c
+#define R200_PP_TXFILTER_2                0x2c40
+#define R200_PP_TXFORMAT_2                0x2c44
+#define R200_PP_TXSIZE_2                  0x2c4c
+#define R200_PP_TXFORMAT_X_2              0x2c48
+#define R200_PP_TXPITCH_2                 0x2c50
+#define R200_PP_BORDER_COLOR_2            0x2c54
+#define R200_PP_CUBIC_FACES_2             0x2c58
+#define R200_PP_TXMULTI_CTL_2             0x2c5c
+#define R200_PP_TXFILTER_3                0x2c60
+#define R200_PP_TXFORMAT_3                0x2c64
+#define R200_PP_TXSIZE_3                  0x2c6c
+#define R200_PP_TXFORMAT_X_3              0x2c68
+#define R200_PP_TXPITCH_3                 0x2c70
+#define R200_PP_BORDER_COLOR_3            0x2c74
+#define R200_PP_CUBIC_FACES_3             0x2c78
+#define R200_PP_TXMULTI_CTL_3             0x2c7c
+#define R200_PP_TXFILTER_4                0x2c80
+#define R200_PP_TXFORMAT_4                0x2c84
+#define R200_PP_TXSIZE_4                  0x2c8c
+#define R200_PP_TXFORMAT_X_4              0x2c88
+#define R200_PP_TXPITCH_4                 0x2c90
+#define R200_PP_BORDER_COLOR_4            0x2c94
+#define R200_PP_CUBIC_FACES_4             0x2c98
+#define R200_PP_TXMULTI_CTL_4             0x2c9c
+#define R200_PP_TXFILTER_5                0x2ca0
+#define R200_PP_TXFORMAT_5                0x2ca4
+#define R200_PP_TXSIZE_5                  0x2cac
+#define R200_PP_TXFORMAT_X_5              0x2ca8
+#define R200_PP_TXPITCH_5                 0x2cb0
+#define R200_PP_BORDER_COLOR_5            0x2cb4
+#define R200_PP_CUBIC_FACES_5             0x2cb8
+#define R200_PP_TXMULTI_CTL_5             0x2cbc
+/* gap */
+#define R200_PP_CNTL_X             0x2cc4  /* Reveree engineered from fglrx */
+#define     R200_PPX_TEX_0_ENABLE      (1 <<  0)
+#define     R200_PPX_TEX_1_ENABLE      (1 <<  1)
+#define     R200_PPX_TEX_2_ENABLE      (1 <<  2)
+#define     R200_PPX_TEX_3_ENABLE      (1 <<  3)
+#define     R200_PPX_TEX_4_ENABLE      (1 <<  4)
+#define     R200_PPX_TEX_5_ENABLE      (1 <<  5)
+#define     R200_PPX_TEX_ENABLE_MASK   (0x3f << 0)
+#define     R200_PPX_OUTPUT_REG_0      (1 <<  6)
+#define     R200_PPX_OUTPUT_REG_1      (1 <<  7)
+#define     R200_PPX_OUTPUT_REG_2      (1 <<  8)
+#define     R200_PPX_OUTPUT_REG_3      (1 <<  9)
+#define     R200_PPX_OUTPUT_REG_4      (1 << 10)
+#define     R200_PPX_OUTPUT_REG_5      (1 << 11)
+#define     R200_PPX_OUTPUT_REG_MASK   (0x3f << 6)
+#define     R200_PPX_OUTPUT_REG_0_SHIFT (6)
+#define     R200_PPX_PFS_INST0_ENABLE  (1 << 12)
+#define     R200_PPX_PFS_INST1_ENABLE  (1 << 13)
+#define     R200_PPX_PFS_INST2_ENABLE  (1 << 14)
+#define     R200_PPX_PFS_INST3_ENABLE  (1 << 15)
+#define     R200_PPX_PFS_INST4_ENABLE  (1 << 16)
+#define     R200_PPX_PFS_INST5_ENABLE  (1 << 17)
+#define     R200_PPX_PFS_INST6_ENABLE  (1 << 18)
+#define     R200_PPX_PFS_INST7_ENABLE  (1 << 19)
+#define     R200_PPX_PFS_INST_ENABLE_MASK (0xff << 12)
+#define     R200_PPX_FPS_INST0_ENABLE_SHIFT (12)
+/* gap */
+#define R200_PP_TRI_PERF                  0x2cf8
+#define     R200_TRI_CUTOFF_MASK            (0x1f << 0)
+#define R200_PP_PERF_CNTL                 0x2cfc
+#define R200_PP_TXOFFSET_0                0x2d00
+#define     R200_TXO_ENDIAN_NO_SWAP     (0 << 0)
+#define     R200_TXO_ENDIAN_BYTE_SWAP   (1 << 0)
+#define     R200_TXO_ENDIAN_WORD_SWAP   (2 << 0)
+#define     R200_TXO_ENDIAN_HALFDW_SWAP (3 << 0)
+#define     R200_TXO_MACRO_TILE         (1 << 2)
+#define     R200_TXO_MICRO_TILE         (1 << 3)
+#define     R200_TXO_OFFSET_MASK        0xffffffe0
+#define     R200_TXO_OFFSET_SHIFT       5
+#define R200_PP_CUBIC_OFFSET_F1_0         0x2d04
+#define R200_PP_CUBIC_OFFSET_F2_0         0x2d08
+#define R200_PP_CUBIC_OFFSET_F3_0         0x2d0c
+#define R200_PP_CUBIC_OFFSET_F4_0         0x2d10
+#define R200_PP_CUBIC_OFFSET_F5_0         0x2d14
+#define R200_PP_TXOFFSET_1                0x2d18
+#define R200_PP_CUBIC_OFFSET_F1_1         0x2d1c
+#define R200_PP_CUBIC_OFFSET_F2_1         0x2d20
+#define R200_PP_CUBIC_OFFSET_F3_1         0x2d24
+#define R200_PP_CUBIC_OFFSET_F4_1         0x2d28
+#define R200_PP_CUBIC_OFFSET_F5_1         0x2d2c
+#define R200_PP_TXOFFSET_2                0x2d30
+#define R200_PP_CUBIC_OFFSET_F1_2         0x2d34
+#define R200_PP_CUBIC_OFFSET_F2_2         0x2d38
+#define R200_PP_CUBIC_OFFSET_F3_2         0x2d3c
+#define R200_PP_CUBIC_OFFSET_F4_2         0x2d40
+#define R200_PP_CUBIC_OFFSET_F5_2         0x2d44
+#define R200_PP_TXOFFSET_3                0x2d48
+#define R200_PP_CUBIC_OFFSET_F1_3         0x2d4c
+#define R200_PP_CUBIC_OFFSET_F2_3         0x2d50
+#define R200_PP_CUBIC_OFFSET_F3_3         0x2d54
+#define R200_PP_CUBIC_OFFSET_F4_3         0x2d58
+#define R200_PP_CUBIC_OFFSET_F5_3         0x2d5c
+#define R200_PP_TXOFFSET_4                0x2d60
+#define R200_PP_CUBIC_OFFSET_F1_4         0x2d64
+#define R200_PP_CUBIC_OFFSET_F2_4         0x2d68
+#define R200_PP_CUBIC_OFFSET_F3_4         0x2d6c
+#define R200_PP_CUBIC_OFFSET_F4_4         0x2d70
+#define R200_PP_CUBIC_OFFSET_F5_4         0x2d74
+#define R200_PP_TXOFFSET_5                0x2d78
+#define R200_PP_CUBIC_OFFSET_F1_5         0x2d7c
+#define R200_PP_CUBIC_OFFSET_F2_5         0x2d80
+#define R200_PP_CUBIC_OFFSET_F3_5         0x2d84
+#define R200_PP_CUBIC_OFFSET_F4_5         0x2d88
+#define R200_PP_CUBIC_OFFSET_F5_5         0x2d8c
+/* gap */
+#define R200_PP_TAM_DEBUG3                0x2d9c
+/* gap */
+#define R200_PP_TFACTOR_0                 0x2ee0
+#define R200_PP_TFACTOR_1                 0x2ee4
+#define R200_PP_TFACTOR_2                 0x2ee8
+#define R200_PP_TFACTOR_3                 0x2eec
+#define R200_PP_TFACTOR_4                 0x2ef0
+#define R200_PP_TFACTOR_5                 0x2ef4
+#define R200_PP_TFACTOR_6                 0x2ef8
+#define R200_PP_TFACTOR_7                 0x2efc
+#define R200_PP_TXCBLEND_0                0x2f00
+#define     R200_TXC_ARG_A_ZERO                (0)
+#define     R200_TXC_ARG_A_CURRENT_COLOR       (2)
+#define     R200_TXC_ARG_A_CURRENT_ALPHA       (3)
+#define     R200_TXC_ARG_A_DIFFUSE_COLOR       (4)
+#define     R200_TXC_ARG_A_DIFFUSE_ALPHA       (5)
+#define     R200_TXC_ARG_A_SPECULAR_COLOR      (6)
+#define     R200_TXC_ARG_A_SPECULAR_ALPHA      (7)
+#define     R200_TXC_ARG_A_TFACTOR_COLOR       (8)
+#define     R200_TXC_ARG_A_TFACTOR_ALPHA       (9)
+#define     R200_TXC_ARG_A_R0_COLOR            (10)
+#define     R200_TXC_ARG_A_R0_ALPHA            (11)
+#define     R200_TXC_ARG_A_R1_COLOR            (12)
+#define     R200_TXC_ARG_A_R1_ALPHA            (13)
+#define     R200_TXC_ARG_A_R2_COLOR            (14)
+#define     R200_TXC_ARG_A_R2_ALPHA            (15)
+#define     R200_TXC_ARG_A_R3_COLOR            (16)
+#define     R200_TXC_ARG_A_R3_ALPHA            (17)
+#define     R200_TXC_ARG_A_R4_COLOR            (18)
+#define     R200_TXC_ARG_A_R4_ALPHA            (19)
+#define     R200_TXC_ARG_A_R5_COLOR            (20)
+#define     R200_TXC_ARG_A_R5_ALPHA            (21)
+#define     R200_TXC_ARG_A_TFACTOR1_COLOR      (26)
+#define     R200_TXC_ARG_A_TFACTOR1_ALPHA      (27)
+#define     R200_TXC_ARG_A_MASK			(31 << 0)
+#define     R200_TXC_ARG_A_SHIFT			0
+#define     R200_TXC_ARG_B_ZERO                (0<<5)
+#define     R200_TXC_ARG_B_CURRENT_COLOR       (2<<5)
+#define     R200_TXC_ARG_B_CURRENT_ALPHA       (3<<5)
+#define     R200_TXC_ARG_B_DIFFUSE_COLOR       (4<<5)
+#define     R200_TXC_ARG_B_DIFFUSE_ALPHA       (5<<5)
+#define     R200_TXC_ARG_B_SPECULAR_COLOR      (6<<5)
+#define     R200_TXC_ARG_B_SPECULAR_ALPHA      (7<<5)
+#define     R200_TXC_ARG_B_TFACTOR_COLOR       (8<<5)
+#define     R200_TXC_ARG_B_TFACTOR_ALPHA       (9<<5)
+#define     R200_TXC_ARG_B_R0_COLOR            (10<<5)
+#define     R200_TXC_ARG_B_R0_ALPHA            (11<<5)
+#define     R200_TXC_ARG_B_R1_COLOR            (12<<5)
+#define     R200_TXC_ARG_B_R1_ALPHA            (13<<5)
+#define     R200_TXC_ARG_B_R2_COLOR            (14<<5)
+#define     R200_TXC_ARG_B_R2_ALPHA            (15<<5)
+#define     R200_TXC_ARG_B_R3_COLOR            (16<<5)
+#define     R200_TXC_ARG_B_R3_ALPHA            (17<<5)
+#define     R200_TXC_ARG_B_R4_COLOR            (18<<5)
+#define     R200_TXC_ARG_B_R4_ALPHA            (19<<5)
+#define     R200_TXC_ARG_B_R5_COLOR            (20<<5)
+#define     R200_TXC_ARG_B_R5_ALPHA            (21<<5)
+#define     R200_TXC_ARG_B_TFACTOR1_COLOR      (26<<5)
+#define     R200_TXC_ARG_B_TFACTOR1_ALPHA      (27<<5)
+#define     R200_TXC_ARG_B_MASK			(31 << 5)
+#define     R200_TXC_ARG_B_SHIFT			5
+#define     R200_TXC_ARG_C_ZERO                (0<<10)
+#define     R200_TXC_ARG_C_CURRENT_COLOR       (2<<10)
+#define     R200_TXC_ARG_C_CURRENT_ALPHA       (3<<10)
+#define     R200_TXC_ARG_C_DIFFUSE_COLOR       (4<<10)
+#define     R200_TXC_ARG_C_DIFFUSE_ALPHA       (5<<10)
+#define     R200_TXC_ARG_C_SPECULAR_COLOR      (6<<10)
+#define     R200_TXC_ARG_C_SPECULAR_ALPHA      (7<<10)
+#define     R200_TXC_ARG_C_TFACTOR_COLOR       (8<<10)
+#define     R200_TXC_ARG_C_TFACTOR_ALPHA       (9<<10)
+#define     R200_TXC_ARG_C_R0_COLOR            (10<<10)
+#define     R200_TXC_ARG_C_R0_ALPHA            (11<<10)
+#define     R200_TXC_ARG_C_R1_COLOR            (12<<10)
+#define     R200_TXC_ARG_C_R1_ALPHA            (13<<10)
+#define     R200_TXC_ARG_C_R2_COLOR            (14<<10)
+#define     R200_TXC_ARG_C_R2_ALPHA            (15<<10)
+#define     R200_TXC_ARG_C_R3_COLOR            (16<<10)
+#define     R200_TXC_ARG_C_R3_ALPHA            (17<<10)
+#define     R200_TXC_ARG_C_R4_COLOR            (18<<10)
+#define     R200_TXC_ARG_C_R4_ALPHA            (19<<10)
+#define     R200_TXC_ARG_C_R5_COLOR            (20<<10)
+#define     R200_TXC_ARG_C_R5_ALPHA            (21<<10)
+#define     R200_TXC_ARG_C_TFACTOR1_COLOR      (26<<10)
+#define     R200_TXC_ARG_C_TFACTOR1_ALPHA      (27<<10)
+#define     R200_TXC_ARG_C_MASK			(31 << 10)
+#define     R200_TXC_ARG_C_SHIFT			10
+#define     R200_TXC_COMP_ARG_A                    (1 << 16)
+#define     R200_TXC_COMP_ARG_A_SHIFT              (16)
+#define     R200_TXC_BIAS_ARG_A                    (1 << 17)
+#define     R200_TXC_SCALE_ARG_A                   (1 << 18)
+#define     R200_TXC_NEG_ARG_A                     (1 << 19)
+#define     R200_TXC_COMP_ARG_B                    (1 << 20)
+#define     R200_TXC_COMP_ARG_B_SHIFT              (20)
+#define     R200_TXC_BIAS_ARG_B                    (1 << 21)
+#define     R200_TXC_SCALE_ARG_B                   (1 << 22)
+#define     R200_TXC_NEG_ARG_B                     (1 << 23)
+#define     R200_TXC_COMP_ARG_C                    (1 << 24)
+#define     R200_TXC_COMP_ARG_C_SHIFT              (24)
+#define     R200_TXC_BIAS_ARG_C                    (1 << 25)
+#define     R200_TXC_SCALE_ARG_C                   (1 << 26)
+#define     R200_TXC_NEG_ARG_C                     (1 << 27)
+#define     R200_TXC_OP_MADD                        (0 << 28)
+#define     R200_TXC_OP_CND0                       (2 << 28)
+#define     R200_TXC_OP_LERP                       (3 << 28)
+#define     R200_TXC_OP_DOT3                       (4 << 28)
+#define     R200_TXC_OP_DOT4                       (5 << 28)
+#define     R200_TXC_OP_CONDITIONAL                (6 << 28)
+#define     R200_TXC_OP_DOT2_ADD                   (7 << 28)
+#define     R200_TXC_OP_MASK                       (7 << 28)
+#define R200_PP_TXCBLEND2_0                0x2f04
+#define     R200_TXC_TFACTOR_SEL_SHIFT             0
+#define     R200_TXC_TFACTOR_SEL_MASK              0x7
+#define     R200_TXC_TFACTOR1_SEL_SHIFT            4
+#define     R200_TXC_TFACTOR1_SEL_MASK             (0x7 << 4)
+#define     R200_TXC_SCALE_SHIFT                   8
+#define     R200_TXC_SCALE_MASK                    (7 << 8)
+#define     R200_TXC_SCALE_1X                      (0 << 8)
+#define     R200_TXC_SCALE_2X                      (1 << 8)
+#define     R200_TXC_SCALE_4X                      (2 << 8)
+#define     R200_TXC_SCALE_8X                      (3 << 8)
+#define     R200_TXC_SCALE_INV2                    (5 << 8)
+#define     R200_TXC_SCALE_INV4                    (6 << 8)
+#define     R200_TXC_SCALE_INV8                    (7 << 8)
+#define     R200_TXC_CLAMP_SHIFT                   12
+#define     R200_TXC_CLAMP_MASK                    (3 << 12)
+#define     R200_TXC_CLAMP_WRAP                    (0 << 12)
+#define     R200_TXC_CLAMP_0_1                     (1 << 12)
+#define     R200_TXC_CLAMP_8_8                     (2 << 12)
+#define     R200_TXC_OUTPUT_REG_SHIFT              16
+#define     R200_TXC_OUTPUT_REG_MASK               (7 << 16)
+#define     R200_TXC_OUTPUT_REG_NONE               (0 << 16)
+#define     R200_TXC_OUTPUT_REG_R0                 (1 << 16)
+#define     R200_TXC_OUTPUT_REG_R1                 (2 << 16)
+#define     R200_TXC_OUTPUT_REG_R2                 (3 << 16)
+#define     R200_TXC_OUTPUT_REG_R3                 (4 << 16)
+#define     R200_TXC_OUTPUT_REG_R4                 (5 << 16)
+#define     R200_TXC_OUTPUT_REG_R5                 (6 << 16)
+#define     R200_TXC_OUTPUT_MASK_MASK              (7 << 20)
+#define     R200_TXC_OUTPUT_MASK_RGB               (0 << 20)
+#define     R200_TXC_OUTPUT_MASK_RG                (1 << 20)
+#define     R200_TXC_OUTPUT_MASK_RB                (2 << 20)
+#define     R200_TXC_OUTPUT_MASK_R                 (3 << 20)
+#define     R200_TXC_OUTPUT_MASK_GB                (4 << 20)
+#define     R200_TXC_OUTPUT_MASK_G                 (5 << 20)
+#define     R200_TXC_OUTPUT_MASK_B                 (6 << 20)
+#define     R200_TXC_OUTPUT_MASK_NONE              (7 << 20)
+#define     R200_TXC_REPL_NORMAL                   0
+#define     R200_TXC_REPL_RED                      1
+#define     R200_TXC_REPL_GREEN                    2
+#define     R200_TXC_REPL_BLUE                     3
+#define     R200_TXC_REPL_ARG_A_SHIFT              26
+#define     R200_TXC_REPL_ARG_A_MASK               (3 << 26)
+#define     R200_TXC_REPL_ARG_B_SHIFT              28
+#define     R200_TXC_REPL_ARG_B_MASK               (3 << 28)
+#define     R200_TXC_REPL_ARG_C_SHIFT              30
+#define     R200_TXC_REPL_ARG_C_MASK               (3 << 30)
+#define R200_PP_TXABLEND_0                0x2f08
+#define     R200_TXA_ARG_A_ZERO              (0)
+#define     R200_TXA_ARG_A_CURRENT_ALPHA     (2) /* guess */
+#define     R200_TXA_ARG_A_CURRENT_BLUE      (3) /* guess */
+#define     R200_TXA_ARG_A_DIFFUSE_ALPHA     (4)
+#define     R200_TXA_ARG_A_DIFFUSE_BLUE      (5)
+#define     R200_TXA_ARG_A_SPECULAR_ALPHA    (6)
+#define     R200_TXA_ARG_A_SPECULAR_BLUE     (7)
+#define     R200_TXA_ARG_A_TFACTOR_ALPHA     (8)
+#define     R200_TXA_ARG_A_TFACTOR_BLUE      (9)
+#define     R200_TXA_ARG_A_R0_ALPHA          (10)
+#define     R200_TXA_ARG_A_R0_BLUE           (11)
+#define     R200_TXA_ARG_A_R1_ALPHA          (12)
+#define     R200_TXA_ARG_A_R1_BLUE           (13)
+#define     R200_TXA_ARG_A_R2_ALPHA          (14)
+#define     R200_TXA_ARG_A_R2_BLUE           (15)
+#define     R200_TXA_ARG_A_R3_ALPHA          (16)
+#define     R200_TXA_ARG_A_R3_BLUE           (17)
+#define     R200_TXA_ARG_A_R4_ALPHA          (18)
+#define     R200_TXA_ARG_A_R4_BLUE           (19)
+#define     R200_TXA_ARG_A_R5_ALPHA          (20)
+#define     R200_TXA_ARG_A_R5_BLUE           (21)
+#define     R200_TXA_ARG_A_TFACTOR1_ALPHA    (26)
+#define     R200_TXA_ARG_A_TFACTOR1_BLUE     (27)
+#define     R200_TXA_ARG_A_MASK			(31 << 0)
+#define     R200_TXA_ARG_A_SHIFT			0
+#define     R200_TXA_ARG_B_ZERO              (0<<5)
+#define     R200_TXA_ARG_B_CURRENT_ALPHA     (2<<5) /* guess */
+#define     R200_TXA_ARG_B_CURRENT_BLUE      (3<<5) /* guess */
+#define     R200_TXA_ARG_B_DIFFUSE_ALPHA     (4<<5)
+#define     R200_TXA_ARG_B_DIFFUSE_BLUE      (5<<5)
+#define     R200_TXA_ARG_B_SPECULAR_ALPHA    (6<<5)
+#define     R200_TXA_ARG_B_SPECULAR_BLUE     (7<<5)
+#define     R200_TXA_ARG_B_TFACTOR_ALPHA     (8<<5)
+#define     R200_TXA_ARG_B_TFACTOR_BLUE      (9<<5)
+#define     R200_TXA_ARG_B_R0_ALPHA          (10<<5)
+#define     R200_TXA_ARG_B_R0_BLUE           (11<<5)
+#define     R200_TXA_ARG_B_R1_ALPHA          (12<<5)
+#define     R200_TXA_ARG_B_R1_BLUE           (13<<5)
+#define     R200_TXA_ARG_B_R2_ALPHA          (14<<5)
+#define     R200_TXA_ARG_B_R2_BLUE           (15<<5)
+#define     R200_TXA_ARG_B_R3_ALPHA          (16<<5)
+#define     R200_TXA_ARG_B_R3_BLUE           (17<<5)
+#define     R200_TXA_ARG_B_R4_ALPHA          (18<<5)
+#define     R200_TXA_ARG_B_R4_BLUE           (19<<5)
+#define     R200_TXA_ARG_B_R5_ALPHA          (20<<5)
+#define     R200_TXA_ARG_B_R5_BLUE           (21<<5)
+#define     R200_TXA_ARG_B_TFACTOR1_ALPHA    (26<<5)
+#define     R200_TXA_ARG_B_TFACTOR1_BLUE     (27<<5)
+#define     R200_TXA_ARG_B_MASK			(31 << 5)
+#define     R200_TXA_ARG_B_SHIFT			5
+#define     R200_TXA_ARG_C_ZERO              (0<<10)
+#define     R200_TXA_ARG_C_CURRENT_ALPHA     (2<<10) /* guess */
+#define     R200_TXA_ARG_C_CURRENT_BLUE      (3<<10) /* guess */
+#define     R200_TXA_ARG_C_DIFFUSE_ALPHA     (4<<10)
+#define     R200_TXA_ARG_C_DIFFUSE_BLUE      (5<<10)
+#define     R200_TXA_ARG_C_SPECULAR_ALPHA    (6<<10)
+#define     R200_TXA_ARG_C_SPECULAR_BLUE     (7<<10)
+#define     R200_TXA_ARG_C_TFACTOR_ALPHA     (8<<10)
+#define     R200_TXA_ARG_C_TFACTOR_BLUE      (9<<10)
+#define     R200_TXA_ARG_C_R0_ALPHA          (10<<10)
+#define     R200_TXA_ARG_C_R0_BLUE           (11<<10)
+#define     R200_TXA_ARG_C_R1_ALPHA          (12<<10)
+#define     R200_TXA_ARG_C_R1_BLUE           (13<<10)
+#define     R200_TXA_ARG_C_R2_ALPHA          (14<<10)
+#define     R200_TXA_ARG_C_R2_BLUE           (15<<10)
+#define     R200_TXA_ARG_C_R3_ALPHA          (16<<10)
+#define     R200_TXA_ARG_C_R3_BLUE           (17<<10)
+#define     R200_TXA_ARG_C_R4_ALPHA          (18<<10)
+#define     R200_TXA_ARG_C_R4_BLUE           (19<<10)
+#define     R200_TXA_ARG_C_R5_ALPHA          (20<<10)
+#define     R200_TXA_ARG_C_R5_BLUE           (21<<10)
+#define     R200_TXA_ARG_C_TFACTOR1_ALPHA    (26<<10)
+#define     R200_TXA_ARG_C_TFACTOR1_BLUE     (27<<10)
+#define     R200_TXA_ARG_C_MASK			(31 << 10)
+#define     R200_TXA_ARG_C_SHIFT			10
+#define     R200_TXA_COMP_ARG_A                    (1 << 16)
+#define     R200_TXA_COMP_ARG_A_SHIFT              (16)
+#define     R200_TXA_BIAS_ARG_A                    (1 << 17)
+#define     R200_TXA_SCALE_ARG_A                   (1 << 18)
+#define     R200_TXA_NEG_ARG_A                     (1 << 19)
+#define     R200_TXA_COMP_ARG_B                    (1 << 20)
+#define     R200_TXA_COMP_ARG_B_SHIFT              (20)
+#define     R200_TXA_BIAS_ARG_B                    (1 << 21)
+#define     R200_TXA_SCALE_ARG_B                   (1 << 22)
+#define     R200_TXA_NEG_ARG_B                     (1 << 23)
+#define     R200_TXA_COMP_ARG_C                    (1 << 24)
+#define     R200_TXA_COMP_ARG_C_SHIFT              (24)
+#define     R200_TXA_BIAS_ARG_C                    (1 << 25)
+#define     R200_TXA_SCALE_ARG_C                   (1 << 26)
+#define     R200_TXA_NEG_ARG_C                     (1 << 27)
+#define     R200_TXA_OP_MADD                       (0 << 28)
+#define     R200_TXA_OP_CND0                       (2 << 28)
+#define     R200_TXA_OP_LERP                       (3 << 28)
+#define     R200_TXA_OP_CONDITIONAL                (6 << 28)
+#define     R200_TXA_OP_MASK                       (7 << 28)
+#define R200_PP_TXABLEND2_0                0x2f0c
+#define     R200_TXA_TFACTOR_SEL_SHIFT             0
+#define     R200_TXA_TFACTOR_SEL_MASK              0x7
+#define     R200_TXA_TFACTOR1_SEL_SHIFT            4
+#define     R200_TXA_TFACTOR1_SEL_MASK             (0x7 << 4)
+#define     R200_TXA_SCALE_SHIFT                   8
+#define     R200_TXA_SCALE_MASK                    (7 << 8)
+#define     R200_TXA_SCALE_1X                      (0 << 8)
+#define     R200_TXA_SCALE_2X                      (1 << 8)
+#define     R200_TXA_SCALE_4X                      (2 << 8)
+#define     R200_TXA_SCALE_8X                      (3 << 8)
+#define     R200_TXA_SCALE_INV2                    (5 << 8)
+#define     R200_TXA_SCALE_INV4                    (6 << 8)
+#define     R200_TXA_SCALE_INV8                    (7 << 8)
+#define     R200_TXA_CLAMP_SHIFT                   12
+#define     R200_TXA_CLAMP_MASK                    (3 << 12)
+#define     R200_TXA_CLAMP_WRAP                    (0 << 12)
+#define     R200_TXA_CLAMP_0_1                     (1 << 12)
+#define     R200_TXA_CLAMP_8_8                     (2 << 12)
+#define     R200_TXA_OUTPUT_REG_SHIFT              16
+#define     R200_TXA_OUTPUT_REG_MASK               (7 << 16)
+#define     R200_TXA_OUTPUT_REG_NONE               (0 << 16)
+#define     R200_TXA_OUTPUT_REG_R0                 (1 << 16)
+#define     R200_TXA_OUTPUT_REG_R1                 (2 << 16)
+#define     R200_TXA_OUTPUT_REG_R2                 (3 << 16)
+#define     R200_TXA_OUTPUT_REG_R3                 (4 << 16)
+#define     R200_TXA_OUTPUT_REG_R4                 (5 << 16)
+#define     R200_TXA_OUTPUT_REG_R5                 (6 << 16)
+#define     R200_TXA_DOT_ALPHA                     (1 << 20)
+#define     R200_TXA_REPL_NORMAL                   0
+#define     R200_TXA_REPL_RED                      1
+#define     R200_TXA_REPL_GREEN                    2
+#define     R200_TXA_REPL_ARG_A_SHIFT              26
+#define     R200_TXA_REPL_ARG_A_MASK               (3 << 26)
+#define     R200_TXA_REPL_ARG_B_SHIFT              28
+#define     R200_TXA_REPL_ARG_B_MASK               (3 << 28)
+#define     R200_TXA_REPL_ARG_C_SHIFT              30
+#define     R200_TXA_REPL_ARG_C_MASK               (3 << 30)
+#define R200_PP_TXCBLEND_1                0x2f10
+#define R200_PP_TXCBLEND2_1               0x2f14
+#define R200_PP_TXABLEND_1                0x2f18
+#define R200_PP_TXABLEND2_1               0x2f1c
+#define R200_PP_TXCBLEND_2                0x2f20
+#define R200_PP_TXCBLEND2_2               0x2f24
+#define R200_PP_TXABLEND_2                0x2f28
+#define R200_PP_TXABLEND2_2               0x2f2c
+#define R200_PP_TXCBLEND_3                0x2f30
+#define R200_PP_TXCBLEND2_3               0x2f34
+#define R200_PP_TXABLEND_3                0x2f38
+#define R200_PP_TXABLEND2_3               0x2f3c
+#define R200_PP_TXCBLEND_4                0x2f40
+#define R200_PP_TXCBLEND2_4               0x2f44
+#define R200_PP_TXABLEND_4                0x2f48
+#define R200_PP_TXABLEND2_4               0x2f4c
+#define R200_PP_TXCBLEND_5                0x2f50
+#define R200_PP_TXCBLEND2_5               0x2f54
+#define R200_PP_TXABLEND_5                0x2f58
+#define R200_PP_TXABLEND2_5               0x2f5c
+#define R200_PP_TXCBLEND_6                0x2f60
+#define R200_PP_TXCBLEND2_6               0x2f64
+#define R200_PP_TXABLEND_6                0x2f68
+#define R200_PP_TXABLEND2_6               0x2f6c
+#define R200_PP_TXCBLEND_7                0x2f70
+#define R200_PP_TXCBLEND2_7               0x2f74
+#define R200_PP_TXABLEND_7                0x2f78
+#define R200_PP_TXABLEND2_7               0x2f7c
+#define R200_PP_TXCBLEND_8                0x2f80
+#define R200_PP_TXCBLEND2_8               0x2f84
+#define R200_PP_TXABLEND_8                0x2f88
+#define R200_PP_TXABLEND2_8               0x2f8c
+#define R200_PP_TXCBLEND_9                0x2f90
+#define R200_PP_TXCBLEND2_9               0x2f94
+#define R200_PP_TXABLEND_9                0x2f98
+#define R200_PP_TXABLEND2_9               0x2f9c
+#define R200_PP_TXCBLEND_10               0x2fa0
+#define R200_PP_TXCBLEND2_10              0x2fa4
+#define R200_PP_TXABLEND_10               0x2fa8
+#define R200_PP_TXABLEND2_10              0x2fac
+#define R200_PP_TXCBLEND_11               0x2fb0
+#define R200_PP_TXCBLEND2_11              0x2fb4
+#define R200_PP_TXABLEND_11               0x2fb8
+#define R200_PP_TXABLEND2_11              0x2fbc
+#define R200_PP_TXCBLEND_12               0x2fc0
+#define R200_PP_TXCBLEND2_12              0x2fc4
+#define R200_PP_TXABLEND_12               0x2fc8
+#define R200_PP_TXABLEND2_12              0x2fcc
+#define R200_PP_TXCBLEND_13               0x2fd0
+#define R200_PP_TXCBLEND2_13              0x2fd4
+#define R200_PP_TXABLEND_13               0x2fd8
+#define R200_PP_TXABLEND2_13              0x2fdc
+#define R200_PP_TXCBLEND_14               0x2fe0
+#define R200_PP_TXCBLEND2_14              0x2fe4
+#define R200_PP_TXABLEND_14               0x2fe8
+#define R200_PP_TXABLEND2_14              0x2fec
+#define R200_PP_TXCBLEND_15               0x2ff0
+#define R200_PP_TXCBLEND2_15              0x2ff4
+#define R200_PP_TXABLEND_15               0x2ff8
+#define R200_PP_TXABLEND2_15              0x2ffc
+/* gap */
+#define R200_RB3D_BLENDCOLOR               0x3218 /* ARGB 8888 */
+#define R200_RB3D_ABLENDCNTL               0x321C /* see BLENDCTL */
+#define R200_RB3D_CBLENDCNTL               0x3220 /* see BLENDCTL */
+
+
+/*
+ * Offsets in TCL vector state.  NOTE: Hardwiring matrix positions.
+ * Multiple contexts could collaberate to eliminate state bouncing.
+ */
+#define R200_VS_LIGHT_AMBIENT_ADDR          0x00000028
+#define R200_VS_LIGHT_DIFFUSE_ADDR          0x00000030
+#define R200_VS_LIGHT_SPECULAR_ADDR         0x00000038
+#define R200_VS_LIGHT_DIRPOS_ADDR           0x00000040
+#define R200_VS_LIGHT_HWVSPOT_ADDR          0x00000048
+#define R200_VS_LIGHT_ATTENUATION_ADDR      0x00000050
+#define R200_VS_SPOT_DUAL_CONE              0x00000058
+#define R200_VS_GLOBAL_AMBIENT_ADDR         0x0000005C
+#define R200_VS_FOG_PARAM_ADDR              0x0000005D
+#define R200_VS_EYE_VECTOR_ADDR             0x0000005E
+#define R200_VS_UCP_ADDR                    0x00000060
+#define R200_VS_PNT_SPRITE_VPORT_SCALE      0x00000068
+#define R200_VS_MATRIX_0_MV                 0x00000080
+#define R200_VS_MATRIX_1_INV_MV        	    0x00000084
+#define R200_VS_MATRIX_2_MVP        	    0x00000088
+#define R200_VS_MATRIX_3_TEX0        	    0x0000008C
+#define R200_VS_MATRIX_4_TEX1        	    0x00000090
+#define R200_VS_MATRIX_5_TEX2        	    0x00000094
+#define R200_VS_MATRIX_6_TEX3        	    0x00000098
+#define R200_VS_MATRIX_7_TEX4        	    0x0000009C
+#define R200_VS_MATRIX_8_TEX5        	    0x000000A0
+#define R200_VS_MAT_0_EMISS                 0x000000B0
+#define R200_VS_MAT_0_AMB                   0x000000B1
+#define R200_VS_MAT_0_DIF                   0x000000B2
+#define R200_VS_MAT_0_SPEC                  0x000000B3
+#define R200_VS_MAT_1_EMISS                 0x000000B4
+#define R200_VS_MAT_1_AMB                   0x000000B5
+#define R200_VS_MAT_1_DIF                   0x000000B6
+#define R200_VS_MAT_1_SPEC                  0x000000B7
+#define R200_VS_EYE2CLIP_MTX                0x000000B8
+#define R200_VS_PNT_SPRITE_ATT_CONST        0x000000BC
+#define R200_VS_PNT_SPRITE_EYE_IN_MODEL     0x000000BD
+#define R200_VS_PNT_SPRITE_CLAMP            0x000000BE
+#define R200_VS_MAX                         0x000001C0
+
+#define R200_PVS_PROG0                      0x00000080
+#define R200_PVS_PROG1                      0x00000180
+#define R200_PVS_PARAM0                     0x00000000
+#define R200_PVS_PARAM1                     0x00000100
+
+/*
+ * Offsets in TCL scalar state
+ */
+#define R200_SS_LIGHT_DCD_ADDR              0x00000000
+#define R200_SS_LIGHT_DCM_ADDR              0x00000008
+#define R200_SS_LIGHT_SPOT_EXPONENT_ADDR    0x00000010
+#define R200_SS_LIGHT_SPOT_CUTOFF_ADDR      0x00000018
+#define R200_SS_LIGHT_SPECULAR_THRESH_ADDR  0x00000020
+#define R200_SS_LIGHT_RANGE_CUTOFF_SQRD     0x00000028
+#define R200_SS_LIGHT_RANGE_ATT_CONST       0x00000030
+#define R200_SS_VERT_GUARD_CLIP_ADJ_ADDR    0x00000080
+#define R200_SS_VERT_GUARD_DISCARD_ADJ_ADDR 0x00000081
+#define R200_SS_HORZ_GUARD_CLIP_ADJ_ADDR    0x00000082
+#define R200_SS_HORZ_GUARD_DISCARD_ADJ_ADDR 0x00000083
+#define R200_SS_MAT_0_SHININESS             0x00000100
+#define R200_SS_MAT_1_SHININESS             0x00000101
+
+
+/*
+ * Matrix indices
+ */
+#define R200_MTX_MV                        0
+#define R200_MTX_IMV                       1
+#define R200_MTX_MVP                       2
+#define R200_MTX_TEX0                      3
+#define R200_MTX_TEX1                      4
+#define R200_MTX_TEX2                      5
+#define R200_MTX_TEX3                      6
+#define R200_MTX_TEX4                      7
+#define R200_MTX_TEX5                      8
+
+/* Color formats for 2d packets
+ */
+#define R200_CP_COLOR_FORMAT_CI8	2
+#define R200_CP_COLOR_FORMAT_ARGB1555	3
+#define R200_CP_COLOR_FORMAT_RGB565	4
+#define R200_CP_COLOR_FORMAT_ARGB8888	6
+#define R200_CP_COLOR_FORMAT_RGB332	7
+#define R200_CP_COLOR_FORMAT_RGB8	9
+#define R200_CP_COLOR_FORMAT_ARGB4444	15
+
+
+/*
+ * CP type-3 packets
+ */
+#define R200_CP_CMD_NOP                 0xC0001000
+#define R200_CP_CMD_NEXT_CHAR           0xC0001900
+#define R200_CP_CMD_PLY_NEXTSCAN        0xC0001D00
+#define R200_CP_CMD_SET_SCISSORS        0xC0001E00
+#define R200_CP_CMD_LOAD_MICROCODE      0xC0002400
+#define R200_CP_CMD_WAIT_FOR_IDLE       0xC0002600
+#define R200_CP_CMD_3D_DRAW_VBUF        0xC0002800
+#define R200_CP_CMD_3D_DRAW_IMMD        0xC0002900
+#define R200_CP_CMD_3D_DRAW_INDX        0xC0002A00
+#define R200_CP_CMD_LOAD_PALETTE        0xC0002C00
+#define R200_CP_CMD_3D_LOAD_VBPNTR      0xC0002F00
+#define R200_CP_CMD_INDX_BUFFER         0xC0003300
+#define R200_CP_CMD_3D_DRAW_VBUF_2      0xC0003400
+#define R200_CP_CMD_3D_DRAW_IMMD_2      0xC0003500
+#define R200_CP_CMD_3D_DRAW_INDX_2      0xC0003600
+#define R200_CP_CMD_PAINT		0xC0009100
+#define R200_CP_CMD_BITBLT		0xC0009200
+#define R200_CP_CMD_SMALLTEXT		0xC0009300
+#define R200_CP_CMD_HOSTDATA_BLT	0xC0009400
+#define R200_CP_CMD_POLYLINE		0xC0009500
+#define R200_CP_CMD_POLYSCANLINES	0xC0009800
+#define R200_CP_CMD_PAINT_MULTI		0xC0009A00
+#define R200_CP_CMD_BITBLT_MULTI	0xC0009B00
+#define R200_CP_CMD_TRANS_BITBLT	0xC0009C00
+
+#endif
+
diff --git a/r200/r200_sanity.c b/r200/r200_sanity.c
new file mode 100644
index 0000000..3f2a866
--- /dev/null
+++ b/r200/r200_sanity.c
@@ -0,0 +1,1458 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_sanity.c,v 1.1 2002/10/30 12:51:52 alanh Exp $ */
+/**************************************************************************
+
+Copyright 2002 ATI Technologies Inc., Ontario, Canada, and
+                     Tungsten Graphics Inc, Cedar Park, TX.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ATI, TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *
+ */
+ 
+#include <errno.h> 
+
+#include "glheader.h"
+#include "imports.h"
+
+#include "r200_context.h"
+#include "r200_ioctl.h"
+#include "r200_sanity.h"
+#include "radeon_reg.h"
+#include "r200_reg.h"
+
+/* Set this '1' to get more verbiage.
+ */
+#define MORE_VERBOSE 1
+
+#if MORE_VERBOSE
+#define VERBOSE (R200_DEBUG & DEBUG_VERBOSE)
+#define NORMAL  (1)
+#else
+#define VERBOSE 0
+#define NORMAL  (R200_DEBUG & DEBUG_VERBOSE)
+#endif
+
+
+/* New (1.3) state mechanism.  3 commands (packet, scalar, vector) in
+ * 1.3 cmdbuffers allow all previous state to be updated as well as
+ * the tcl scalar and vector areas.  
+ */
+static struct { 
+   int start; 
+   int len; 
+   const char *name;
+} packet[RADEON_MAX_STATE_PACKETS] = {
+   { RADEON_PP_MISC,7,"RADEON_PP_MISC" },
+   { RADEON_PP_CNTL,3,"RADEON_PP_CNTL" },
+   { RADEON_RB3D_COLORPITCH,1,"RADEON_RB3D_COLORPITCH" },
+   { RADEON_RE_LINE_PATTERN,2,"RADEON_RE_LINE_PATTERN" },
+   { RADEON_SE_LINE_WIDTH,1,"RADEON_SE_LINE_WIDTH" },
+   { RADEON_PP_LUM_MATRIX,1,"RADEON_PP_LUM_MATRIX" },
+   { RADEON_PP_ROT_MATRIX_0,2,"RADEON_PP_ROT_MATRIX_0" },
+   { RADEON_RB3D_STENCILREFMASK,3,"RADEON_RB3D_STENCILREFMASK" },
+   { RADEON_SE_VPORT_XSCALE,6,"RADEON_SE_VPORT_XSCALE" },
+   { RADEON_SE_CNTL,2,"RADEON_SE_CNTL" },
+   { RADEON_SE_CNTL_STATUS,1,"RADEON_SE_CNTL_STATUS" },
+   { RADEON_RE_MISC,1,"RADEON_RE_MISC" },
+   { RADEON_PP_TXFILTER_0,6,"RADEON_PP_TXFILTER_0" },
+   { RADEON_PP_BORDER_COLOR_0,1,"RADEON_PP_BORDER_COLOR_0" },
+   { RADEON_PP_TXFILTER_1,6,"RADEON_PP_TXFILTER_1" },
+   { RADEON_PP_BORDER_COLOR_1,1,"RADEON_PP_BORDER_COLOR_1" },
+   { RADEON_PP_TXFILTER_2,6,"RADEON_PP_TXFILTER_2" },
+   { RADEON_PP_BORDER_COLOR_2,1,"RADEON_PP_BORDER_COLOR_2" },
+   { RADEON_SE_ZBIAS_FACTOR,2,"RADEON_SE_ZBIAS_FACTOR" },
+   { RADEON_SE_TCL_OUTPUT_VTX_FMT,11,"RADEON_SE_TCL_OUTPUT_VTX_FMT" },
+   { RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED,17,"RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED" },
+   { R200_PP_TXCBLEND_0, 4, "R200_EMIT_PP_TXCBLEND_0" },
+   { R200_PP_TXCBLEND_1, 4, "R200_PP_TXCBLEND_1" },
+   { R200_PP_TXCBLEND_2, 4, "R200_PP_TXCBLEND_2" },
+   { R200_PP_TXCBLEND_3, 4, "R200_PP_TXCBLEND_3" },
+   { R200_PP_TXCBLEND_4, 4, "R200_PP_TXCBLEND_4" },
+   { R200_PP_TXCBLEND_5, 4, "R200_PP_TXCBLEND_5" },
+   { R200_PP_TXCBLEND_6, 4, "R200_PP_TXCBLEND_6" },
+   { R200_PP_TXCBLEND_7, 4, "R200_PP_TXCBLEND_7" },
+   { R200_SE_TCL_LIGHT_MODEL_CTL_0, 6, "R200_SE_TCL_LIGHT_MODEL_CTL_0" },
+   { R200_PP_TFACTOR_0, 6, "R200_PP_TFACTOR_0" },
+   { R200_SE_VTX_FMT_0, 4, "R200_SE_VTX_FMT_0" },
+   { R200_SE_VAP_CNTL, 1, "R200_SE_VAP_CNTL" },
+   { R200_SE_TCL_MATRIX_SEL_0, 5, "R200_SE_TCL_MATRIX_SEL_0" },
+   { R200_SE_TCL_TEX_PROC_CTL_2, 5, "R200_SE_TCL_TEX_PROC_CTL_2" },
+   { R200_SE_TCL_UCP_VERT_BLEND_CTL, 1, "R200_SE_TCL_UCP_VERT_BLEND_CTL" },
+   { R200_PP_TXFILTER_0, 6, "R200_PP_TXFILTER_0" },
+   { R200_PP_TXFILTER_1, 6, "R200_PP_TXFILTER_1" },
+   { R200_PP_TXFILTER_2, 6, "R200_PP_TXFILTER_2" },
+   { R200_PP_TXFILTER_3, 6, "R200_PP_TXFILTER_3" },
+   { R200_PP_TXFILTER_4, 6, "R200_PP_TXFILTER_4" },
+   { R200_PP_TXFILTER_5, 6, "R200_PP_TXFILTER_5" },
+   { R200_PP_TXOFFSET_0, 1, "R200_PP_TXOFFSET_0" },
+   { R200_PP_TXOFFSET_1, 1, "R200_PP_TXOFFSET_1" },
+   { R200_PP_TXOFFSET_2, 1, "R200_PP_TXOFFSET_2" },
+   { R200_PP_TXOFFSET_3, 1, "R200_PP_TXOFFSET_3" },
+   { R200_PP_TXOFFSET_4, 1, "R200_PP_TXOFFSET_4" },
+   { R200_PP_TXOFFSET_5, 1, "R200_PP_TXOFFSET_5" },
+   { R200_SE_VTE_CNTL, 1, "R200_SE_VTE_CNTL" },
+   { R200_SE_TCL_OUTPUT_VTX_COMP_SEL, 1, "R200_SE_TCL_OUTPUT_VTX_COMP_SEL" },
+   { R200_PP_TAM_DEBUG3, 1, "R200_PP_TAM_DEBUG3" },
+   { R200_PP_CNTL_X, 1, "R200_PP_CNTL_X" }, 
+   { R200_RB3D_DEPTHXY_OFFSET, 1, "R200_RB3D_DEPTHXY_OFFSET" }, 
+   { R200_RE_AUX_SCISSOR_CNTL, 1, "R200_RE_AUX_SCISSOR_CNTL" }, 
+   { R200_RE_SCISSOR_TL_0, 2, "R200_RE_SCISSOR_TL_0" }, 
+   { R200_RE_SCISSOR_TL_1, 2, "R200_RE_SCISSOR_TL_1" }, 
+   { R200_RE_SCISSOR_TL_2, 2, "R200_RE_SCISSOR_TL_2" }, 
+   { R200_SE_VAP_CNTL_STATUS, 1, "R200_SE_VAP_CNTL_STATUS" }, 
+   { R200_SE_VTX_STATE_CNTL, 1, "R200_SE_VTX_STATE_CNTL" }, 
+   { R200_RE_POINTSIZE, 1, "R200_RE_POINTSIZE" }, 
+   { R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0, 4, "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0" },
+   { R200_PP_CUBIC_FACES_0, 1, "R200_PP_CUBIC_FACES_0" }, /* 61 */
+   { R200_PP_CUBIC_OFFSET_F1_0, 5, "R200_PP_CUBIC_OFFSET_F1_0" }, /* 62 */
+   { R200_PP_CUBIC_FACES_1, 1, "R200_PP_CUBIC_FACES_1" },
+   { R200_PP_CUBIC_OFFSET_F1_1, 5, "R200_PP_CUBIC_OFFSET_F1_1" },
+   { R200_PP_CUBIC_FACES_2, 1, "R200_PP_CUBIC_FACES_2" },
+   { R200_PP_CUBIC_OFFSET_F1_2, 5, "R200_PP_CUBIC_OFFSET_F1_2" },
+   { R200_PP_CUBIC_FACES_3, 1, "R200_PP_CUBIC_FACES_3" },
+   { R200_PP_CUBIC_OFFSET_F1_3, 5, "R200_PP_CUBIC_OFFSET_F1_3" },
+   { R200_PP_CUBIC_FACES_4, 1, "R200_PP_CUBIC_FACES_4" },
+   { R200_PP_CUBIC_OFFSET_F1_4, 5, "R200_PP_CUBIC_OFFSET_F1_4" },
+   { R200_PP_CUBIC_FACES_5, 1, "R200_PP_CUBIC_FACES_5" },
+   { R200_PP_CUBIC_OFFSET_F1_5, 5, "R200_PP_CUBIC_OFFSET_F1_5" },
+   { RADEON_PP_TEX_SIZE_0, 2, "RADEON_PP_TEX_SIZE_0" },
+   { RADEON_PP_TEX_SIZE_1, 2, "RADEON_PP_TEX_SIZE_1" },
+   { RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2" },
+   { R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR" },
+   { R200_SE_TCL_POINT_SPRITE_CNTL, 1, "R200_SE_TCL_POINT_SPRITE_CNTL" },
+   { RADEON_PP_CUBIC_FACES_0, 1, "RADEON_PP_CUBIC_FACES_0" },
+   { RADEON_PP_CUBIC_OFFSET_T0_0, 5, "RADEON_PP_CUBIC_OFFSET_T0_0" },
+   { RADEON_PP_CUBIC_FACES_1, 1, "RADEON_PP_CUBIC_FACES_1" },
+   { RADEON_PP_CUBIC_OFFSET_T1_0, 5, "RADEON_PP_CUBIC_OFFSET_T1_0" },
+   { RADEON_PP_CUBIC_FACES_2, 1, "RADEON_PP_CUBIC_FACES_2" },
+   { RADEON_PP_CUBIC_OFFSET_T2_0, 5, "RADEON_PP_CUBIC_OFFSET_T2_0" },
+   { R200_PP_TRI_PERF, 2, "R200_PP_TRI_PERF" },
+   { R200_PP_TXCBLEND_8, 32, "R200_PP_AFS_0"},   /* 85 */
+   { R200_PP_TXCBLEND_0, 32, "R200_PP_AFS_1"},
+   { R200_PP_TFACTOR_0, 8, "R200_ATF_TFACTOR"},
+   { R200_PP_TXFILTER_0, 8, "R200_PP_TXCTLALL_0"},
+   { R200_PP_TXFILTER_1, 8, "R200_PP_TXCTLALL_1"},
+   { R200_PP_TXFILTER_2, 8, "R200_PP_TXCTLALL_2"},
+   { R200_PP_TXFILTER_3, 8, "R200_PP_TXCTLALL_3"},
+   { R200_PP_TXFILTER_4, 8, "R200_PP_TXCTLALL_4"},
+   { R200_PP_TXFILTER_5, 8, "R200_PP_TXCTLALL_5"},
+   { R200_VAP_PVS_CNTL_1, 2, "R200_VAP_PVS_CNTL"},
+};
+
+struct reg_names {
+   int idx;
+   const char *name;
+};
+
+static struct reg_names reg_names[] = {
+   { R200_PP_MISC, "R200_PP_MISC" },
+   { R200_PP_FOG_COLOR, "R200_PP_FOG_COLOR" },
+   { R200_RE_SOLID_COLOR, "R200_RE_SOLID_COLOR" },
+   { R200_RB3D_BLENDCNTL, "R200_RB3D_BLENDCNTL" },
+   { R200_RB3D_DEPTHOFFSET, "R200_RB3D_DEPTHOFFSET" },
+   { R200_RB3D_DEPTHPITCH, "R200_RB3D_DEPTHPITCH" },
+   { R200_RB3D_ZSTENCILCNTL, "R200_RB3D_ZSTENCILCNTL" },
+   { R200_PP_CNTL, "R200_PP_CNTL" },
+   { R200_RB3D_CNTL, "R200_RB3D_CNTL" },
+   { R200_RB3D_COLOROFFSET, "R200_RB3D_COLOROFFSET" },
+   { R200_RE_WIDTH_HEIGHT, "R200_RE_WIDTH_HEIGHT" },
+   { R200_RB3D_COLORPITCH, "R200_RB3D_COLORPITCH" },
+   { R200_SE_CNTL, "R200_SE_CNTL" },
+   { R200_RE_CNTL, "R200_RE_CNTL" },
+   { R200_RE_MISC, "R200_RE_MISC" },
+   { R200_RE_STIPPLE_ADDR, "R200_RE_STIPPLE_ADDR" },
+   { R200_RE_STIPPLE_DATA, "R200_RE_STIPPLE_DATA" },
+   { R200_RE_LINE_PATTERN, "R200_RE_LINE_PATTERN" },
+   { R200_RE_LINE_STATE, "R200_RE_LINE_STATE" },
+   { R200_RE_SCISSOR_TL_0, "R200_RE_SCISSOR_TL_0" },
+   { R200_RE_SCISSOR_BR_0, "R200_RE_SCISSOR_BR_0" },
+   { R200_RE_SCISSOR_TL_1, "R200_RE_SCISSOR_TL_1" },
+   { R200_RE_SCISSOR_BR_1, "R200_RE_SCISSOR_BR_1" },
+   { R200_RE_SCISSOR_TL_2, "R200_RE_SCISSOR_TL_2" },
+   { R200_RE_SCISSOR_BR_2, "R200_RE_SCISSOR_BR_2" },
+   { R200_RB3D_DEPTHXY_OFFSET, "R200_RB3D_DEPTHXY_OFFSET" },
+   { R200_RB3D_STENCILREFMASK, "R200_RB3D_STENCILREFMASK" },
+   { R200_RB3D_ROPCNTL, "R200_RB3D_ROPCNTL" },
+   { R200_RB3D_PLANEMASK, "R200_RB3D_PLANEMASK" },
+   { R200_SE_VPORT_XSCALE, "R200_SE_VPORT_XSCALE" },
+   { R200_SE_VPORT_XOFFSET, "R200_SE_VPORT_XOFFSET" },
+   { R200_SE_VPORT_YSCALE, "R200_SE_VPORT_YSCALE" },
+   { R200_SE_VPORT_YOFFSET, "R200_SE_VPORT_YOFFSET" },
+   { R200_SE_VPORT_ZSCALE, "R200_SE_VPORT_ZSCALE" },
+   { R200_SE_VPORT_ZOFFSET, "R200_SE_VPORT_ZOFFSET" },
+   { R200_SE_ZBIAS_FACTOR, "R200_SE_ZBIAS_FACTOR" },
+   { R200_SE_ZBIAS_CONSTANT, "R200_SE_ZBIAS_CONSTANT" },
+   { R200_SE_LINE_WIDTH, "R200_SE_LINE_WIDTH" },
+   { R200_SE_VAP_CNTL, "R200_SE_VAP_CNTL" },
+   { R200_SE_VF_CNTL, "R200_SE_VF_CNTL" },
+   { R200_SE_VTX_FMT_0, "R200_SE_VTX_FMT_0" },
+   { R200_SE_VTX_FMT_1, "R200_SE_VTX_FMT_1" },
+   { R200_SE_TCL_OUTPUT_VTX_FMT_0, "R200_SE_TCL_OUTPUT_VTX_FMT_0" },
+   { R200_SE_TCL_OUTPUT_VTX_FMT_1, "R200_SE_TCL_OUTPUT_VTX_FMT_1" },
+   { R200_SE_VTE_CNTL, "R200_SE_VTE_CNTL" },
+   { R200_SE_VTX_NUM_ARRAYS, "R200_SE_VTX_NUM_ARRAYS" },
+   { R200_SE_VTX_AOS_ATTR01, "R200_SE_VTX_AOS_ATTR01" },
+   { R200_SE_VTX_AOS_ADDR0, "R200_SE_VTX_AOS_ADDR0" },
+   { R200_SE_VTX_AOS_ADDR1, "R200_SE_VTX_AOS_ADDR1" },
+   { R200_SE_VTX_AOS_ATTR23, "R200_SE_VTX_AOS_ATTR23" },
+   { R200_SE_VTX_AOS_ADDR2, "R200_SE_VTX_AOS_ADDR2" },
+   { R200_SE_VTX_AOS_ADDR3, "R200_SE_VTX_AOS_ADDR3" },
+   { R200_SE_VTX_AOS_ATTR45, "R200_SE_VTX_AOS_ATTR45" },
+   { R200_SE_VTX_AOS_ADDR4, "R200_SE_VTX_AOS_ADDR4" },
+   { R200_SE_VTX_AOS_ADDR5, "R200_SE_VTX_AOS_ADDR5" },
+   { R200_SE_VTX_AOS_ATTR67, "R200_SE_VTX_AOS_ATTR67" },
+   { R200_SE_VTX_AOS_ADDR6, "R200_SE_VTX_AOS_ADDR6" },
+   { R200_SE_VTX_AOS_ADDR7, "R200_SE_VTX_AOS_ADDR7" },
+   { R200_SE_VTX_AOS_ATTR89, "R200_SE_VTX_AOS_ATTR89" },
+   { R200_SE_VTX_AOS_ADDR8, "R200_SE_VTX_AOS_ADDR8" },
+   { R200_SE_VTX_AOS_ADDR9, "R200_SE_VTX_AOS_ADDR9" },
+   { R200_SE_VTX_AOS_ATTR1011, "R200_SE_VTX_AOS_ATTR1011" },
+   { R200_SE_VTX_AOS_ADDR10, "R200_SE_VTX_AOS_ADDR10" },
+   { R200_SE_VTX_AOS_ADDR11, "R200_SE_VTX_AOS_ADDR11" },
+   { R200_SE_VF_MAX_VTX_INDX, "R200_SE_VF_MAX_VTX_INDX" },
+   { R200_SE_VF_MIN_VTX_INDX, "R200_SE_VF_MIN_VTX_INDX" },
+   { R200_SE_VTX_STATE_CNTL, "R200_SE_VTX_STATE_CNTL" },
+   { R200_SE_TCL_VECTOR_INDX_REG, "R200_SE_TCL_VECTOR_INDX_REG" },
+   { R200_SE_TCL_VECTOR_DATA_REG, "R200_SE_TCL_VECTOR_DATA_REG" },
+   { R200_SE_TCL_SCALAR_INDX_REG, "R200_SE_TCL_SCALAR_INDX_REG" },
+   { R200_SE_TCL_SCALAR_DATA_REG, "R200_SE_TCL_SCALAR_DATA_REG" },
+   { R200_SE_TCL_MATRIX_SEL_0, "R200_SE_TCL_MATRIX_SEL_0" },
+   { R200_SE_TCL_MATRIX_SEL_1, "R200_SE_TCL_MATRIX_SEL_1" },
+   { R200_SE_TCL_MATRIX_SEL_2, "R200_SE_TCL_MATRIX_SEL_2" },
+   { R200_SE_TCL_MATRIX_SEL_3, "R200_SE_TCL_MATRIX_SEL_3" },
+   { R200_SE_TCL_MATRIX_SEL_4, "R200_SE_TCL_MATRIX_SEL_4" },
+   { R200_SE_TCL_LIGHT_MODEL_CTL_0, "R200_SE_TCL_LIGHT_MODEL_CTL_0" },
+   { R200_SE_TCL_LIGHT_MODEL_CTL_1, "R200_SE_TCL_LIGHT_MODEL_CTL_1" },
+   { R200_SE_TCL_PER_LIGHT_CTL_0, "R200_SE_TCL_PER_LIGHT_CTL_0" },
+   { R200_SE_TCL_PER_LIGHT_CTL_1, "R200_SE_TCL_PER_LIGHT_CTL_1" },
+   { R200_SE_TCL_PER_LIGHT_CTL_2, "R200_SE_TCL_PER_LIGHT_CTL_2" },
+   { R200_SE_TCL_PER_LIGHT_CTL_3, "R200_SE_TCL_PER_LIGHT_CTL_3" },
+   { R200_SE_TCL_TEX_PROC_CTL_2, "R200_SE_TCL_TEX_PROC_CTL_2" },
+   { R200_SE_TCL_TEX_PROC_CTL_3, "R200_SE_TCL_TEX_PROC_CTL_3" },
+   { R200_SE_TCL_TEX_PROC_CTL_0, "R200_SE_TCL_TEX_PROC_CTL_0" },
+   { R200_SE_TCL_TEX_PROC_CTL_1, "R200_SE_TCL_TEX_PROC_CTL_1" },
+   { R200_SE_TC_TEX_CYL_WRAP_CTL, "R200_SE_TC_TEX_CYL_WRAP_CTL" },
+   { R200_SE_TCL_UCP_VERT_BLEND_CTL, "R200_SE_TCL_UCP_VERT_BLEND_CTL" },
+   { R200_SE_TCL_POINT_SPRITE_CNTL, "R200_SE_TCL_POINT_SPRITE_CNTL" },
+   { R200_SE_VTX_ST_POS_0_X_4, "R200_SE_VTX_ST_POS_0_X_4" },
+   { R200_SE_VTX_ST_POS_0_Y_4, "R200_SE_VTX_ST_POS_0_Y_4" },
+   { R200_SE_VTX_ST_POS_0_Z_4, "R200_SE_VTX_ST_POS_0_Z_4" },
+   { R200_SE_VTX_ST_POS_0_W_4, "R200_SE_VTX_ST_POS_0_W_4" },
+   { R200_SE_VTX_ST_NORM_0_X, "R200_SE_VTX_ST_NORM_0_X" },
+   { R200_SE_VTX_ST_NORM_0_Y, "R200_SE_VTX_ST_NORM_0_Y" },
+   { R200_SE_VTX_ST_NORM_0_Z, "R200_SE_VTX_ST_NORM_0_Z" },
+   { R200_SE_VTX_ST_PVMS, "R200_SE_VTX_ST_PVMS" },
+   { R200_SE_VTX_ST_CLR_0_R, "R200_SE_VTX_ST_CLR_0_R" },
+   { R200_SE_VTX_ST_CLR_0_G, "R200_SE_VTX_ST_CLR_0_G" },
+   { R200_SE_VTX_ST_CLR_0_B, "R200_SE_VTX_ST_CLR_0_B" },
+   { R200_SE_VTX_ST_CLR_0_A, "R200_SE_VTX_ST_CLR_0_A" },
+   { R200_SE_VTX_ST_CLR_1_R, "R200_SE_VTX_ST_CLR_1_R" },
+   { R200_SE_VTX_ST_CLR_1_G, "R200_SE_VTX_ST_CLR_1_G" },
+   { R200_SE_VTX_ST_CLR_1_B, "R200_SE_VTX_ST_CLR_1_B" },
+   { R200_SE_VTX_ST_CLR_1_A, "R200_SE_VTX_ST_CLR_1_A" },
+   { R200_SE_VTX_ST_CLR_2_R, "R200_SE_VTX_ST_CLR_2_R" },
+   { R200_SE_VTX_ST_CLR_2_G, "R200_SE_VTX_ST_CLR_2_G" },
+   { R200_SE_VTX_ST_CLR_2_B, "R200_SE_VTX_ST_CLR_2_B" },
+   { R200_SE_VTX_ST_CLR_2_A, "R200_SE_VTX_ST_CLR_2_A" },
+   { R200_SE_VTX_ST_CLR_3_R, "R200_SE_VTX_ST_CLR_3_R" },
+   { R200_SE_VTX_ST_CLR_3_G, "R200_SE_VTX_ST_CLR_3_G" },
+   { R200_SE_VTX_ST_CLR_3_B, "R200_SE_VTX_ST_CLR_3_B" },
+   { R200_SE_VTX_ST_CLR_3_A, "R200_SE_VTX_ST_CLR_3_A" },
+   { R200_SE_VTX_ST_CLR_4_R, "R200_SE_VTX_ST_CLR_4_R" },
+   { R200_SE_VTX_ST_CLR_4_G, "R200_SE_VTX_ST_CLR_4_G" },
+   { R200_SE_VTX_ST_CLR_4_B, "R200_SE_VTX_ST_CLR_4_B" },
+   { R200_SE_VTX_ST_CLR_4_A, "R200_SE_VTX_ST_CLR_4_A" },
+   { R200_SE_VTX_ST_CLR_5_R, "R200_SE_VTX_ST_CLR_5_R" },
+   { R200_SE_VTX_ST_CLR_5_G, "R200_SE_VTX_ST_CLR_5_G" },
+   { R200_SE_VTX_ST_CLR_5_B, "R200_SE_VTX_ST_CLR_5_B" },
+   { R200_SE_VTX_ST_CLR_5_A, "R200_SE_VTX_ST_CLR_5_A" },
+   { R200_SE_VTX_ST_CLR_6_R, "R200_SE_VTX_ST_CLR_6_R" },
+   { R200_SE_VTX_ST_CLR_6_G, "R200_SE_VTX_ST_CLR_6_G" },
+   { R200_SE_VTX_ST_CLR_6_B, "R200_SE_VTX_ST_CLR_6_B" },
+   { R200_SE_VTX_ST_CLR_6_A, "R200_SE_VTX_ST_CLR_6_A" },
+   { R200_SE_VTX_ST_CLR_7_R, "R200_SE_VTX_ST_CLR_7_R" },
+   { R200_SE_VTX_ST_CLR_7_G, "R200_SE_VTX_ST_CLR_7_G" },
+   { R200_SE_VTX_ST_CLR_7_B, "R200_SE_VTX_ST_CLR_7_B" },
+   { R200_SE_VTX_ST_CLR_7_A, "R200_SE_VTX_ST_CLR_7_A" },
+   { R200_SE_VTX_ST_TEX_0_S, "R200_SE_VTX_ST_TEX_0_S" },
+   { R200_SE_VTX_ST_TEX_0_T, "R200_SE_VTX_ST_TEX_0_T" },
+   { R200_SE_VTX_ST_TEX_0_R, "R200_SE_VTX_ST_TEX_0_R" },
+   { R200_SE_VTX_ST_TEX_0_Q, "R200_SE_VTX_ST_TEX_0_Q" },
+   { R200_SE_VTX_ST_TEX_1_S, "R200_SE_VTX_ST_TEX_1_S" },
+   { R200_SE_VTX_ST_TEX_1_T, "R200_SE_VTX_ST_TEX_1_T" },
+   { R200_SE_VTX_ST_TEX_1_R, "R200_SE_VTX_ST_TEX_1_R" },
+   { R200_SE_VTX_ST_TEX_1_Q, "R200_SE_VTX_ST_TEX_1_Q" },
+   { R200_SE_VTX_ST_TEX_2_S, "R200_SE_VTX_ST_TEX_2_S" },
+   { R200_SE_VTX_ST_TEX_2_T, "R200_SE_VTX_ST_TEX_2_T" },
+   { R200_SE_VTX_ST_TEX_2_R, "R200_SE_VTX_ST_TEX_2_R" },
+   { R200_SE_VTX_ST_TEX_2_Q, "R200_SE_VTX_ST_TEX_2_Q" },
+   { R200_SE_VTX_ST_TEX_3_S, "R200_SE_VTX_ST_TEX_3_S" },
+   { R200_SE_VTX_ST_TEX_3_T, "R200_SE_VTX_ST_TEX_3_T" },
+   { R200_SE_VTX_ST_TEX_3_R, "R200_SE_VTX_ST_TEX_3_R" },
+   { R200_SE_VTX_ST_TEX_3_Q, "R200_SE_VTX_ST_TEX_3_Q" },
+   { R200_SE_VTX_ST_TEX_4_S, "R200_SE_VTX_ST_TEX_4_S" },
+   { R200_SE_VTX_ST_TEX_4_T, "R200_SE_VTX_ST_TEX_4_T" },
+   { R200_SE_VTX_ST_TEX_4_R, "R200_SE_VTX_ST_TEX_4_R" },
+   { R200_SE_VTX_ST_TEX_4_Q, "R200_SE_VTX_ST_TEX_4_Q" },
+   { R200_SE_VTX_ST_TEX_5_S, "R200_SE_VTX_ST_TEX_5_S" },
+   { R200_SE_VTX_ST_TEX_5_T, "R200_SE_VTX_ST_TEX_5_T" },
+   { R200_SE_VTX_ST_TEX_5_R, "R200_SE_VTX_ST_TEX_5_R" },
+   { R200_SE_VTX_ST_TEX_5_Q, "R200_SE_VTX_ST_TEX_5_Q" },
+   { R200_SE_VTX_ST_PNT_SPRT_SZ, "R200_SE_VTX_ST_PNT_SPRT_SZ" },
+   { R200_SE_VTX_ST_DISC_FOG, "R200_SE_VTX_ST_DISC_FOG" },
+   { R200_SE_VTX_ST_SHININESS_0, "R200_SE_VTX_ST_SHININESS_0" },
+   { R200_SE_VTX_ST_SHININESS_1, "R200_SE_VTX_ST_SHININESS_1" },
+   { R200_SE_VTX_ST_BLND_WT_0, "R200_SE_VTX_ST_BLND_WT_0" },
+   { R200_SE_VTX_ST_BLND_WT_1, "R200_SE_VTX_ST_BLND_WT_1" },
+   { R200_SE_VTX_ST_BLND_WT_2, "R200_SE_VTX_ST_BLND_WT_2" },
+   { R200_SE_VTX_ST_BLND_WT_3, "R200_SE_VTX_ST_BLND_WT_3" },
+   { R200_SE_VTX_ST_POS_1_X, "R200_SE_VTX_ST_POS_1_X" },
+   { R200_SE_VTX_ST_POS_1_Y, "R200_SE_VTX_ST_POS_1_Y" },
+   { R200_SE_VTX_ST_POS_1_Z, "R200_SE_VTX_ST_POS_1_Z" },
+   { R200_SE_VTX_ST_POS_1_W, "R200_SE_VTX_ST_POS_1_W" },
+   { R200_SE_VTX_ST_NORM_1_X, "R200_SE_VTX_ST_NORM_1_X" },
+   { R200_SE_VTX_ST_NORM_1_Y, "R200_SE_VTX_ST_NORM_1_Y" },
+   { R200_SE_VTX_ST_NORM_1_Z, "R200_SE_VTX_ST_NORM_1_Z" },
+   { R200_SE_VTX_ST_USR_CLR_0_R, "R200_SE_VTX_ST_USR_CLR_0_R" },
+   { R200_SE_VTX_ST_USR_CLR_0_G, "R200_SE_VTX_ST_USR_CLR_0_G" },
+   { R200_SE_VTX_ST_USR_CLR_0_B, "R200_SE_VTX_ST_USR_CLR_0_B" },
+   { R200_SE_VTX_ST_USR_CLR_0_A, "R200_SE_VTX_ST_USR_CLR_0_A" },
+   { R200_SE_VTX_ST_USR_CLR_1_R, "R200_SE_VTX_ST_USR_CLR_1_R" },
+   { R200_SE_VTX_ST_USR_CLR_1_G, "R200_SE_VTX_ST_USR_CLR_1_G" },
+   { R200_SE_VTX_ST_USR_CLR_1_B, "R200_SE_VTX_ST_USR_CLR_1_B" },
+   { R200_SE_VTX_ST_USR_CLR_1_A, "R200_SE_VTX_ST_USR_CLR_1_A" },
+   { R200_SE_VTX_ST_CLR_0_PKD, "R200_SE_VTX_ST_CLR_0_PKD" },
+   { R200_SE_VTX_ST_CLR_1_PKD, "R200_SE_VTX_ST_CLR_1_PKD" },
+   { R200_SE_VTX_ST_CLR_2_PKD, "R200_SE_VTX_ST_CLR_2_PKD" },
+   { R200_SE_VTX_ST_CLR_3_PKD, "R200_SE_VTX_ST_CLR_3_PKD" },
+   { R200_SE_VTX_ST_CLR_4_PKD, "R200_SE_VTX_ST_CLR_4_PKD" },
+   { R200_SE_VTX_ST_CLR_5_PKD, "R200_SE_VTX_ST_CLR_5_PKD" },
+   { R200_SE_VTX_ST_CLR_6_PKD, "R200_SE_VTX_ST_CLR_6_PKD" },
+   { R200_SE_VTX_ST_CLR_7_PKD, "R200_SE_VTX_ST_CLR_7_PKD" },
+   { R200_SE_VTX_ST_POS_0_X_2, "R200_SE_VTX_ST_POS_0_X_2" },
+   { R200_SE_VTX_ST_POS_0_Y_2, "R200_SE_VTX_ST_POS_0_Y_2" },
+   { R200_SE_VTX_ST_PAR_CLR_LD, "R200_SE_VTX_ST_PAR_CLR_LD" },
+   { R200_SE_VTX_ST_USR_CLR_PKD, "R200_SE_VTX_ST_USR_CLR_PKD" },
+   { R200_SE_VTX_ST_POS_0_X_3, "R200_SE_VTX_ST_POS_0_X_3" },
+   { R200_SE_VTX_ST_POS_0_Y_3, "R200_SE_VTX_ST_POS_0_Y_3" },
+   { R200_SE_VTX_ST_POS_0_Z_3, "R200_SE_VTX_ST_POS_0_Z_3" },
+   { R200_SE_VTX_ST_END_OF_PKT, "R200_SE_VTX_ST_END_OF_PKT" },
+   { R200_RE_POINTSIZE, "R200_RE_POINTSIZE" },
+   { R200_RE_TOP_LEFT, "R200_RE_TOP_LEFT" },
+   { R200_RE_AUX_SCISSOR_CNTL, "R200_RE_AUX_SCISSOR_CNTL" },
+   { R200_PP_TXFILTER_0, "R200_PP_TXFILTER_0" },
+   { R200_PP_TXFORMAT_0, "R200_PP_TXFORMAT_0" },
+   { R200_PP_TXSIZE_0, "R200_PP_TXSIZE_0" },
+   { R200_PP_TXFORMAT_X_0, "R200_PP_TXFORMAT_X_0" },
+   { R200_PP_TXPITCH_0, "R200_PP_TXPITCH_0" },
+   { R200_PP_BORDER_COLOR_0, "R200_PP_BORDER_COLOR_0" },
+   { R200_PP_CUBIC_FACES_0, "R200_PP_CUBIC_FACES_0" },
+   { R200_PP_TXMULTI_CTL_0, "R200_PP_TXMULTI_CTL_0" },
+   { R200_PP_TXFILTER_1, "R200_PP_TXFILTER_1" },
+   { R200_PP_TXFORMAT_1, "R200_PP_TXFORMAT_1" },
+   { R200_PP_TXSIZE_1, "R200_PP_TXSIZE_1" },
+   { R200_PP_TXFORMAT_X_1, "R200_PP_TXFORMAT_X_1" },
+   { R200_PP_TXPITCH_1, "R200_PP_TXPITCH_1" },
+   { R200_PP_BORDER_COLOR_1, "R200_PP_BORDER_COLOR_1" },
+   { R200_PP_CUBIC_FACES_1, "R200_PP_CUBIC_FACES_1" },
+   { R200_PP_TXMULTI_CTL_1, "R200_PP_TXMULTI_CTL_1" },
+   { R200_PP_TXFILTER_2, "R200_PP_TXFILTER_2" },
+   { R200_PP_TXFORMAT_2, "R200_PP_TXFORMAT_2" },
+   { R200_PP_TXSIZE_2, "R200_PP_TXSIZE_2" },
+   { R200_PP_TXFORMAT_X_2, "R200_PP_TXFORMAT_X_2" },
+   { R200_PP_TXPITCH_2, "R200_PP_TXPITCH_2" },
+   { R200_PP_BORDER_COLOR_2, "R200_PP_BORDER_COLOR_2" },
+   { R200_PP_CUBIC_FACES_2, "R200_PP_CUBIC_FACES_2" },
+   { R200_PP_TXMULTI_CTL_2, "R200_PP_TXMULTI_CTL_2" },
+   { R200_PP_TXFILTER_3, "R200_PP_TXFILTER_3" },
+   { R200_PP_TXFORMAT_3, "R200_PP_TXFORMAT_3" },
+   { R200_PP_TXSIZE_3, "R200_PP_TXSIZE_3" },
+   { R200_PP_TXFORMAT_X_3, "R200_PP_TXFORMAT_X_3" },
+   { R200_PP_TXPITCH_3, "R200_PP_TXPITCH_3" },
+   { R200_PP_BORDER_COLOR_3, "R200_PP_BORDER_COLOR_3" },
+   { R200_PP_CUBIC_FACES_3, "R200_PP_CUBIC_FACES_3" },
+   { R200_PP_TXMULTI_CTL_3, "R200_PP_TXMULTI_CTL_3" },
+   { R200_PP_TXFILTER_4, "R200_PP_TXFILTER_4" },
+   { R200_PP_TXFORMAT_4, "R200_PP_TXFORMAT_4" },
+   { R200_PP_TXSIZE_4, "R200_PP_TXSIZE_4" },
+   { R200_PP_TXFORMAT_X_4, "R200_PP_TXFORMAT_X_4" },
+   { R200_PP_TXPITCH_4, "R200_PP_TXPITCH_4" },
+   { R200_PP_BORDER_COLOR_4, "R200_PP_BORDER_COLOR_4" },
+   { R200_PP_CUBIC_FACES_4, "R200_PP_CUBIC_FACES_4" },
+   { R200_PP_TXMULTI_CTL_4, "R200_PP_TXMULTI_CTL_4" },
+   { R200_PP_TXFILTER_5, "R200_PP_TXFILTER_5" },
+   { R200_PP_TXFORMAT_5, "R200_PP_TXFORMAT_5" },
+   { R200_PP_TXSIZE_5, "R200_PP_TXSIZE_5" },
+   { R200_PP_TXFORMAT_X_5, "R200_PP_TXFORMAT_X_5" },
+   { R200_PP_TXPITCH_5, "R200_PP_TXPITCH_5" },
+   { R200_PP_BORDER_COLOR_5, "R200_PP_BORDER_COLOR_5" },
+   { R200_PP_CUBIC_FACES_5, "R200_PP_CUBIC_FACES_5" },
+   { R200_PP_TXMULTI_CTL_5, "R200_PP_TXMULTI_CTL_5" },
+   { R200_PP_TXOFFSET_0, "R200_PP_TXOFFSET_0" },
+   { R200_PP_CUBIC_OFFSET_F1_0, "R200_PP_CUBIC_OFFSET_F1_0" },
+   { R200_PP_CUBIC_OFFSET_F2_0, "R200_PP_CUBIC_OFFSET_F2_0" },
+   { R200_PP_CUBIC_OFFSET_F3_0, "R200_PP_CUBIC_OFFSET_F3_0" },
+   { R200_PP_CUBIC_OFFSET_F4_0, "R200_PP_CUBIC_OFFSET_F4_0" },
+   { R200_PP_CUBIC_OFFSET_F5_0, "R200_PP_CUBIC_OFFSET_F5_0" },
+   { R200_PP_TXOFFSET_1, "R200_PP_TXOFFSET_1" },
+   { R200_PP_CUBIC_OFFSET_F1_1, "R200_PP_CUBIC_OFFSET_F1_1" },
+   { R200_PP_CUBIC_OFFSET_F2_1, "R200_PP_CUBIC_OFFSET_F2_1" },
+   { R200_PP_CUBIC_OFFSET_F3_1, "R200_PP_CUBIC_OFFSET_F3_1" },
+   { R200_PP_CUBIC_OFFSET_F4_1, "R200_PP_CUBIC_OFFSET_F4_1" },
+   { R200_PP_CUBIC_OFFSET_F5_1, "R200_PP_CUBIC_OFFSET_F5_1" },
+   { R200_PP_TXOFFSET_2, "R200_PP_TXOFFSET_2" },
+   { R200_PP_CUBIC_OFFSET_F1_2, "R200_PP_CUBIC_OFFSET_F1_2" },
+   { R200_PP_CUBIC_OFFSET_F2_2, "R200_PP_CUBIC_OFFSET_F2_2" },
+   { R200_PP_CUBIC_OFFSET_F3_2, "R200_PP_CUBIC_OFFSET_F3_2" },
+   { R200_PP_CUBIC_OFFSET_F4_2, "R200_PP_CUBIC_OFFSET_F4_2" },
+   { R200_PP_CUBIC_OFFSET_F5_2, "R200_PP_CUBIC_OFFSET_F5_2" },
+   { R200_PP_TXOFFSET_3, "R200_PP_TXOFFSET_3" },
+   { R200_PP_CUBIC_OFFSET_F1_3, "R200_PP_CUBIC_OFFSET_F1_3" },
+   { R200_PP_CUBIC_OFFSET_F2_3, "R200_PP_CUBIC_OFFSET_F2_3" },
+   { R200_PP_CUBIC_OFFSET_F3_3, "R200_PP_CUBIC_OFFSET_F3_3" },
+   { R200_PP_CUBIC_OFFSET_F4_3, "R200_PP_CUBIC_OFFSET_F4_3" },
+   { R200_PP_CUBIC_OFFSET_F5_3, "R200_PP_CUBIC_OFFSET_F5_3" },
+   { R200_PP_TXOFFSET_4, "R200_PP_TXOFFSET_4" },
+   { R200_PP_CUBIC_OFFSET_F1_4, "R200_PP_CUBIC_OFFSET_F1_4" },
+   { R200_PP_CUBIC_OFFSET_F2_4, "R200_PP_CUBIC_OFFSET_F2_4" },
+   { R200_PP_CUBIC_OFFSET_F3_4, "R200_PP_CUBIC_OFFSET_F3_4" },
+   { R200_PP_CUBIC_OFFSET_F4_4, "R200_PP_CUBIC_OFFSET_F4_4" },
+   { R200_PP_CUBIC_OFFSET_F5_4, "R200_PP_CUBIC_OFFSET_F5_4" },
+   { R200_PP_TXOFFSET_5, "R200_PP_TXOFFSET_5" },
+   { R200_PP_CUBIC_OFFSET_F1_5, "R200_PP_CUBIC_OFFSET_F1_5" },
+   { R200_PP_CUBIC_OFFSET_F2_5, "R200_PP_CUBIC_OFFSET_F2_5" },
+   { R200_PP_CUBIC_OFFSET_F3_5, "R200_PP_CUBIC_OFFSET_F3_5" },
+   { R200_PP_CUBIC_OFFSET_F4_5, "R200_PP_CUBIC_OFFSET_F4_5" },
+   { R200_PP_CUBIC_OFFSET_F5_5, "R200_PP_CUBIC_OFFSET_F5_5" },
+   { R200_PP_TAM_DEBUG3, "R200_PP_TAM_DEBUG3" },
+   { R200_PP_TFACTOR_0, "R200_PP_TFACTOR_0" },
+   { R200_PP_TFACTOR_1, "R200_PP_TFACTOR_1" },
+   { R200_PP_TFACTOR_2, "R200_PP_TFACTOR_2" },
+   { R200_PP_TFACTOR_3, "R200_PP_TFACTOR_3" },
+   { R200_PP_TFACTOR_4, "R200_PP_TFACTOR_4" },
+   { R200_PP_TFACTOR_5, "R200_PP_TFACTOR_5" },
+   { R200_PP_TFACTOR_6, "R200_PP_TFACTOR_6" },
+   { R200_PP_TFACTOR_7, "R200_PP_TFACTOR_7" },
+   { R200_PP_TXCBLEND_0, "R200_PP_TXCBLEND_0" },
+   { R200_PP_TXCBLEND2_0, "R200_PP_TXCBLEND2_0" },
+   { R200_PP_TXABLEND_0, "R200_PP_TXABLEND_0" },
+   { R200_PP_TXABLEND2_0, "R200_PP_TXABLEND2_0" },
+   { R200_PP_TXCBLEND_1, "R200_PP_TXCBLEND_1" },
+   { R200_PP_TXCBLEND2_1, "R200_PP_TXCBLEND2_1" },
+   { R200_PP_TXABLEND_1, "R200_PP_TXABLEND_1" },
+   { R200_PP_TXABLEND2_1, "R200_PP_TXABLEND2_1" },
+   { R200_PP_TXCBLEND_2, "R200_PP_TXCBLEND_2" },
+   { R200_PP_TXCBLEND2_2, "R200_PP_TXCBLEND2_2" },
+   { R200_PP_TXABLEND_2, "R200_PP_TXABLEND_2" },
+   { R200_PP_TXABLEND2_2, "R200_PP_TXABLEND2_2" },
+   { R200_PP_TXCBLEND_3, "R200_PP_TXCBLEND_3" },
+   { R200_PP_TXCBLEND2_3, "R200_PP_TXCBLEND2_3" },
+   { R200_PP_TXABLEND_3, "R200_PP_TXABLEND_3" },
+   { R200_PP_TXABLEND2_3, "R200_PP_TXABLEND2_3" },
+   { R200_PP_TXCBLEND_4, "R200_PP_TXCBLEND_4" },
+   { R200_PP_TXCBLEND2_4, "R200_PP_TXCBLEND2_4" },
+   { R200_PP_TXABLEND_4, "R200_PP_TXABLEND_4" },
+   { R200_PP_TXABLEND2_4, "R200_PP_TXABLEND2_4" },
+   { R200_PP_TXCBLEND_5, "R200_PP_TXCBLEND_5" },
+   { R200_PP_TXCBLEND2_5, "R200_PP_TXCBLEND2_5" },
+   { R200_PP_TXABLEND_5, "R200_PP_TXABLEND_5" },
+   { R200_PP_TXABLEND2_5, "R200_PP_TXABLEND2_5" },
+   { R200_PP_TXCBLEND_6, "R200_PP_TXCBLEND_6" },
+   { R200_PP_TXCBLEND2_6, "R200_PP_TXCBLEND2_6" },
+   { R200_PP_TXABLEND_6, "R200_PP_TXABLEND_6" },
+   { R200_PP_TXABLEND2_6, "R200_PP_TXABLEND2_6" },
+   { R200_PP_TXCBLEND_7, "R200_PP_TXCBLEND_7" },
+   { R200_PP_TXCBLEND2_7, "R200_PP_TXCBLEND2_7" },
+   { R200_PP_TXABLEND_7, "R200_PP_TXABLEND_7" },
+   { R200_PP_TXABLEND2_7, "R200_PP_TXABLEND2_7" },
+   { R200_RB3D_BLENDCOLOR, "R200_RB3D_BLENDCOLOR" },
+   { R200_RB3D_ABLENDCNTL, "R200_RB3D_ABLENDCNTL" },
+   { R200_RB3D_CBLENDCNTL, "R200_RB3D_CBLENDCNTL" },
+   { R200_SE_TCL_OUTPUT_VTX_COMP_SEL, "R200_SE_TCL_OUTPUT_VTX_COMP_SEL" },
+   { R200_PP_CNTL_X, "R200_PP_CNTL_X" },
+   { R200_SE_VAP_CNTL_STATUS, "R200_SE_VAP_CNTL_STATUS" },
+   { R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0, "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0" },
+   { R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_1, "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_1" },
+   { R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_2, "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_2" },
+   { R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_3, "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_3" },
+   { R200_PP_TRI_PERF, "R200_PP_TRI_PERF" },
+   { R200_PP_PERF_CNTL, "R200_PP_PERF_CNTL" },
+   { R200_PP_TXCBLEND_8, "R200_PP_TXCBLEND_8" },
+   { R200_PP_TXCBLEND2_8, "R200_PP_TXCBLEND2_8" },
+   { R200_PP_TXABLEND_8, "R200_PP_TXABLEND_8" },
+   { R200_PP_TXABLEND2_8, "R200_PP_TXABLEND2_8" },
+   { R200_PP_TXCBLEND_9, "R200_PP_TXCBLEND_9" },
+   { R200_PP_TXCBLEND2_9, "R200_PP_TXCBLEND2_9" },
+   { R200_PP_TXABLEND_9, "R200_PP_TXABLEND_9" },
+   { R200_PP_TXABLEND2_9, "R200_PP_TXABLEND2_9" },
+   { R200_PP_TXCBLEND_10, "R200_PP_TXCBLEND_10" },
+   { R200_PP_TXCBLEND2_10, "R200_PP_TXCBLEND2_10" },
+   { R200_PP_TXABLEND_10, "R200_PP_TXABLEND_10" },
+   { R200_PP_TXABLEND2_10, "R200_PP_TXABLEND2_10" },
+   { R200_PP_TXCBLEND_11, "R200_PP_TXCBLEND_11" },
+   { R200_PP_TXCBLEND2_11, "R200_PP_TXCBLEND2_11" },
+   { R200_PP_TXABLEND_11, "R200_PP_TXABLEND_11" },
+   { R200_PP_TXABLEND2_11, "R200_PP_TXABLEND2_11" },
+   { R200_PP_TXCBLEND_12, "R200_PP_TXCBLEND_12" },
+   { R200_PP_TXCBLEND2_12, "R200_PP_TXCBLEND2_12" },
+   { R200_PP_TXABLEND_12, "R200_PP_TXABLEND_12" },
+   { R200_PP_TXABLEND2_12, "R200_PP_TXABLEND2_12" },
+   { R200_PP_TXCBLEND_13, "R200_PP_TXCBLEND_13" },
+   { R200_PP_TXCBLEND2_13, "R200_PP_TXCBLEND2_13" },
+   { R200_PP_TXABLEND_13, "R200_PP_TXABLEND_13" },
+   { R200_PP_TXABLEND2_13, "R200_PP_TXABLEND2_13" },
+   { R200_PP_TXCBLEND_14, "R200_PP_TXCBLEND_14" },
+   { R200_PP_TXCBLEND2_14, "R200_PP_TXCBLEND2_14" },
+   { R200_PP_TXABLEND_14, "R200_PP_TXABLEND_14" },
+   { R200_PP_TXABLEND2_14, "R200_PP_TXABLEND2_14" },
+   { R200_PP_TXCBLEND_15, "R200_PP_TXCBLEND_15" },
+   { R200_PP_TXCBLEND2_15, "R200_PP_TXCBLEND2_15" },
+   { R200_PP_TXABLEND_15, "R200_PP_TXABLEND_15" },
+   { R200_PP_TXABLEND2_15, "R200_PP_TXABLEND2_15" },
+   { R200_VAP_PVS_CNTL_1, "R200_VAP_PVS_CNTL_1" },
+   { R200_VAP_PVS_CNTL_2, "R200_VAP_PVS_CNTL_2" },
+};
+
+static struct reg_names scalar_names[] = {
+   { R200_SS_LIGHT_DCD_ADDR, "R200_SS_LIGHT_DCD_ADDR" },
+   { R200_SS_LIGHT_DCM_ADDR, "R200_SS_LIGHT_DCM_ADDR" },
+   { R200_SS_LIGHT_SPOT_EXPONENT_ADDR, "R200_SS_LIGHT_SPOT_EXPONENT_ADDR" },
+   { R200_SS_LIGHT_SPOT_CUTOFF_ADDR, "R200_SS_LIGHT_SPOT_CUTOFF_ADDR" },
+   { R200_SS_LIGHT_SPECULAR_THRESH_ADDR, "R200_SS_LIGHT_SPECULAR_THRESH_ADDR" },
+   { R200_SS_LIGHT_RANGE_CUTOFF_SQRD, "R200_SS_LIGHT_RANGE_CUTOFF_SQRD" },
+   { R200_SS_LIGHT_RANGE_ATT_CONST, "R200_SS_LIGHT_RANGE_ATT_CONST" },
+   { R200_SS_VERT_GUARD_CLIP_ADJ_ADDR, "R200_SS_VERT_GUARD_CLIP_ADJ_ADDR" },
+   { R200_SS_VERT_GUARD_DISCARD_ADJ_ADDR, "R200_SS_VERT_GUARD_DISCARD_ADJ_ADDR" },
+   { R200_SS_HORZ_GUARD_CLIP_ADJ_ADDR, "R200_SS_HORZ_GUARD_CLIP_ADJ_ADDR" },
+   { R200_SS_HORZ_GUARD_DISCARD_ADJ_ADDR, "R200_SS_HORZ_GUARD_DISCARD_ADJ_ADDR" },
+   { R200_SS_MAT_0_SHININESS, "R200_SS_MAT_0_SHININESS" },
+   { R200_SS_MAT_1_SHININESS, "R200_SS_MAT_1_SHININESS" },
+   { 1000, "" },
+};
+
+/* Puff these out to make them look like normal (dword) registers.
+ */
+static struct reg_names vector_names[] = {
+   { 0, "start" },
+   { R200_VS_LIGHT_AMBIENT_ADDR, "R200_VS_LIGHT_AMBIENT_ADDR" },
+   { R200_VS_LIGHT_DIFFUSE_ADDR, "R200_VS_LIGHT_DIFFUSE_ADDR" },
+   { R200_VS_LIGHT_SPECULAR_ADDR, "R200_VS_LIGHT_SPECULAR_ADDR" },
+   { R200_VS_LIGHT_DIRPOS_ADDR, "R200_VS_LIGHT_DIRPOS_ADDR" },
+   { R200_VS_LIGHT_HWVSPOT_ADDR, "R200_VS_LIGHT_HWVSPOT_ADDR" },
+   { R200_VS_LIGHT_ATTENUATION_ADDR, "R200_VS_LIGHT_ATTENUATION_ADDR" },
+   { R200_VS_SPOT_DUAL_CONE, "R200_VS_SPOT_DUAL_CONE" },
+   { R200_VS_GLOBAL_AMBIENT_ADDR, "R200_VS_GLOBAL_AMBIENT_ADDR" },
+   { R200_VS_FOG_PARAM_ADDR, "R200_VS_FOG_PARAM_ADDR" },
+   { R200_VS_EYE_VECTOR_ADDR, "R200_VS_EYE_VECTOR_ADDR" },
+   { R200_VS_UCP_ADDR, "R200_VS_UCP_ADDR" },
+   { R200_VS_PNT_SPRITE_VPORT_SCALE, "R200_VS_PNT_SPRITE_VPORT_SCALE" },
+   { R200_VS_MATRIX_0_MV, "R200_VS_MATRIX_0_MV" },
+   { R200_VS_MATRIX_1_INV_MV, "R200_VS_MATRIX_1_INV_MV" },
+   { R200_VS_MATRIX_2_MVP, "R200_VS_MATRIX_2_MVP" },
+   { R200_VS_MATRIX_3_TEX0, "R200_VS_MATRIX_3_TEX0" },
+   { R200_VS_MATRIX_4_TEX1, "R200_VS_MATRIX_4_TEX1" },
+   { R200_VS_MATRIX_5_TEX2, "R200_VS_MATRIX_5_TEX2" },
+   { R200_VS_MATRIX_6_TEX3, "R200_VS_MATRIX_6_TEX3" },
+   { R200_VS_MATRIX_7_TEX4, "R200_VS_MATRIX_7_TEX4" },
+   { R200_VS_MATRIX_8_TEX5, "R200_VS_MATRIX_8_TEX5" },
+   { R200_VS_MAT_0_EMISS, "R200_VS_MAT_0_EMISS" },
+   { R200_VS_MAT_0_AMB, "R200_VS_MAT_0_AMB" },
+   { R200_VS_MAT_0_DIF, "R200_VS_MAT_0_DIF" },
+   { R200_VS_MAT_0_SPEC, "R200_VS_MAT_0_SPEC" },
+   { R200_VS_MAT_1_EMISS, "R200_VS_MAT_1_EMISS" },
+   { R200_VS_MAT_1_AMB, "R200_VS_MAT_1_AMB" },
+   { R200_VS_MAT_1_DIF, "R200_VS_MAT_1_DIF" },
+   { R200_VS_MAT_1_SPEC, "R200_VS_MAT_1_SPEC" },
+   { R200_VS_EYE2CLIP_MTX, "R200_VS_EYE2CLIP_MTX" },
+   { R200_VS_PNT_SPRITE_ATT_CONST, "R200_VS_PNT_SPRITE_ATT_CONST" },
+   { R200_VS_PNT_SPRITE_EYE_IN_MODEL, "R200_VS_PNT_SPRITE_EYE_IN_MODEL" },
+   { R200_VS_PNT_SPRITE_CLAMP, "R200_VS_PNT_SPRITE_CLAMP" },
+   { R200_VS_MAX, "R200_VS_MAX" },
+   { 1000, "" },
+};
+
+union fi { float f; int i; };
+
+#define ISVEC   1
+#define ISFLOAT 2
+#define TOUCHED 4
+
+struct reg {
+   int idx; 
+   struct reg_names *closest;
+   int flags;
+   union fi current;
+   union fi *values;
+   int nvalues;
+   int nalloc;
+   float vmin, vmax;
+};
+
+
+static struct reg regs[Elements(reg_names)+1];
+static struct reg scalars[512+1];
+static struct reg vectors[512*4+1];
+
+static int total, total_changed, bufs;
+
+static void init_regs( void )
+{
+   struct reg_names *tmp;
+   int i;
+
+   for (i = 0 ; i < Elements(regs) ; i++) {
+      regs[i].idx = reg_names[i].idx;
+      regs[i].closest = &reg_names[i];
+      regs[i].flags = 0;
+   }
+
+   for (i = 0, tmp = scalar_names ; i < Elements(scalars) ; i++) {
+      if (tmp[1].idx == i) tmp++;
+      scalars[i].idx = i;
+      scalars[i].closest = tmp;
+      scalars[i].flags = ISFLOAT;
+   }
+
+   for (i = 0, tmp = vector_names ; i < Elements(vectors) ; i++) {
+      if (tmp[1].idx*4 == i) tmp++;
+      vectors[i].idx = i;
+      vectors[i].closest = tmp;
+      vectors[i].flags = ISFLOAT|ISVEC;
+   }
+
+   regs[Elements(regs)-1].idx = -1;
+   scalars[Elements(scalars)-1].idx = -1;
+   vectors[Elements(vectors)-1].idx = -1;
+}
+
+static int find_or_add_value( struct reg *reg, int val )
+{
+   int j;
+
+   for ( j = 0 ; j < reg->nvalues ; j++)
+      if ( val == reg->values[j].i )
+	 return 1;
+
+   if (j == reg->nalloc) {
+      reg->nalloc += 5;
+      reg->nalloc *= 2;
+      reg->values = (union fi *) realloc( reg->values, 
+					  reg->nalloc * sizeof(union fi) );
+   }
+
+   reg->values[reg->nvalues++].i = val;
+   return 0;
+}
+
+static struct reg *lookup_reg( struct reg *tab, int reg )
+{
+   int i;
+
+   for (i = 0 ; tab[i].idx != -1 ; i++) {
+      if (tab[i].idx == reg)
+	 return &tab[i];
+   }
+
+   fprintf(stderr, "*** unknown reg 0x%x\n", reg);
+   return NULL;
+}
+
+
+static const char *get_reg_name( struct reg *reg )
+{
+   static char tmp[80];
+
+   if (reg->idx == reg->closest->idx) 
+      return reg->closest->name;
+
+   
+   if (reg->flags & ISVEC) {
+      if (reg->idx/4 != reg->closest->idx)
+	 sprintf(tmp, "%s+%d[%d]", 
+		 reg->closest->name, 
+		 (reg->idx/4) - reg->closest->idx,
+		 reg->idx%4);
+      else
+	 sprintf(tmp, "%s[%d]", reg->closest->name, reg->idx%4);
+   }
+   else {
+      if (reg->idx != reg->closest->idx)
+	 sprintf(tmp, "%s+%d", reg->closest->name, reg->idx - reg->closest->idx);
+      else
+	 sprintf(tmp, "%s", reg->closest->name);
+   }
+
+   return tmp;
+}
+
+static int print_int_reg_assignment( struct reg *reg, int data )
+{
+   int changed = (reg->current.i != data);
+   int ever_seen = find_or_add_value( reg, data );
+   
+   if (VERBOSE || (NORMAL && (changed || !ever_seen)))
+       fprintf(stderr, "   %s <-- 0x%x", get_reg_name(reg), data);
+       
+   if (NORMAL) {
+      if (!ever_seen) 
+	 fprintf(stderr, " *** BRAND NEW VALUE");
+      else if (changed) 
+	 fprintf(stderr, " *** CHANGED"); 
+   }
+   
+   reg->current.i = data;
+
+   if (VERBOSE || (NORMAL && (changed || !ever_seen)))
+      fprintf(stderr, "\n");
+
+   return changed;
+}
+
+
+static int print_float_reg_assignment( struct reg *reg, float data )
+{
+   int changed = (reg->current.f != data);
+   int newmin = (data < reg->vmin);
+   int newmax = (data > reg->vmax);
+
+   if (VERBOSE || (NORMAL && (newmin || newmax || changed)))
+      fprintf(stderr, "   %s <-- %.3f", get_reg_name(reg), data);
+
+   if (NORMAL) {
+      if (newmin) {
+	 fprintf(stderr, " *** NEW MIN (prev %.3f)", reg->vmin);
+	 reg->vmin = data;
+      }
+      else if (newmax) {
+	 fprintf(stderr, " *** NEW MAX (prev %.3f)", reg->vmax);
+	 reg->vmax = data;
+      }
+      else if (changed) {
+	 fprintf(stderr, " *** CHANGED");
+      }
+   }
+
+   reg->current.f = data;
+
+   if (VERBOSE || (NORMAL && (newmin || newmax || changed)))
+      fprintf(stderr, "\n");
+
+   return changed;
+}
+
+static int print_reg_assignment( struct reg *reg, int data )
+{
+   float_ui32_type datau;
+   datau.ui32 = data;
+   reg->flags |= TOUCHED;
+   if (reg->flags & ISFLOAT)
+      return print_float_reg_assignment( reg, datau.f );
+   else
+      return print_int_reg_assignment( reg, data );
+}
+
+static void print_reg( struct reg *reg )
+{
+   if (reg->flags & TOUCHED) {
+      if (reg->flags & ISFLOAT) {
+	 fprintf(stderr, "   %s == %f\n", get_reg_name(reg), reg->current.f);
+      } else {
+	 fprintf(stderr, "   %s == 0x%x\n", get_reg_name(reg), reg->current.i);
+      }
+   }
+}
+
+
+static void dump_state( void )
+{
+   int i;
+
+   for (i = 0 ; i < Elements(regs) ; i++) 
+      print_reg( &regs[i] );
+
+   for (i = 0 ; i < Elements(scalars) ; i++) 
+      print_reg( &scalars[i] );
+
+   for (i = 0 ; i < Elements(vectors) ; i++) 
+      print_reg( &vectors[i] );
+}
+
+
+
+static int radeon_emit_packets( 
+   drm_radeon_cmd_header_t header,
+   drm_radeon_cmd_buffer_t *cmdbuf )
+{
+   int id = (int)header.packet.packet_id;
+   int sz = packet[id].len;
+   int *data = (int *)cmdbuf->buf;
+   int i;
+   
+   if (sz * sizeof(int) > cmdbuf->bufsz) {
+      fprintf(stderr, "Packet overflows cmdbuf\n");      
+      return -EINVAL;
+   }
+
+   if (!packet[id].name) {
+      fprintf(stderr, "*** Unknown packet 0 nr %d\n", id );
+      return -EINVAL;
+   }
+
+   
+   if (VERBOSE) 
+      fprintf(stderr, "Packet 0 reg %s nr %d\n", packet[id].name, sz );
+
+   for ( i = 0 ; i < sz ; i++) {
+      struct reg *reg = lookup_reg( regs, packet[id].start + i*4 );
+      if (print_reg_assignment( reg, data[i] ))
+	 total_changed++;
+      total++;
+   }
+
+   cmdbuf->buf += sz * sizeof(int);
+   cmdbuf->bufsz -= sz * sizeof(int);
+   return 0;
+}
+
+
+static int radeon_emit_scalars( 
+   drm_radeon_cmd_header_t header,
+   drm_radeon_cmd_buffer_t *cmdbuf )
+{
+   int sz = header.scalars.count;
+   int *data = (int *)cmdbuf->buf;
+   int start = header.scalars.offset;
+   int stride = header.scalars.stride;
+   int i;
+
+   if (VERBOSE)
+      fprintf(stderr, "emit scalars, start %d stride %d nr %d (end %d)\n",
+	      start, stride, sz, start + stride * sz);
+
+
+   for (i = 0 ; i < sz ; i++, start += stride) {
+      struct reg *reg = lookup_reg( scalars, start );
+      if (print_reg_assignment( reg, data[i] ))
+	 total_changed++;
+      total++;
+   }
+	 
+   cmdbuf->buf += sz * sizeof(int);
+   cmdbuf->bufsz -= sz * sizeof(int);
+   return 0;
+}
+
+
+static int radeon_emit_scalars2( 
+   drm_radeon_cmd_header_t header,
+   drm_radeon_cmd_buffer_t *cmdbuf )
+{
+   int sz = header.scalars.count;
+   int *data = (int *)cmdbuf->buf;
+   int start = header.scalars.offset + 0x100;
+   int stride = header.scalars.stride;
+   int i;
+
+   if (VERBOSE)
+      fprintf(stderr, "emit scalars2, start %d stride %d nr %d (end %d)\n",
+	      start, stride, sz, start + stride * sz);
+
+   if (start + stride * sz > 258) {
+      fprintf(stderr, "emit scalars OVERFLOW %d/%d/%d\n", start, stride, sz);
+      return -1;
+   }
+
+   for (i = 0 ; i < sz ; i++, start += stride) {
+      struct reg *reg = lookup_reg( scalars, start );
+      if (print_reg_assignment( reg, data[i] ))
+	 total_changed++;
+      total++;
+   }
+	 
+   cmdbuf->buf += sz * sizeof(int);
+   cmdbuf->bufsz -= sz * sizeof(int);
+   return 0;
+}
+
+/* Check: inf/nan/extreme-size?
+ * Check: table start, end, nr, etc.
+ */
+static int radeon_emit_vectors( 
+   drm_radeon_cmd_header_t header,
+   drm_radeon_cmd_buffer_t *cmdbuf )
+{
+   int sz = header.vectors.count;
+   int *data = (int *)cmdbuf->buf;
+   int start = header.vectors.offset;
+   int stride = header.vectors.stride;
+   int i,j;
+
+   if (VERBOSE)
+      fprintf(stderr, "emit vectors, start %d stride %d nr %d (end %d) (0x%x)\n",
+	      start, stride, sz, start + stride * sz, header.i);
+
+/*    if (start + stride * (sz/4) > 128) { */
+/*       fprintf(stderr, "emit vectors OVERFLOW %d/%d/%d\n", start, stride, sz); */
+/*       return -1; */
+/*    } */
+
+   for (i = 0 ; i < sz ;  start += stride) {
+      int changed = 0;
+      for (j = 0 ; j < 4 ; i++,j++) {
+	 struct reg *reg = lookup_reg( vectors, start*4+j );
+	 if (print_reg_assignment( reg, data[i] ))
+	    changed = 1;
+      }
+      if (changed)
+	 total_changed += 4;
+      total += 4;
+   }
+	 
+
+   cmdbuf->buf += sz * sizeof(int);
+   cmdbuf->bufsz -= sz * sizeof(int);
+   return 0;
+}
+
+static int radeon_emit_veclinear( 
+   drm_radeon_cmd_header_t header,
+   drm_radeon_cmd_buffer_t *cmdbuf )
+{
+   int sz = header.veclinear.count * 4;
+   int *data = (int *)cmdbuf->buf;
+   float *fdata =(float *)cmdbuf->buf;
+   int start = header.veclinear.addr_lo | (header.veclinear.addr_hi << 8);
+   int i;
+
+   if (1||VERBOSE)
+      fprintf(stderr, "emit vectors linear, start %d nr %d (end %d) (0x%x)\n",
+	      start, sz >> 2, start + (sz >> 2), header.i);
+
+
+   if (start < 0x60) {
+      for (i = 0 ; i < sz ;  i += 4) {
+	 fprintf(stderr, "R200_VS_PARAM %d 0 %f\n", (i >> 2) + start, fdata[i]);
+	 fprintf(stderr, "R200_VS_PARAM %d 1 %f\n", (i >> 2) + start, fdata[i+1]);
+	 fprintf(stderr, "R200_VS_PARAM %d 2 %f\n", (i >> 2) + start, fdata[i+2]);
+	 fprintf(stderr, "R200_VS_PARAM %d 3 %f\n", (i >> 2) + start, fdata[i+3]);
+      }
+   }
+   else if ((start >= 0x100) && (start < 0x160)) {
+      for (i = 0 ; i < sz ;  i += 4) {
+	 fprintf(stderr, "R200_VS_PARAM %d 0 %f\n", (i >> 2) + start - 0x100 + 0x60, fdata[i]);
+	 fprintf(stderr, "R200_VS_PARAM %d 1 %f\n", (i >> 2) + start - 0x100 + 0x60, fdata[i+1]);
+	 fprintf(stderr, "R200_VS_PARAM %d 2 %f\n", (i >> 2) + start - 0x100 + 0x60, fdata[i+2]);
+	 fprintf(stderr, "R200_VS_PARAM %d 3 %f\n", (i >> 2) + start - 0x100 + 0x60, fdata[i+3]);
+      }
+   }
+   else if ((start >= 0x80) && (start < 0xc0)) {
+      for (i = 0 ; i < sz ;  i += 4) {
+	 fprintf(stderr, "R200_VS_PROG %d OPDST %08x\n", (i >> 2) + start - 0x80, data[i]);
+	 fprintf(stderr, "R200_VS_PROG %d SRC1  %08x\n", (i >> 2) + start - 0x80, data[i+1]);
+	 fprintf(stderr, "R200_VS_PROG %d SRC2  %08x\n", (i >> 2) + start - 0x80, data[i+2]);
+	 fprintf(stderr, "R200_VS_PROG %d SRC3  %08x\n", (i >> 2) + start - 0x80, data[i+3]);
+      }
+   }
+   else if ((start >= 0x180) && (start < 0x1c0)) {
+      for (i = 0 ; i < sz ;  i += 4) {
+	 fprintf(stderr, "R200_VS_PROG %d OPDST %08x\n", (i >> 2) + start - 0x180 + 0x40, data[i]);
+	 fprintf(stderr, "R200_VS_PROG %d SRC1  %08x\n", (i >> 2) + start - 0x180 + 0x40, data[i+1]);
+	 fprintf(stderr, "R200_VS_PROG %d SRC2  %08x\n", (i >> 2) + start - 0x180 + 0x40, data[i+2]);
+	 fprintf(stderr, "R200_VS_PROG %d SRC3  %08x\n", (i >> 2) + start - 0x180 + 0x40, data[i+3]);
+      }
+   }
+   else {
+      fprintf(stderr, "write to unknown vector area\n");
+   }
+
+   cmdbuf->buf += sz * sizeof(int);
+   cmdbuf->bufsz -= sz * sizeof(int);
+   return 0;
+}
+
+#if 0
+static int print_vertex_format( int vfmt )
+{
+   if (NORMAL) {
+      fprintf(stderr, "   %s(%x): %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+	      "vertex format",
+	      vfmt,
+	      "xy,",
+	      (vfmt & R200_VTX_Z0) ? "z," : "",
+	      (vfmt & R200_VTX_W0) ? "w0," : "",
+	      (vfmt & R200_VTX_FPCOLOR) ? "fpcolor," : "",
+	      (vfmt & R200_VTX_FPALPHA) ? "fpalpha," : "",
+	      (vfmt & R200_VTX_PKCOLOR) ? "pkcolor," : "",
+	      (vfmt & R200_VTX_FPSPEC) ? "fpspec," : "",
+	      (vfmt & R200_VTX_FPFOG) ? "fpfog," : "",
+	      (vfmt & R200_VTX_PKSPEC) ? "pkspec," : "",
+	      (vfmt & R200_VTX_ST0) ? "st0," : "",
+	      (vfmt & R200_VTX_ST1) ? "st1," : "",
+	      (vfmt & R200_VTX_Q1) ? "q1," : "",
+	      (vfmt & R200_VTX_ST2) ? "st2," : "",
+	      (vfmt & R200_VTX_Q2) ? "q2," : "",
+	      (vfmt & R200_VTX_ST3) ? "st3," : "",
+	      (vfmt & R200_VTX_Q3) ? "q3," : "",
+	      (vfmt & R200_VTX_Q0) ? "q0," : "",
+	      (vfmt & R200_VTX_N0) ? "n0," : "",
+	      (vfmt & R200_VTX_XY1) ? "xy1," : "",
+	      (vfmt & R200_VTX_Z1) ? "z1," : "",
+	      (vfmt & R200_VTX_W1) ? "w1," : "",
+	      (vfmt & R200_VTX_N1) ? "n1," : "");
+
+   
+      if (!find_or_add_value( &others[V_VTXFMT], vfmt ))
+	 fprintf(stderr, " *** NEW VALUE");
+
+      fprintf(stderr, "\n");
+   }
+
+   return 0;
+}
+#endif
+
+static char *primname[0x10] = {
+   "NONE",
+   "POINTS",
+   "LINES",
+   "LINE_STRIP",
+   "TRIANGLES",
+   "TRIANGLE_FAN",
+   "TRIANGLE_STRIP",
+   "RECT_LIST",
+   NULL,
+   "3VRT_POINTS",
+   "3VRT_LINES",
+   "POINT_SPRITES",
+   "LINE_LOOP",
+   "QUADS",
+   "QUAD_STRIP",
+   "POLYGON",
+};
+
+static int print_prim_and_flags( int prim )
+{
+   int numverts;
+   
+   if (NORMAL)
+      fprintf(stderr, "   %s(%x): %s%s%s%s%s%s\n",
+	      "prim flags",
+	      prim,
+	      ((prim & 0x30) == R200_VF_PRIM_WALK_IND) ? "IND," : "",
+	      ((prim & 0x30) == R200_VF_PRIM_WALK_LIST) ? "LIST," : "",
+	      ((prim & 0x30) == R200_VF_PRIM_WALK_RING) ? "RING," : "",
+	      (prim & R200_VF_COLOR_ORDER_RGBA) ? "RGBA," : "BGRA, ",
+	      (prim & R200_VF_INDEX_SZ_4) ? "INDX-32," : "",
+	      (prim & R200_VF_TCL_OUTPUT_VTX_ENABLE) ? "TCL_OUT_VTX," : "");
+
+   numverts = prim>>16;
+   
+   if (NORMAL)
+      fprintf(stderr, "   prim: %s numverts %d\n", primname[prim&0xf], numverts);
+
+   switch (prim & 0xf) {
+   case R200_VF_PRIM_NONE:
+   case R200_VF_PRIM_POINTS:
+      if (numverts < 1) {
+	 fprintf(stderr, "Bad nr verts for line %d\n", numverts);
+	 return -1;
+      }
+      break;
+   case R200_VF_PRIM_LINES:
+   case R200_VF_PRIM_POINT_SPRITES:
+      if ((numverts & 1) || numverts == 0) {
+	 fprintf(stderr, "Bad nr verts for line %d\n", numverts);
+	 return -1;
+      }
+      break;
+   case R200_VF_PRIM_LINE_STRIP:
+   case R200_VF_PRIM_LINE_LOOP:
+      if (numverts < 2) {
+	 fprintf(stderr, "Bad nr verts for line_strip %d\n", numverts);
+	 return -1;
+      }
+      break;
+   case R200_VF_PRIM_TRIANGLES:
+   case R200_VF_PRIM_3VRT_POINTS:
+   case R200_VF_PRIM_3VRT_LINES:
+   case R200_VF_PRIM_RECT_LIST:
+      if (numverts % 3 || numverts == 0) {
+	 fprintf(stderr, "Bad nr verts for tri %d\n", numverts);
+	 return -1;
+      }
+      break;
+   case R200_VF_PRIM_TRIANGLE_FAN:
+   case R200_VF_PRIM_TRIANGLE_STRIP:
+   case R200_VF_PRIM_POLYGON:
+      if (numverts < 3) {
+	 fprintf(stderr, "Bad nr verts for strip/fan %d\n", numverts);
+	 return -1;
+      }
+      break;
+   case R200_VF_PRIM_QUADS:
+      if (numverts % 4 || numverts == 0) {
+	 fprintf(stderr, "Bad nr verts for quad %d\n", numverts);
+	 return -1;
+      }
+      break;
+   case R200_VF_PRIM_QUAD_STRIP:
+      if (numverts % 2 || numverts < 4) {
+	 fprintf(stderr, "Bad nr verts for quadstrip %d\n", numverts);
+	 return -1;
+      }
+      break;
+   default:
+      fprintf(stderr, "Bad primitive\n");
+      return -1;
+   }	
+   return 0;
+}
+
+/* build in knowledge about each packet type
+ */
+static int radeon_emit_packet3( drm_radeon_cmd_buffer_t *cmdbuf )
+{
+   int cmdsz;
+   int *cmd = (int *)cmdbuf->buf;
+   int *tmp;
+   int i, stride, size, start;
+
+   cmdsz = 2 + ((cmd[0] & RADEON_CP_PACKET_COUNT_MASK) >> 16);
+
+   if ((cmd[0] & RADEON_CP_PACKET_MASK) != RADEON_CP_PACKET3 ||
+       cmdsz * 4 > cmdbuf->bufsz ||
+       cmdsz > RADEON_CP_PACKET_MAX_DWORDS) {
+      fprintf(stderr, "Bad packet\n");
+      return -EINVAL;
+   }
+
+   switch( cmd[0] & ~RADEON_CP_PACKET_COUNT_MASK ) {
+   case R200_CP_CMD_NOP:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_NOP, %d dwords\n", cmdsz);
+      break;
+   case R200_CP_CMD_NEXT_CHAR:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_NEXT_CHAR, %d dwords\n", cmdsz);
+      break;
+   case R200_CP_CMD_PLY_NEXTSCAN:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_PLY_NEXTSCAN, %d dwords\n", cmdsz);
+      break;
+   case R200_CP_CMD_SET_SCISSORS:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_SET_SCISSORS, %d dwords\n", cmdsz);
+      break;
+   case R200_CP_CMD_LOAD_MICROCODE:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_LOAD_MICROCODE, %d dwords\n", cmdsz);
+      break;
+   case R200_CP_CMD_WAIT_FOR_IDLE:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_WAIT_FOR_IDLE, %d dwords\n", cmdsz);
+      break;
+
+   case R200_CP_CMD_3D_DRAW_VBUF:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_3D_DRAW_VBUF, %d dwords\n", cmdsz);
+/*       print_vertex_format(cmd[1]); */
+      if (print_prim_and_flags(cmd[2]))
+	 return -EINVAL;
+      break;
+
+   case R200_CP_CMD_3D_DRAW_IMMD:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_3D_DRAW_IMMD, %d dwords\n", cmdsz);
+      break;
+   case R200_CP_CMD_3D_DRAW_INDX: {
+      int neltdwords;
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_3D_DRAW_INDX, %d dwords\n", cmdsz);
+/*       print_vertex_format(cmd[1]); */
+      if (print_prim_and_flags(cmd[2]))
+	 return -EINVAL;
+      neltdwords = cmd[2]>>16;
+      neltdwords += neltdwords & 1;
+      neltdwords /= 2;
+      if (neltdwords + 3 != cmdsz)
+	 fprintf(stderr, "Mismatch in DRAW_INDX, %d vs cmdsz %d\n",
+		 neltdwords, cmdsz);
+      break;
+   }
+   case R200_CP_CMD_LOAD_PALETTE:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_LOAD_PALETTE, %d dwords\n", cmdsz);
+      break;
+   case R200_CP_CMD_3D_LOAD_VBPNTR:
+      if (NORMAL) {
+	 fprintf(stderr, "PACKET3_3D_LOAD_VBPNTR, %d dwords\n", cmdsz);
+	 fprintf(stderr, "   nr arrays: %d\n", cmd[1]);
+      }
+
+      if (((cmd[1]/2)*3) + ((cmd[1]%2)*2) != cmdsz - 2) {
+	 fprintf(stderr, "  ****** MISMATCH %d/%d *******\n",
+		 ((cmd[1]/2)*3) + ((cmd[1]%2)*2) + 2, cmdsz);
+	 return -EINVAL;
+      }
+
+      if (NORMAL) {
+	 tmp = cmd+2;
+	 for (i = 0 ; i < cmd[1] ; i++) {
+	    if (i & 1) {
+	       stride = (tmp[0]>>24) & 0xff;
+	       size = (tmp[0]>>16) & 0xff;
+	       start = tmp[2];
+	       tmp += 3;
+	    }
+	    else {
+	       stride = (tmp[0]>>8) & 0xff;
+	       size = (tmp[0]) & 0xff;
+	       start = tmp[1];
+	    }
+	    fprintf(stderr, "   array %d: start 0x%x vsize %d vstride %d\n",
+		    i, start, size, stride );
+	 }
+      }
+      break;
+   case R200_CP_CMD_PAINT:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_PAINT, %d dwords\n", cmdsz);
+      break;
+   case R200_CP_CMD_BITBLT:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_BITBLT, %d dwords\n", cmdsz);
+      break;
+   case R200_CP_CMD_SMALLTEXT:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_SMALLTEXT, %d dwords\n", cmdsz);
+      break;
+   case R200_CP_CMD_HOSTDATA_BLT:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_HOSTDATA_BLT, %d dwords\n", 
+	      cmdsz);
+      break;
+   case R200_CP_CMD_POLYLINE:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_POLYLINE, %d dwords\n", cmdsz);
+      break;
+   case R200_CP_CMD_POLYSCANLINES:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_POLYSCANLINES, %d dwords\n", 
+	      cmdsz);
+      break;
+   case R200_CP_CMD_PAINT_MULTI:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_PAINT_MULTI, %d dwords\n", 
+	      cmdsz);
+      break;
+   case R200_CP_CMD_BITBLT_MULTI:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_BITBLT_MULTI, %d dwords\n", 
+	      cmdsz);
+      break;
+   case R200_CP_CMD_TRANS_BITBLT:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_TRANS_BITBLT, %d dwords\n", 
+	      cmdsz);
+      break;
+   case R200_CP_CMD_3D_DRAW_VBUF_2:
+      if (NORMAL)
+	 fprintf(stderr, "R200_CP_CMD_3D_DRAW_VBUF_2, %d dwords\n", 
+	      cmdsz);
+      if (print_prim_and_flags(cmd[1]))
+	 return -EINVAL;
+      break;
+   case R200_CP_CMD_3D_DRAW_IMMD_2:
+      if (NORMAL)
+	 fprintf(stderr, "R200_CP_CMD_3D_DRAW_IMMD_2, %d dwords\n", 
+	      cmdsz);
+      if (print_prim_and_flags(cmd[1]))
+	 return -EINVAL;
+      break;
+   case R200_CP_CMD_3D_DRAW_INDX_2:
+      if (NORMAL)
+	 fprintf(stderr, "R200_CP_CMD_3D_DRAW_INDX_2, %d dwords\n", 
+	      cmdsz);
+      if (print_prim_and_flags(cmd[1]))
+	 return -EINVAL;
+      break;
+   default:
+      fprintf(stderr, "UNKNOWN PACKET, %d dwords\n", cmdsz);
+      break;
+   }
+      
+   cmdbuf->buf += cmdsz * 4;
+   cmdbuf->bufsz -= cmdsz * 4;
+   return 0;
+}
+
+
+/* Check cliprects for bounds, then pass on to above:
+ */
+static int radeon_emit_packet3_cliprect( drm_radeon_cmd_buffer_t *cmdbuf )
+{   
+   drm_clip_rect_t *boxes = (drm_clip_rect_t *)cmdbuf->boxes;
+   int i = 0;
+
+   if (VERBOSE && total_changed) {
+      dump_state();
+      total_changed = 0;
+   }
+
+   if (NORMAL) {
+      do {
+	 if ( i < cmdbuf->nbox ) {
+	    fprintf(stderr, "Emit box %d/%d %d,%d %d,%d\n",
+		    i, cmdbuf->nbox,
+		    boxes[i].x1, boxes[i].y1, boxes[i].x2, boxes[i].y2);
+	 }
+      } while ( ++i < cmdbuf->nbox );
+   }
+
+   if (cmdbuf->nbox == 1)
+      cmdbuf->nbox = 0;
+
+   return radeon_emit_packet3( cmdbuf );
+}
+
+
+int r200SanityCmdBuffer( r200ContextPtr rmesa,
+			   int nbox,
+			   drm_clip_rect_t *boxes )
+{
+   int idx;
+   drm_radeon_cmd_buffer_t cmdbuf;
+   drm_radeon_cmd_header_t header;
+   static int inited = 0;
+
+   if (!inited) {
+      init_regs();
+      inited = 1;
+   }
+
+
+   cmdbuf.buf = rmesa->store.cmd_buf;
+   cmdbuf.bufsz = rmesa->store.cmd_used;
+   cmdbuf.boxes = (drm_clip_rect_t *)boxes;
+   cmdbuf.nbox = nbox;
+
+   while ( cmdbuf.bufsz >= sizeof(header) ) {
+		
+      header.i = *(int *)cmdbuf.buf;
+      cmdbuf.buf += sizeof(header);
+      cmdbuf.bufsz -= sizeof(header);
+
+      switch (header.header.cmd_type) {
+      case RADEON_CMD_PACKET: 
+	 if (radeon_emit_packets( header, &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_packets failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      case RADEON_CMD_SCALARS:
+	 if (radeon_emit_scalars( header, &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_scalars failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      case RADEON_CMD_SCALARS2:
+	 if (radeon_emit_scalars2( header, &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_scalars failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      case RADEON_CMD_VECTORS:
+	 if (radeon_emit_vectors( header, &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_vectors failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      case RADEON_CMD_DMA_DISCARD:
+	 idx = header.dma.buf_idx;
+	 if (NORMAL)
+	    fprintf(stderr, "RADEON_CMD_DMA_DISCARD buf %d\n", idx);
+	 bufs++;
+	 break;
+
+      case RADEON_CMD_PACKET3:
+	 if (radeon_emit_packet3( &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_packet3 failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      case RADEON_CMD_PACKET3_CLIP:
+	 if (radeon_emit_packet3_cliprect( &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_packet3_clip failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      case RADEON_CMD_WAIT:
+	 break;
+
+      case RADEON_CMD_VECLINEAR:
+	 if (radeon_emit_veclinear( header, &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_veclinear failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      default:
+	 fprintf(stderr,"bad cmd_type %d at %p\n", 
+		   header.header.cmd_type,
+		   cmdbuf.buf - sizeof(header));
+	 return -EINVAL;
+      }
+   }
+
+   if (0)
+   {
+      static int n = 0;
+      n++;
+      if (n == 10) {
+	 fprintf(stderr, "Bufs %d Total emitted %d real changes %d (%.2f%%)\n",
+		 bufs,
+		 total, total_changed, 
+		 ((float)total_changed/(float)total*100.0));
+	 fprintf(stderr, "Total emitted per buf: %.2f\n",
+		 (float)total/(float)bufs);
+	 fprintf(stderr, "Real changes per buf: %.2f\n",
+		 (float)total_changed/(float)bufs);
+
+	 bufs = n = total = total_changed = 0;
+      }
+   }
+
+   fprintf(stderr, "leaving %s\n\n\n", __FUNCTION__);
+
+   return 0;
+}
diff --git a/r200/r200_sanity.h b/r200/r200_sanity.h
new file mode 100644
index 0000000..f4c110d
--- /dev/null
+++ b/r200/r200_sanity.h
@@ -0,0 +1,8 @@
+#ifndef R200_SANITY_H
+#define R200_SANITY_H
+
+extern int r200SanityCmdBuffer( r200ContextPtr rmesa,
+				int nbox,
+				drm_clip_rect_t *boxes );
+
+#endif
diff --git a/r200/r200_span.c b/r200/r200_span.c
new file mode 100644
index 0000000..6e99dfe
--- /dev/null
+++ b/r200/r200_span.c
@@ -0,0 +1,306 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_span.c,v 1.1 2002/10/30 12:51:52 alanh Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "swrast/swrast.h"
+#include "colormac.h"
+
+#include "r200_context.h"
+#include "r200_ioctl.h"
+#include "r200_state.h"
+#include "r200_span.h"
+#include "r200_tex.h"
+
+#define DBG 0
+
+/*
+ * Note that all information needed to access pixels in a renderbuffer
+ * should be obtained through the gl_renderbuffer parameter, not per-context
+ * information.
+ */
+#define LOCAL_VARS						\
+   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
+   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
+   const GLuint bottom = dPriv->h - 1;				\
+   GLubyte *buf = (GLubyte *) drb->flippedData			\
+      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
+   GLuint p;							\
+   (void) p;
+
+#define LOCAL_DEPTH_VARS				\
+   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
+   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
+   const GLuint bottom = dPriv->h - 1;			\
+   GLuint xo = dPriv->x;				\
+   GLuint yo = dPriv->y;				\
+   GLubyte *buf = (GLubyte *) drb->Base.Data;
+
+#define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
+
+#define Y_FLIP(Y) (bottom - (Y))
+
+#define HW_LOCK() 
+
+#define HW_UNLOCK()							
+
+
+
+/* ================================================================
+ * Color buffer
+ */
+
+/* 16 bit, RGB565 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_RGB
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
+
+#define TAG(x)    r200##x##_RGB565
+#define TAG2(x,y) r200##x##_RGB565##y
+#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
+#include "spantmp2.h"
+
+/* 32 bit, ARGB8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+
+#define TAG(x)    r200##x##_ARGB8888
+#define TAG2(x,y) r200##x##_ARGB8888##y
+#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
+#include "spantmp2.h"
+
+
+/* ================================================================
+ * Depth buffer
+ */
+
+/* The Radeon family has depth tiling on all the time, so we have to convert
+ * the x,y coordinates into the memory bus address (mba) in the same
+ * manner as the engine.  In each case, the linear block address (ba)
+ * is calculated, and then wired with x and y to produce the final
+ * memory address.
+ * The chip will do address translation on its own if the surface registers
+ * are set up correctly. It is not quite enough to get it working with hyperz too...
+ */
+
+/* extract bit 'b' of x, result is zero or one */
+#define BIT(x,b) ((x & (1<<b))>>b)
+
+static GLuint
+r200_mba_z32( driRenderbuffer *drb, GLint x, GLint y )
+{
+   GLuint pitch = drb->pitch;
+   if (drb->depthHasSurface) {
+      return 4 * (x + y * pitch);
+   }
+   else {
+      GLuint b = ((y & 0x7FF) >> 4) * ((pitch & 0xFFF) >> 5) + ((x & 0x7FF) >> 5);
+      GLuint a = 
+         (BIT(x,0) << 2) |
+         (BIT(y,0) << 3) |
+         (BIT(x,1) << 4) |
+         (BIT(y,1) << 5) |
+         (BIT(x,3) << 6) |
+         (BIT(x,4) << 7) |
+         (BIT(x,2) << 8) |
+         (BIT(y,2) << 9) |
+         (BIT(y,3) << 10) |
+         (((pitch & 0x20) ? (b & 0x01) : ((b & 0x01) ^ (BIT(y,4)))) << 11) |
+         ((b >> 1) << 12);
+      return a;
+   }
+}
+
+static GLuint
+r200_mba_z16( driRenderbuffer *drb, GLint x, GLint y )
+{
+   GLuint pitch = drb->pitch;
+   if (drb->depthHasSurface) {
+      return 2 * (x + y * pitch);
+   }
+   else {
+      GLuint b = ((y & 0x7FF) >> 4) * ((pitch & 0xFFF) >> 6) + ((x & 0x7FF) >> 6);
+      GLuint a = 
+         (BIT(x,0) << 1) |
+         (BIT(y,0) << 2) |
+         (BIT(x,1) << 3) |
+         (BIT(y,1) << 4) |
+         (BIT(x,2) << 5) |
+         (BIT(x,4) << 6) |
+         (BIT(x,5) << 7) |
+         (BIT(x,3) << 8) |
+         (BIT(y,2) << 9) |
+         (BIT(y,3) << 10) |
+         (((pitch & 0x40) ? (b & 0x01) : ((b & 0x01) ^ (BIT(y,4)))) << 11) |
+         ((b >> 1) << 12);
+      return a;
+   }
+}
+
+
+/* 16-bit depth buffer functions
+ */
+
+#define WRITE_DEPTH( _x, _y, d )					\
+   *(GLushort *)(buf + r200_mba_z16( drb, _x + xo, _y + yo )) = d;
+
+#define READ_DEPTH( d, _x, _y )						\
+   d = *(GLushort *)(buf + r200_mba_z16( drb, _x + xo, _y + yo ));
+
+#define TAG(x) r200##x##_z16
+#include "depthtmp.h"
+
+
+/* 24 bit depth, 8 bit stencil depthbuffer functions
+ */
+
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint offset = r200_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0xff000000;							\
+   tmp |= ((d) & 0x00ffffff);						\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+
+#define READ_DEPTH( d, _x, _y )						\
+   d = *(GLuint *)(buf + r200_mba_z32( drb, _x + xo,			\
+					 _y + yo )) & 0x00ffffff;
+
+#define TAG(x) r200##x##_z24_s8
+#include "depthtmp.h"
+
+
+/* ================================================================
+ * Stencil buffer
+ */
+
+/* 24 bit depth, 8 bit stencil depthbuffer functions
+ */
+#define WRITE_STENCIL( _x, _y, d )					\
+do {									\
+   GLuint offset = r200_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0x00ffffff;							\
+   tmp |= (((d) & 0xff) << 24);						\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+
+#define READ_STENCIL( d, _x, _y )					\
+do {									\
+   GLuint offset = r200_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0xff000000;							\
+   d = tmp >> 24;							\
+} while (0)
+
+#define TAG(x) r200##x##_z24_s8
+#include "stenciltmp.h"
+
+
+/* Move locking out to get reasonable span performance (10x better
+ * than doing this in HW_LOCK above).  WaitForIdle() is the main
+ * culprit.
+ */
+
+static void r200SpanRenderStart( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+
+   R200_FIREVERTICES( rmesa );
+   LOCK_HARDWARE( rmesa );
+   r200WaitForIdleLocked( rmesa );
+
+   /* Read & rewrite the first pixel in the frame buffer.  This should
+    * be a noop, right?  In fact without this conform fails as reading
+    * from the framebuffer sometimes produces old results -- the
+    * on-card read cache gets mixed up and doesn't notice that the
+    * framebuffer has been updated.
+    *
+    * In the worst case this is buggy too as p might get the wrong
+    * value first time, so really need a hidden pixel somewhere for this.
+    */
+   {
+      int p;
+      driRenderbuffer *drb =
+	 (driRenderbuffer *) ctx->WinSysDrawBuffer->_ColorDrawBuffers[0][0];
+      volatile int *buf =
+	 (volatile int *)(rmesa->dri.screen->pFB + drb->offset);
+      p = *buf;
+      *buf = p;
+   }
+}
+
+static void r200SpanRenderFinish( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+   _swrast_flush( ctx );
+   UNLOCK_HARDWARE( rmesa );
+}
+
+void r200InitSpanFuncs( GLcontext *ctx )
+{
+   struct swrast_device_driver *swdd = _swrast_GetDeviceDriverReference(ctx);
+   swdd->SpanRenderStart          = r200SpanRenderStart;
+   swdd->SpanRenderFinish         = r200SpanRenderFinish; 
+}
+
+
+
+/**
+ * Plug in the Get/Put routines for the given driRenderbuffer.
+ */
+void
+radeonSetSpanFunctions(driRenderbuffer *drb, const GLvisual *vis)
+{
+   if (drb->Base.InternalFormat == GL_RGBA) {
+      if (vis->redBits == 5 && vis->greenBits == 6 && vis->blueBits == 5) {
+         r200InitPointers_RGB565(&drb->Base);
+      }
+      else {
+         r200InitPointers_ARGB8888(&drb->Base);
+      }
+   }
+   else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
+      r200InitDepthPointers_z16(&drb->Base);
+   }
+   else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
+      r200InitDepthPointers_z24_s8(&drb->Base);
+   }
+   else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
+      r200InitStencilPointers_z24_s8(&drb->Base);
+   }
+}
diff --git a/r200/r200_span.h b/r200/r200_span.h
new file mode 100644
index 0000000..5e7d3e4
--- /dev/null
+++ b/r200/r200_span.h
@@ -0,0 +1,46 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_span.h,v 1.1 2002/10/30 12:51:52 alanh Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __R200_SPAN_H__
+#define __R200_SPAN_H__
+
+#include "drirenderbuffer.h"
+
+extern void r200InitSpanFuncs( GLcontext *ctx );
+
+extern void
+radeonSetSpanFunctions(driRenderbuffer *rb, const GLvisual *vis);
+
+#endif
diff --git a/r200/r200_state.c b/r200/r200_state.c
new file mode 100644
index 0000000..16726d7
--- /dev/null
+++ b/r200/r200_state.c
@@ -0,0 +1,2651 @@
+/* $XFree86$ */
+/**************************************************************************
+
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "api_arrayelt.h"
+#include "enums.h"
+#include "colormac.h"
+#include "light.h"
+#include "framebuffer.h"
+
+#include "swrast/swrast.h"
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "swrast_setup/swrast_setup.h"
+
+#include "r200_context.h"
+#include "r200_ioctl.h"
+#include "r200_state.h"
+#include "r200_tcl.h"
+#include "r200_tex.h"
+#include "r200_swtcl.h"
+#include "r200_vertprog.h"
+
+#include "drirenderbuffer.h"
+
+
+/* =============================================================
+ * Alpha blending
+ */
+
+static void r200AlphaFunc( GLcontext *ctx, GLenum func, GLfloat ref )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   int pp_misc = rmesa->hw.ctx.cmd[CTX_PP_MISC];
+   GLubyte refByte;
+
+   CLAMPED_FLOAT_TO_UBYTE(refByte, ref);
+
+   R200_STATECHANGE( rmesa, ctx );
+
+   pp_misc &= ~(R200_ALPHA_TEST_OP_MASK | R200_REF_ALPHA_MASK);
+   pp_misc |= (refByte & R200_REF_ALPHA_MASK);
+
+   switch ( func ) {
+   case GL_NEVER:
+      pp_misc |= R200_ALPHA_TEST_FAIL; 
+      break;
+   case GL_LESS:
+      pp_misc |= R200_ALPHA_TEST_LESS;
+      break;
+   case GL_EQUAL:
+      pp_misc |= R200_ALPHA_TEST_EQUAL;
+      break;
+   case GL_LEQUAL:
+      pp_misc |= R200_ALPHA_TEST_LEQUAL;
+      break;
+   case GL_GREATER:
+      pp_misc |= R200_ALPHA_TEST_GREATER;
+      break;
+   case GL_NOTEQUAL:
+      pp_misc |= R200_ALPHA_TEST_NEQUAL;
+      break;
+   case GL_GEQUAL:
+      pp_misc |= R200_ALPHA_TEST_GEQUAL;
+      break;
+   case GL_ALWAYS:
+      pp_misc |= R200_ALPHA_TEST_PASS;
+      break;
+   }
+
+   rmesa->hw.ctx.cmd[CTX_PP_MISC] = pp_misc;
+}
+
+static void r200BlendColor( GLcontext *ctx, const GLfloat cf[4] )
+{
+   GLubyte color[4];
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   R200_STATECHANGE( rmesa, ctx );
+   CLAMPED_FLOAT_TO_UBYTE(color[0], cf[0]);
+   CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]);
+   CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]);
+   CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]);
+   if (rmesa->r200Screen->drmSupportsBlendColor)
+      rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCOLOR] = r200PackColor( 4, color[0], color[1], color[2], color[3] );
+}
+
+/**
+ * Calculate the hardware blend factor setting.  This same function is used
+ * for source and destination of both alpha and RGB.
+ *
+ * \returns
+ * The hardware register value for the specified blend factor.  This value
+ * will need to be shifted into the correct position for either source or
+ * destination factor.
+ *
+ * \todo
+ * Since the two cases where source and destination are handled differently
+ * are essentially error cases, they should never happen.  Determine if these
+ * cases can be removed.
+ */
+static int blend_factor( GLenum factor, GLboolean is_src )
+{
+   int func;
+
+   switch ( factor ) {
+   case GL_ZERO:
+      func = R200_BLEND_GL_ZERO;
+      break;
+   case GL_ONE:
+      func = R200_BLEND_GL_ONE;
+      break;
+   case GL_DST_COLOR:
+      func = R200_BLEND_GL_DST_COLOR;
+      break;
+   case GL_ONE_MINUS_DST_COLOR:
+      func = R200_BLEND_GL_ONE_MINUS_DST_COLOR;
+      break;
+   case GL_SRC_COLOR:
+      func = R200_BLEND_GL_SRC_COLOR;
+      break;
+   case GL_ONE_MINUS_SRC_COLOR:
+      func = R200_BLEND_GL_ONE_MINUS_SRC_COLOR;
+      break;
+   case GL_SRC_ALPHA:
+      func = R200_BLEND_GL_SRC_ALPHA;
+      break;
+   case GL_ONE_MINUS_SRC_ALPHA:
+      func = R200_BLEND_GL_ONE_MINUS_SRC_ALPHA;
+      break;
+   case GL_DST_ALPHA:
+      func = R200_BLEND_GL_DST_ALPHA;
+      break;
+   case GL_ONE_MINUS_DST_ALPHA:
+      func = R200_BLEND_GL_ONE_MINUS_DST_ALPHA;
+      break;
+   case GL_SRC_ALPHA_SATURATE:
+      func = (is_src) ? R200_BLEND_GL_SRC_ALPHA_SATURATE : R200_BLEND_GL_ZERO;
+      break;
+   case GL_CONSTANT_COLOR:
+      func = R200_BLEND_GL_CONST_COLOR;
+      break;
+   case GL_ONE_MINUS_CONSTANT_COLOR:
+      func = R200_BLEND_GL_ONE_MINUS_CONST_COLOR;
+      break;
+   case GL_CONSTANT_ALPHA:
+      func = R200_BLEND_GL_CONST_ALPHA;
+      break;
+   case GL_ONE_MINUS_CONSTANT_ALPHA:
+      func = R200_BLEND_GL_ONE_MINUS_CONST_ALPHA;
+      break;
+   default:
+      func = (is_src) ? R200_BLEND_GL_ONE : R200_BLEND_GL_ZERO;
+   }
+   return func;
+}
+
+/**
+ * Sets both the blend equation and the blend function.
+ * This is done in a single
+ * function because some blend equations (i.e., \c GL_MIN and \c GL_MAX)
+ * change the interpretation of the blend function.
+ * Also, make sure that blend function and blend equation are set to their default
+ * value if color blending is not enabled, since at least blend equations GL_MIN
+ * and GL_FUNC_REVERSE_SUBTRACT will cause wrong results otherwise for
+ * unknown reasons.
+ */
+static void r200_set_blend_state( GLcontext * ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint cntl = rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &
+      ~(R200_ROP_ENABLE | R200_ALPHA_BLEND_ENABLE | R200_SEPARATE_ALPHA_ENABLE);
+
+   int func = (R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
+      (R200_BLEND_GL_ZERO << R200_DST_BLEND_SHIFT);
+   int eqn = R200_COMB_FCN_ADD_CLAMP;
+   int funcA = (R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
+      (R200_BLEND_GL_ZERO << R200_DST_BLEND_SHIFT);
+   int eqnA = R200_COMB_FCN_ADD_CLAMP;
+
+   R200_STATECHANGE( rmesa, ctx );
+
+   if (rmesa->r200Screen->drmSupportsBlendColor) {
+      if (ctx->Color.ColorLogicOpEnabled) {
+         rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] =  cntl | R200_ROP_ENABLE;
+         rmesa->hw.ctx.cmd[CTX_RB3D_ABLENDCNTL] = eqn | func;
+         rmesa->hw.ctx.cmd[CTX_RB3D_CBLENDCNTL] = eqn | func;
+         return;
+      } else if (ctx->Color.BlendEnabled) {
+         rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] =  cntl | R200_ALPHA_BLEND_ENABLE | R200_SEPARATE_ALPHA_ENABLE;
+      }
+      else {
+         rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = cntl;
+         rmesa->hw.ctx.cmd[CTX_RB3D_ABLENDCNTL] = eqn | func;
+         rmesa->hw.ctx.cmd[CTX_RB3D_CBLENDCNTL] = eqn | func;
+         return;
+      }
+   }
+   else {
+      if (ctx->Color.ColorLogicOpEnabled) {
+         rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] =  cntl | R200_ROP_ENABLE;
+         rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] = eqn | func;
+         return;
+      } else if (ctx->Color.BlendEnabled) {
+         rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] =  cntl | R200_ALPHA_BLEND_ENABLE;
+      }
+      else {
+         rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = cntl;
+         rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] = eqn | func;
+         return;
+      }
+   }
+
+   func = (blend_factor( ctx->Color.BlendSrcRGB, GL_TRUE ) << R200_SRC_BLEND_SHIFT) |
+      (blend_factor( ctx->Color.BlendDstRGB, GL_FALSE ) << R200_DST_BLEND_SHIFT);
+
+   switch(ctx->Color.BlendEquationRGB) {
+   case GL_FUNC_ADD:
+      eqn = R200_COMB_FCN_ADD_CLAMP;
+      break;
+
+   case GL_FUNC_SUBTRACT:
+      eqn = R200_COMB_FCN_SUB_CLAMP;
+      break;
+
+   case GL_FUNC_REVERSE_SUBTRACT:
+      eqn = R200_COMB_FCN_RSUB_CLAMP;
+      break;
+
+   case GL_MIN:
+      eqn = R200_COMB_FCN_MIN;
+      func = (R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
+         (R200_BLEND_GL_ONE << R200_DST_BLEND_SHIFT);
+      break;
+
+   case GL_MAX:
+      eqn = R200_COMB_FCN_MAX;
+      func = (R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
+         (R200_BLEND_GL_ONE << R200_DST_BLEND_SHIFT);
+      break;
+
+   default:
+      fprintf( stderr, "[%s:%u] Invalid RGB blend equation (0x%04x).\n",
+         __FUNCTION__, __LINE__, ctx->Color.BlendEquationRGB );
+      return;
+   }
+
+   if (!rmesa->r200Screen->drmSupportsBlendColor) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] = eqn | func;
+      return;
+   }
+
+   funcA = (blend_factor( ctx->Color.BlendSrcA, GL_TRUE ) << R200_SRC_BLEND_SHIFT) |
+      (blend_factor( ctx->Color.BlendDstA, GL_FALSE ) << R200_DST_BLEND_SHIFT);
+
+   switch(ctx->Color.BlendEquationA) {
+   case GL_FUNC_ADD:
+      eqnA = R200_COMB_FCN_ADD_CLAMP;
+      break;
+
+   case GL_FUNC_SUBTRACT:
+      eqnA = R200_COMB_FCN_SUB_CLAMP;
+      break;
+
+   case GL_FUNC_REVERSE_SUBTRACT:
+      eqnA = R200_COMB_FCN_RSUB_CLAMP;
+      break;
+
+   case GL_MIN:
+      eqnA = R200_COMB_FCN_MIN;
+      funcA = (R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
+         (R200_BLEND_GL_ONE << R200_DST_BLEND_SHIFT);
+      break;
+
+   case GL_MAX:
+      eqnA = R200_COMB_FCN_MAX;
+      funcA = (R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
+         (R200_BLEND_GL_ONE << R200_DST_BLEND_SHIFT);
+      break;
+
+   default:
+      fprintf( stderr, "[%s:%u] Invalid A blend equation (0x%04x).\n",
+         __FUNCTION__, __LINE__, ctx->Color.BlendEquationA );
+      return;
+   }
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_ABLENDCNTL] = eqnA | funcA;
+   rmesa->hw.ctx.cmd[CTX_RB3D_CBLENDCNTL] = eqn | func;
+
+}
+
+static void r200BlendEquationSeparate( GLcontext *ctx,
+				       GLenum modeRGB, GLenum modeA )
+{
+      r200_set_blend_state( ctx );
+}
+
+static void r200BlendFuncSeparate( GLcontext *ctx,
+				     GLenum sfactorRGB, GLenum dfactorRGB,
+				     GLenum sfactorA, GLenum dfactorA )
+{
+      r200_set_blend_state( ctx );
+}
+
+
+/* =============================================================
+ * Depth testing
+ */
+
+static void r200DepthFunc( GLcontext *ctx, GLenum func )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   R200_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~R200_Z_TEST_MASK;
+
+   switch ( ctx->Depth.Func ) {
+   case GL_NEVER:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_Z_TEST_NEVER;
+      break;
+   case GL_LESS:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_Z_TEST_LESS;
+      break;
+   case GL_EQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_Z_TEST_EQUAL;
+      break;
+   case GL_LEQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_Z_TEST_LEQUAL;
+      break;
+   case GL_GREATER:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_Z_TEST_GREATER;
+      break;
+   case GL_NOTEQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_Z_TEST_NEQUAL;
+      break;
+   case GL_GEQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_Z_TEST_GEQUAL;
+      break;
+   case GL_ALWAYS:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_Z_TEST_ALWAYS;
+      break;
+   }
+}
+
+static void r200ClearDepth( GLcontext *ctx, GLclampd d )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint format = (rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &
+		    R200_DEPTH_FORMAT_MASK);
+
+   switch ( format ) {
+   case R200_DEPTH_FORMAT_16BIT_INT_Z:
+      rmesa->state.depth.clear = d * 0x0000ffff;
+      break;
+   case R200_DEPTH_FORMAT_24BIT_INT_Z:
+      rmesa->state.depth.clear = d * 0x00ffffff;
+      break;
+   }
+}
+
+static void r200DepthMask( GLcontext *ctx, GLboolean flag )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   R200_STATECHANGE( rmesa, ctx );
+
+   if ( ctx->Depth.Mask ) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |=  R200_Z_WRITE_ENABLE;
+   } else {
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~R200_Z_WRITE_ENABLE;
+   }
+}
+
+
+/* =============================================================
+ * Fog
+ */
+
+
+static void r200Fogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   union { int i; float f; } c, d;
+   GLchan col[4];
+   GLuint i;
+
+   c.i = rmesa->hw.fog.cmd[FOG_C];
+   d.i = rmesa->hw.fog.cmd[FOG_D];
+
+   switch (pname) {
+   case GL_FOG_MODE:
+      if (!ctx->Fog.Enabled)
+	 return;
+      R200_STATECHANGE(rmesa, tcl);
+      rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~R200_TCL_FOG_MASK;
+      switch (ctx->Fog.Mode) {
+      case GL_LINEAR:
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= R200_TCL_FOG_LINEAR;
+	 if (ctx->Fog.Start == ctx->Fog.End) {
+	    c.f = 1.0F;
+	    d.f = 1.0F;
+	 }
+	 else {
+	    c.f = ctx->Fog.End/(ctx->Fog.End-ctx->Fog.Start);
+	    d.f = -1.0/(ctx->Fog.End-ctx->Fog.Start);
+	 }
+	 break;
+      case GL_EXP:
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= R200_TCL_FOG_EXP;
+	 c.f = 0.0;
+	 d.f = -ctx->Fog.Density;
+	 break;
+      case GL_EXP2:
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= R200_TCL_FOG_EXP2;
+	 c.f = 0.0;
+	 d.f = -(ctx->Fog.Density * ctx->Fog.Density);
+	 break;
+      default:
+	 return;
+      }
+      break;
+   case GL_FOG_DENSITY:
+      switch (ctx->Fog.Mode) {
+      case GL_EXP:
+	 c.f = 0.0;
+	 d.f = -ctx->Fog.Density;
+	 break;
+      case GL_EXP2:
+	 c.f = 0.0;
+	 d.f = -(ctx->Fog.Density * ctx->Fog.Density);
+	 break;
+      default:
+	 break;
+      }
+      break;
+   case GL_FOG_START:
+   case GL_FOG_END:
+      if (ctx->Fog.Mode == GL_LINEAR) {
+	 if (ctx->Fog.Start == ctx->Fog.End) {
+	    c.f = 1.0F;
+	    d.f = 1.0F;
+	 } else {
+	    c.f = ctx->Fog.End/(ctx->Fog.End-ctx->Fog.Start);
+	    d.f = -1.0/(ctx->Fog.End-ctx->Fog.Start);
+	 }
+      }
+      break;
+   case GL_FOG_COLOR: 
+      R200_STATECHANGE( rmesa, ctx );
+      UNCLAMPED_FLOAT_TO_RGB_CHAN( col, ctx->Fog.Color );
+      i = r200PackColor( 4, col[0], col[1], col[2], 0 );
+      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] &= ~R200_FOG_COLOR_MASK;
+      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] |= i;
+      break;
+   case GL_FOG_COORD_SRC: {
+      GLuint out_0 = rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0];
+      GLuint fog   = rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR];
+
+      fog &= ~R200_FOG_USE_MASK;
+      if ( ctx->Fog.FogCoordinateSource == GL_FOG_COORD || ctx->VertexProgram.Enabled) {
+	 fog   |= R200_FOG_USE_VTX_FOG;
+	 out_0 |= R200_VTX_DISCRETE_FOG;
+      }
+      else {
+	 fog   |=  R200_FOG_USE_SPEC_ALPHA;
+	 out_0 &= ~R200_VTX_DISCRETE_FOG;
+      }
+
+      if ( fog != rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] ) {
+	 R200_STATECHANGE( rmesa, ctx );
+	 rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] = fog;
+      }
+
+      if (out_0 != rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0]) {
+	 R200_STATECHANGE( rmesa, vtx );
+	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] = out_0;	 
+      }
+
+      break;
+   }
+   default:
+      return;
+   }
+
+   if (c.i != rmesa->hw.fog.cmd[FOG_C] || d.i != rmesa->hw.fog.cmd[FOG_D]) {
+      R200_STATECHANGE( rmesa, fog );
+      rmesa->hw.fog.cmd[FOG_C] = c.i;
+      rmesa->hw.fog.cmd[FOG_D] = d.i;
+   }
+}
+
+
+/* =============================================================
+ * Scissoring
+ */
+
+
+static GLboolean intersect_rect( drm_clip_rect_t *out,
+				 drm_clip_rect_t *a,
+				 drm_clip_rect_t *b )
+{
+   *out = *a;
+   if ( b->x1 > out->x1 ) out->x1 = b->x1;
+   if ( b->y1 > out->y1 ) out->y1 = b->y1;
+   if ( b->x2 < out->x2 ) out->x2 = b->x2;
+   if ( b->y2 < out->y2 ) out->y2 = b->y2;
+   if ( out->x1 >= out->x2 ) return GL_FALSE;
+   if ( out->y1 >= out->y2 ) return GL_FALSE;
+   return GL_TRUE;
+}
+
+
+void r200RecalcScissorRects( r200ContextPtr rmesa )
+{
+   drm_clip_rect_t *out;
+   int i;
+
+   /* Grow cliprect store?
+    */
+   if (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
+      while (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
+	 rmesa->state.scissor.numAllocedClipRects += 1;	/* zero case */
+	 rmesa->state.scissor.numAllocedClipRects *= 2;
+      }
+
+      if (rmesa->state.scissor.pClipRects)
+	 FREE(rmesa->state.scissor.pClipRects);
+
+      rmesa->state.scissor.pClipRects = 
+	 MALLOC( rmesa->state.scissor.numAllocedClipRects * 
+		 sizeof(drm_clip_rect_t) );
+
+      if ( rmesa->state.scissor.pClipRects == NULL ) {
+	 rmesa->state.scissor.numAllocedClipRects = 0;
+	 return;
+      }
+   }
+   
+   out = rmesa->state.scissor.pClipRects;
+   rmesa->state.scissor.numClipRects = 0;
+
+   for ( i = 0 ; i < rmesa->numClipRects ;  i++ ) {
+      if ( intersect_rect( out, 
+			   &rmesa->pClipRects[i], 
+			   &rmesa->state.scissor.rect ) ) {
+	 rmesa->state.scissor.numClipRects++;
+	 out++;
+      }
+   }
+}
+
+
+static void r200UpdateScissor( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   if ( rmesa->dri.drawable ) {
+      __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+
+      int x = ctx->Scissor.X;
+      int y = dPriv->h - ctx->Scissor.Y - ctx->Scissor.Height;
+      int w = ctx->Scissor.X + ctx->Scissor.Width - 1;
+      int h = dPriv->h - ctx->Scissor.Y - 1;
+
+      rmesa->state.scissor.rect.x1 = x + dPriv->x;
+      rmesa->state.scissor.rect.y1 = y + dPriv->y;
+      rmesa->state.scissor.rect.x2 = w + dPriv->x + 1;
+      rmesa->state.scissor.rect.y2 = h + dPriv->y + 1;
+
+      r200RecalcScissorRects( rmesa );
+   }
+}
+
+
+static void r200Scissor( GLcontext *ctx,
+			   GLint x, GLint y, GLsizei w, GLsizei h )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   if ( ctx->Scissor.Enabled ) {
+      R200_FIREVERTICES( rmesa );	/* don't pipeline cliprect changes */
+      r200UpdateScissor( ctx );
+   }
+
+}
+
+
+/* =============================================================
+ * Culling
+ */
+
+static void r200CullFace( GLcontext *ctx, GLenum unused )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint s = rmesa->hw.set.cmd[SET_SE_CNTL];
+   GLuint t = rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL];
+
+   s |= R200_FFACE_SOLID | R200_BFACE_SOLID;
+   t &= ~(R200_CULL_FRONT | R200_CULL_BACK);
+
+   if ( ctx->Polygon.CullFlag ) {
+      switch ( ctx->Polygon.CullFaceMode ) {
+      case GL_FRONT:
+	 s &= ~R200_FFACE_SOLID;
+	 t |= R200_CULL_FRONT;
+	 break;
+      case GL_BACK:
+	 s &= ~R200_BFACE_SOLID;
+	 t |= R200_CULL_BACK;
+	 break;
+      case GL_FRONT_AND_BACK:
+	 s &= ~(R200_FFACE_SOLID | R200_BFACE_SOLID);
+	 t |= (R200_CULL_FRONT | R200_CULL_BACK);
+	 break;
+      }
+   }
+
+   if ( rmesa->hw.set.cmd[SET_SE_CNTL] != s ) {
+      R200_STATECHANGE(rmesa, set );
+      rmesa->hw.set.cmd[SET_SE_CNTL] = s;
+   }
+
+   if ( rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] != t ) {
+      R200_STATECHANGE(rmesa, tcl );
+      rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] = t;
+   }
+}
+
+static void r200FrontFace( GLcontext *ctx, GLenum mode )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   R200_STATECHANGE( rmesa, set );
+   rmesa->hw.set.cmd[SET_SE_CNTL] &= ~R200_FFACE_CULL_DIR_MASK;
+
+   R200_STATECHANGE( rmesa, tcl );
+   rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~R200_CULL_FRONT_IS_CCW;
+
+   switch ( mode ) {
+   case GL_CW:
+      rmesa->hw.set.cmd[SET_SE_CNTL] |= R200_FFACE_CULL_CW;
+      break;
+   case GL_CCW:
+      rmesa->hw.set.cmd[SET_SE_CNTL] |= R200_FFACE_CULL_CCW;
+      rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= R200_CULL_FRONT_IS_CCW;
+      break;
+   }
+}
+
+/* =============================================================
+ * Point state
+ */
+static void r200PointSize( GLcontext *ctx, GLfloat size )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLfloat *fcmd = (GLfloat *)rmesa->hw.ptp.cmd;
+
+   R200_STATECHANGE( rmesa, cst );
+   R200_STATECHANGE( rmesa, ptp );
+   rmesa->hw.cst.cmd[CST_RE_POINTSIZE] &= ~0xffff;
+   rmesa->hw.cst.cmd[CST_RE_POINTSIZE] |= ((GLuint)(ctx->Point.Size * 16.0));
+/* this is the size param of the point size calculation (point size reg value
+   is not used when calculation is active). */
+   fcmd[PTP_VPORT_SCALE_PTSIZE] = ctx->Point.Size;
+}
+
+static void r200PointParameter( GLcontext *ctx, GLenum pname, const GLfloat *params)
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLfloat *fcmd = (GLfloat *)rmesa->hw.ptp.cmd;
+
+   switch (pname) {
+   case GL_POINT_SIZE_MIN:
+   /* Can clamp both in tcl and setup - just set both (as does fglrx) */
+      R200_STATECHANGE( rmesa, lin );
+      R200_STATECHANGE( rmesa, ptp );
+      rmesa->hw.lin.cmd[LIN_SE_LINE_WIDTH] &= 0xffff;
+      rmesa->hw.lin.cmd[LIN_SE_LINE_WIDTH] |= (GLuint)(ctx->Point.MinSize * 16.0) << 16;
+      fcmd[PTP_CLAMP_MIN] = ctx->Point.MinSize;
+      break;
+   case GL_POINT_SIZE_MAX:
+      R200_STATECHANGE( rmesa, cst );
+      R200_STATECHANGE( rmesa, ptp );
+      rmesa->hw.cst.cmd[CST_RE_POINTSIZE] &= 0xffff;
+      rmesa->hw.cst.cmd[CST_RE_POINTSIZE] |= (GLuint)(ctx->Point.MaxSize * 16.0) << 16;
+      fcmd[PTP_CLAMP_MAX] = ctx->Point.MaxSize;
+      break;
+   case GL_POINT_DISTANCE_ATTENUATION:
+      R200_STATECHANGE( rmesa, vtx );
+      R200_STATECHANGE( rmesa, spr );
+      R200_STATECHANGE( rmesa, ptp );
+      GLfloat *fcmd = (GLfloat *)rmesa->hw.ptp.cmd;
+      rmesa->hw.spr.cmd[SPR_POINT_SPRITE_CNTL] &=
+	 ~(R200_PS_MULT_MASK | R200_PS_LIN_ATT_ZERO | R200_PS_SE_SEL_STATE);
+      /* can't rely on ctx->Point._Attenuated here and test for NEW_POINT in
+	 r200ValidateState looks like overkill */
+      if (ctx->Point.Params[0] != 1.0 ||
+	  ctx->Point.Params[1] != 0.0 ||
+	  ctx->Point.Params[2] != 0.0 ||
+	  (ctx->VertexProgram.Enabled && ctx->VertexProgram.PointSizeEnabled)) {
+	 /* all we care for vp would be the ps_se_sel_state setting */
+	 fcmd[PTP_ATT_CONST_QUAD] = ctx->Point.Params[2];
+	 fcmd[PTP_ATT_CONST_LIN] = ctx->Point.Params[1];
+	 fcmd[PTP_ATT_CONST_CON] = ctx->Point.Params[0];
+	 rmesa->hw.spr.cmd[SPR_POINT_SPRITE_CNTL] |= R200_PS_MULT_ATTENCONST;
+	 if (ctx->Point.Params[1] == 0.0)
+	    rmesa->hw.spr.cmd[SPR_POINT_SPRITE_CNTL] |= R200_PS_LIN_ATT_ZERO;
+/* FIXME: setting this here doesn't look quite ok - we only want to do
+          that if we're actually drawing points probably */
+	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_PT_SIZE;
+	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= R200_VTX_POINT_SIZE;
+      }
+      else {
+	 rmesa->hw.spr.cmd[SPR_POINT_SPRITE_CNTL] |=
+	    R200_PS_SE_SEL_STATE | R200_PS_MULT_CONST;
+	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] &= ~R200_OUTPUT_PT_SIZE;
+	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] &= ~R200_VTX_POINT_SIZE;
+      }
+      break;
+   case GL_POINT_FADE_THRESHOLD_SIZE:
+      /* don't support multisampling, so doesn't matter. */
+      break;
+   /* can't do these but don't need them.
+   case GL_POINT_SPRITE_R_MODE_NV:
+   case GL_POINT_SPRITE_COORD_ORIGIN: */
+   default:
+      fprintf(stderr, "bad pname parameter in r200PointParameter\n");
+      return;
+   }
+}
+
+/* =============================================================
+ * Line state
+ */
+static void r200LineWidth( GLcontext *ctx, GLfloat widthf )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   R200_STATECHANGE( rmesa, lin );
+   R200_STATECHANGE( rmesa, set );
+
+   /* Line width is stored in U6.4 format.
+    */
+   rmesa->hw.lin.cmd[LIN_SE_LINE_WIDTH] &= ~0xffff;
+   rmesa->hw.lin.cmd[LIN_SE_LINE_WIDTH] |= (GLuint)(ctx->Line._Width * 16.0);
+
+   if ( widthf > 1.0 ) {
+      rmesa->hw.set.cmd[SET_SE_CNTL] |=  R200_WIDELINE_ENABLE;
+   } else {
+      rmesa->hw.set.cmd[SET_SE_CNTL] &= ~R200_WIDELINE_ENABLE;
+   }
+}
+
+static void r200LineStipple( GLcontext *ctx, GLint factor, GLushort pattern )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   R200_STATECHANGE( rmesa, lin );
+   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] = 
+      ((((GLuint)factor & 0xff) << 16) | ((GLuint)pattern));
+}
+
+
+/* =============================================================
+ * Masks
+ */
+static void r200ColorMask( GLcontext *ctx,
+			   GLboolean r, GLboolean g,
+			   GLboolean b, GLboolean a )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint mask = r200PackColor( rmesa->r200Screen->cpp,
+				ctx->Color.ColorMask[RCOMP],
+				ctx->Color.ColorMask[GCOMP],
+				ctx->Color.ColorMask[BCOMP],
+				ctx->Color.ColorMask[ACOMP] );
+
+   GLuint flag = rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] & ~R200_PLANE_MASK_ENABLE;
+
+   if (!(r && g && b && a))
+      flag |= R200_PLANE_MASK_ENABLE;
+
+   if ( rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] != flag ) { 
+      R200_STATECHANGE( rmesa, ctx ); 
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = flag; 
+   } 
+
+   if ( rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK] != mask ) {
+      R200_STATECHANGE( rmesa, msk );
+      rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK] = mask;
+   }
+}
+
+
+/* =============================================================
+ * Polygon state
+ */
+
+static void r200PolygonOffset( GLcontext *ctx,
+			       GLfloat factor, GLfloat units )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   float_ui32_type constant =  { units * rmesa->state.depth.scale };
+   float_ui32_type factoru = { factor };
+
+/*    factor *= 2; */
+/*    constant *= 2; */
+
+/*    fprintf(stderr, "%s f:%f u:%f\n", __FUNCTION__, factor, constant); */
+
+   R200_STATECHANGE( rmesa, zbs );
+   rmesa->hw.zbs.cmd[ZBS_SE_ZBIAS_FACTOR]   = factoru.ui32;
+   rmesa->hw.zbs.cmd[ZBS_SE_ZBIAS_CONSTANT] = constant.ui32;
+}
+
+static void r200PolygonStipple( GLcontext *ctx, const GLubyte *mask )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint i;
+   drm_radeon_stipple_t stipple;
+
+   /* Must flip pattern upside down.
+    */
+   for ( i = 0 ; i < 32 ; i++ ) {
+      rmesa->state.stipple.mask[31 - i] = ((GLuint *) mask)[i];
+   }
+
+   /* TODO: push this into cmd mechanism
+    */
+   R200_FIREVERTICES( rmesa );
+   LOCK_HARDWARE( rmesa );
+
+   /* FIXME: Use window x,y offsets into stipple RAM.
+    */
+   stipple.mask = rmesa->state.stipple.mask;
+   drmCommandWrite( rmesa->dri.fd, DRM_RADEON_STIPPLE, 
+                    &stipple, sizeof(stipple) );
+   UNLOCK_HARDWARE( rmesa );
+}
+
+static void r200PolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLboolean flag = (ctx->_TriangleCaps & DD_TRI_UNFILLED) != 0;
+
+   /* Can't generally do unfilled via tcl, but some good special
+    * cases work. 
+    */
+   TCL_FALLBACK( ctx, R200_TCL_FALLBACK_UNFILLED, flag);
+   if (rmesa->TclFallback) {
+      r200ChooseRenderState( ctx );
+      r200ChooseVertexState( ctx );
+   }
+}
+
+
+/* =============================================================
+ * Rendering attributes
+ *
+ * We really don't want to recalculate all this every time we bind a
+ * texture.  These things shouldn't change all that often, so it makes
+ * sense to break them out of the core texture state update routines.
+ */
+
+/* Examine lighting and texture state to determine if separate specular
+ * should be enabled.
+ */
+static void r200UpdateSpecular( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   u_int32_t p = rmesa->hw.ctx.cmd[CTX_PP_CNTL];
+
+   R200_STATECHANGE( rmesa, tcl );
+   R200_STATECHANGE( rmesa, vtx );
+
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] &= ~(3<<R200_VTX_COLOR_0_SHIFT);
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] &= ~(3<<R200_VTX_COLOR_1_SHIFT);
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] &= ~R200_OUTPUT_COLOR_0;
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] &= ~R200_OUTPUT_COLOR_1;
+   rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= ~R200_LIGHTING_ENABLE;
+
+   p &= ~R200_SPECULAR_ENABLE;
+
+   rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_DIFFUSE_SPECULAR_COMBINE;
+
+
+   if (ctx->Light.Enabled &&
+       ctx->Light.Model.ColorControl == GL_SEPARATE_SPECULAR_COLOR) {
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT) |
+	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));	
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_0;
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_1;
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LIGHTING_ENABLE;
+      p |=  R200_SPECULAR_ENABLE;
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= 
+	 ~R200_DIFFUSE_SPECULAR_COMBINE;
+   }
+   else if (ctx->Light.Enabled) {
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));	
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_0;
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LIGHTING_ENABLE;
+   } else if (ctx->Fog.ColorSumEnabled ) {
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT) |
+	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));	
+      p |=  R200_SPECULAR_ENABLE;
+   } else {
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));	
+   }
+
+   if (ctx->Fog.Enabled) {
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));	
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_1;
+   }
+
+   if ( rmesa->hw.ctx.cmd[CTX_PP_CNTL] != p ) {
+      R200_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_PP_CNTL] = p;
+   }
+
+   /* Update vertex/render formats
+    */
+   if (rmesa->TclFallback) { 
+      r200ChooseRenderState( ctx );
+      r200ChooseVertexState( ctx );
+   }
+}
+
+
+/* =============================================================
+ * Materials
+ */
+
+
+/* Update on colormaterial, material emmissive/ambient, 
+ * lightmodel.globalambient
+ */
+static void update_global_ambient( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   float *fcmd = (float *)R200_DB_STATE( glt );
+
+   /* Need to do more if both emmissive & ambient are PREMULT:
+    * I believe this is not nessary when using source_material. This condition thus
+    * will never happen currently, and the function has no dependencies on materials now
+    */
+   if ((rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_1] &
+       ((3 << R200_FRONT_EMISSIVE_SOURCE_SHIFT) |
+	(3 << R200_FRONT_AMBIENT_SOURCE_SHIFT))) == 0) 
+   {
+      COPY_3V( &fcmd[GLT_RED], 
+	       ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_EMISSION]);
+      ACC_SCALE_3V( &fcmd[GLT_RED],
+		   ctx->Light.Model.Ambient,
+		   ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_AMBIENT]);
+   } 
+   else
+   {
+      COPY_3V( &fcmd[GLT_RED], ctx->Light.Model.Ambient );
+   }
+   
+   R200_DB_STATECHANGE(rmesa, &rmesa->hw.glt);
+}
+
+/* Update on change to 
+ *    - light[p].colors
+ *    - light[p].enabled
+ */
+static void update_light_colors( GLcontext *ctx, GLuint p )
+{
+   struct gl_light *l = &ctx->Light.Light[p];
+
+/*     fprintf(stderr, "%s\n", __FUNCTION__); */
+
+   if (l->Enabled) {
+      r200ContextPtr rmesa = R200_CONTEXT(ctx);
+      float *fcmd = (float *)R200_DB_STATE( lit[p] );
+
+      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );	 
+      COPY_4V( &fcmd[LIT_DIFFUSE_RED], l->Diffuse );
+      COPY_4V( &fcmd[LIT_SPECULAR_RED], l->Specular );
+      
+      R200_DB_STATECHANGE( rmesa, &rmesa->hw.lit[p] );
+   }
+}
+
+static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
+{
+      r200ContextPtr rmesa = R200_CONTEXT(ctx);
+      GLuint light_model_ctl1 = rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_1];
+      light_model_ctl1 &= ~((0xf << R200_FRONT_EMISSIVE_SOURCE_SHIFT) |
+			   (0xf << R200_FRONT_AMBIENT_SOURCE_SHIFT) |
+			   (0xf << R200_FRONT_DIFFUSE_SOURCE_SHIFT) |
+		   (0xf << R200_FRONT_SPECULAR_SOURCE_SHIFT) |
+		   (0xf << R200_BACK_EMISSIVE_SOURCE_SHIFT) |
+		   (0xf << R200_BACK_AMBIENT_SOURCE_SHIFT) |
+		   (0xf << R200_BACK_DIFFUSE_SOURCE_SHIFT) |
+		   (0xf << R200_BACK_SPECULAR_SOURCE_SHIFT));
+
+   if (ctx->Light.ColorMaterialEnabled) {
+      GLuint mask = ctx->Light.ColorMaterialBitmask;
+   
+      if (mask & MAT_BIT_FRONT_EMISSION) {
+	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
+			     R200_FRONT_EMISSIVE_SOURCE_SHIFT);
+      }
+      else
+	 light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
+			     R200_FRONT_EMISSIVE_SOURCE_SHIFT);
+
+      if (mask & MAT_BIT_FRONT_AMBIENT) {
+	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
+			     R200_FRONT_AMBIENT_SOURCE_SHIFT);
+      }
+      else
+         light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
+			     R200_FRONT_AMBIENT_SOURCE_SHIFT);
+	 
+      if (mask & MAT_BIT_FRONT_DIFFUSE) {
+	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
+			     R200_FRONT_DIFFUSE_SOURCE_SHIFT);
+      }
+      else
+         light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
+			     R200_FRONT_DIFFUSE_SOURCE_SHIFT);
+   
+      if (mask & MAT_BIT_FRONT_SPECULAR) {
+	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
+			     R200_FRONT_SPECULAR_SOURCE_SHIFT);
+      }
+      else {
+         light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
+			     R200_FRONT_SPECULAR_SOURCE_SHIFT);
+      }
+   
+      if (mask & MAT_BIT_BACK_EMISSION) {
+	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
+			     R200_BACK_EMISSIVE_SOURCE_SHIFT);
+      }
+
+      else light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_1 <<
+			     R200_BACK_EMISSIVE_SOURCE_SHIFT);
+
+      if (mask & MAT_BIT_BACK_AMBIENT) {
+	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
+			     R200_BACK_AMBIENT_SOURCE_SHIFT);
+      }
+      else light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_1 <<
+			     R200_BACK_AMBIENT_SOURCE_SHIFT);
+
+      if (mask & MAT_BIT_BACK_DIFFUSE) {
+	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
+			     R200_BACK_DIFFUSE_SOURCE_SHIFT);
+   }
+      else light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_1 <<
+			     R200_BACK_DIFFUSE_SOURCE_SHIFT);
+
+      if (mask & MAT_BIT_BACK_SPECULAR) {
+	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
+			     R200_BACK_SPECULAR_SOURCE_SHIFT);
+      }
+      else {
+         light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_1 <<
+			     R200_BACK_SPECULAR_SOURCE_SHIFT);
+      }
+      }
+   else {
+       /* Default to SOURCE_MATERIAL:
+        */
+     light_model_ctl1 |=
+        (R200_LM1_SOURCE_MATERIAL_0 << R200_FRONT_EMISSIVE_SOURCE_SHIFT) |
+        (R200_LM1_SOURCE_MATERIAL_0 << R200_FRONT_AMBIENT_SOURCE_SHIFT) |
+        (R200_LM1_SOURCE_MATERIAL_0 << R200_FRONT_DIFFUSE_SOURCE_SHIFT) |
+        (R200_LM1_SOURCE_MATERIAL_0 << R200_FRONT_SPECULAR_SOURCE_SHIFT) |
+        (R200_LM1_SOURCE_MATERIAL_1 << R200_BACK_EMISSIVE_SOURCE_SHIFT) |
+        (R200_LM1_SOURCE_MATERIAL_1 << R200_BACK_AMBIENT_SOURCE_SHIFT) |
+        (R200_LM1_SOURCE_MATERIAL_1 << R200_BACK_DIFFUSE_SOURCE_SHIFT) |
+        (R200_LM1_SOURCE_MATERIAL_1 << R200_BACK_SPECULAR_SOURCE_SHIFT);
+   }
+
+   if (light_model_ctl1 != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_1]) {
+      R200_STATECHANGE( rmesa, tcl );
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_1] = light_model_ctl1;
+   }
+   
+   
+}
+
+void r200UpdateMaterial( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLfloat (*mat)[4] = ctx->Light.Material.Attrib;
+   GLfloat *fcmd = (GLfloat *)R200_DB_STATE( mtl[0] );
+   GLfloat *fcmd2 = (GLfloat *)R200_DB_STATE( mtl[1] );
+   GLuint mask = ~0;
+   
+   /* Might be possible and faster to update everything unconditionally? */
+   if (ctx->Light.ColorMaterialEnabled)
+      mask &= ~ctx->Light.ColorMaterialBitmask;
+
+   if (R200_DEBUG & DEBUG_STATE)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   if (mask & MAT_BIT_FRONT_EMISSION) {
+      fcmd[MTL_EMMISSIVE_RED]   = mat[MAT_ATTRIB_FRONT_EMISSION][0];
+      fcmd[MTL_EMMISSIVE_GREEN] = mat[MAT_ATTRIB_FRONT_EMISSION][1];
+      fcmd[MTL_EMMISSIVE_BLUE]  = mat[MAT_ATTRIB_FRONT_EMISSION][2];
+      fcmd[MTL_EMMISSIVE_ALPHA] = mat[MAT_ATTRIB_FRONT_EMISSION][3];
+   }
+   if (mask & MAT_BIT_FRONT_AMBIENT) {
+      fcmd[MTL_AMBIENT_RED]     = mat[MAT_ATTRIB_FRONT_AMBIENT][0];
+      fcmd[MTL_AMBIENT_GREEN]   = mat[MAT_ATTRIB_FRONT_AMBIENT][1];
+      fcmd[MTL_AMBIENT_BLUE]    = mat[MAT_ATTRIB_FRONT_AMBIENT][2];
+      fcmd[MTL_AMBIENT_ALPHA]   = mat[MAT_ATTRIB_FRONT_AMBIENT][3];
+   }
+   if (mask & MAT_BIT_FRONT_DIFFUSE) {
+      fcmd[MTL_DIFFUSE_RED]     = mat[MAT_ATTRIB_FRONT_DIFFUSE][0];
+      fcmd[MTL_DIFFUSE_GREEN]   = mat[MAT_ATTRIB_FRONT_DIFFUSE][1];
+      fcmd[MTL_DIFFUSE_BLUE]    = mat[MAT_ATTRIB_FRONT_DIFFUSE][2];
+      fcmd[MTL_DIFFUSE_ALPHA]   = mat[MAT_ATTRIB_FRONT_DIFFUSE][3];
+   }
+   if (mask & MAT_BIT_FRONT_SPECULAR) {
+      fcmd[MTL_SPECULAR_RED]    = mat[MAT_ATTRIB_FRONT_SPECULAR][0];
+      fcmd[MTL_SPECULAR_GREEN]  = mat[MAT_ATTRIB_FRONT_SPECULAR][1];
+      fcmd[MTL_SPECULAR_BLUE]   = mat[MAT_ATTRIB_FRONT_SPECULAR][2];
+      fcmd[MTL_SPECULAR_ALPHA]  = mat[MAT_ATTRIB_FRONT_SPECULAR][3];
+   }
+   if (mask & MAT_BIT_FRONT_SHININESS) {
+      fcmd[MTL_SHININESS]       = mat[MAT_ATTRIB_FRONT_SHININESS][0];
+   }
+
+   if (mask & MAT_BIT_BACK_EMISSION) {
+      fcmd2[MTL_EMMISSIVE_RED]   = mat[MAT_ATTRIB_BACK_EMISSION][0];
+      fcmd2[MTL_EMMISSIVE_GREEN] = mat[MAT_ATTRIB_BACK_EMISSION][1];
+      fcmd2[MTL_EMMISSIVE_BLUE]  = mat[MAT_ATTRIB_BACK_EMISSION][2];
+      fcmd2[MTL_EMMISSIVE_ALPHA] = mat[MAT_ATTRIB_BACK_EMISSION][3];
+   }
+   if (mask & MAT_BIT_BACK_AMBIENT) {
+      fcmd2[MTL_AMBIENT_RED]     = mat[MAT_ATTRIB_BACK_AMBIENT][0];
+      fcmd2[MTL_AMBIENT_GREEN]   = mat[MAT_ATTRIB_BACK_AMBIENT][1];
+      fcmd2[MTL_AMBIENT_BLUE]    = mat[MAT_ATTRIB_BACK_AMBIENT][2];
+      fcmd2[MTL_AMBIENT_ALPHA]   = mat[MAT_ATTRIB_BACK_AMBIENT][3];
+   }
+   if (mask & MAT_BIT_BACK_DIFFUSE) {
+      fcmd2[MTL_DIFFUSE_RED]     = mat[MAT_ATTRIB_BACK_DIFFUSE][0];
+      fcmd2[MTL_DIFFUSE_GREEN]   = mat[MAT_ATTRIB_BACK_DIFFUSE][1];
+      fcmd2[MTL_DIFFUSE_BLUE]    = mat[MAT_ATTRIB_BACK_DIFFUSE][2];
+      fcmd2[MTL_DIFFUSE_ALPHA]   = mat[MAT_ATTRIB_BACK_DIFFUSE][3];
+   }
+   if (mask & MAT_BIT_BACK_SPECULAR) {
+      fcmd2[MTL_SPECULAR_RED]    = mat[MAT_ATTRIB_BACK_SPECULAR][0];
+      fcmd2[MTL_SPECULAR_GREEN]  = mat[MAT_ATTRIB_BACK_SPECULAR][1];
+      fcmd2[MTL_SPECULAR_BLUE]   = mat[MAT_ATTRIB_BACK_SPECULAR][2];
+      fcmd2[MTL_SPECULAR_ALPHA]  = mat[MAT_ATTRIB_BACK_SPECULAR][3];
+   }
+   if (mask & MAT_BIT_BACK_SHININESS) {
+      fcmd2[MTL_SHININESS]       = mat[MAT_ATTRIB_BACK_SHININESS][0];
+   }
+
+   R200_DB_STATECHANGE( rmesa, &rmesa->hw.mtl[0] );
+   R200_DB_STATECHANGE( rmesa, &rmesa->hw.mtl[1] );
+
+   /* currently material changes cannot trigger a global ambient change, I believe this is correct
+    update_global_ambient( ctx ); */
+}
+
+/* _NEW_LIGHT
+ * _NEW_MODELVIEW
+ * _MESA_NEW_NEED_EYE_COORDS
+ *
+ * Uses derived state from mesa:
+ *       _VP_inf_norm
+ *       _h_inf_norm
+ *       _Position
+ *       _NormDirection
+ *       _ModelViewInvScale
+ *       _NeedEyeCoords
+ *       _EyeZDir
+ *
+ * which are calculated in light.c and are correct for the current
+ * lighting space (model or eye), hence dependencies on _NEW_MODELVIEW
+ * and _MESA_NEW_NEED_EYE_COORDS.  
+ */
+static void update_light( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   /* Have to check these, or have an automatic shortcircuit mechanism
+    * to remove noop statechanges. (Or just do a better job on the
+    * front end).
+    */
+   {
+      GLuint tmp = rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0];
+
+      if (ctx->_NeedEyeCoords)
+	 tmp &= ~R200_LIGHT_IN_MODELSPACE;
+      else
+	 tmp |= R200_LIGHT_IN_MODELSPACE;
+      
+      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0]) 
+      {
+	 R200_STATECHANGE( rmesa, tcl );
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] = tmp;
+      }
+   }
+
+   {
+      GLfloat *fcmd = (GLfloat *)R200_DB_STATE( eye );
+      fcmd[EYE_X] = ctx->_EyeZDir[0];
+      fcmd[EYE_Y] = ctx->_EyeZDir[1];
+      fcmd[EYE_Z] = - ctx->_EyeZDir[2];
+      fcmd[EYE_RESCALE_FACTOR] = ctx->_ModelViewInvScale;
+      R200_DB_STATECHANGE( rmesa, &rmesa->hw.eye );
+   }
+
+
+
+   if (ctx->Light.Enabled) {
+      GLint p;
+      for (p = 0 ; p < MAX_LIGHTS; p++) {
+	 if (ctx->Light.Light[p].Enabled) {
+	    struct gl_light *l = &ctx->Light.Light[p];
+	    GLfloat *fcmd = (GLfloat *)R200_DB_STATE( lit[p] );
+	    
+	    if (l->EyePosition[3] == 0.0) {
+	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm ); 
+	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm ); 
+	       fcmd[LIT_POSITION_W] = 0;
+	       fcmd[LIT_DIRECTION_W] = 0;
+	    } else {
+	       COPY_4V( &fcmd[LIT_POSITION_X], l->_Position );
+	       fcmd[LIT_DIRECTION_X] = -l->_NormDirection[0];
+	       fcmd[LIT_DIRECTION_Y] = -l->_NormDirection[1];
+	       fcmd[LIT_DIRECTION_Z] = -l->_NormDirection[2];
+	       fcmd[LIT_DIRECTION_W] = 0;
+	    }
+
+	    R200_DB_STATECHANGE( rmesa, &rmesa->hw.lit[p] );
+	 }
+      }
+   }
+}
+
+static void r200Lightfv( GLcontext *ctx, GLenum light,
+			   GLenum pname, const GLfloat *params )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLint p = light - GL_LIGHT0;
+   struct gl_light *l = &ctx->Light.Light[p];
+   GLfloat *fcmd = (GLfloat *)rmesa->hw.lit[p].cmd;
+   
+
+   switch (pname) {
+   case GL_AMBIENT:		
+   case GL_DIFFUSE:
+   case GL_SPECULAR:
+      update_light_colors( ctx, p );
+      break;
+
+   case GL_SPOT_DIRECTION: 
+      /* picked up in update_light */	
+      break;
+
+   case GL_POSITION: {
+      /* positions picked up in update_light, but can do flag here */	
+      GLuint flag = (p&1)? R200_LIGHT_1_IS_LOCAL : R200_LIGHT_0_IS_LOCAL;
+      GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
+
+      R200_STATECHANGE(rmesa, tcl);
+      if (l->EyePosition[3] != 0.0F)
+	 rmesa->hw.tcl.cmd[idx] |= flag;
+      else
+	 rmesa->hw.tcl.cmd[idx] &= ~flag;
+      break;
+   }
+
+   case GL_SPOT_EXPONENT:
+      R200_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_SPOT_EXPONENT] = params[0];
+      break;
+
+   case GL_SPOT_CUTOFF: {
+      GLuint flag = (p&1) ? R200_LIGHT_1_IS_SPOT : R200_LIGHT_0_IS_SPOT;
+      GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
+
+      R200_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_SPOT_CUTOFF] = l->_CosCutoff;
+
+      R200_STATECHANGE(rmesa, tcl);
+      if (l->SpotCutoff != 180.0F)
+	 rmesa->hw.tcl.cmd[idx] |= flag;
+      else
+	 rmesa->hw.tcl.cmd[idx] &= ~flag;
+
+      break;
+   }
+
+   case GL_CONSTANT_ATTENUATION:
+      R200_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_ATTEN_CONST] = params[0];
+      if ( params[0] == 0.0 )
+	 fcmd[LIT_ATTEN_CONST_INV] = FLT_MAX;
+      else
+	 fcmd[LIT_ATTEN_CONST_INV] = 1.0 / params[0];
+      break;
+   case GL_LINEAR_ATTENUATION:
+      R200_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_ATTEN_LINEAR] = params[0];
+      break;
+   case GL_QUADRATIC_ATTENUATION:
+      R200_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_ATTEN_QUADRATIC] = params[0];
+      break;
+   default:
+      return;
+   }
+
+   /* Set RANGE_ATTEN only when needed */
+   switch (pname) {
+   case GL_POSITION:
+   case GL_CONSTANT_ATTENUATION:
+   case GL_LINEAR_ATTENUATION:
+   case GL_QUADRATIC_ATTENUATION: {
+      GLuint *icmd = (GLuint *)R200_DB_STATE( tcl );
+      GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
+      GLuint atten_flag = ( p&1 ) ? R200_LIGHT_1_ENABLE_RANGE_ATTEN
+				  : R200_LIGHT_0_ENABLE_RANGE_ATTEN;
+      GLuint atten_const_flag = ( p&1 ) ? R200_LIGHT_1_CONSTANT_RANGE_ATTEN
+				  : R200_LIGHT_0_CONSTANT_RANGE_ATTEN;
+
+      if ( l->EyePosition[3] == 0.0F ||
+	   ( ( fcmd[LIT_ATTEN_CONST] == 0.0 || fcmd[LIT_ATTEN_CONST] == 1.0 ) &&
+	     fcmd[LIT_ATTEN_QUADRATIC] == 0.0 && fcmd[LIT_ATTEN_LINEAR] == 0.0 ) ) {
+	 /* Disable attenuation */
+	 icmd[idx] &= ~atten_flag;
+      } else {
+	 if ( fcmd[LIT_ATTEN_QUADRATIC] == 0.0 && fcmd[LIT_ATTEN_LINEAR] == 0.0 ) {
+	    /* Enable only constant portion of attenuation calculation */
+	    icmd[idx] |= ( atten_flag | atten_const_flag );
+	 } else {
+	    /* Enable full attenuation calculation */
+	    icmd[idx] &= ~atten_const_flag;
+	    icmd[idx] |= atten_flag;
+	 }
+      }
+
+      R200_DB_STATECHANGE( rmesa, &rmesa->hw.tcl );
+      break;
+   }
+   default:
+     break;
+   }
+}
+
+static void r200UpdateLocalViewer ( GLcontext *ctx )
+{
+/* It looks like for the texgen modes GL_SPHERE_MAP, GL_NORMAL_MAP and
+   GL_REFLECTION_MAP we need R200_LOCAL_VIEWER set (fglrx does exactly that
+   for these and only these modes). This means specular highlights may turn out
+   wrong in some cases when lighting is enabled but GL_LIGHT_MODEL_LOCAL_VIEWER
+   is not set, though it seems to happen rarely and the effect seems quite
+   subtle. May need TCL fallback to fix it completely, though I'm not sure
+   how you'd identify the cases where the specular highlights indeed will
+   be wrong. Don't know if fglrx does something special in that case.
+*/
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   R200_STATECHANGE( rmesa, tcl );
+   if (ctx->Light.Model.LocalViewer ||
+       ctx->Texture._GenFlags & TEXGEN_NEED_NORMALS)
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LOCAL_VIEWER;
+   else
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= ~R200_LOCAL_VIEWER;
+}
+
+static void r200LightModelfv( GLcontext *ctx, GLenum pname,
+				const GLfloat *param )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   switch (pname) {
+      case GL_LIGHT_MODEL_AMBIENT: 
+	 update_global_ambient( ctx );
+	 break;
+
+      case GL_LIGHT_MODEL_LOCAL_VIEWER:
+	 r200UpdateLocalViewer( ctx );
+         break;
+
+      case GL_LIGHT_MODEL_TWO_SIDE:
+	 R200_STATECHANGE( rmesa, tcl );
+	 if (ctx->Light.Model.TwoSide)
+	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LIGHT_TWOSIDE;
+	 else
+	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= ~(R200_LIGHT_TWOSIDE);
+	 if (rmesa->TclFallback) {
+	    r200ChooseRenderState( ctx );
+	    r200ChooseVertexState( ctx );
+	 }
+         break;
+
+      case GL_LIGHT_MODEL_COLOR_CONTROL:
+	 r200UpdateSpecular(ctx);
+         break;
+
+      default:
+         break;
+   }
+}
+
+static void r200ShadeModel( GLcontext *ctx, GLenum mode )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint s = rmesa->hw.set.cmd[SET_SE_CNTL];
+
+   s &= ~(R200_DIFFUSE_SHADE_MASK |
+	  R200_ALPHA_SHADE_MASK |
+	  R200_SPECULAR_SHADE_MASK |
+	  R200_FOG_SHADE_MASK |
+	  R200_DISC_FOG_SHADE_MASK);
+
+   switch ( mode ) {
+   case GL_FLAT:
+      s |= (R200_DIFFUSE_SHADE_FLAT |
+	    R200_ALPHA_SHADE_FLAT |
+	    R200_SPECULAR_SHADE_FLAT |
+	    R200_FOG_SHADE_FLAT |
+	    R200_DISC_FOG_SHADE_FLAT);
+      break;
+   case GL_SMOOTH:
+      s |= (R200_DIFFUSE_SHADE_GOURAUD |
+	    R200_ALPHA_SHADE_GOURAUD |
+	    R200_SPECULAR_SHADE_GOURAUD |
+	    R200_FOG_SHADE_GOURAUD |
+	    R200_DISC_FOG_SHADE_GOURAUD);
+      break;
+   default:
+      return;
+   }
+
+   if ( rmesa->hw.set.cmd[SET_SE_CNTL] != s ) {
+      R200_STATECHANGE( rmesa, set );
+      rmesa->hw.set.cmd[SET_SE_CNTL] = s;
+   }
+}
+
+
+/* =============================================================
+ * User clip planes
+ */
+
+static void r200ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
+{
+   GLint p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
+
+   R200_STATECHANGE( rmesa, ucp[p] );
+   rmesa->hw.ucp[p].cmd[UCP_X] = ip[0];
+   rmesa->hw.ucp[p].cmd[UCP_Y] = ip[1];
+   rmesa->hw.ucp[p].cmd[UCP_Z] = ip[2];
+   rmesa->hw.ucp[p].cmd[UCP_W] = ip[3];
+}
+
+static void r200UpdateClipPlanes( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint p;
+
+   for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
+      if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
+	 GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
+
+	 R200_STATECHANGE( rmesa, ucp[p] );
+	 rmesa->hw.ucp[p].cmd[UCP_X] = ip[0];
+	 rmesa->hw.ucp[p].cmd[UCP_Y] = ip[1];
+	 rmesa->hw.ucp[p].cmd[UCP_Z] = ip[2];
+	 rmesa->hw.ucp[p].cmd[UCP_W] = ip[3];
+      }
+   }
+}
+
+
+/* =============================================================
+ * Stencil
+ */
+
+static void
+r200StencilFuncSeparate( GLcontext *ctx, GLenum face, GLenum func,
+                         GLint ref, GLuint mask )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint refmask = (((ctx->Stencil.Ref[0] & 0xff) << R200_STENCIL_REF_SHIFT) |
+		     ((ctx->Stencil.ValueMask[0] & 0xff) << R200_STENCIL_MASK_SHIFT));
+
+   R200_STATECHANGE( rmesa, ctx );
+   R200_STATECHANGE( rmesa, msk );
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~R200_STENCIL_TEST_MASK;
+   rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] &= ~(R200_STENCIL_REF_MASK|
+						   R200_STENCIL_VALUE_MASK);
+
+   switch ( ctx->Stencil.Function[0] ) {
+   case GL_NEVER:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_TEST_NEVER;
+      break;
+   case GL_LESS:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_TEST_LESS;
+      break;
+   case GL_EQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_TEST_EQUAL;
+      break;
+   case GL_LEQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_TEST_LEQUAL;
+      break;
+   case GL_GREATER:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_TEST_GREATER;
+      break;
+   case GL_NOTEQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_TEST_NEQUAL;
+      break;
+   case GL_GEQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_TEST_GEQUAL;
+      break;
+   case GL_ALWAYS:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_TEST_ALWAYS;
+      break;
+   }
+
+   rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] |= refmask;
+}
+
+static void
+r200StencilMaskSeparate( GLcontext *ctx, GLenum face, GLuint mask )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   R200_STATECHANGE( rmesa, msk );
+   rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] &= ~R200_STENCIL_WRITE_MASK;
+   rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] |=
+      ((ctx->Stencil.WriteMask[0] & 0xff) << R200_STENCIL_WRITEMASK_SHIFT);
+}
+
+static void
+r200StencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
+                       GLenum zfail, GLenum zpass )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   R200_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~(R200_STENCIL_FAIL_MASK |
+					       R200_STENCIL_ZFAIL_MASK |
+					       R200_STENCIL_ZPASS_MASK);
+
+   switch ( ctx->Stencil.FailFunc[0] ) {
+   case GL_KEEP:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_FAIL_KEEP;
+      break;
+   case GL_ZERO:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_FAIL_ZERO;
+      break;
+   case GL_REPLACE:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_FAIL_REPLACE;
+      break;
+   case GL_INCR:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_FAIL_INC;
+      break;
+   case GL_DECR:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_FAIL_DEC;
+      break;
+   case GL_INCR_WRAP_EXT:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_FAIL_INC_WRAP;
+      break;
+   case GL_DECR_WRAP_EXT:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_FAIL_DEC_WRAP;
+      break;
+   case GL_INVERT:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_FAIL_INVERT;
+      break;
+   }
+
+   switch ( ctx->Stencil.ZFailFunc[0] ) {
+   case GL_KEEP:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZFAIL_KEEP;
+      break;
+   case GL_ZERO:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZFAIL_ZERO;
+      break;
+   case GL_REPLACE:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZFAIL_REPLACE;
+      break;
+   case GL_INCR:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZFAIL_INC;
+      break;
+   case GL_DECR:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZFAIL_DEC;
+      break;
+   case GL_INCR_WRAP_EXT:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZFAIL_INC_WRAP;
+      break;
+   case GL_DECR_WRAP_EXT:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZFAIL_DEC_WRAP;
+      break;
+   case GL_INVERT:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZFAIL_INVERT;
+      break;
+   }
+
+   switch ( ctx->Stencil.ZPassFunc[0] ) {
+   case GL_KEEP:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZPASS_KEEP;
+      break;
+   case GL_ZERO:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZPASS_ZERO;
+      break;
+   case GL_REPLACE:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZPASS_REPLACE;
+      break;
+   case GL_INCR:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZPASS_INC;
+      break;
+   case GL_DECR:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZPASS_DEC;
+      break;
+   case GL_INCR_WRAP_EXT:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZPASS_INC_WRAP;
+      break;
+   case GL_DECR_WRAP_EXT:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZPASS_DEC_WRAP;
+      break;
+   case GL_INVERT:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_STENCIL_ZPASS_INVERT;
+      break;
+   }
+}
+
+static void r200ClearStencil( GLcontext *ctx, GLint s )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   rmesa->state.stencil.clear = 
+      ((GLuint) (ctx->Stencil.Clear & 0xff) |
+       (0xff << R200_STENCIL_MASK_SHIFT) |
+       ((ctx->Stencil.WriteMask[0] & 0xff) << R200_STENCIL_WRITEMASK_SHIFT));
+}
+
+
+/* =============================================================
+ * Window position and viewport transformation
+ */
+
+/*
+ * To correctly position primitives:
+ */
+#define SUBPIXEL_X 0.125
+#define SUBPIXEL_Y 0.125
+
+
+/**
+ * Called when window size or position changes or viewport or depth range
+ * state is changed.  We update the hardware viewport state here.
+ */
+void r200UpdateWindow( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   GLfloat xoffset = (GLfloat)dPriv->x;
+   GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
+   const GLfloat *v = ctx->Viewport._WindowMap.m;
+
+   float_ui32_type sx = { v[MAT_SX] };
+   float_ui32_type tx = { v[MAT_TX] + xoffset + SUBPIXEL_X };
+   float_ui32_type sy = { - v[MAT_SY] };
+   float_ui32_type ty = { (- v[MAT_TY]) + yoffset + SUBPIXEL_Y };
+   float_ui32_type sz = { v[MAT_SZ] * rmesa->state.depth.scale };
+   float_ui32_type tz = { v[MAT_TZ] * rmesa->state.depth.scale };
+
+   R200_FIREVERTICES( rmesa );
+   R200_STATECHANGE( rmesa, vpt );
+
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_XSCALE]  = sx.ui32;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_XOFFSET] = tx.ui32;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_YSCALE]  = sy.ui32;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_YOFFSET] = ty.ui32;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_ZSCALE]  = sz.ui32;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_ZOFFSET] = tz.ui32;
+}
+
+
+
+static void r200Viewport( GLcontext *ctx, GLint x, GLint y,
+			    GLsizei width, GLsizei height )
+{
+   /* Don't pipeline viewport changes, conflict with window offset
+    * setting below.  Could apply deltas to rescue pipelined viewport
+    * values, or keep the originals hanging around.
+    */
+   r200UpdateWindow( ctx );
+}
+
+static void r200DepthRange( GLcontext *ctx, GLclampd nearval,
+			      GLclampd farval )
+{
+   r200UpdateWindow( ctx );
+}
+
+void r200UpdateViewportOffset( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   GLfloat xoffset = (GLfloat)dPriv->x;
+   GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
+   const GLfloat *v = ctx->Viewport._WindowMap.m;
+
+   float_ui32_type tx;
+   float_ui32_type ty;
+
+   tx.f = v[MAT_TX] + xoffset + SUBPIXEL_X;
+   ty.f = (- v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+
+   if ( rmesa->hw.vpt.cmd[VPT_SE_VPORT_XOFFSET] != tx.ui32 ||
+	rmesa->hw.vpt.cmd[VPT_SE_VPORT_YOFFSET] != ty.ui32 )
+   {
+      /* Note: this should also modify whatever data the context reset
+       * code uses...
+       */
+      R200_STATECHANGE( rmesa, vpt );
+      rmesa->hw.vpt.cmd[VPT_SE_VPORT_XOFFSET] = tx.ui32;
+      rmesa->hw.vpt.cmd[VPT_SE_VPORT_YOFFSET] = ty.ui32;
+
+      /* update polygon stipple x/y screen offset */
+      {
+         GLuint stx, sty;
+         GLuint m = rmesa->hw.msc.cmd[MSC_RE_MISC];
+
+         m &= ~(R200_STIPPLE_X_OFFSET_MASK |
+                R200_STIPPLE_Y_OFFSET_MASK);
+
+         /* add magic offsets, then invert */
+         stx = 31 - ((rmesa->dri.drawable->x - 1) & R200_STIPPLE_COORD_MASK);
+         sty = 31 - ((rmesa->dri.drawable->y + rmesa->dri.drawable->h - 1)
+                     & R200_STIPPLE_COORD_MASK);
+
+         m |= ((stx << R200_STIPPLE_X_OFFSET_SHIFT) |
+               (sty << R200_STIPPLE_Y_OFFSET_SHIFT));
+
+         if ( rmesa->hw.msc.cmd[MSC_RE_MISC] != m ) {
+            R200_STATECHANGE( rmesa, msc );
+	    rmesa->hw.msc.cmd[MSC_RE_MISC] = m;
+         }
+      }
+   }
+
+   r200UpdateScissor( ctx );
+}
+
+
+
+/* =============================================================
+ * Miscellaneous
+ */
+
+static void r200ClearColor( GLcontext *ctx, const GLfloat c[4] )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLubyte color[4];
+   CLAMPED_FLOAT_TO_UBYTE(color[0], c[0]);
+   CLAMPED_FLOAT_TO_UBYTE(color[1], c[1]);
+   CLAMPED_FLOAT_TO_UBYTE(color[2], c[2]);
+   CLAMPED_FLOAT_TO_UBYTE(color[3], c[3]);
+   rmesa->state.color.clear = r200PackColor( rmesa->r200Screen->cpp,
+                                             color[0], color[1],
+                                             color[2], color[3] );
+}
+
+
+static void r200RenderMode( GLcontext *ctx, GLenum mode )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   FALLBACK( rmesa, R200_FALLBACK_RENDER_MODE, (mode != GL_RENDER) );
+}
+
+
+static GLuint r200_rop_tab[] = {
+   R200_ROP_CLEAR,
+   R200_ROP_AND,
+   R200_ROP_AND_REVERSE,
+   R200_ROP_COPY,
+   R200_ROP_AND_INVERTED,
+   R200_ROP_NOOP,
+   R200_ROP_XOR,
+   R200_ROP_OR,
+   R200_ROP_NOR,
+   R200_ROP_EQUIV,
+   R200_ROP_INVERT,
+   R200_ROP_OR_REVERSE,
+   R200_ROP_COPY_INVERTED,
+   R200_ROP_OR_INVERTED,
+   R200_ROP_NAND,
+   R200_ROP_SET,
+};
+
+static void r200LogicOpCode( GLcontext *ctx, GLenum opcode )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint rop = (GLuint)opcode - GL_CLEAR;
+
+   ASSERT( rop < 16 );
+
+   R200_STATECHANGE( rmesa, msk );
+   rmesa->hw.msk.cmd[MSK_RB3D_ROPCNTL] = r200_rop_tab[rop];
+}
+
+
+/*
+ * Set up the cliprects for either front or back-buffer drawing.
+ */
+void r200SetCliprects( r200ContextPtr rmesa )
+{
+   __DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
+   __DRIdrawablePrivate *const readable = rmesa->dri.readable;
+   GLframebuffer *const draw_fb = (GLframebuffer*) drawable->driverPrivate;
+   GLframebuffer *const read_fb = (GLframebuffer*) readable->driverPrivate;
+
+   if (draw_fb->_ColorDrawBufferMask[0]
+       == BUFFER_BIT_BACK_LEFT) {
+      /* Can't ignore 2d windows if we are page flipping.
+       */
+      if ( drawable->numBackClipRects == 0 || rmesa->doPageFlip ) {
+         rmesa->numClipRects = drawable->numClipRects;
+         rmesa->pClipRects = drawable->pClipRects;
+      }
+      else {
+         rmesa->numClipRects = drawable->numBackClipRects;
+         rmesa->pClipRects = drawable->pBackClipRects;
+      }
+   }
+   else {
+     /* front buffer (or none, or multiple buffers) */
+     rmesa->numClipRects = drawable->numClipRects;
+     rmesa->pClipRects = drawable->pClipRects;
+  }
+
+   if ((draw_fb->Width != drawable->w) || (draw_fb->Height != drawable->h)) {
+      _mesa_resize_framebuffer(rmesa->glCtx, draw_fb,
+			       drawable->w, drawable->h);
+      draw_fb->Initialized = GL_TRUE;
+   }
+
+   if (drawable != readable) {
+      if ((read_fb->Width != readable->w) ||
+	  (read_fb->Height != readable->h)) {
+	 _mesa_resize_framebuffer(rmesa->glCtx, read_fb,
+				  readable->w, readable->h);
+	 read_fb->Initialized = GL_TRUE;
+      }
+   }
+
+   if (rmesa->state.scissor.enabled)
+      r200RecalcScissorRects( rmesa );
+
+   rmesa->lastStamp = drawable->lastStamp;
+}
+
+
+static void r200DrawBuffer( GLcontext *ctx, GLenum mode )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   if (R200_DEBUG & DEBUG_DRI)
+      fprintf(stderr, "%s %s\n", __FUNCTION__,
+	      _mesa_lookup_enum_by_nr( mode ));
+
+   R200_FIREVERTICES(rmesa);	/* don't pipeline cliprect changes */
+
+   /*
+    * _ColorDrawBufferMask is easier to cope with than <mode>.
+    * Check for software fallback, update cliprects.
+    */
+   switch ( ctx->DrawBuffer->_ColorDrawBufferMask[0] ) {
+   case BUFFER_BIT_FRONT_LEFT:
+   case BUFFER_BIT_BACK_LEFT:
+      FALLBACK( rmesa, R200_FALLBACK_DRAW_BUFFER, GL_FALSE );
+      break;
+   default:
+      /* 0 (GL_NONE) buffers or multiple color drawing buffers */
+      FALLBACK( rmesa, R200_FALLBACK_DRAW_BUFFER, GL_TRUE );
+      return;
+   }
+
+   r200SetCliprects( rmesa );
+
+   /* We'll set the drawing engine's offset/pitch parameters later
+    * when we update other state.
+    */
+}
+
+
+static void r200ReadBuffer( GLcontext *ctx, GLenum mode )
+{
+   /* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
+}
+
+/* =============================================================
+ * State enable/disable
+ */
+
+static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint p, flag;
+
+   if ( R200_DEBUG & DEBUG_STATE )
+      fprintf( stderr, "%s( %s = %s )\n", __FUNCTION__,
+	       _mesa_lookup_enum_by_nr( cap ),
+	       state ? "GL_TRUE" : "GL_FALSE" );
+
+   switch ( cap ) {
+      /* Fast track this one...
+       */
+   case GL_TEXTURE_1D:
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_3D:
+      break;
+
+   case GL_ALPHA_TEST:
+      R200_STATECHANGE( rmesa, ctx );
+      if (state) {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_ALPHA_TEST_ENABLE;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~R200_ALPHA_TEST_ENABLE;
+      }
+      break;
+
+   case GL_BLEND:
+   case GL_COLOR_LOGIC_OP:
+      r200_set_blend_state( ctx );
+      break;
+
+   case GL_CLIP_PLANE0:
+   case GL_CLIP_PLANE1:
+   case GL_CLIP_PLANE2:
+   case GL_CLIP_PLANE3:
+   case GL_CLIP_PLANE4:
+   case GL_CLIP_PLANE5: 
+      p = cap-GL_CLIP_PLANE0;
+      R200_STATECHANGE( rmesa, tcl );
+      if (state) {
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= (R200_UCP_ENABLE_0<<p);
+	 r200ClipPlane( ctx, cap, NULL );
+      }
+      else {
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~(R200_UCP_ENABLE_0<<p);
+      }
+      break;
+
+   case GL_COLOR_MATERIAL:
+      r200ColorMaterial( ctx, 0, 0 );
+      r200UpdateMaterial( ctx );
+      break;
+
+   case GL_CULL_FACE:
+      r200CullFace( ctx, 0 );
+      break;
+
+   case GL_DEPTH_TEST:
+      R200_STATECHANGE(rmesa, ctx );
+      if ( state ) {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  R200_Z_ENABLE;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~R200_Z_ENABLE;
+      }
+      break;
+
+   case GL_DITHER:
+      R200_STATECHANGE(rmesa, ctx );
+      if ( state ) {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  R200_DITHER_ENABLE;
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->state.color.roundEnable;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~R200_DITHER_ENABLE;
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->state.color.roundEnable;
+      }
+      break;
+
+   case GL_FOG:
+      R200_STATECHANGE(rmesa, ctx );
+      if ( state ) {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_FOG_ENABLE;
+	 r200Fogfv( ctx, GL_FOG_MODE, NULL );
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~R200_FOG_ENABLE;
+	 R200_STATECHANGE(rmesa, tcl);
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~R200_TCL_FOG_MASK;
+      }
+      r200UpdateSpecular( ctx ); /* for PK_SPEC */
+      if (rmesa->TclFallback) 
+	 r200ChooseVertexState( ctx );
+      _mesa_allow_light_in_model( ctx, !state );
+      break;
+
+   case GL_LIGHT0:
+   case GL_LIGHT1:
+   case GL_LIGHT2:
+   case GL_LIGHT3:
+   case GL_LIGHT4:
+   case GL_LIGHT5:
+   case GL_LIGHT6:
+   case GL_LIGHT7:
+      R200_STATECHANGE(rmesa, tcl);
+      p = cap - GL_LIGHT0;
+      if (p&1) 
+	 flag = (R200_LIGHT_1_ENABLE |
+		 R200_LIGHT_1_ENABLE_AMBIENT | 
+		 R200_LIGHT_1_ENABLE_SPECULAR);
+      else
+	 flag = (R200_LIGHT_0_ENABLE |
+		 R200_LIGHT_0_ENABLE_AMBIENT | 
+		 R200_LIGHT_0_ENABLE_SPECULAR);
+
+      if (state)
+	 rmesa->hw.tcl.cmd[p/2 + TCL_PER_LIGHT_CTL_0] |= flag;
+      else
+	 rmesa->hw.tcl.cmd[p/2 + TCL_PER_LIGHT_CTL_0] &= ~flag;
+
+      /* 
+       */
+      update_light_colors( ctx, p );
+      break;
+
+   case GL_LIGHTING:
+      r200UpdateSpecular(ctx);
+      /* for reflection map fixup - might set recheck_texgen for all units too */
+      rmesa->NewGLState |= _NEW_TEXTURE;
+      break;
+
+   case GL_LINE_SMOOTH:
+      R200_STATECHANGE( rmesa, ctx );
+      if ( state ) {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |=  R200_ANTI_ALIAS_LINE;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~R200_ANTI_ALIAS_LINE;
+      }
+      break;
+
+   case GL_LINE_STIPPLE:
+      R200_STATECHANGE( rmesa, set );
+      if ( state ) {
+	 rmesa->hw.set.cmd[SET_RE_CNTL] |=  R200_PATTERN_ENABLE;
+      } else {
+	 rmesa->hw.set.cmd[SET_RE_CNTL] &= ~R200_PATTERN_ENABLE;
+      }
+      break;
+
+   case GL_NORMALIZE:
+      R200_STATECHANGE( rmesa, tcl );
+      if ( state ) {
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |=  R200_NORMALIZE_NORMALS;
+      } else {
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= ~R200_NORMALIZE_NORMALS;
+      }
+      break;
+
+      /* Pointsize registers on r200 only work for point sprites, and point smooth
+       * doesn't work for point sprites (and isn't needed for 1.0 sized aa points).
+       * In any case, setting pointmin == pointsizemax == 1.0 for aa points
+       * is enough to satisfy conform.
+       */
+   case GL_POINT_SMOOTH:
+      break;
+
+      /* These don't really do anything, as we don't use the 3vtx
+       * primitives yet.
+       */
+#if 0
+   case GL_POLYGON_OFFSET_POINT:
+      R200_STATECHANGE( rmesa, set );
+      if ( state ) {
+	 rmesa->hw.set.cmd[SET_SE_CNTL] |=  R200_ZBIAS_ENABLE_POINT;
+      } else {
+	 rmesa->hw.set.cmd[SET_SE_CNTL] &= ~R200_ZBIAS_ENABLE_POINT;
+      }
+      break;
+
+   case GL_POLYGON_OFFSET_LINE:
+      R200_STATECHANGE( rmesa, set );
+      if ( state ) {
+	 rmesa->hw.set.cmd[SET_SE_CNTL] |=  R200_ZBIAS_ENABLE_LINE;
+      } else {
+	 rmesa->hw.set.cmd[SET_SE_CNTL] &= ~R200_ZBIAS_ENABLE_LINE;
+      }
+      break;
+#endif
+
+   case GL_POINT_SPRITE_ARB:
+      R200_STATECHANGE( rmesa, spr );
+      if ( state ) {
+	 int i;
+	 for (i = 0; i < 6; i++) {
+	    rmesa->hw.spr.cmd[SPR_POINT_SPRITE_CNTL] |=
+		ctx->Point.CoordReplace[i] << (R200_PS_GEN_TEX_0_SHIFT + i);
+	 }
+      } else {
+	 rmesa->hw.spr.cmd[SPR_POINT_SPRITE_CNTL] &= ~R200_PS_GEN_TEX_MASK;
+      }
+      break;
+
+   case GL_POLYGON_OFFSET_FILL:
+      R200_STATECHANGE( rmesa, set );
+      if ( state ) {
+	 rmesa->hw.set.cmd[SET_SE_CNTL] |=  R200_ZBIAS_ENABLE_TRI;
+      } else {
+	 rmesa->hw.set.cmd[SET_SE_CNTL] &= ~R200_ZBIAS_ENABLE_TRI;
+      }
+      break;
+
+   case GL_POLYGON_SMOOTH:
+      R200_STATECHANGE( rmesa, ctx );
+      if ( state ) {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |=  R200_ANTI_ALIAS_POLY;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~R200_ANTI_ALIAS_POLY;
+      }
+      break;
+
+   case GL_POLYGON_STIPPLE:
+      R200_STATECHANGE(rmesa, set );
+      if ( state ) {
+	 rmesa->hw.set.cmd[SET_RE_CNTL] |=  R200_STIPPLE_ENABLE;
+      } else {
+	 rmesa->hw.set.cmd[SET_RE_CNTL] &= ~R200_STIPPLE_ENABLE;
+      }
+      break;
+
+   case GL_RESCALE_NORMAL_EXT: {
+      GLboolean tmp = ctx->_NeedEyeCoords ? state : !state;
+      R200_STATECHANGE( rmesa, tcl );
+      if ( tmp ) {
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |=  R200_RESCALE_NORMALS;
+      } else {
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= ~R200_RESCALE_NORMALS;
+      }
+      break;
+   }
+
+   case GL_SCISSOR_TEST:
+      R200_FIREVERTICES( rmesa );
+      rmesa->state.scissor.enabled = state;
+      r200UpdateScissor( ctx );
+      break;
+
+   case GL_STENCIL_TEST:
+      if ( rmesa->state.stencil.hwBuffer ) {
+	 R200_STATECHANGE( rmesa, ctx );
+	 if ( state ) {
+	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  R200_STENCIL_ENABLE;
+	 } else {
+	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~R200_STENCIL_ENABLE;
+	 }
+      } else {
+	 FALLBACK( rmesa, R200_FALLBACK_STENCIL, state );
+      }
+      break;
+
+   case GL_TEXTURE_GEN_Q:
+   case GL_TEXTURE_GEN_R:
+   case GL_TEXTURE_GEN_S:
+   case GL_TEXTURE_GEN_T:
+      /* Picked up in r200UpdateTextureState.
+       */
+      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE; 
+      break;
+
+   case GL_COLOR_SUM_EXT:
+      r200UpdateSpecular ( ctx );
+      break;
+
+   case GL_VERTEX_PROGRAM_ARB:
+      if (!state) {
+	 GLuint i;
+	 rmesa->curr_vp_hw = NULL;
+	 R200_STATECHANGE( rmesa, vap );
+	 rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~R200_VAP_PROG_VTX_SHADER_ENABLE;
+	 /* mark all tcl atoms (tcl vector state got overwritten) dirty
+	    not sure about tcl scalar state - we need at least grd
+	    with vert progs too.
+	    ucp looks like it doesn't get overwritten (may even work
+	    with vp for pos-invariant progs if we're lucky) */
+	 R200_STATECHANGE( rmesa, mtl[0] );
+	 R200_STATECHANGE( rmesa, mtl[1] );
+	 R200_STATECHANGE( rmesa, fog );
+	 R200_STATECHANGE( rmesa, glt );
+	 R200_STATECHANGE( rmesa, eye );
+	 for (i = R200_MTX_MV; i <= R200_MTX_TEX5; i++) {
+	    R200_STATECHANGE( rmesa, mat[i] );
+	 }
+	 for (i = 0 ; i < 8; i++) {
+	    R200_STATECHANGE( rmesa, lit[i] );
+	 }
+	 R200_STATECHANGE( rmesa, tcl );
+	 for (i = 0; i <= ctx->Const.MaxClipPlanes; i++) {
+	    if (ctx->Transform.ClipPlanesEnabled & (1 << i)) {
+	       rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= (R200_UCP_ENABLE_0 << i);
+	    }
+/*	    else {
+	       rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~(R200_UCP_ENABLE_0 << i);
+	    }*/
+	 }
+	 /* ugly. Need to call everything which might change compsel. */
+	 r200UpdateSpecular( ctx );
+#if 0
+	/* shouldn't be necessary, as it's picked up anyway in r200ValidateState (_NEW_PROGRAM),
+	   but without it doom3 locks up at always the same places. Why? */
+	/* FIXME: This can (and should) be replaced by a call to the TCL_STATE_FLUSH reg before
+	   accessing VAP_SE_VAP_CNTL. Requires drm changes (done). Remove after some time... */
+	 r200UpdateTextureState( ctx );
+	 /* if we call r200UpdateTextureState we need the code below because we are calling it with
+	    non-current derived enabled values which may revert the state atoms for frag progs even when
+	    they already got disabled... ugh
+	    Should really figure out why we need to call r200UpdateTextureState in the first place */
+	 GLuint unit;
+	 for (unit = 0; unit < R200_MAX_TEXTURE_UNITS; unit++) {
+	    R200_STATECHANGE( rmesa, pix[unit] );
+	    R200_STATECHANGE( rmesa, tex[unit] );
+	    rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] &=
+		~(R200_TXFORMAT_ST_ROUTE_MASK | R200_TXFORMAT_LOOKUP_DISABLE);
+	    rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] |= unit << R200_TXFORMAT_ST_ROUTE_SHIFT;
+	    /* need to guard this with drmSupportsFragmentShader? Should never get here if
+	       we don't announce ATI_fs, right? */
+	    rmesa->hw.tex[unit].cmd[TEX_PP_TXMULTI_CTL] = 0;
+         }
+	 R200_STATECHANGE( rmesa, cst );
+	 R200_STATECHANGE( rmesa, tf );
+	 rmesa->hw.cst.cmd[CST_PP_CNTL_X] = 0;
+#endif
+      }
+      else {
+	 /* picked up later */
+      }
+      /* call functions which change hw state based on ARB_vp enabled or not. */
+      r200PointParameter( ctx, GL_POINT_DISTANCE_ATTENUATION, NULL );
+      r200Fogfv( ctx, GL_FOG_COORD_SRC, NULL );
+      break;
+
+   case GL_VERTEX_PROGRAM_POINT_SIZE_ARB:
+      r200PointParameter( ctx, GL_POINT_DISTANCE_ATTENUATION, NULL );
+      break;
+
+   case GL_FRAGMENT_SHADER_ATI:
+      if ( !state ) {
+	 /* restore normal tex env colors and make sure tex env combine will get updated
+	    mark env atoms dirty (as their data was overwritten by afs even
+	    if they didn't change) and restore tex coord routing */
+	 GLuint unit;
+	 for (unit = 0; unit < R200_MAX_TEXTURE_UNITS; unit++) {
+	    R200_STATECHANGE( rmesa, pix[unit] );
+	    R200_STATECHANGE( rmesa, tex[unit] );
+	    rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] &=
+		~(R200_TXFORMAT_ST_ROUTE_MASK | R200_TXFORMAT_LOOKUP_DISABLE);
+	    rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] |= unit << R200_TXFORMAT_ST_ROUTE_SHIFT;
+	    /* need to guard this with drmSupportsFragmentShader? Should never get here if
+	       we don't announce ATI_fs, right? */
+	    rmesa->hw.tex[unit].cmd[TEX_PP_TXMULTI_CTL] = 0;
+         }
+	 R200_STATECHANGE( rmesa, cst );
+	 R200_STATECHANGE( rmesa, tf );
+	 rmesa->hw.cst.cmd[CST_PP_CNTL_X] = 0;
+      }
+      else {
+	 /* need to mark this dirty as pix/tf atoms have overwritten the data
+	    even if the data in the atoms didn't change */
+	 R200_STATECHANGE( rmesa, atf );
+	 R200_STATECHANGE( rmesa, afs[1] );
+	 /* everything else picked up in r200UpdateTextureState hopefully */
+      }
+      break;
+   default:
+      return;
+   }
+}
+
+
+void r200LightingSpaceChange( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLboolean tmp;
+
+   if (R200_DEBUG & DEBUG_STATE) 
+      fprintf(stderr, "%s %d BEFORE %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
+	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0]);
+
+   if (ctx->_NeedEyeCoords)
+      tmp = ctx->Transform.RescaleNormals;
+   else
+      tmp = !ctx->Transform.RescaleNormals;
+
+   R200_STATECHANGE( rmesa, tcl );
+   if ( tmp ) {
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |=  R200_RESCALE_NORMALS;
+   } else {
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= ~R200_RESCALE_NORMALS;
+   }
+
+   if (R200_DEBUG & DEBUG_STATE) 
+      fprintf(stderr, "%s %d AFTER %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
+	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0]);
+}
+
+/* =============================================================
+ * Deferred state management - matrices, textures, other?
+ */
+
+
+
+
+static void upload_matrix( r200ContextPtr rmesa, GLfloat *src, int idx )
+{
+   float *dest = ((float *)R200_DB_STATE( mat[idx] ))+MAT_ELT_0;
+   int i;
+
+
+   for (i = 0 ; i < 4 ; i++) {
+      *dest++ = src[i];
+      *dest++ = src[i+4];
+      *dest++ = src[i+8];
+      *dest++ = src[i+12];
+   }
+
+   R200_DB_STATECHANGE( rmesa, &rmesa->hw.mat[idx] );
+}
+
+static void upload_matrix_t( r200ContextPtr rmesa, const GLfloat *src, int idx )
+{
+   float *dest = ((float *)R200_DB_STATE( mat[idx] ))+MAT_ELT_0;
+   memcpy(dest, src, 16*sizeof(float));
+   R200_DB_STATECHANGE( rmesa, &rmesa->hw.mat[idx] );
+}
+
+
+static void update_texturematrix( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+   GLuint tpc = rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_0];
+   GLuint compsel = rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL];
+   int unit;
+
+   if (R200_DEBUG & DEBUG_STATE) 
+      fprintf(stderr, "%s before COMPSEL: %x\n", __FUNCTION__,
+	      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL]);
+
+   rmesa->TexMatEnabled = 0;
+   rmesa->TexMatCompSel = 0;
+
+   for (unit = 0 ; unit < ctx->Const.MaxTextureUnits; unit++) {
+      if (!ctx->Texture.Unit[unit]._ReallyEnabled) 
+	 continue;
+
+      if (ctx->TextureMatrixStack[unit].Top->type != MATRIX_IDENTITY) {
+	 rmesa->TexMatEnabled |= (R200_TEXGEN_TEXMAT_0_ENABLE|
+				  R200_TEXMAT_0_ENABLE) << unit;
+
+	 rmesa->TexMatCompSel |= R200_OUTPUT_TEX_0 << unit;
+
+	 if (rmesa->TexGenEnabled & (R200_TEXMAT_0_ENABLE << unit)) {
+	    /* Need to preconcatenate any active texgen 
+	     * obj/eyeplane matrices:
+	     */
+	    _math_matrix_mul_matrix( &rmesa->tmpmat,
+				     ctx->TextureMatrixStack[unit].Top, 
+				     &rmesa->TexGenMatrix[unit] );
+	    upload_matrix( rmesa, rmesa->tmpmat.m, R200_MTX_TEX0+unit );
+	 } 
+	 else {
+	    upload_matrix( rmesa, ctx->TextureMatrixStack[unit].Top->m, 
+			   R200_MTX_TEX0+unit );
+	 }
+      }
+      else if (rmesa->TexGenEnabled & (R200_TEXMAT_0_ENABLE << unit)) {
+	 upload_matrix( rmesa, rmesa->TexGenMatrix[unit].m, 
+			R200_MTX_TEX0+unit );
+      }
+   }
+
+   tpc = (rmesa->TexMatEnabled | rmesa->TexGenEnabled);
+   if (tpc != rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_0]) {
+      R200_STATECHANGE(rmesa, tcg);
+      rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_0] = tpc;
+   }
+
+   compsel &= ~R200_OUTPUT_TEX_MASK;
+   compsel |= rmesa->TexMatCompSel | rmesa->TexGenCompSel;
+   if (compsel != rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL]) {
+      R200_STATECHANGE(rmesa, vtx);
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] = compsel;
+   }
+}
+
+
+
+/**
+ * Tell the card where to render (offset, pitch).
+ * Effected by glDrawBuffer, etc
+ */
+void
+r200UpdateDrawBuffer(GLcontext *ctx)
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   driRenderbuffer *drb;
+
+   if (fb->_ColorDrawBufferMask[0] == BUFFER_BIT_FRONT_LEFT) {
+      /* draw to front */
+      drb = (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
+   }
+   else if (fb->_ColorDrawBufferMask[0] == BUFFER_BIT_BACK_LEFT) {
+      /* draw to back */
+      drb = (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+   }
+   else {
+      /* drawing to multiple buffers, or none */
+      return;
+   }
+
+   assert(drb);
+   assert(drb->flippedPitch);
+
+   R200_STATECHANGE( rmesa, ctx );
+
+   /* Note: we used the (possibly) page-flipped values */
+   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
+     = ((drb->flippedOffset + rmesa->r200Screen->fbLocation)
+	& R200_COLOROFFSET_MASK);
+   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
+   if (rmesa->sarea->tiling_enabled) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
+   }
+}
+
+
+
+void r200ValidateState( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint new_state = rmesa->NewGLState;
+
+   if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+     r200UpdateDrawBuffer(ctx);
+   }
+
+   if (new_state & (_NEW_TEXTURE | _NEW_PROGRAM)) {
+      r200UpdateTextureState( ctx );
+      new_state |= rmesa->NewGLState; /* may add TEXTURE_MATRIX */
+      r200UpdateLocalViewer( ctx );
+   }
+
+/* FIXME: don't really need most of these when vertex progs are enabled */
+
+   /* Need an event driven matrix update?
+    */
+   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION)) 
+      upload_matrix( rmesa, ctx->_ModelProjectMatrix.m, R200_MTX_MVP );
+
+   /* Need these for lighting (shouldn't upload otherwise)
+    */
+   if (new_state & (_NEW_MODELVIEW)) {
+      upload_matrix( rmesa, ctx->ModelviewMatrixStack.Top->m, R200_MTX_MV );
+      upload_matrix_t( rmesa, ctx->ModelviewMatrixStack.Top->inv, R200_MTX_IMV );
+   }
+
+   /* Does this need to be triggered on eg. modelview for
+    * texgen-derived objplane/eyeplane matrices?
+    */
+   if (new_state & (_NEW_TEXTURE|_NEW_TEXTURE_MATRIX)) {
+      update_texturematrix( ctx );
+   }
+
+   if (new_state & (_NEW_LIGHT|_NEW_MODELVIEW|_MESA_NEW_NEED_EYE_COORDS)) {
+      update_light( ctx );
+   }
+
+   /* emit all active clip planes if projection matrix changes.
+    */
+   if (new_state & (_NEW_PROJECTION)) {
+      if (ctx->Transform.ClipPlanesEnabled) 
+	 r200UpdateClipPlanes( ctx );
+   }
+
+   if (new_state & (_NEW_PROGRAM|
+   /* need to test for pretty much anything due to possible parameter bindings */
+	_NEW_MODELVIEW|_NEW_PROJECTION|_NEW_TRANSFORM|
+	_NEW_LIGHT|_NEW_TEXTURE|_NEW_TEXTURE_MATRIX|
+	_NEW_FOG|_NEW_POINT|_NEW_TRACK_MATRIX)) {
+      if (ctx->VertexProgram._Enabled) {
+	 r200SetupVertexProg( ctx );
+      }
+      else TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, 0);
+   }
+
+   rmesa->NewGLState = 0;
+}
+
+
+static void r200InvalidateState( GLcontext *ctx, GLuint new_state )
+{
+   _swrast_InvalidateState( ctx, new_state );
+   _swsetup_InvalidateState( ctx, new_state );
+   _vbo_InvalidateState( ctx, new_state );
+   _tnl_InvalidateState( ctx, new_state );
+   _ae_invalidate_state( ctx, new_state );
+   R200_CONTEXT(ctx)->NewGLState |= new_state;
+}
+
+/* A hack.  The r200 can actually cope just fine with materials
+ * between begin/ends, so fix this.
+ * Should map to inputs just like the generic vertex arrays for vertex progs.
+ * In theory there could still be too many and we'd still need a fallback.
+ */
+static GLboolean check_material( GLcontext *ctx )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   GLint i;
+
+   for (i = _TNL_ATTRIB_MAT_FRONT_AMBIENT;
+	i < _TNL_ATTRIB_MAT_BACK_INDEXES;
+	i++)
+      if (tnl->vb.AttribPtr[i] &&
+	  tnl->vb.AttribPtr[i]->stride)
+	 return GL_TRUE;
+
+   return GL_FALSE;
+}
+
+static void r200WrapRunPipeline( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLboolean has_material;
+
+   if (0)
+      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->NewGLState);
+
+   /* Validate state:
+    */
+   if (rmesa->NewGLState)
+      r200ValidateState( ctx );
+
+   has_material = !ctx->VertexProgram._Enabled && ctx->Light.Enabled && check_material( ctx );
+
+   if (has_material) {
+      TCL_FALLBACK( ctx, R200_TCL_FALLBACK_MATERIAL, GL_TRUE );
+   }
+
+   /* Run the pipeline.
+    */ 
+   _tnl_run_pipeline( ctx );
+
+   if (has_material) {
+      TCL_FALLBACK( ctx, R200_TCL_FALLBACK_MATERIAL, GL_FALSE );
+   }
+}
+
+
+/* Initialize the driver's state functions.
+ */
+void r200InitStateFuncs( struct dd_function_table *functions )
+{
+   functions->UpdateState		= r200InvalidateState;
+   functions->LightingSpaceChange	= r200LightingSpaceChange;
+
+   functions->DrawBuffer		= r200DrawBuffer;
+   functions->ReadBuffer		= r200ReadBuffer;
+
+   functions->AlphaFunc			= r200AlphaFunc;
+   functions->BlendColor		= r200BlendColor;
+   functions->BlendEquationSeparate	= r200BlendEquationSeparate;
+   functions->BlendFuncSeparate		= r200BlendFuncSeparate;
+   functions->ClearColor		= r200ClearColor;
+   functions->ClearDepth		= r200ClearDepth;
+   functions->ClearIndex		= NULL;
+   functions->ClearStencil		= r200ClearStencil;
+   functions->ClipPlane			= r200ClipPlane;
+   functions->ColorMask			= r200ColorMask;
+   functions->CullFace			= r200CullFace;
+   functions->DepthFunc			= r200DepthFunc;
+   functions->DepthMask			= r200DepthMask;
+   functions->DepthRange		= r200DepthRange;
+   functions->Enable			= r200Enable;
+   functions->Fogfv			= r200Fogfv;
+   functions->FrontFace			= r200FrontFace;
+   functions->Hint			= NULL;
+   functions->IndexMask			= NULL;
+   functions->LightModelfv		= r200LightModelfv;
+   functions->Lightfv			= r200Lightfv;
+   functions->LineStipple		= r200LineStipple;
+   functions->LineWidth			= r200LineWidth;
+   functions->LogicOpcode		= r200LogicOpCode;
+   functions->PolygonMode		= r200PolygonMode;
+   functions->PolygonOffset		= r200PolygonOffset;
+   functions->PolygonStipple		= r200PolygonStipple;
+   functions->PointParameterfv		= r200PointParameter;
+   functions->PointSize			= r200PointSize;
+   functions->RenderMode		= r200RenderMode;
+   functions->Scissor			= r200Scissor;
+   functions->ShadeModel		= r200ShadeModel;
+   functions->StencilFuncSeparate	= r200StencilFuncSeparate;
+   functions->StencilMaskSeparate	= r200StencilMaskSeparate;
+   functions->StencilOpSeparate		= r200StencilOpSeparate;
+   functions->Viewport			= r200Viewport;
+}
+
+
+void r200InitTnlFuncs( GLcontext *ctx )
+{
+   TNL_CONTEXT(ctx)->Driver.NotifyMaterialChange = r200UpdateMaterial;
+   TNL_CONTEXT(ctx)->Driver.RunPipeline = r200WrapRunPipeline;
+}
diff --git a/r200/r200_state.h b/r200/r200_state.h
new file mode 100644
index 0000000..f34090b
--- /dev/null
+++ b/r200/r200_state.h
@@ -0,0 +1,68 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_state.h,v 1.2 2002/11/05 17:46:08 tsi Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __R200_STATE_H__
+#define __R200_STATE_H__
+
+#include "r200_context.h"
+
+extern void r200InitState( r200ContextPtr rmesa );
+extern void r200InitStateFuncs( struct dd_function_table *functions );
+extern void r200InitTnlFuncs( GLcontext *ctx );
+
+extern void r200UpdateMaterial( GLcontext *ctx );
+
+extern void r200SetCliprects( r200ContextPtr rmesa );
+extern void r200RecalcScissorRects( r200ContextPtr rmesa );
+extern void r200UpdateViewportOffset( GLcontext *ctx );
+extern void r200UpdateWindow( GLcontext *ctx );
+extern void r200UpdateDrawBuffer(GLcontext *ctx);
+
+extern void r200ValidateState( GLcontext *ctx );
+
+extern void r200PrintDirty( r200ContextPtr rmesa,
+			      const char *msg );
+
+
+extern void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode );
+#define FALLBACK( rmesa, bit, mode ) do {				\
+   if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",		\
+		     __FUNCTION__, bit, mode );				\
+   r200Fallback( rmesa->glCtx, bit, mode );				\
+} while (0)
+
+extern void r200LightingSpaceChange( GLcontext *ctx );
+
+#endif
diff --git a/r200/r200_state_init.c b/r200/r200_state_init.c
new file mode 100644
index 0000000..b40d0bd
--- /dev/null
+++ b/r200/r200_state_init.c
@@ -0,0 +1,972 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_state_init.c,v 1.4 2003/02/22 06:21:11 dawes Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "enums.h"
+#include "colormac.h"
+#include "api_arrayelt.h"
+
+#include "swrast/swrast.h"
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "swrast_setup/swrast_setup.h"
+
+#include "r200_context.h"
+#include "r200_ioctl.h"
+#include "r200_state.h"
+#include "r200_tcl.h"
+#include "r200_tex.h"
+#include "r200_swtcl.h"
+
+#include "xmlpool.h"
+
+/* =============================================================
+ * State initialization
+ */
+
+void r200PrintDirty( r200ContextPtr rmesa, const char *msg )
+{
+   struct r200_state_atom *l;
+
+   fprintf(stderr, msg);
+   fprintf(stderr, ": ");
+
+   foreach(l, &rmesa->hw.atomlist) {
+      if (l->dirty || rmesa->hw.all_dirty)
+	 fprintf(stderr, "%s, ", l->name);
+   }
+
+   fprintf(stderr, "\n");
+}
+
+static int cmdpkt( int id ) 
+{
+   drm_radeon_cmd_header_t h;
+   h.i = 0;
+   h.packet.cmd_type = RADEON_CMD_PACKET;
+   h.packet.packet_id = id;
+   return h.i;
+}
+
+static int cmdvec( int offset, int stride, int count ) 
+{
+   drm_radeon_cmd_header_t h;
+   h.i = 0;
+   h.vectors.cmd_type = RADEON_CMD_VECTORS;
+   h.vectors.offset = offset;
+   h.vectors.stride = stride;
+   h.vectors.count = count;
+   return h.i;
+}
+
+/* warning: the count here is divided by 4 compared to other cmds
+   (so it doesn't exceed the char size)! */
+static int cmdveclinear( int offset, int count ) 
+{
+   drm_radeon_cmd_header_t h;
+   h.i = 0;
+   h.veclinear.cmd_type = RADEON_CMD_VECLINEAR;
+   h.veclinear.addr_lo = offset & 0xff;
+   h.veclinear.addr_hi = (offset & 0xff00) >> 8;
+   h.veclinear.count = count;
+   return h.i;
+}
+
+static int cmdscl( int offset, int stride, int count ) 
+{
+   drm_radeon_cmd_header_t h;
+   h.i = 0;
+   h.scalars.cmd_type = RADEON_CMD_SCALARS;
+   h.scalars.offset = offset;
+   h.scalars.stride = stride;
+   h.scalars.count = count;
+   return h.i;
+}
+
+static int cmdscl2( int offset, int stride, int count ) 
+{
+   drm_radeon_cmd_header_t h;
+   h.i = 0;
+   h.scalars.cmd_type = RADEON_CMD_SCALARS2;
+   h.scalars.offset = offset - 0x100;
+   h.scalars.stride = stride;
+   h.scalars.count = count;
+   return h.i;
+}
+
+#define CHECK( NM, FLAG )				\
+static GLboolean check_##NM( GLcontext *ctx, int idx )	\
+{							\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+   (void) idx;						\
+   (void) rmesa;					\
+   return FLAG;						\
+}
+
+#define TCL_CHECK( NM, FLAG )				\
+static GLboolean check_##NM( GLcontext *ctx, int idx )	\
+{							\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+   (void) idx;						\
+   return !rmesa->TclFallback && !ctx->VertexProgram._Enabled && (FLAG);	\
+}
+
+#define TCL_OR_VP_CHECK( NM, FLAG )			\
+static GLboolean check_##NM( GLcontext *ctx, int idx )	\
+{							\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+   (void) idx;						\
+   return !rmesa->TclFallback && (FLAG);		\
+}
+
+#define VP_CHECK( NM, FLAG )				\
+static GLboolean check_##NM( GLcontext *ctx, int idx )	\
+{							\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+   (void) idx;						\
+   return !rmesa->TclFallback && ctx->VertexProgram._Enabled && (FLAG);		\
+}
+
+
+CHECK( always, GL_TRUE )
+CHECK( never, GL_FALSE )
+CHECK( tex_any, ctx->Texture._EnabledUnits )
+CHECK( tf, (ctx->Texture._EnabledUnits && !ctx->ATIFragmentShader._Enabled) );
+CHECK( tex_pair, (rmesa->state.texture.unit[idx].unitneeded | rmesa->state.texture.unit[idx & ~1].unitneeded) )
+CHECK( tex, rmesa->state.texture.unit[idx].unitneeded )
+CHECK( pix_zero, !ctx->ATIFragmentShader._Enabled )
+CHECK( texenv, (rmesa->state.envneeded & (1 << idx) && !ctx->ATIFragmentShader._Enabled) )
+CHECK( afs_pass1, (ctx->ATIFragmentShader._Enabled && (ctx->ATIFragmentShader.Current->NumPasses > 1)) )
+CHECK( afs, ctx->ATIFragmentShader._Enabled )
+CHECK( tex_cube, rmesa->state.texture.unit[idx].unitneeded & TEXTURE_CUBE_BIT )
+TCL_CHECK( tcl_fog, ctx->Fog.Enabled )
+TCL_CHECK( tcl, GL_TRUE )
+TCL_CHECK( tcl_tex, rmesa->state.texture.unit[idx].unitneeded )
+TCL_CHECK( tcl_lighting, ctx->Light.Enabled )
+TCL_CHECK( tcl_light, ctx->Light.Enabled && ctx->Light.Light[idx].Enabled )
+TCL_OR_VP_CHECK( tcl_ucp, (ctx->Transform.ClipPlanesEnabled & (1 << idx)) )
+TCL_OR_VP_CHECK( tcl_or_vp, GL_TRUE )
+VP_CHECK( tcl_vp, GL_TRUE )
+VP_CHECK( tcl_vp_size, ctx->VertexProgram.Current->Base.NumNativeInstructions > 64 )
+VP_CHECK( tcl_vpp_size, ctx->VertexProgram.Current->Base.NumNativeParameters > 96 )
+
+
+/* Initialize the context's hardware state.
+ */
+void r200InitState( r200ContextPtr rmesa )
+{
+   GLcontext *ctx = rmesa->glCtx;
+   GLuint color_fmt, depth_fmt, i;
+   GLint drawPitch, drawOffset;
+
+   switch ( rmesa->r200Screen->cpp ) {
+   case 2:
+      color_fmt = R200_COLOR_FORMAT_RGB565;
+      break;
+   case 4:
+      color_fmt = R200_COLOR_FORMAT_ARGB8888;
+      break;
+   default:
+      fprintf( stderr, "Error: Unsupported pixel depth... exiting\n" );
+      exit( -1 );
+   }
+
+   rmesa->state.color.clear = 0x00000000;
+
+   switch ( ctx->Visual.depthBits ) {
+   case 16:
+      rmesa->state.depth.clear = 0x0000ffff;
+      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffff;
+      depth_fmt = R200_DEPTH_FORMAT_16BIT_INT_Z;
+      rmesa->state.stencil.clear = 0x00000000;
+      break;
+   case 24:
+      rmesa->state.depth.clear = 0x00ffffff;
+      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffffff;
+      depth_fmt = R200_DEPTH_FORMAT_24BIT_INT_Z;
+      rmesa->state.stencil.clear = 0xffff0000;
+      break;
+   default:
+      fprintf( stderr, "Error: Unsupported depth %d... exiting\n",
+	       ctx->Visual.depthBits );
+      exit( -1 );
+   }
+
+   /* Only have hw stencil when depth buffer is 24 bits deep */
+   rmesa->state.stencil.hwBuffer = ( ctx->Visual.stencilBits > 0 &&
+				     ctx->Visual.depthBits == 24 );
+
+   rmesa->Fallback = 0;
+
+   if ( ctx->Visual.doubleBufferMode && rmesa->sarea->pfCurrentPage == 0 ) {
+      drawOffset = rmesa->r200Screen->backOffset;
+      drawPitch  = rmesa->r200Screen->backPitch;
+   } else {
+      drawOffset = rmesa->r200Screen->frontOffset;
+      drawPitch  = rmesa->r200Screen->frontPitch;
+   }
+#if 000
+   if ( ctx->Visual.doubleBufferMode && rmesa->sarea->pfCurrentPage == 0 ) {
+      rmesa->state.color.drawOffset = rmesa->r200Screen->backOffset;
+      rmesa->state.color.drawPitch  = rmesa->r200Screen->backPitch;
+   } else {
+      rmesa->state.color.drawOffset = rmesa->r200Screen->frontOffset;
+      rmesa->state.color.drawPitch  = rmesa->r200Screen->frontPitch;
+   }
+
+   rmesa->state.pixel.readOffset = rmesa->state.color.drawOffset;
+   rmesa->state.pixel.readPitch  = rmesa->state.color.drawPitch;
+#endif
+
+   rmesa->hw.max_state_size = 0;
+
+#define ALLOC_STATE( ATOM, CHK, SZ, NM, IDX )				\
+   do {								\
+      rmesa->hw.ATOM.cmd_size = SZ;				\
+      rmesa->hw.ATOM.cmd = (int *)CALLOC(SZ * sizeof(int));	\
+      rmesa->hw.ATOM.lastcmd = (int *)CALLOC(SZ * sizeof(int));	\
+      rmesa->hw.ATOM.name = NM;					\
+      rmesa->hw.ATOM.idx = IDX;					\
+      rmesa->hw.ATOM.check = check_##CHK;			\
+      rmesa->hw.ATOM.dirty = GL_FALSE;				\
+      rmesa->hw.max_state_size += SZ * sizeof(int);		\
+   } while (0)
+
+
+   /* Allocate state buffers:
+    */
+   if (rmesa->r200Screen->drmSupportsBlendColor)
+      ALLOC_STATE( ctx, always, CTX_STATE_SIZE_NEWDRM, "CTX/context", 0 );
+   else
+      ALLOC_STATE( ctx, always, CTX_STATE_SIZE_OLDDRM, "CTX/context", 0 );
+   ALLOC_STATE( set, always, SET_STATE_SIZE, "SET/setup", 0 );
+   ALLOC_STATE( lin, always, LIN_STATE_SIZE, "LIN/line", 0 );
+   ALLOC_STATE( msk, always, MSK_STATE_SIZE, "MSK/mask", 0 );
+   ALLOC_STATE( vpt, always, VPT_STATE_SIZE, "VPT/viewport", 0 );
+   ALLOC_STATE( vtx, always, VTX_STATE_SIZE, "VTX/vertex", 0 );
+   ALLOC_STATE( vap, always, VAP_STATE_SIZE, "VAP/vap", 0 );
+   ALLOC_STATE( vte, always, VTE_STATE_SIZE, "VTE/vte", 0 );
+   ALLOC_STATE( msc, always, MSC_STATE_SIZE, "MSC/misc", 0 );
+   ALLOC_STATE( cst, always, CST_STATE_SIZE, "CST/constant", 0 );
+   ALLOC_STATE( zbs, always, ZBS_STATE_SIZE, "ZBS/zbias", 0 );
+   ALLOC_STATE( tf, tf, TF_STATE_SIZE, "TF/tfactor", 0 );
+   if (rmesa->r200Screen->drmSupportsFragShader) {
+      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200) {
+      /* make sure texture units 0/1 are emitted pair-wise for r200 t0 hang workaround */
+	 ALLOC_STATE( tex[0], tex_pair, TEX_STATE_SIZE_NEWDRM, "TEX/tex-0", 0 );
+	 ALLOC_STATE( tex[1], tex_pair, TEX_STATE_SIZE_NEWDRM, "TEX/tex-1", 1 );
+	 ALLOC_STATE( tam, tex_any, TAM_STATE_SIZE, "TAM/tam", 0 );
+      }
+      else {
+	 ALLOC_STATE( tex[0], tex, TEX_STATE_SIZE_NEWDRM, "TEX/tex-0", 0 );
+	 ALLOC_STATE( tex[1], tex, TEX_STATE_SIZE_NEWDRM, "TEX/tex-1", 1 );
+	 ALLOC_STATE( tam, never, TAM_STATE_SIZE, "TAM/tam", 0 );
+      }
+      ALLOC_STATE( tex[2], tex, TEX_STATE_SIZE_NEWDRM, "TEX/tex-2", 2 );
+      ALLOC_STATE( tex[3], tex, TEX_STATE_SIZE_NEWDRM, "TEX/tex-3", 3 );
+      ALLOC_STATE( tex[4], tex, TEX_STATE_SIZE_NEWDRM, "TEX/tex-4", 4 );
+      ALLOC_STATE( tex[5], tex, TEX_STATE_SIZE_NEWDRM, "TEX/tex-5", 5 );
+      ALLOC_STATE( atf, afs, ATF_STATE_SIZE, "ATF/tfactor", 0 );
+      ALLOC_STATE( afs[0], afs_pass1, AFS_STATE_SIZE, "AFS/afsinst-0", 0 );
+      ALLOC_STATE( afs[1], afs, AFS_STATE_SIZE, "AFS/afsinst-1", 1 );
+   }
+   else {
+      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200) {
+	 ALLOC_STATE( tex[0], tex_pair, TEX_STATE_SIZE_OLDDRM, "TEX/tex-0", 0 );
+	 ALLOC_STATE( tex[1], tex_pair, TEX_STATE_SIZE_OLDDRM, "TEX/tex-1", 1 );
+	 ALLOC_STATE( tam, tex_any, TAM_STATE_SIZE, "TAM/tam", 0 );
+      }
+      else {
+	 ALLOC_STATE( tex[0], tex, TEX_STATE_SIZE_OLDDRM, "TEX/tex-0", 0 );
+	 ALLOC_STATE( tex[1], tex, TEX_STATE_SIZE_OLDDRM, "TEX/tex-1", 1 );
+	 ALLOC_STATE( tam, never, TAM_STATE_SIZE, "TAM/tam", 0 );
+      }
+      ALLOC_STATE( tex[2], tex, TEX_STATE_SIZE_OLDDRM, "TEX/tex-2", 2 );
+      ALLOC_STATE( tex[3], tex, TEX_STATE_SIZE_OLDDRM, "TEX/tex-3", 3 );
+      ALLOC_STATE( tex[4], tex, TEX_STATE_SIZE_OLDDRM, "TEX/tex-4", 4 );
+      ALLOC_STATE( tex[5], tex, TEX_STATE_SIZE_OLDDRM, "TEX/tex-5", 5 );
+      ALLOC_STATE( atf, never, ATF_STATE_SIZE, "TF/tfactor", 0 );
+      ALLOC_STATE( afs[0], never, AFS_STATE_SIZE, "AFS/afsinst-0", 0 );
+      ALLOC_STATE( afs[1], never, AFS_STATE_SIZE, "AFS/afsinst-1", 1 );
+   }
+   if (rmesa->r200Screen->drmSupportsCubeMapsR200) {
+      ALLOC_STATE( cube[0], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-0", 0 );
+      ALLOC_STATE( cube[1], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-1", 1 );
+      ALLOC_STATE( cube[2], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-2", 2 );
+      ALLOC_STATE( cube[3], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-3", 3 );
+      ALLOC_STATE( cube[4], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-4", 4 );
+      ALLOC_STATE( cube[5], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-5", 5 );
+   }
+   else {
+      ALLOC_STATE( cube[0], never, CUBE_STATE_SIZE, "CUBE/tex-0", 0 );
+      ALLOC_STATE( cube[1], never, CUBE_STATE_SIZE, "CUBE/tex-1", 1 );
+      ALLOC_STATE( cube[2], never, CUBE_STATE_SIZE, "CUBE/tex-2", 2 );
+      ALLOC_STATE( cube[3], never, CUBE_STATE_SIZE, "CUBE/tex-3", 3 );
+      ALLOC_STATE( cube[4], never, CUBE_STATE_SIZE, "CUBE/tex-4", 4 );
+      ALLOC_STATE( cube[5], never, CUBE_STATE_SIZE, "CUBE/tex-5", 5 );
+   }
+   if (rmesa->r200Screen->drmSupportsVertexProgram) {
+      ALLOC_STATE( pvs, tcl_vp, PVS_STATE_SIZE, "PVS/pvscntl", 0 );
+      ALLOC_STATE( vpi[0], tcl_vp, VPI_STATE_SIZE, "VP/vertexprog-0", 0 );
+      ALLOC_STATE( vpi[1], tcl_vp_size, VPI_STATE_SIZE, "VP/vertexprog-1", 1 );
+      ALLOC_STATE( vpp[0], tcl_vp, VPP_STATE_SIZE, "VPP/vertexparam-0", 0 );
+      ALLOC_STATE( vpp[1], tcl_vpp_size, VPP_STATE_SIZE, "VPP/vertexparam-1", 1 );
+   }
+   else {
+      ALLOC_STATE( pvs, never, PVS_STATE_SIZE, "PVS/pvscntl", 0 );
+      ALLOC_STATE( vpi[0], never, VPI_STATE_SIZE, "VP/vertexprog-0", 0 );
+      ALLOC_STATE( vpi[1], never, VPI_STATE_SIZE, "VP/vertexprog-1", 1 );
+      ALLOC_STATE( vpp[0], never, VPP_STATE_SIZE, "VPP/vertexparam-0", 0 );
+      ALLOC_STATE( vpp[1], never, VPP_STATE_SIZE, "VPP/vertexparam-1", 1 );
+   }
+   /* FIXME: this atom has two commands, we need only one (ucp_vert_blend) for vp */
+   ALLOC_STATE( tcl, tcl_or_vp, TCL_STATE_SIZE, "TCL/tcl", 0 );
+   ALLOC_STATE( msl, tcl, MSL_STATE_SIZE, "MSL/matrix-select", 0 );
+   ALLOC_STATE( tcg, tcl, TCG_STATE_SIZE, "TCG/texcoordgen", 0 );
+   ALLOC_STATE( mtl[0], tcl_lighting, MTL_STATE_SIZE, "MTL0/material0", 0 );
+   ALLOC_STATE( mtl[1], tcl_lighting, MTL_STATE_SIZE, "MTL1/material1", 1 );
+   ALLOC_STATE( grd, tcl_or_vp, GRD_STATE_SIZE, "GRD/guard-band", 0 );
+   ALLOC_STATE( fog, tcl_fog, FOG_STATE_SIZE, "FOG/fog", 0 );
+   ALLOC_STATE( glt, tcl_lighting, GLT_STATE_SIZE, "GLT/light-global", 0 );
+   ALLOC_STATE( eye, tcl_lighting, EYE_STATE_SIZE, "EYE/eye-vector", 0 );
+   ALLOC_STATE( mat[R200_MTX_MV], tcl, MAT_STATE_SIZE, "MAT/modelview", 0 );
+   ALLOC_STATE( mat[R200_MTX_IMV], tcl, MAT_STATE_SIZE, "MAT/it-modelview", 0 );
+   ALLOC_STATE( mat[R200_MTX_MVP], tcl, MAT_STATE_SIZE, "MAT/modelproject", 0 );
+   ALLOC_STATE( mat[R200_MTX_TEX0], tcl_tex, MAT_STATE_SIZE, "MAT/texmat0", 0 );
+   ALLOC_STATE( mat[R200_MTX_TEX1], tcl_tex, MAT_STATE_SIZE, "MAT/texmat1", 1 );
+   ALLOC_STATE( mat[R200_MTX_TEX2], tcl_tex, MAT_STATE_SIZE, "MAT/texmat2", 2 );
+   ALLOC_STATE( mat[R200_MTX_TEX3], tcl_tex, MAT_STATE_SIZE, "MAT/texmat3", 3 );
+   ALLOC_STATE( mat[R200_MTX_TEX4], tcl_tex, MAT_STATE_SIZE, "MAT/texmat4", 4 );
+   ALLOC_STATE( mat[R200_MTX_TEX5], tcl_tex, MAT_STATE_SIZE, "MAT/texmat5", 5 );
+   ALLOC_STATE( ucp[0], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-0", 0 );
+   ALLOC_STATE( ucp[1], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-1", 1 );
+   ALLOC_STATE( ucp[2], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-2", 2 );
+   ALLOC_STATE( ucp[3], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-3", 3 );
+   ALLOC_STATE( ucp[4], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-4", 4 );
+   ALLOC_STATE( ucp[5], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-5", 5 );
+   ALLOC_STATE( lit[0], tcl_light, LIT_STATE_SIZE, "LIT/light-0", 0 );
+   ALLOC_STATE( lit[1], tcl_light, LIT_STATE_SIZE, "LIT/light-1", 1 );
+   ALLOC_STATE( lit[2], tcl_light, LIT_STATE_SIZE, "LIT/light-2", 2 );
+   ALLOC_STATE( lit[3], tcl_light, LIT_STATE_SIZE, "LIT/light-3", 3 );
+   ALLOC_STATE( lit[4], tcl_light, LIT_STATE_SIZE, "LIT/light-4", 4 );
+   ALLOC_STATE( lit[5], tcl_light, LIT_STATE_SIZE, "LIT/light-5", 5 );
+   ALLOC_STATE( lit[6], tcl_light, LIT_STATE_SIZE, "LIT/light-6", 6 );
+   ALLOC_STATE( lit[7], tcl_light, LIT_STATE_SIZE, "LIT/light-7", 7 );
+   ALLOC_STATE( pix[0], pix_zero, PIX_STATE_SIZE, "PIX/pixstage-0", 0 );
+   ALLOC_STATE( pix[1], texenv, PIX_STATE_SIZE, "PIX/pixstage-1", 1 );
+   ALLOC_STATE( pix[2], texenv, PIX_STATE_SIZE, "PIX/pixstage-2", 2 );
+   ALLOC_STATE( pix[3], texenv, PIX_STATE_SIZE, "PIX/pixstage-3", 3 );
+   ALLOC_STATE( pix[4], texenv, PIX_STATE_SIZE, "PIX/pixstage-4", 4 );
+   ALLOC_STATE( pix[5], texenv, PIX_STATE_SIZE, "PIX/pixstage-5", 5 );
+   if (rmesa->r200Screen->drmSupportsTriPerf) {
+      ALLOC_STATE( prf, always, PRF_STATE_SIZE, "PRF/performance-tri", 0 );
+   }
+   else {
+      ALLOC_STATE( prf, never, PRF_STATE_SIZE, "PRF/performance-tri", 0 );
+   }
+   if (rmesa->r200Screen->drmSupportsPointSprites) {
+      ALLOC_STATE( spr, always, SPR_STATE_SIZE, "SPR/pointsprite", 0 );
+      ALLOC_STATE( ptp, tcl, PTP_STATE_SIZE, "PTP/pointparams", 0 );
+   }
+   else {
+      ALLOC_STATE (spr, never, SPR_STATE_SIZE, "SPR/pointsprite", 0 );
+      ALLOC_STATE (ptp, never, PTP_STATE_SIZE, "PTP/pointparams", 0 );
+   }
+
+   r200SetUpAtomList( rmesa );
+
+   /* Fill in the packet headers:
+    */
+   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(RADEON_EMIT_PP_MISC);
+   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(RADEON_EMIT_PP_CNTL);
+   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(RADEON_EMIT_RB3D_COLORPITCH);
+   if (rmesa->r200Screen->drmSupportsBlendColor)
+      rmesa->hw.ctx.cmd[CTX_CMD_3] = cmdpkt(R200_EMIT_RB3D_BLENDCOLOR);
+   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(RADEON_EMIT_RE_LINE_PATTERN);
+   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(RADEON_EMIT_SE_LINE_WIDTH);
+   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(RADEON_EMIT_RB3D_STENCILREFMASK);
+   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(RADEON_EMIT_SE_VPORT_XSCALE);
+   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(RADEON_EMIT_SE_CNTL);
+   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(RADEON_EMIT_RE_MISC);
+   rmesa->hw.cst.cmd[CST_CMD_0] = cmdpkt(R200_EMIT_PP_CNTL_X);
+   rmesa->hw.cst.cmd[CST_CMD_1] = cmdpkt(R200_EMIT_RB3D_DEPTHXY_OFFSET);
+   rmesa->hw.cst.cmd[CST_CMD_2] = cmdpkt(R200_EMIT_RE_AUX_SCISSOR_CNTL);
+   rmesa->hw.cst.cmd[CST_CMD_3] = cmdpkt(R200_EMIT_RE_SCISSOR_TL_0);
+   rmesa->hw.cst.cmd[CST_CMD_4] = cmdpkt(R200_EMIT_SE_VAP_CNTL_STATUS);
+   rmesa->hw.cst.cmd[CST_CMD_5] = cmdpkt(R200_EMIT_RE_POINTSIZE);
+   rmesa->hw.cst.cmd[CST_CMD_6] = cmdpkt(R200_EMIT_TCL_INPUT_VTX_VECTOR_ADDR_0);
+   rmesa->hw.tam.cmd[TAM_CMD_0] = cmdpkt(R200_EMIT_PP_TAM_DEBUG3);
+   rmesa->hw.tf.cmd[TF_CMD_0] = cmdpkt(R200_EMIT_TFACTOR_0);
+   if (rmesa->r200Screen->drmSupportsFragShader) {
+      rmesa->hw.atf.cmd[ATF_CMD_0] = cmdpkt(R200_EMIT_ATF_TFACTOR);
+      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_0);
+      rmesa->hw.tex[0].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_0);
+      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_1);
+      rmesa->hw.tex[1].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_1);
+      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_2);
+      rmesa->hw.tex[2].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_2);
+      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_3);
+      rmesa->hw.tex[3].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_3);
+      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_4);
+      rmesa->hw.tex[4].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_4);
+      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_5);
+      rmesa->hw.tex[5].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_5);
+   } else {
+      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_0);
+      rmesa->hw.tex[0].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_0);
+      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_1);
+      rmesa->hw.tex[1].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_1);
+      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_2);
+      rmesa->hw.tex[2].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_2);
+      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_3);
+      rmesa->hw.tex[3].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_3);
+      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_4);
+      rmesa->hw.tex[4].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_4);
+      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_5);
+      rmesa->hw.tex[5].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_5);
+   }
+   rmesa->hw.afs[0].cmd[AFS_CMD_0] = cmdpkt(R200_EMIT_PP_AFS_0);
+   rmesa->hw.afs[1].cmd[AFS_CMD_0] = cmdpkt(R200_EMIT_PP_AFS_1);
+   rmesa->hw.pvs.cmd[PVS_CMD_0] = cmdpkt(R200_EMIT_VAP_PVS_CNTL);
+   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_0);
+   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_0);
+   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_1);
+   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_1);
+   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_2);
+   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_2);
+   rmesa->hw.cube[3].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_3);
+   rmesa->hw.cube[3].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_3);
+   rmesa->hw.cube[4].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_4);
+   rmesa->hw.cube[4].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_4);
+   rmesa->hw.cube[5].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_5);
+   rmesa->hw.cube[5].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_5);
+   rmesa->hw.pix[0].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_0);
+   rmesa->hw.pix[1].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_1);
+   rmesa->hw.pix[2].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_2);
+   rmesa->hw.pix[3].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_3);
+   rmesa->hw.pix[4].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_4);
+   rmesa->hw.pix[5].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_5);
+   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(RADEON_EMIT_SE_ZBIAS_FACTOR);
+   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(R200_EMIT_TCL_LIGHT_MODEL_CTL_0);
+   rmesa->hw.tcl.cmd[TCL_CMD_1] = cmdpkt(R200_EMIT_TCL_UCP_VERT_BLEND_CTL);
+   rmesa->hw.tcg.cmd[TCG_CMD_0] = cmdpkt(R200_EMIT_TEX_PROC_CTL_2);
+   rmesa->hw.msl.cmd[MSL_CMD_0] = cmdpkt(R200_EMIT_MATRIX_SELECT_0);
+   rmesa->hw.vap.cmd[VAP_CMD_0] = cmdpkt(R200_EMIT_VAP_CTL);
+   rmesa->hw.vtx.cmd[VTX_CMD_0] = cmdpkt(R200_EMIT_VTX_FMT_0);
+   rmesa->hw.vtx.cmd[VTX_CMD_1] = cmdpkt(R200_EMIT_OUTPUT_VTX_COMP_SEL);
+   rmesa->hw.vtx.cmd[VTX_CMD_2] = cmdpkt(R200_EMIT_SE_VTX_STATE_CNTL);
+   rmesa->hw.vte.cmd[VTE_CMD_0] = cmdpkt(R200_EMIT_VTE_CNTL);
+   rmesa->hw.prf.cmd[PRF_CMD_0] = cmdpkt(R200_EMIT_PP_TRI_PERF_CNTL);
+   rmesa->hw.spr.cmd[SPR_CMD_0] = cmdpkt(R200_EMIT_TCL_POINT_SPRITE_CNTL);
+   rmesa->hw.mtl[0].cmd[MTL_CMD_0] = 
+      cmdvec( R200_VS_MAT_0_EMISS, 1, 16 );
+   rmesa->hw.mtl[0].cmd[MTL_CMD_1] = 
+      cmdscl2( R200_SS_MAT_0_SHININESS, 1, 1 );
+   rmesa->hw.mtl[1].cmd[MTL_CMD_0] =
+      cmdvec( R200_VS_MAT_1_EMISS, 1, 16 );
+   rmesa->hw.mtl[1].cmd[MTL_CMD_1] =
+      cmdscl2( R200_SS_MAT_1_SHININESS, 1, 1 );
+
+   rmesa->hw.vpi[0].cmd[VPI_CMD_0] =
+      cmdveclinear( R200_PVS_PROG0, 64 );
+   rmesa->hw.vpi[1].cmd[VPI_CMD_0] =
+      cmdveclinear( R200_PVS_PROG1, 64 );
+   rmesa->hw.vpp[0].cmd[VPP_CMD_0] =
+      cmdveclinear( R200_PVS_PARAM0, 96 );
+   rmesa->hw.vpp[1].cmd[VPP_CMD_0] =
+      cmdveclinear( R200_PVS_PARAM1, 96 );
+
+   rmesa->hw.grd.cmd[GRD_CMD_0] = 
+      cmdscl( R200_SS_VERT_GUARD_CLIP_ADJ_ADDR, 1, 4 );
+   rmesa->hw.fog.cmd[FOG_CMD_0] = 
+      cmdvec( R200_VS_FOG_PARAM_ADDR, 1, 4 );
+   rmesa->hw.glt.cmd[GLT_CMD_0] = 
+      cmdvec( R200_VS_GLOBAL_AMBIENT_ADDR, 1, 4 );
+   rmesa->hw.eye.cmd[EYE_CMD_0] = 
+      cmdvec( R200_VS_EYE_VECTOR_ADDR, 1, 4 );
+
+   rmesa->hw.mat[R200_MTX_MV].cmd[MAT_CMD_0] = 
+      cmdvec( R200_VS_MATRIX_0_MV, 1, 16);
+   rmesa->hw.mat[R200_MTX_IMV].cmd[MAT_CMD_0] = 
+      cmdvec( R200_VS_MATRIX_1_INV_MV, 1, 16);
+   rmesa->hw.mat[R200_MTX_MVP].cmd[MAT_CMD_0] = 
+      cmdvec( R200_VS_MATRIX_2_MVP, 1, 16);
+   rmesa->hw.mat[R200_MTX_TEX0].cmd[MAT_CMD_0] = 
+      cmdvec( R200_VS_MATRIX_3_TEX0, 1, 16);
+   rmesa->hw.mat[R200_MTX_TEX1].cmd[MAT_CMD_0] = 
+      cmdvec( R200_VS_MATRIX_4_TEX1, 1, 16);
+   rmesa->hw.mat[R200_MTX_TEX2].cmd[MAT_CMD_0] = 
+      cmdvec( R200_VS_MATRIX_5_TEX2, 1, 16);
+   rmesa->hw.mat[R200_MTX_TEX3].cmd[MAT_CMD_0] = 
+      cmdvec( R200_VS_MATRIX_6_TEX3, 1, 16);
+   rmesa->hw.mat[R200_MTX_TEX4].cmd[MAT_CMD_0] = 
+      cmdvec( R200_VS_MATRIX_7_TEX4, 1, 16);
+   rmesa->hw.mat[R200_MTX_TEX5].cmd[MAT_CMD_0] = 
+      cmdvec( R200_VS_MATRIX_8_TEX5, 1, 16);
+
+   for (i = 0 ; i < 8; i++) {
+      rmesa->hw.lit[i].cmd[LIT_CMD_0] = 
+	 cmdvec( R200_VS_LIGHT_AMBIENT_ADDR + i, 8, 24 );
+      rmesa->hw.lit[i].cmd[LIT_CMD_1] = 
+	 cmdscl( R200_SS_LIGHT_DCD_ADDR + i, 8, 7 );
+   }
+
+   for (i = 0 ; i < 6; i++) {
+      rmesa->hw.ucp[i].cmd[UCP_CMD_0] = 
+	 cmdvec( R200_VS_UCP_ADDR + i, 1, 4 );
+   }
+
+   rmesa->hw.ptp.cmd[PTP_CMD_0] =
+      cmdvec( R200_VS_PNT_SPRITE_VPORT_SCALE, 1, 4 );
+   rmesa->hw.ptp.cmd[PTP_CMD_1] =
+      cmdvec( R200_VS_PNT_SPRITE_ATT_CONST, 1, 12 );
+
+   /* Initial Harware state:
+    */
+   rmesa->hw.ctx.cmd[CTX_PP_MISC] = (R200_ALPHA_TEST_PASS
+				     /* | R200_RIGHT_HAND_CUBE_OGL*/);
+
+   rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] = (R200_FOG_VERTEX |
+					  R200_FOG_USE_SPEC_ALPHA);
+
+   rmesa->hw.ctx.cmd[CTX_RE_SOLID_COLOR] = 0x00000000;
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] = (R200_COMB_FCN_ADD_CLAMP |
+				(R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
+				(R200_BLEND_GL_ZERO << R200_DST_BLEND_SHIFT));
+
+   if (rmesa->r200Screen->drmSupportsBlendColor) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCOLOR] = 0x00000000;
+      rmesa->hw.ctx.cmd[CTX_RB3D_ABLENDCNTL] = (R200_COMB_FCN_ADD_CLAMP |
+				(R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
+				(R200_BLEND_GL_ZERO << R200_DST_BLEND_SHIFT));
+      rmesa->hw.ctx.cmd[CTX_RB3D_CBLENDCNTL] = (R200_COMB_FCN_ADD_CLAMP |
+				(R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
+				(R200_BLEND_GL_ZERO << R200_DST_BLEND_SHIFT));
+   }
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHOFFSET] =
+      rmesa->r200Screen->depthOffset + rmesa->r200Screen->fbLocation;
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] = 
+      ((rmesa->r200Screen->depthPitch &
+	R200_DEPTHPITCH_MASK) |
+       R200_DEPTH_ENDIAN_NO_SWAP);
+   
+   if (rmesa->using_hyperz)
+      rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] |= R200_DEPTH_HYPERZ;
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (depth_fmt |
+					       R200_Z_TEST_LESS |
+					       R200_STENCIL_TEST_ALWAYS |
+					       R200_STENCIL_FAIL_KEEP |
+					       R200_STENCIL_ZPASS_KEEP |
+					       R200_STENCIL_ZFAIL_KEEP |
+					       R200_Z_WRITE_ENABLE);
+
+   if (rmesa->using_hyperz) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_Z_COMPRESSION_ENABLE |
+						  R200_Z_DECOMPRESSION_ENABLE;
+/*      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200)
+	 rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_HIERARCHY_ENABLE;*/
+   }
+
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] = (R200_ANTI_ALIAS_NONE 
+ 				     | R200_TEX_BLEND_0_ENABLE);
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = color_fmt;
+   switch ( driQueryOptioni( &rmesa->optionCache, "dither_mode" ) ) {
+   case DRI_CONF_DITHER_XERRORDIFFRESET:
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= R200_DITHER_INIT;
+      break;
+   case DRI_CONF_DITHER_ORDERED:
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= R200_SCALE_DITHER_ENABLE;
+      break;
+   }
+   if ( driQueryOptioni( &rmesa->optionCache, "round_mode" ) ==
+	DRI_CONF_ROUND_ROUND )
+      rmesa->state.color.roundEnable = R200_ROUND_ENABLE;
+   else
+      rmesa->state.color.roundEnable = 0;
+   if ( driQueryOptioni (&rmesa->optionCache, "color_reduction" ) ==
+	DRI_CONF_COLOR_REDUCTION_DITHER )
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= R200_DITHER_ENABLE;
+   else
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->state.color.roundEnable;
+
+#if 000
+   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = ((rmesa->state.color.drawOffset +
+					       rmesa->r200Screen->fbLocation)
+					      & R200_COLOROFFSET_MASK);
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = ((rmesa->state.color.drawPitch &
+					      R200_COLORPITCH_MASK) |
+					     R200_COLOR_ENDIAN_NO_SWAP);
+#else
+   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = ((drawOffset +
+					       rmesa->r200Screen->fbLocation)
+					      & R200_COLOROFFSET_MASK);
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = ((drawPitch &
+					      R200_COLORPITCH_MASK) |
+					     R200_COLOR_ENDIAN_NO_SWAP);
+#endif
+   /* (fixed size) sarea is initialized to zero afaics so can omit version check. Phew! */
+   if (rmesa->sarea->tiling_enabled) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
+   }
+
+   rmesa->hw.prf.cmd[PRF_PP_TRI_PERF] = R200_TRI_CUTOFF_MASK - R200_TRI_CUTOFF_MASK * 
+			driQueryOptionf (&rmesa->optionCache,"texture_blend_quality");
+   rmesa->hw.prf.cmd[PRF_PP_PERF_CNTL] = 0;
+
+   rmesa->hw.set.cmd[SET_SE_CNTL] = (R200_FFACE_CULL_CCW |
+				     R200_BFACE_SOLID |
+				     R200_FFACE_SOLID |
+				     R200_FLAT_SHADE_VTX_LAST |
+				     R200_DIFFUSE_SHADE_GOURAUD |
+				     R200_ALPHA_SHADE_GOURAUD |
+				     R200_SPECULAR_SHADE_GOURAUD |
+				     R200_FOG_SHADE_GOURAUD |
+				     R200_DISC_FOG_SHADE_GOURAUD |
+				     R200_VTX_PIX_CENTER_OGL |
+				     R200_ROUND_MODE_TRUNC |
+				     R200_ROUND_PREC_8TH_PIX);
+
+   rmesa->hw.set.cmd[SET_RE_CNTL] = (R200_PERSPECTIVE_ENABLE |
+				     R200_SCISSOR_ENABLE);
+
+   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] = ((1 << 16) | 0xffff);
+
+   rmesa->hw.lin.cmd[LIN_RE_LINE_STATE] = 
+      ((0 << R200_LINE_CURRENT_PTR_SHIFT) |
+       (1 << R200_LINE_CURRENT_COUNT_SHIFT));
+
+   rmesa->hw.lin.cmd[LIN_SE_LINE_WIDTH] = (1 << 4);
+
+   rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] = 
+      ((0x00 << R200_STENCIL_REF_SHIFT) |
+       (0xff << R200_STENCIL_MASK_SHIFT) |
+       (0xff << R200_STENCIL_WRITEMASK_SHIFT));
+
+   rmesa->hw.msk.cmd[MSK_RB3D_ROPCNTL] = R200_ROP_COPY;
+   rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK] = 0xffffffff;
+
+   rmesa->hw.tam.cmd[TAM_DEBUG3] = 0;
+
+   rmesa->hw.msc.cmd[MSC_RE_MISC] = 
+      ((0 << R200_STIPPLE_X_OFFSET_SHIFT) |
+       (0 << R200_STIPPLE_Y_OFFSET_SHIFT) |
+       R200_STIPPLE_BIG_BIT_ORDER);
+
+
+   rmesa->hw.cst.cmd[CST_PP_CNTL_X] = 0;
+   rmesa->hw.cst.cmd[CST_RB3D_DEPTHXY_OFFSET] = 0;
+   rmesa->hw.cst.cmd[CST_RE_AUX_SCISSOR_CNTL] = 0x0;
+   rmesa->hw.cst.cmd[CST_RE_SCISSOR_TL_0] = 0;
+   rmesa->hw.cst.cmd[CST_RE_SCISSOR_BR_0] = 0;
+   rmesa->hw.cst.cmd[CST_SE_VAP_CNTL_STATUS] =
+#ifdef MESA_BIG_ENDIAN
+						R200_VC_32BIT_SWAP;
+#else
+						R200_VC_NO_SWAP;
+#endif
+
+   if (!(rmesa->r200Screen->chip_flags & RADEON_CHIPSET_TCL)) {
+      /* Bypass TCL */
+      rmesa->hw.cst.cmd[CST_SE_VAP_CNTL_STATUS] |= (1<<8);
+   }
+
+   rmesa->hw.cst.cmd[CST_RE_POINTSIZE] =
+      (((GLuint)(ctx->Const.MaxPointSize * 16.0)) << R200_MAXPOINTSIZE_SHIFT) | 0x10;
+   rmesa->hw.cst.cmd[CST_SE_TCL_INPUT_VTX_0] =
+      (0x0 << R200_VERTEX_POSITION_ADDR__SHIFT);
+   rmesa->hw.cst.cmd[CST_SE_TCL_INPUT_VTX_1] =
+      (0x02 << R200_VTX_COLOR_0_ADDR__SHIFT) |
+      (0x03 << R200_VTX_COLOR_1_ADDR__SHIFT);
+   rmesa->hw.cst.cmd[CST_SE_TCL_INPUT_VTX_2] =
+      (0x06 << R200_VTX_TEX_0_ADDR__SHIFT) |
+      (0x07 << R200_VTX_TEX_1_ADDR__SHIFT) |
+      (0x08 << R200_VTX_TEX_2_ADDR__SHIFT) |
+      (0x09 << R200_VTX_TEX_3_ADDR__SHIFT);
+   rmesa->hw.cst.cmd[CST_SE_TCL_INPUT_VTX_3] =
+      (0x0A << R200_VTX_TEX_4_ADDR__SHIFT) |
+      (0x0B << R200_VTX_TEX_5_ADDR__SHIFT);
+  
+
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_XSCALE]  = 0x00000000;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_XOFFSET] = 0x00000000;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_YSCALE]  = 0x00000000;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_YOFFSET] = 0x00000000;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_ZSCALE]  = 0x00000000;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_ZOFFSET] = 0x00000000;
+
+   for ( i = 0 ; i < ctx->Const.MaxTextureUnits ; i++ ) {
+      rmesa->hw.tex[i].cmd[TEX_PP_TXFILTER] = R200_BORDER_MODE_OGL;
+      rmesa->hw.tex[i].cmd[TEX_PP_TXFORMAT] = 
+         ((i << R200_TXFORMAT_ST_ROUTE_SHIFT) |  /* <-- note i */
+          (2 << R200_TXFORMAT_WIDTH_SHIFT) |
+          (2 << R200_TXFORMAT_HEIGHT_SHIFT));
+      rmesa->hw.tex[i].cmd[TEX_PP_BORDER_COLOR] = 0;
+      rmesa->hw.tex[i].cmd[TEX_PP_TXFORMAT_X] =
+         (/* R200_TEXCOORD_PROJ | */
+          0x100000);	/* Small default bias */
+      if (rmesa->r200Screen->drmSupportsFragShader) {
+	 rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET_NEWDRM] =
+	     rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	 rmesa->hw.tex[i].cmd[TEX_PP_CUBIC_FACES] = 0;
+	 rmesa->hw.tex[i].cmd[TEX_PP_TXMULTI_CTL] = 0;
+      }
+      else {
+	  rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET_OLDDRM] =
+	     rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+     }
+
+      rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_FACES] = 0;
+      rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F1] =
+         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+      rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F2] =
+         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+      rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F3] =
+         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+      rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F4] =
+         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+      rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F5] =
+         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+
+      rmesa->hw.pix[i].cmd[PIX_PP_TXCBLEND] =
+         (R200_TXC_ARG_A_ZERO |
+          R200_TXC_ARG_B_ZERO |
+          R200_TXC_ARG_C_DIFFUSE_COLOR |
+          R200_TXC_OP_MADD);
+
+      rmesa->hw.pix[i].cmd[PIX_PP_TXCBLEND2] =
+         ((i << R200_TXC_TFACTOR_SEL_SHIFT) |
+          R200_TXC_SCALE_1X |
+          R200_TXC_CLAMP_0_1 |
+          R200_TXC_OUTPUT_REG_R0);
+
+      rmesa->hw.pix[i].cmd[PIX_PP_TXABLEND] =
+         (R200_TXA_ARG_A_ZERO |
+          R200_TXA_ARG_B_ZERO |
+          R200_TXA_ARG_C_DIFFUSE_ALPHA |
+          R200_TXA_OP_MADD);
+
+      rmesa->hw.pix[i].cmd[PIX_PP_TXABLEND2] =
+         ((i << R200_TXA_TFACTOR_SEL_SHIFT) |
+          R200_TXA_SCALE_1X |
+          R200_TXA_CLAMP_0_1 |
+          R200_TXA_OUTPUT_REG_R0);
+   }
+
+   rmesa->hw.tf.cmd[TF_TFACTOR_0] = 0;
+   rmesa->hw.tf.cmd[TF_TFACTOR_1] = 0;
+   rmesa->hw.tf.cmd[TF_TFACTOR_2] = 0;
+   rmesa->hw.tf.cmd[TF_TFACTOR_3] = 0;
+   rmesa->hw.tf.cmd[TF_TFACTOR_4] = 0;
+   rmesa->hw.tf.cmd[TF_TFACTOR_5] = 0;
+
+   rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] = 
+      (R200_VAP_TCL_ENABLE | 
+       (0x9 << R200_VAP_VF_MAX_VTX_NUM__SHIFT));
+
+   rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL] = 
+      (R200_VPORT_X_SCALE_ENA |
+       R200_VPORT_Y_SCALE_ENA |
+       R200_VPORT_Z_SCALE_ENA |
+       R200_VPORT_X_OFFSET_ENA |
+       R200_VPORT_Y_OFFSET_ENA |
+       R200_VPORT_Z_OFFSET_ENA |
+/* FIXME: Turn on for tex rect only */
+       R200_VTX_ST_DENORMALIZED |  
+       R200_VTX_W0_FMT); 
+
+
+   rmesa->hw.vtx.cmd[VTX_VTXFMT_0] = 0;
+   rmesa->hw.vtx.cmd[VTX_VTXFMT_1] = 0;
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] = 
+      ((R200_VTX_Z0 | R200_VTX_W0 |
+       (R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT)));	
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] = 0;
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] = (R200_OUTPUT_XYZW);
+   rmesa->hw.vtx.cmd[VTX_STATE_CNTL] = R200_VSC_UPDATE_USER_COLOR_0_ENABLE;
+						   
+
+   /* Matrix selection */
+   rmesa->hw.msl.cmd[MSL_MATRIX_SELECT_0] = 
+      (R200_MTX_MV << R200_MODELVIEW_0_SHIFT);
+   
+   rmesa->hw.msl.cmd[MSL_MATRIX_SELECT_1] = 
+       (R200_MTX_IMV << R200_IT_MODELVIEW_0_SHIFT);
+
+   rmesa->hw.msl.cmd[MSL_MATRIX_SELECT_2] = 
+      (R200_MTX_MVP << R200_MODELPROJECT_0_SHIFT);
+
+   rmesa->hw.msl.cmd[MSL_MATRIX_SELECT_3] = 
+      ((R200_MTX_TEX0 << R200_TEXMAT_0_SHIFT) |
+       (R200_MTX_TEX1 << R200_TEXMAT_1_SHIFT) |
+       (R200_MTX_TEX2 << R200_TEXMAT_2_SHIFT) |
+       (R200_MTX_TEX3 << R200_TEXMAT_3_SHIFT));
+
+   rmesa->hw.msl.cmd[MSL_MATRIX_SELECT_4] = 
+      ((R200_MTX_TEX4 << R200_TEXMAT_4_SHIFT) |
+       (R200_MTX_TEX5 << R200_TEXMAT_5_SHIFT));
+
+
+   /* General TCL state */
+   rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] = 
+      (R200_SPECULAR_LIGHTS |
+       R200_DIFFUSE_SPECULAR_COMBINE |
+       R200_LOCAL_LIGHT_VEC_GL |
+       R200_LM0_SOURCE_MATERIAL_0 << R200_FRONT_SHININESS_SOURCE_SHIFT |
+       R200_LM0_SOURCE_MATERIAL_1 << R200_BACK_SHININESS_SOURCE_SHIFT);
+
+   rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_1] = 
+      ((R200_LM1_SOURCE_MATERIAL_0 << R200_FRONT_EMISSIVE_SOURCE_SHIFT) |
+       (R200_LM1_SOURCE_MATERIAL_0 << R200_FRONT_AMBIENT_SOURCE_SHIFT) |
+       (R200_LM1_SOURCE_MATERIAL_0 << R200_FRONT_DIFFUSE_SOURCE_SHIFT) |
+       (R200_LM1_SOURCE_MATERIAL_0 << R200_FRONT_SPECULAR_SOURCE_SHIFT) |
+       (R200_LM1_SOURCE_MATERIAL_1 << R200_BACK_EMISSIVE_SOURCE_SHIFT) |
+       (R200_LM1_SOURCE_MATERIAL_1 << R200_BACK_AMBIENT_SOURCE_SHIFT) |
+       (R200_LM1_SOURCE_MATERIAL_1 << R200_BACK_DIFFUSE_SOURCE_SHIFT) |
+       (R200_LM1_SOURCE_MATERIAL_1 << R200_BACK_SPECULAR_SOURCE_SHIFT)); 
+
+   rmesa->hw.tcl.cmd[TCL_PER_LIGHT_CTL_0] = 0; /* filled in via callbacks */
+   rmesa->hw.tcl.cmd[TCL_PER_LIGHT_CTL_1] = 0;
+   rmesa->hw.tcl.cmd[TCL_PER_LIGHT_CTL_2] = 0;
+   rmesa->hw.tcl.cmd[TCL_PER_LIGHT_CTL_3] = 0;
+   
+   rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] = 
+      (R200_UCP_IN_CLIP_SPACE |
+       R200_CULL_FRONT_IS_CCW);
+
+   /* Texgen/Texmat state */
+   rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_2] = 0x00ffffff;
+   rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_3] = 
+      ((0 << R200_TEXGEN_0_INPUT_TEX_SHIFT) |
+       (1 << R200_TEXGEN_1_INPUT_TEX_SHIFT) |
+       (2 << R200_TEXGEN_2_INPUT_TEX_SHIFT) |
+       (3 << R200_TEXGEN_3_INPUT_TEX_SHIFT) |
+       (4 << R200_TEXGEN_4_INPUT_TEX_SHIFT) |
+       (5 << R200_TEXGEN_5_INPUT_TEX_SHIFT)); 
+   rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_0] = 0; 
+   rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_1] =  
+      ((0 << R200_TEXGEN_0_INPUT_SHIFT) |
+       (1 << R200_TEXGEN_1_INPUT_SHIFT) |
+       (2 << R200_TEXGEN_2_INPUT_SHIFT) |
+       (3 << R200_TEXGEN_3_INPUT_SHIFT) |
+       (4 << R200_TEXGEN_4_INPUT_SHIFT) |
+       (5 << R200_TEXGEN_5_INPUT_SHIFT)); 
+   rmesa->hw.tcg.cmd[TCG_TEX_CYL_WRAP_CTL] = 0;
+
+
+   for (i = 0 ; i < 8; i++) {
+      struct gl_light *l = &ctx->Light.Light[i];
+      GLenum p = GL_LIGHT0 + i;
+      *(float *)&(rmesa->hw.lit[i].cmd[LIT_RANGE_CUTOFF]) = FLT_MAX;
+
+      ctx->Driver.Lightfv( ctx, p, GL_AMBIENT, l->Ambient );
+      ctx->Driver.Lightfv( ctx, p, GL_DIFFUSE, l->Diffuse );
+      ctx->Driver.Lightfv( ctx, p, GL_SPECULAR, l->Specular );
+      ctx->Driver.Lightfv( ctx, p, GL_POSITION, NULL );
+      ctx->Driver.Lightfv( ctx, p, GL_SPOT_DIRECTION, NULL );
+      ctx->Driver.Lightfv( ctx, p, GL_SPOT_EXPONENT, &l->SpotExponent );
+      ctx->Driver.Lightfv( ctx, p, GL_SPOT_CUTOFF, &l->SpotCutoff );
+      ctx->Driver.Lightfv( ctx, p, GL_CONSTANT_ATTENUATION,
+			   &l->ConstantAttenuation );
+      ctx->Driver.Lightfv( ctx, p, GL_LINEAR_ATTENUATION, 
+			   &l->LinearAttenuation );
+      ctx->Driver.Lightfv( ctx, p, GL_QUADRATIC_ATTENUATION, 
+			   &l->QuadraticAttenuation );
+      *(float *)&(rmesa->hw.lit[i].cmd[LIT_ATTEN_XXX]) = 0.0;
+   }
+
+   ctx->Driver.LightModelfv( ctx, GL_LIGHT_MODEL_AMBIENT, 
+			     ctx->Light.Model.Ambient );
+
+   TNL_CONTEXT(ctx)->Driver.NotifyMaterialChange( ctx );
+
+   for (i = 0 ; i < 6; i++) {
+      ctx->Driver.ClipPlane( ctx, GL_CLIP_PLANE0 + i, NULL );
+   }
+
+   ctx->Driver.Fogfv( ctx, GL_FOG_MODE, NULL );
+   ctx->Driver.Fogfv( ctx, GL_FOG_DENSITY, &ctx->Fog.Density );
+   ctx->Driver.Fogfv( ctx, GL_FOG_START, &ctx->Fog.Start );
+   ctx->Driver.Fogfv( ctx, GL_FOG_END, &ctx->Fog.End );
+   ctx->Driver.Fogfv( ctx, GL_FOG_COLOR, ctx->Fog.Color );
+   ctx->Driver.Fogfv( ctx, GL_FOG_COORDINATE_SOURCE_EXT, NULL );
+   
+   rmesa->hw.grd.cmd[GRD_VERT_GUARD_CLIP_ADJ] = IEEE_ONE;
+   rmesa->hw.grd.cmd[GRD_VERT_GUARD_DISCARD_ADJ] = IEEE_ONE;
+   rmesa->hw.grd.cmd[GRD_HORZ_GUARD_CLIP_ADJ] = IEEE_ONE;
+   rmesa->hw.grd.cmd[GRD_HORZ_GUARD_DISCARD_ADJ] = IEEE_ONE;
+
+   rmesa->hw.eye.cmd[EYE_X] = 0;
+   rmesa->hw.eye.cmd[EYE_Y] = 0;
+   rmesa->hw.eye.cmd[EYE_Z] = IEEE_ONE;
+   rmesa->hw.eye.cmd[EYE_RESCALE_FACTOR] = IEEE_ONE;
+
+   rmesa->hw.spr.cmd[SPR_POINT_SPRITE_CNTL] =
+      R200_PS_SE_SEL_STATE | R200_PS_MULT_CONST;
+
+   /* ptp_eye is presumably used to calculate the attenuation wrt a different
+      location? In any case, since point attenuation triggers _needeyecoords,
+      it is constant. Probably ignored as long as R200_PS_USE_MODEL_EYE_VEC
+      isn't set */
+   rmesa->hw.ptp.cmd[PTP_EYE_X] = 0;
+   rmesa->hw.ptp.cmd[PTP_EYE_Y] = 0;
+   rmesa->hw.ptp.cmd[PTP_EYE_Z] = IEEE_ONE | 0x80000000; /* -1.0 */
+   rmesa->hw.ptp.cmd[PTP_EYE_3] = 0;
+   /* no idea what the ptp_vport_scale values are good for, except the
+      PTSIZE one - hopefully doesn't matter */
+   rmesa->hw.ptp.cmd[PTP_VPORT_SCALE_0] = IEEE_ONE;
+   rmesa->hw.ptp.cmd[PTP_VPORT_SCALE_1] = IEEE_ONE;
+   rmesa->hw.ptp.cmd[PTP_VPORT_SCALE_PTSIZE] = IEEE_ONE;
+   rmesa->hw.ptp.cmd[PTP_VPORT_SCALE_3] = IEEE_ONE;
+   rmesa->hw.ptp.cmd[PTP_ATT_CONST_QUAD] = 0;
+   rmesa->hw.ptp.cmd[PTP_ATT_CONST_LIN] = 0;
+   rmesa->hw.ptp.cmd[PTP_ATT_CONST_CON] = IEEE_ONE;
+   rmesa->hw.ptp.cmd[PTP_ATT_CONST_3] = 0;
+   rmesa->hw.ptp.cmd[PTP_CLAMP_MIN] = IEEE_ONE;
+   rmesa->hw.ptp.cmd[PTP_CLAMP_MAX] = 0x44ffe000; /* 2047 */
+   rmesa->hw.ptp.cmd[PTP_CLAMP_2] = 0;
+   rmesa->hw.ptp.cmd[PTP_CLAMP_3] = 0;
+
+   r200LightingSpaceChange( ctx );
+
+   rmesa->hw.all_dirty = GL_TRUE;
+}
diff --git a/r200/r200_swtcl.c b/r200/r200_swtcl.c
new file mode 100644
index 0000000..25d229d
--- /dev/null
+++ b/r200/r200_swtcl.c
@@ -0,0 +1,979 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_swtcl.c,v 1.5 2003/05/06 23:52:08 daenzer Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "mtypes.h"
+#include "colormac.h"
+#include "enums.h"
+#include "image.h"
+#include "imports.h"
+#include "macros.h"
+
+#include "swrast/s_context.h"
+#include "swrast/s_fog.h"
+#include "swrast_setup/swrast_setup.h"
+#include "math/m_translate.h"
+#include "tnl/tnl.h"
+#include "tnl/t_context.h"
+#include "tnl/t_pipeline.h"
+
+#include "r200_context.h"
+#include "r200_ioctl.h"
+#include "r200_state.h"
+#include "r200_swtcl.h"
+#include "r200_tcl.h"
+
+
+static void flush_last_swtcl_prim( r200ContextPtr rmesa  );
+
+
+/***********************************************************************
+ *                         Initialization 
+ ***********************************************************************/
+
+#define EMIT_ATTR( ATTR, STYLE, F0 )					\
+do {									\
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);	\
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE);	\
+   rmesa->swtcl.vertex_attr_count++;					\
+   fmt_0 |= F0;								\
+} while (0)
+
+#define EMIT_PAD( N )							\
+do {									\
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;		\
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;	\
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);		\
+   rmesa->swtcl.vertex_attr_count++;					\
+} while (0)
+
+static void r200SetVertexFormat( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+   DECLARE_RENDERINPUTS(index_bitset);
+   int fmt_0 = 0;
+   int fmt_1 = 0;
+   int offset = 0;
+
+   RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
+
+   /* Important:
+    */
+   if ( VB->NdcPtr != NULL ) {
+      VB->AttribPtr[VERT_ATTRIB_POS] = VB->NdcPtr;
+   }
+   else {
+      VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
+   }
+
+   assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
+   rmesa->swtcl.vertex_attr_count = 0;
+
+   /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
+    * build up a hardware vertex.
+    */
+   if ( !rmesa->swtcl.needproj ||
+       RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) { /* need w coord for projected textures */
+      EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F, R200_VTX_XY | R200_VTX_Z0 | R200_VTX_W0 );
+      offset = 4;
+   }
+   else {
+      EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_3F, R200_VTX_XY | R200_VTX_Z0 );
+      offset = 3;
+   }
+
+   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POINTSIZE )) {
+      EMIT_ATTR( _TNL_ATTRIB_POINTSIZE, EMIT_1F, R200_VTX_POINT_SIZE );
+      offset += 1;
+   }
+
+   rmesa->swtcl.coloroffset = offset;
+#if MESA_LITTLE_ENDIAN 
+   EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA, (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) );
+#else
+   EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR, (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) );
+#endif
+   offset += 1;
+
+   rmesa->swtcl.specoffset = 0;
+   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 ) ||
+       RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
+
+#if MESA_LITTLE_ENDIAN 
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
+	 rmesa->swtcl.specoffset = offset;
+	 EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_RGB, (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT) );
+      }
+      else {
+	 EMIT_PAD( 3 );
+      }
+
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
+	 EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1UB_1F, (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT) );
+      }
+      else {
+	 EMIT_PAD( 1 );
+      }
+#else
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
+	 EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1UB_1F, (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT) );
+      }
+      else {
+	 EMIT_PAD( 1 );
+      }
+
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
+	 rmesa->swtcl.specoffset = offset;
+	 EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_BGR, (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT) );
+      }
+      else {
+	 EMIT_PAD( 3 );
+      }
+#endif
+   }
+
+   if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+      int i;
+
+      for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+	 if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
+	    GLuint sz = VB->TexCoordPtr[i]->size;
+
+	    fmt_1 |= sz << (3 * i);
+	    EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_1F + sz - 1, 0 );
+	 }
+      }
+   }
+
+   if ( (rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] & R200_FOG_USE_MASK)
+      != R200_FOG_USE_SPEC_ALPHA ) {
+      R200_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] &= ~R200_FOG_USE_MASK;
+      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] |= R200_FOG_USE_SPEC_ALPHA;
+   }
+
+   if (!RENDERINPUTS_EQUAL( rmesa->tnl_index_bitset, index_bitset ) ||
+	(rmesa->hw.vtx.cmd[VTX_VTXFMT_0] != fmt_0) ||
+	(rmesa->hw.vtx.cmd[VTX_VTXFMT_1] != fmt_1) ) {
+      R200_NEWPRIM(rmesa);
+      R200_STATECHANGE( rmesa, vtx );
+      rmesa->hw.vtx.cmd[VTX_VTXFMT_0] = fmt_0;
+      rmesa->hw.vtx.cmd[VTX_VTXFMT_1] = fmt_1;
+
+      rmesa->swtcl.vertex_size =
+	  _tnl_install_attrs( ctx,
+			      rmesa->swtcl.vertex_attrs, 
+			      rmesa->swtcl.vertex_attr_count,
+			      NULL, 0 );
+      rmesa->swtcl.vertex_size /= 4;
+      RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
+   }
+}
+
+
+static void r200RenderStart( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+
+   r200SetVertexFormat( ctx );
+
+   if (rmesa->dma.flush != 0 && 
+       rmesa->dma.flush != flush_last_swtcl_prim)
+      rmesa->dma.flush( rmesa );
+}
+
+
+/**
+ * Set vertex state for SW TCL.  The primary purpose of this function is to
+ * determine in advance whether or not the hardware can / should do the
+ * projection divide or Mesa should do it.
+ */
+void r200ChooseVertexState( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   GLuint vte;
+   GLuint vap;
+
+   /* We must ensure that we don't do _tnl_need_projected_coords while in a
+    * rasterization fallback.  As this function will be called again when we
+    * leave a rasterization fallback, we can just skip it for now.
+    */
+   if (rmesa->Fallback != 0)
+      return;
+
+   vte = rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL];
+   vap = rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL];
+
+   /* HW perspective divide is a win, but tiny vertex formats are a
+    * bigger one.
+    */
+   if (!RENDERINPUTS_TEST_RANGE( tnl->render_inputs_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )
+	|| (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
+      rmesa->swtcl.needproj = GL_TRUE;
+      vte |= R200_VTX_XY_FMT | R200_VTX_Z_FMT;
+      vte &= ~R200_VTX_W0_FMT;
+      if (RENDERINPUTS_TEST_RANGE( tnl->render_inputs_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+	 vap &= ~R200_VAP_FORCE_W_TO_ONE;
+      }
+      else {
+	 vap |= R200_VAP_FORCE_W_TO_ONE;
+      }
+   }
+   else {
+      rmesa->swtcl.needproj = GL_FALSE;
+      vte &= ~(R200_VTX_XY_FMT | R200_VTX_Z_FMT);
+      vte |= R200_VTX_W0_FMT;
+      vap &= ~R200_VAP_FORCE_W_TO_ONE;
+   }
+
+   _tnl_need_projected_coords( ctx, rmesa->swtcl.needproj );
+
+   if (vte != rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL]) {
+      R200_STATECHANGE( rmesa, vte );
+      rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL] = vte;
+   }
+
+   if (vap != rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL]) {
+      R200_STATECHANGE( rmesa, vap );
+      rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] = vap;
+   }
+}
+
+
+/* Flush vertices in the current dma region.
+ */
+static void flush_last_swtcl_prim( r200ContextPtr rmesa  )
+{
+   if (R200_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   rmesa->dma.flush = NULL;
+
+   if (rmesa->dma.current.buf) {
+      struct r200_dma_region *current = &rmesa->dma.current;
+      GLuint current_offset = (rmesa->r200Screen->gart_buffer_offset +
+			       current->buf->buf->idx * RADEON_BUFFER_SIZE + 
+			       current->start);
+
+      assert (!(rmesa->swtcl.hw_primitive & R200_VF_PRIM_WALK_IND));
+
+      assert (current->start + 
+	      rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+	      current->ptr);
+
+      if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
+	 r200EnsureCmdBufSpace( rmesa, VERT_AOS_BUFSZ +
+			        rmesa->hw.max_state_size + VBUF_BUFSZ );
+	 r200EmitVertexAOS( rmesa,
+			      rmesa->swtcl.vertex_size,
+			      current_offset);
+
+	 r200EmitVbufPrim( rmesa,
+			   rmesa->swtcl.hw_primitive,
+			   rmesa->swtcl.numverts);
+      }
+
+      rmesa->swtcl.numverts = 0;
+      current->start = current->ptr;
+   }
+}
+
+
+/* Alloc space in the current dma region.
+ */
+static INLINE void *
+r200AllocDmaLowVerts( r200ContextPtr rmesa, int nverts, int vsize )
+{
+   GLuint bytes = vsize * nverts;
+
+   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
+      r200RefillCurrentDmaRegion( rmesa );
+
+   if (!rmesa->dma.flush) {
+      rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+      rmesa->dma.flush = flush_last_swtcl_prim;
+   }
+
+   ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
+   ASSERT( rmesa->dma.flush == flush_last_swtcl_prim );
+   ASSERT( rmesa->dma.current.start + 
+	   rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+	   rmesa->dma.current.ptr );
+
+
+   {
+      GLubyte *head = (GLubyte *) (rmesa->dma.current.address + rmesa->dma.current.ptr);
+      rmesa->dma.current.ptr += bytes;
+      rmesa->swtcl.numverts += nverts;
+      return head;
+   }
+
+}
+
+
+/**************************************************************************/
+
+
+static INLINE GLuint reduced_hw_prim( GLcontext *ctx, GLuint prim)
+{
+   switch (prim) {
+   case GL_POINTS:
+      return (ctx->Point.PointSprite ||
+	 ((ctx->_TriangleCaps & (DD_POINT_SIZE | DD_POINT_ATTEN)) &&
+	 !(ctx->_TriangleCaps & (DD_POINT_SMOOTH)))) ?
+	 R200_VF_PRIM_POINT_SPRITES : R200_VF_PRIM_POINTS;
+   case GL_LINES:
+   /* fallthrough */
+   case GL_LINE_LOOP:
+   /* fallthrough */
+   case GL_LINE_STRIP:
+      return R200_VF_PRIM_LINES;
+   default:
+   /* all others reduced to triangles */
+      return R200_VF_PRIM_TRIANGLES;
+   }
+}
+
+
+static void r200RasterPrimitive( GLcontext *ctx, GLuint hwprim );
+static void r200RenderPrimitive( GLcontext *ctx, GLenum prim );
+static void r200ResetLineStipple( GLcontext *ctx );
+
+/***********************************************************************
+ *                    Emit primitives as inline vertices               *
+ ***********************************************************************/
+
+#define HAVE_POINTS      1
+#define HAVE_LINES       1
+#define HAVE_LINE_STRIPS 1
+#define HAVE_TRIANGLES   1
+#define HAVE_TRI_STRIPS  1
+#define HAVE_TRI_STRIP_1 0
+#define HAVE_TRI_FANS    1
+#define HAVE_QUADS       0
+#define HAVE_QUAD_STRIPS 0
+#define HAVE_POLYGONS    1
+#define HAVE_ELTS        0
+
+#undef LOCAL_VARS
+#undef ALLOC_VERTS
+#define CTX_ARG r200ContextPtr rmesa
+#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
+#define ALLOC_VERTS( n, size ) r200AllocDmaLowVerts( rmesa, n, size * 4 )
+#define LOCAL_VARS						\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+   const char *r200verts = (char *)rmesa->swtcl.verts;
+#define VERT(x) (r200Vertex *)(r200verts + ((x) * vertsize * sizeof(int)))
+#define VERTEX r200Vertex 
+#define DO_DEBUG_VERTS (1 && (R200_DEBUG & DEBUG_VERTS))
+
+#undef TAG
+#define TAG(x) r200_##x
+#include "tnl_dd/t_dd_triemit.h"
+
+
+/***********************************************************************
+ *          Macros for t_dd_tritmp.h to draw basic primitives          *
+ ***********************************************************************/
+
+#define QUAD( a, b, c, d ) r200_quad( rmesa, a, b, c, d )
+#define TRI( a, b, c )     r200_triangle( rmesa, a, b, c )
+#define LINE( a, b )       r200_line( rmesa, a, b )
+#define POINT( a )         r200_point( rmesa, a )
+
+/***********************************************************************
+ *              Build render functions from dd templates               *
+ ***********************************************************************/
+
+#define R200_TWOSIDE_BIT	0x01
+#define R200_UNFILLED_BIT	0x02
+#define R200_MAX_TRIFUNC	0x04
+
+
+static struct {
+   tnl_points_func	        points;
+   tnl_line_func		line;
+   tnl_triangle_func	triangle;
+   tnl_quad_func		quad;
+} rast_tab[R200_MAX_TRIFUNC];
+
+
+#define DO_FALLBACK  0
+#define DO_UNFILLED (IND & R200_UNFILLED_BIT)
+#define DO_TWOSIDE  (IND & R200_TWOSIDE_BIT)
+#define DO_FLAT      0
+#define DO_OFFSET     0
+#define DO_TRI       1
+#define DO_QUAD      1
+#define DO_LINE      1
+#define DO_POINTS    1
+#define DO_FULL_QUAD 1
+
+#define HAVE_RGBA   1
+#define HAVE_SPEC   1
+#define HAVE_BACK_COLORS  0
+#define HAVE_HW_FLATSHADE 1
+#define TAB rast_tab
+
+#define DEPTH_SCALE 1.0
+#define UNFILLED_TRI unfilled_tri
+#define UNFILLED_QUAD unfilled_quad
+#define VERT_X(_v) _v->v.x
+#define VERT_Y(_v) _v->v.y
+#define VERT_Z(_v) _v->v.z
+#define AREA_IS_CCW( a ) (a < 0)
+#define GET_VERTEX(e) (rmesa->swtcl.verts + (e*rmesa->swtcl.vertex_size*sizeof(int)))
+
+#define VERT_SET_RGBA( v, c )  					\
+do {								\
+   r200_color_t *color = (r200_color_t *)&((v)->ui[coloroffset]);	\
+   UNCLAMPED_FLOAT_TO_UBYTE(color->red, (c)[0]);		\
+   UNCLAMPED_FLOAT_TO_UBYTE(color->green, (c)[1]);		\
+   UNCLAMPED_FLOAT_TO_UBYTE(color->blue, (c)[2]);		\
+   UNCLAMPED_FLOAT_TO_UBYTE(color->alpha, (c)[3]);		\
+} while (0)
+
+#define VERT_COPY_RGBA( v0, v1 ) v0->ui[coloroffset] = v1->ui[coloroffset]
+
+#define VERT_SET_SPEC( v, c )					\
+do {								\
+   if (specoffset) {						\
+      r200_color_t *spec = (r200_color_t *)&((v)->ui[specoffset]);	\
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->red, (c)[0]);	\
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->green, (c)[1]);	\
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->blue, (c)[2]);	\
+   }								\
+} while (0)
+#define VERT_COPY_SPEC( v0, v1 )			\
+do {							\
+   if (specoffset) {					\
+      r200_color_t *spec0 = (r200_color_t *)&((v0)->ui[specoffset]);	\
+      r200_color_t *spec1 = (r200_color_t *)&((v1)->ui[specoffset]);	\
+      spec0->red   = spec1->red;	\
+      spec0->green = spec1->green;	\
+      spec0->blue  = spec1->blue; 	\
+   }							\
+} while (0)
+
+/* These don't need LE32_TO_CPU() as they used to save and restore
+ * colors which are already in the correct format.
+ */
+#define VERT_SAVE_RGBA( idx )    color[idx] = v[idx]->ui[coloroffset]
+#define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = color[idx]
+#define VERT_SAVE_SPEC( idx )    if (specoffset) spec[idx] = v[idx]->ui[specoffset]
+#define VERT_RESTORE_SPEC( idx ) if (specoffset) v[idx]->ui[specoffset] = spec[idx]
+
+#undef LOCAL_VARS
+#undef TAG
+#undef INIT
+
+#define LOCAL_VARS(n)							\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);			\
+   GLuint color[n], spec[n];						\
+   GLuint coloroffset = rmesa->swtcl.coloroffset;	\
+   GLuint specoffset = rmesa->swtcl.specoffset;			\
+   (void) color; (void) spec; (void) coloroffset; (void) specoffset;
+
+/***********************************************************************
+ *                Helpers for rendering unfilled primitives            *
+ ***********************************************************************/
+
+#define RASTERIZE(x) r200RasterPrimitive( ctx, reduced_hw_prim(ctx, x) )
+#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
+#undef TAG
+#define TAG(x) x
+#include "tnl_dd/t_dd_unfilled.h"
+#undef IND
+
+
+/***********************************************************************
+ *                      Generate GL render functions                   *
+ ***********************************************************************/
+
+
+#define IND (0)
+#define TAG(x) x
+#include "tnl_dd/t_dd_tritmp.h"
+
+#define IND (R200_TWOSIDE_BIT)
+#define TAG(x) x##_twoside
+#include "tnl_dd/t_dd_tritmp.h"
+
+#define IND (R200_UNFILLED_BIT)
+#define TAG(x) x##_unfilled
+#include "tnl_dd/t_dd_tritmp.h"
+
+#define IND (R200_TWOSIDE_BIT|R200_UNFILLED_BIT)
+#define TAG(x) x##_twoside_unfilled
+#include "tnl_dd/t_dd_tritmp.h"
+
+
+static void init_rast_tab( void )
+{
+   init();
+   init_twoside();
+   init_unfilled();
+   init_twoside_unfilled();
+}
+
+/**********************************************************************/
+/*               Render unclipped begin/end objects                   */
+/**********************************************************************/
+
+#define RENDER_POINTS( start, count )		\
+   for ( ; start < count ; start++)		\
+      r200_point( rmesa, VERT(start) )
+#define RENDER_LINE( v0, v1 ) \
+   r200_line( rmesa, VERT(v0), VERT(v1) )
+#define RENDER_TRI( v0, v1, v2 )  \
+   r200_triangle( rmesa, VERT(v0), VERT(v1), VERT(v2) )
+#define RENDER_QUAD( v0, v1, v2, v3 ) \
+   r200_quad( rmesa, VERT(v0), VERT(v1), VERT(v2), VERT(v3) )
+#define INIT(x) do {					\
+   r200RenderPrimitive( ctx, x );			\
+} while (0)
+#undef LOCAL_VARS
+#define LOCAL_VARS						\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+   const GLuint vertsize = rmesa->swtcl.vertex_size;		\
+   const char *r200verts = (char *)rmesa->swtcl.verts;		\
+   const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;	\
+   const GLboolean stipple = ctx->Line.StippleFlag;		\
+   (void) elt; (void) stipple;
+#define RESET_STIPPLE	if ( stipple ) r200ResetLineStipple( ctx );
+#define RESET_OCCLUSION
+#define PRESERVE_VB_DEFS
+#define ELT(x) (x)
+#define TAG(x) r200_##x##_verts
+#include "tnl/t_vb_rendertmp.h"
+#undef ELT
+#undef TAG
+#define TAG(x) r200_##x##_elts
+#define ELT(x) elt[x]
+#include "tnl/t_vb_rendertmp.h"
+
+
+
+/**********************************************************************/
+/*                    Choose render functions                         */
+/**********************************************************************/
+
+void r200ChooseRenderState( GLcontext *ctx )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint index = 0;
+   GLuint flags = ctx->_TriangleCaps;
+
+   if (!rmesa->TclFallback || rmesa->Fallback) 
+      return;
+
+   if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R200_TWOSIDE_BIT;
+   if (flags & DD_TRI_UNFILLED)      index |= R200_UNFILLED_BIT;
+
+   if (index != rmesa->swtcl.RenderIndex) {
+      tnl->Driver.Render.Points = rast_tab[index].points;
+      tnl->Driver.Render.Line = rast_tab[index].line;
+      tnl->Driver.Render.ClippedLine = rast_tab[index].line;
+      tnl->Driver.Render.Triangle = rast_tab[index].triangle;
+      tnl->Driver.Render.Quad = rast_tab[index].quad;
+
+      if (index == 0) {
+	 tnl->Driver.Render.PrimTabVerts = r200_render_tab_verts;
+	 tnl->Driver.Render.PrimTabElts = r200_render_tab_elts;
+	 tnl->Driver.Render.ClippedPolygon = r200_fast_clipped_poly;
+      } else {
+	 tnl->Driver.Render.PrimTabVerts = _tnl_render_tab_verts;
+	 tnl->Driver.Render.PrimTabElts = _tnl_render_tab_elts;
+	 tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
+      }
+
+      rmesa->swtcl.RenderIndex = index;
+   }
+}
+
+
+/**********************************************************************/
+/*                 High level hooks for t_vb_render.c                 */
+/**********************************************************************/
+
+
+static void r200RasterPrimitive( GLcontext *ctx, GLuint hwprim )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   if (rmesa->swtcl.hw_primitive != hwprim) {
+      /* need to disable perspective-correct texturing for point sprites */
+      if ((hwprim & 0xf) == R200_VF_PRIM_POINT_SPRITES && ctx->Point.PointSprite) {
+	 if (rmesa->hw.set.cmd[SET_RE_CNTL] & R200_PERSPECTIVE_ENABLE) {
+	    R200_STATECHANGE( rmesa, set );
+	    rmesa->hw.set.cmd[SET_RE_CNTL] &= ~R200_PERSPECTIVE_ENABLE;
+	 }
+      }
+      else if (!(rmesa->hw.set.cmd[SET_RE_CNTL] & R200_PERSPECTIVE_ENABLE)) {
+	 R200_STATECHANGE( rmesa, set );
+	 rmesa->hw.set.cmd[SET_RE_CNTL] |= R200_PERSPECTIVE_ENABLE;
+      }
+      R200_NEWPRIM( rmesa );
+      rmesa->swtcl.hw_primitive = hwprim;
+   }
+}
+
+static void r200RenderPrimitive( GLcontext *ctx, GLenum prim )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   rmesa->swtcl.render_primitive = prim;
+   if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED)) 
+      r200RasterPrimitive( ctx, reduced_hw_prim(ctx, prim) );
+}
+
+static void r200RenderFinish( GLcontext *ctx )
+{
+}
+
+static void r200ResetLineStipple( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   R200_STATECHANGE( rmesa, lin );
+}
+
+
+/**********************************************************************/
+/*           Transition to/from hardware rasterization.               */
+/**********************************************************************/
+
+static const char * const fallbackStrings[] = {
+   "Texture mode",
+   "glDrawBuffer(GL_FRONT_AND_BACK)",
+   "glEnable(GL_STENCIL) without hw stencil buffer",
+   "glRenderMode(selection or feedback)",
+   "R200_NO_RAST",
+   "Mixing GL_CLAMP_TO_BORDER and GL_CLAMP (or GL_MIRROR_CLAMP_ATI)"
+};
+
+
+static const char *getFallbackString(GLuint bit)
+{
+   int i = 0;
+   while (bit > 1) {
+      i++;
+      bit >>= 1;
+   }
+   return fallbackStrings[i];
+}
+
+
+void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   GLuint oldfallback = rmesa->Fallback;
+
+   if (mode) {
+      rmesa->Fallback |= bit;
+      if (oldfallback == 0) {
+	 R200_FIREVERTICES( rmesa );
+	 TCL_FALLBACK( ctx, R200_TCL_FALLBACK_RASTER, GL_TRUE );
+	 _swsetup_Wakeup( ctx );
+	 rmesa->swtcl.RenderIndex = ~0;
+         if (R200_DEBUG & DEBUG_FALLBACKS) {
+            fprintf(stderr, "R200 begin rasterization fallback: 0x%x %s\n",
+                    bit, getFallbackString(bit));
+         }
+      }
+   }
+   else {
+      rmesa->Fallback &= ~bit;
+      if (oldfallback == bit) {
+
+	 _swrast_flush( ctx );
+	 tnl->Driver.Render.Start = r200RenderStart;
+	 tnl->Driver.Render.PrimitiveNotify = r200RenderPrimitive;
+	 tnl->Driver.Render.Finish = r200RenderFinish;
+
+	 tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+	 tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+	 tnl->Driver.Render.Interp = _tnl_interp;
+
+	 tnl->Driver.Render.ResetLineStipple = r200ResetLineStipple;
+	 TCL_FALLBACK( ctx, R200_TCL_FALLBACK_RASTER, GL_FALSE );
+	 if (rmesa->TclFallback) {
+	    /* These are already done if rmesa->TclFallback goes to
+	     * zero above. But not if it doesn't (R200_NO_TCL for
+	     * example?)
+	     */
+	    _tnl_invalidate_vertex_state( ctx, ~0 );
+	    _tnl_invalidate_vertices( ctx, ~0 );
+	    RENDERINPUTS_ZERO( rmesa->tnl_index_bitset );
+	    r200ChooseVertexState( ctx );
+	    r200ChooseRenderState( ctx );
+	 }
+         if (R200_DEBUG & DEBUG_FALLBACKS) {
+            fprintf(stderr, "R200 end rasterization fallback: 0x%x %s\n",
+                    bit, getFallbackString(bit));
+         }
+      }
+   }
+}
+
+
+
+
+/**
+ * Cope with depth operations by drawing individual pixels as points.
+ * 
+ * \todo
+ * The way the vertex state is set in this routine is hokey.  It seems to
+ * work, but it's very hackish.  This whole routine is pretty hackish.  If
+ * the bitmap is small enough, it seems like it would be faster to copy it
+ * to AGP memory and use it as a non-power-of-two texture (i.e.,
+ * NV_texture_rectangle).
+ */
+void
+r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
+		  GLsizei width, GLsizei height,
+		  const struct gl_pixelstore_attrib *unpack,
+		  const GLubyte *bitmap )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   const GLfloat *rc = ctx->Current.RasterColor; 
+   GLint row, col;
+   r200Vertex vert;
+   GLuint orig_vte;
+   GLuint h;
+
+
+   /* Turn off tcl.  
+    */
+   TCL_FALLBACK( ctx, R200_TCL_FALLBACK_BITMAP, 1 );
+
+   /* Choose tiny vertex format
+    */
+   {
+      const GLuint fmt_0 = R200_VTX_XY | R200_VTX_Z0 | R200_VTX_W0
+	  | (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT);
+      const GLuint fmt_1 = 0;
+      GLuint vte = rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL];
+      GLuint vap = rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL];
+
+      vte &= ~(R200_VTX_XY_FMT | R200_VTX_Z_FMT);
+      vte |= R200_VTX_W0_FMT;
+      vap &= ~R200_VAP_FORCE_W_TO_ONE;
+
+      rmesa->swtcl.vertex_size = 5;
+
+      if ( (rmesa->hw.vtx.cmd[VTX_VTXFMT_0] != fmt_0)
+	   || (rmesa->hw.vtx.cmd[VTX_VTXFMT_1] != fmt_1) ) {
+	 R200_NEWPRIM(rmesa);
+	 R200_STATECHANGE( rmesa, vtx );
+	 rmesa->hw.vtx.cmd[VTX_VTXFMT_0] = fmt_0;
+	 rmesa->hw.vtx.cmd[VTX_VTXFMT_1] = fmt_1;
+      }
+
+      if (vte != rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL]) {
+	 R200_STATECHANGE( rmesa, vte );
+	 rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL] = vte;
+      }
+
+      if (vap != rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL]) {
+	 R200_STATECHANGE( rmesa, vap );
+	 rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] = vap;
+      }
+   }
+
+   /* Ready for point primitives:
+    */
+   r200RenderPrimitive( ctx, GL_POINTS );
+
+   /* Turn off the hw viewport transformation:
+    */
+   R200_STATECHANGE( rmesa, vte );
+   orig_vte = rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL];
+   rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL] &= ~(R200_VPORT_X_SCALE_ENA |
+					   R200_VPORT_Y_SCALE_ENA |
+					   R200_VPORT_Z_SCALE_ENA |
+					   R200_VPORT_X_OFFSET_ENA |
+					   R200_VPORT_Y_OFFSET_ENA |
+					   R200_VPORT_Z_OFFSET_ENA); 
+
+   /* Turn off other stuff:  Stipple?, texture?, blending?, etc.
+    */
+
+
+   /* Populate the vertex
+    *
+    * Incorporate FOG into RGBA
+    */
+   if (ctx->Fog.Enabled) {
+      const GLfloat *fc = ctx->Fog.Color;
+      GLfloat color[4];
+      GLfloat f;
+
+      if (ctx->Fog.FogCoordinateSource == GL_FOG_COORDINATE_EXT)
+         f = _swrast_z_to_fogfactor(ctx, ctx->Current.Attrib[VERT_ATTRIB_FOG][0]);
+      else
+         f = _swrast_z_to_fogfactor(ctx, ctx->Current.RasterDistance);
+
+      color[0] = f * rc[0] + (1.F - f) * fc[0];
+      color[1] = f * rc[1] + (1.F - f) * fc[1];
+      color[2] = f * rc[2] + (1.F - f) * fc[2];
+      color[3] = rc[3];
+
+      UNCLAMPED_FLOAT_TO_CHAN(vert.tv.color.red,   color[0]);
+      UNCLAMPED_FLOAT_TO_CHAN(vert.tv.color.green, color[1]);
+      UNCLAMPED_FLOAT_TO_CHAN(vert.tv.color.blue,  color[2]);
+      UNCLAMPED_FLOAT_TO_CHAN(vert.tv.color.alpha, color[3]);
+   }
+   else {
+      UNCLAMPED_FLOAT_TO_CHAN(vert.tv.color.red,   rc[0]);
+      UNCLAMPED_FLOAT_TO_CHAN(vert.tv.color.green, rc[1]);
+      UNCLAMPED_FLOAT_TO_CHAN(vert.tv.color.blue,  rc[2]);
+      UNCLAMPED_FLOAT_TO_CHAN(vert.tv.color.alpha, rc[3]);
+   }
+
+
+   vert.tv.z = ctx->Current.RasterPos[2];
+
+
+   /* Update window height
+    */
+   LOCK_HARDWARE( rmesa );
+   UNLOCK_HARDWARE( rmesa );
+   h = rmesa->dri.drawable->h + rmesa->dri.drawable->y;
+   px += rmesa->dri.drawable->x;
+
+   /* Clipping handled by existing mechansims in r200_ioctl.c?
+    */
+   for (row=0; row<height; row++) {
+      const GLubyte *src = (const GLubyte *) 
+	 _mesa_image_address2d(unpack, bitmap, width, height, 
+                               GL_COLOR_INDEX, GL_BITMAP, row, 0 );
+
+      if (unpack->LsbFirst) {
+         /* Lsb first */
+         GLubyte mask = 1U << (unpack->SkipPixels & 0x7);
+         for (col=0; col<width; col++) {
+            if (*src & mask) {
+	       vert.tv.x = px+col;
+	       vert.tv.y = h - (py+row) - 1;
+	       r200_point( rmesa, &vert );
+            }
+	    src += (mask >> 7);
+	    mask = ((mask << 1) & 0xff) | (mask >> 7);
+         }
+
+         /* get ready for next row */
+         if (mask != 1)
+            src++;
+      }
+      else {
+         /* Msb first */
+         GLubyte mask = 128U >> (unpack->SkipPixels & 0x7);
+         for (col=0; col<width; col++) {
+            if (*src & mask) {
+	       vert.tv.x = px+col;
+	       vert.tv.y = h - (py+row) - 1;
+	       r200_point( rmesa, &vert );
+            }
+	    src += mask & 1;
+	    mask = ((mask << 7) & 0xff) | (mask >> 1);
+         }
+         /* get ready for next row */
+         if (mask != 128)
+            src++;
+      }
+   }
+
+   /* Fire outstanding vertices, restore state
+    */
+   R200_STATECHANGE( rmesa, vte );
+   rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL] = orig_vte;
+
+   /* Unfallback
+    */
+   TCL_FALLBACK( ctx, R200_TCL_FALLBACK_BITMAP, 0 );
+
+   /* Need to restore vertexformat?
+    */
+   if (rmesa->TclFallback)
+      r200ChooseVertexState( ctx );
+}
+
+
+
+/**********************************************************************/
+/*                            Initialization.                         */
+/**********************************************************************/
+
+void r200InitSwtcl( GLcontext *ctx )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   static int firsttime = 1;
+
+   if (firsttime) {
+      init_rast_tab();
+      firsttime = 0;
+   }
+
+   tnl->Driver.Render.Start = r200RenderStart;
+   tnl->Driver.Render.Finish = r200RenderFinish;
+   tnl->Driver.Render.PrimitiveNotify = r200RenderPrimitive;
+   tnl->Driver.Render.ResetLineStipple = r200ResetLineStipple;
+   tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+   tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+   tnl->Driver.Render.Interp = _tnl_interp;
+
+   /* FIXME: what are these numbers? */
+   _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
+		       36 * sizeof(GLfloat) );
+   
+   rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+   rmesa->swtcl.RenderIndex = ~0;
+   rmesa->swtcl.render_primitive = GL_TRIANGLES;
+   rmesa->swtcl.hw_primitive = 0;
+}
+
+
+void r200DestroySwtcl( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   if (rmesa->swtcl.indexed_verts.buf) 
+      r200ReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, __FUNCTION__ );
+}
diff --git a/r200/r200_swtcl.h b/r200/r200_swtcl.h
new file mode 100644
index 0000000..ccf8179
--- /dev/null
+++ b/r200/r200_swtcl.h
@@ -0,0 +1,75 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_swtcl.h,v 1.3 2003/05/06 23:52:08 daenzer Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __R200_SWTCL_H__
+#define __R200_SWTCL_H__
+
+#include "mtypes.h"
+#include "swrast/swrast.h"
+#include "r200_context.h"
+
+extern void r200InitSwtcl( GLcontext *ctx );
+extern void r200DestroySwtcl( GLcontext *ctx );
+
+extern void r200ChooseRenderState( GLcontext *ctx );
+extern void r200ChooseVertexState( GLcontext *ctx );
+
+extern void r200CheckTexSizes( GLcontext *ctx );
+
+extern void r200BuildVertices( GLcontext *ctx, GLuint start, GLuint count,
+				 GLuint newinputs );
+
+extern void r200PrintSetupFlags(char *msg, GLuint flags );
+
+
+extern void r200_emit_indexed_verts( GLcontext *ctx,
+				       GLuint start,
+				       GLuint count );
+
+extern void r200_translate_vertex( GLcontext *ctx, 
+				     const r200Vertex *src, 
+				     SWvertex *dst );
+
+extern void r200_print_vertex( GLcontext *ctx, const r200Vertex *v );
+
+extern void r200_import_float_colors( GLcontext *ctx );
+extern void r200_import_float_spec_colors( GLcontext *ctx );
+
+extern void r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
+			      GLsizei width, GLsizei height,
+			      const struct gl_pixelstore_attrib *unpack,
+			      const GLubyte *bitmap );
+
+
+#endif
diff --git a/r200/r200_tcl.c b/r200/r200_tcl.c
new file mode 100644
index 0000000..1ff0cf9
--- /dev/null
+++ b/r200/r200_tcl.c
@@ -0,0 +1,655 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_tcl.c,v 1.2 2002/12/16 16:18:55 dawes Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "mtypes.h"
+#include "enums.h"
+#include "colormac.h"
+#include "light.h"
+
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+
+#include "r200_context.h"
+#include "r200_state.h"
+#include "r200_ioctl.h"
+#include "r200_tex.h"
+#include "r200_tcl.h"
+#include "r200_swtcl.h"
+#include "r200_maos.h"
+
+
+
+#define HAVE_POINTS      1
+#define HAVE_LINES       1
+#define HAVE_LINE_LOOP   0
+#define HAVE_LINE_STRIPS 1
+#define HAVE_TRIANGLES   1
+#define HAVE_TRI_STRIPS  1
+#define HAVE_TRI_STRIP_1 0
+#define HAVE_TRI_FANS    1
+#define HAVE_QUADS       1
+#define HAVE_QUAD_STRIPS 1
+#define HAVE_POLYGONS    1
+#define HAVE_ELTS        1
+
+
+#define HW_POINTS           ((ctx->Point.PointSprite || \
+				((ctx->_TriangleCaps & (DD_POINT_SIZE | DD_POINT_ATTEN)) && \
+	 			!(ctx->_TriangleCaps & (DD_POINT_SMOOTH)))) ? \
+				R200_VF_PRIM_POINT_SPRITES : R200_VF_PRIM_POINTS)
+#define HW_LINES            R200_VF_PRIM_LINES
+#define HW_LINE_LOOP        0
+#define HW_LINE_STRIP       R200_VF_PRIM_LINE_STRIP
+#define HW_TRIANGLES        R200_VF_PRIM_TRIANGLES
+#define HW_TRIANGLE_STRIP_0 R200_VF_PRIM_TRIANGLE_STRIP
+#define HW_TRIANGLE_STRIP_1 0
+#define HW_TRIANGLE_FAN     R200_VF_PRIM_TRIANGLE_FAN
+#define HW_QUADS            R200_VF_PRIM_QUADS
+#define HW_QUAD_STRIP       R200_VF_PRIM_QUAD_STRIP
+#define HW_POLYGON          R200_VF_PRIM_POLYGON
+
+
+static GLboolean discrete_prim[0x10] = {
+   0,				/* 0 none */
+   1,				/* 1 points */
+   1,				/* 2 lines */
+   0,				/* 3 line_strip */
+   1,				/* 4 tri_list */
+   0,				/* 5 tri_fan */
+   0,				/* 6 tri_strip */
+   0,				/* 7 tri_w_flags */
+   1,				/* 8 rect list (unused) */
+   1,				/* 9 3vert point */
+   1,				/* a 3vert line */
+   0,				/* b point sprite */
+   0,				/* c line loop */
+   1,				/* d quads */
+   0,				/* e quad strip */
+   0,				/* f polygon */
+};
+   
+
+#define LOCAL_VARS r200ContextPtr rmesa = R200_CONTEXT(ctx)
+#define ELT_TYPE  GLushort
+
+#define ELT_INIT(prim, hw_prim) \
+   r200TclPrimitive( ctx, prim, hw_prim | R200_VF_PRIM_WALK_IND )
+
+#define GET_MESA_ELTS() rmesa->tcl.Elts
+
+
+/* Don't really know how many elts will fit in what's left of cmdbuf,
+ * as there is state to emit, etc:
+ */
+
+/* Testing on isosurf shows a maximum around here.  Don't know if it's
+ * the card or driver or kernel module that is causing the behaviour.
+ */
+#define GET_MAX_HW_ELTS() 300
+
+#define RESET_STIPPLE() do {			\
+   R200_STATECHANGE( rmesa, lin );		\
+   r200EmitState( rmesa );			\
+} while (0)
+
+#define AUTO_STIPPLE( mode )  do {		\
+   R200_STATECHANGE( rmesa, lin );		\
+   if (mode)					\
+      rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] |=	\
+	 R200_LINE_PATTERN_AUTO_RESET;	\
+   else						\
+      rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] &=	\
+	 ~R200_LINE_PATTERN_AUTO_RESET;	\
+   r200EmitState( rmesa );			\
+} while (0)
+
+
+#define ALLOC_ELTS(nr)	r200AllocElts( rmesa, nr )
+
+static GLushort *r200AllocElts( r200ContextPtr rmesa, GLuint nr ) 
+{
+   if (rmesa->dma.flush == r200FlushElts &&
+       rmesa->store.cmd_used + nr*2 < R200_CMD_BUF_SZ) {
+
+      GLushort *dest = (GLushort *)(rmesa->store.cmd_buf +
+				    rmesa->store.cmd_used);
+
+      rmesa->store.cmd_used += nr*2;
+
+      return dest;
+   }
+   else {
+      if (rmesa->dma.flush)
+	 rmesa->dma.flush( rmesa );
+
+      r200EnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
+			     rmesa->hw.max_state_size + ELTS_BUFSZ(nr) );
+
+      r200EmitAOS( rmesa,
+		   rmesa->tcl.aos_components,
+		   rmesa->tcl.nr_aos_components, 0 );
+
+      return r200AllocEltsOpenEnded( rmesa, rmesa->tcl.hw_primitive, nr );
+   }
+}
+
+
+#define CLOSE_ELTS() 				\
+do {						\
+   if (0) R200_NEWPRIM( rmesa );		\
+}						\
+while (0)
+
+
+/* TODO: Try to extend existing primitive if both are identical,
+ * discrete and there are no intervening state changes.  (Somewhat
+ * duplicates changes to DrawArrays code)
+ */
+static void r200EmitPrim( GLcontext *ctx, 
+		          GLenum prim, 
+		          GLuint hwprim, 
+		          GLuint start, 
+		          GLuint count)	
+{
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+   r200TclPrimitive( ctx, prim, hwprim );
+   
+   r200EnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
+			  rmesa->hw.max_state_size + VBUF_BUFSZ );
+
+   r200EmitAOS( rmesa,
+		  rmesa->tcl.aos_components,
+		  rmesa->tcl.nr_aos_components,
+		  start );
+   
+   /* Why couldn't this packet have taken an offset param?
+    */
+   r200EmitVbufPrim( rmesa,
+		     rmesa->tcl.hw_primitive,
+		     count - start );
+}
+
+#define EMIT_PRIM(ctx, prim, hwprim, start, count) do {         \
+   r200EmitPrim( ctx, prim, hwprim, start, count );             \
+   (void) rmesa; } while (0)
+
+/* Try & join small primitives
+ */
+#if 0
+#define PREFER_DISCRETE_ELT_PRIM( NR, PRIM ) 0
+#else
+#define PREFER_DISCRETE_ELT_PRIM( NR, PRIM )			\
+  ((NR) < 20 ||							\
+   ((NR) < 40 &&						\
+    rmesa->tcl.hw_primitive == (PRIM|				\
+			    R200_VF_TCL_OUTPUT_VTX_ENABLE|	\
+			        R200_VF_PRIM_WALK_IND)))
+#endif
+
+#ifdef MESA_BIG_ENDIAN
+/* We could do without (most of) this ugliness if dest was always 32 bit word aligned... */
+#define EMIT_ELT(dest, offset, x) do {                          \
+        int off = offset + ( ( (GLuint)dest & 0x2 ) >> 1 );     \
+        GLushort *des = (GLushort *)( (GLuint)dest & ~0x2 );    \
+        (des)[ off + 1 - 2 * ( off & 1 ) ] = (GLushort)(x);	\
+	(void)rmesa; } while (0)
+#else
+#define EMIT_ELT(dest, offset, x) do {				\
+	(dest)[offset] = (GLushort) (x);			\
+	(void)rmesa; } while (0)
+#endif
+
+#define EMIT_TWO_ELTS(dest, offset, x, y)  *(GLuint *)((dest)+offset) = ((y)<<16)|(x);
+
+
+
+#define TAG(x) tcl_##x
+#include "tnl_dd/t_dd_dmatmp2.h"
+
+/**********************************************************************/
+/*                          External entrypoints                     */
+/**********************************************************************/
+
+void r200EmitPrimitive( GLcontext *ctx, 
+			  GLuint first,
+			  GLuint last,
+			  GLuint flags )
+{
+   tcl_render_tab_verts[flags&PRIM_MODE_MASK]( ctx, first, last, flags );
+}
+
+void r200EmitEltPrimitive( GLcontext *ctx, 
+			     GLuint first,
+			     GLuint last,
+			     GLuint flags )
+{
+   tcl_render_tab_elts[flags&PRIM_MODE_MASK]( ctx, first, last, flags );
+}
+
+void r200TclPrimitive( GLcontext *ctx, 
+			 GLenum prim,
+			 int hw_prim )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint newprim = hw_prim | R200_VF_TCL_OUTPUT_VTX_ENABLE;
+
+   if (newprim != rmesa->tcl.hw_primitive ||
+       !discrete_prim[hw_prim&0xf]) {
+      /* need to disable perspective-correct texturing for point sprites */
+      if ((prim & PRIM_MODE_MASK) == GL_POINTS && ctx->Point.PointSprite) {
+	 if (rmesa->hw.set.cmd[SET_RE_CNTL] & R200_PERSPECTIVE_ENABLE) {
+	    R200_STATECHANGE( rmesa, set );
+	    rmesa->hw.set.cmd[SET_RE_CNTL] &= ~R200_PERSPECTIVE_ENABLE;
+	 }
+      }
+      else if (!(rmesa->hw.set.cmd[SET_RE_CNTL] & R200_PERSPECTIVE_ENABLE)) {
+	 R200_STATECHANGE( rmesa, set );
+	 rmesa->hw.set.cmd[SET_RE_CNTL] |= R200_PERSPECTIVE_ENABLE;
+      }
+      R200_NEWPRIM( rmesa );
+      rmesa->tcl.hw_primitive = newprim;
+   }
+}
+
+
+/**********************************************************************/
+/*             Fog blend factor computation for hw tcl                */
+/*             same calculation used as in t_vb_fog.c                 */
+/**********************************************************************/
+
+#define FOG_EXP_TABLE_SIZE 256
+#define FOG_MAX (10.0)
+#define EXP_FOG_MAX .0006595
+#define FOG_INCR (FOG_MAX/FOG_EXP_TABLE_SIZE)
+static GLfloat exp_table[FOG_EXP_TABLE_SIZE];
+
+#if 1
+#define NEG_EXP( result, narg )						\
+do {									\
+   GLfloat f = (GLfloat) (narg * (1.0/FOG_INCR));			\
+   GLint k = (GLint) f;							\
+   if (k > FOG_EXP_TABLE_SIZE-2) 					\
+      result = (GLfloat) EXP_FOG_MAX;					\
+   else									\
+      result = exp_table[k] + (f-k)*(exp_table[k+1]-exp_table[k]);	\
+} while (0)
+#else
+#define NEG_EXP( result, narg )					\
+do {								\
+   result = exp(-narg);						\
+} while (0)
+#endif
+
+
+/**
+ * Initialize the exp_table[] lookup table for approximating exp().
+ */
+void
+r200InitStaticFogData( void )
+{
+   GLfloat f = 0.0F;
+   GLint i = 0;
+   for ( ; i < FOG_EXP_TABLE_SIZE ; i++, f += FOG_INCR) {
+      exp_table[i] = (GLfloat) exp(-f);
+   }
+}
+
+
+/**
+ * Compute per-vertex fog blend factors from fog coordinates by
+ * evaluating the GL_LINEAR, GL_EXP or GL_EXP2 fog function.
+ * Fog coordinates are distances from the eye (typically between the
+ * near and far clip plane distances).
+ * Note the fog (eye Z) coords may be negative so we use ABS(z) below.
+ * Fog blend factors are in the range [0,1].
+ */
+float
+r200ComputeFogBlendFactor( GLcontext *ctx, GLfloat fogcoord )
+{
+   GLfloat end  = ctx->Fog.End;
+   GLfloat d, temp;
+   const GLfloat z = FABSF(fogcoord);
+
+   switch (ctx->Fog.Mode) {
+   case GL_LINEAR:
+      if (ctx->Fog.Start == ctx->Fog.End)
+         d = 1.0F;
+      else
+         d = 1.0F / (ctx->Fog.End - ctx->Fog.Start);
+      temp = (end - z) * d;
+      return CLAMP(temp, 0.0F, 1.0F);
+      break;
+   case GL_EXP:
+      d = ctx->Fog.Density;
+      NEG_EXP( temp, d * z );
+      return temp;
+      break;
+   case GL_EXP2:
+      d = ctx->Fog.Density*ctx->Fog.Density;
+      NEG_EXP( temp, d * z * z );
+      return temp;
+      break;
+   default:
+      _mesa_problem(ctx, "Bad fog mode in make_fog_coord");
+      return 0;
+   }
+}
+
+
+/**********************************************************************/
+/*                          Render pipeline stage                     */
+/**********************************************************************/
+
+
+/* TCL render.
+ */
+static GLboolean r200_run_tcl_render( GLcontext *ctx,
+				      struct tnl_pipeline_stage *stage )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+   GLuint i;
+   GLubyte *vimap_rev;
+/* use hw fixed order for simplicity, pos 0, weight 1, normal 2, fog 3, 
+   color0 - color3 4-7, texcoord0 - texcoord5 8-13, pos 1 14. Must not use
+   more than 12 of those at the same time. */
+   GLubyte map_rev_fixed[15] = {255, 255, 255, 255, 255, 255, 255, 255,
+			    255, 255, 255, 255, 255, 255, 255};
+
+
+   /* TODO: separate this from the swtnl pipeline 
+    */
+   if (rmesa->TclFallback)
+      return GL_TRUE;	/* fallback to software t&l */
+
+   if (R200_DEBUG & DEBUG_PRIMS)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   if (VB->Count == 0)
+      return GL_FALSE;
+
+   /* Validate state:
+    */
+   if (rmesa->NewGLState)
+      r200ValidateState( ctx );
+
+   if (!ctx->VertexProgram._Enabled) {
+   /* NOTE: inputs != tnl->render_inputs - these are the untransformed
+    * inputs.
+    */
+      map_rev_fixed[0] = VERT_ATTRIB_POS;
+      /* technically there is no reason we always need VA_COLOR0. In theory
+         could disable it depending on lighting, color materials, texturing... */
+      map_rev_fixed[4] = VERT_ATTRIB_COLOR0;
+
+      if (ctx->Light.Enabled) {
+	 map_rev_fixed[2] = VERT_ATTRIB_NORMAL;
+      }
+
+      /* this also enables VA_COLOR1 when using separate specular
+         lighting model, which is unnecessary.
+         FIXME: OTOH, we're missing the case where a ATI_fragment_shader accesses
+         the secondary color (if lighting is disabled). The chip seems
+         misconfigured for that though elsewhere (tcl output, might lock up) */
+      if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR) {
+	 map_rev_fixed[5] = VERT_ATTRIB_COLOR1;
+      }
+
+      if ( (ctx->Fog.FogCoordinateSource == GL_FOG_COORD) && ctx->Fog.Enabled ) {
+	 map_rev_fixed[3] = VERT_ATTRIB_FOG;
+      }
+
+      for (i = 0 ; i < ctx->Const.MaxTextureUnits; i++) {
+	 if (ctx->Texture.Unit[i]._ReallyEnabled) {
+	    if (rmesa->TexGenNeedNormals[i]) {
+	       map_rev_fixed[2] = VERT_ATTRIB_NORMAL;
+	    }
+	    map_rev_fixed[8 + i] = VERT_ATTRIB_TEX0 + i;
+	 }
+      }
+      vimap_rev = &map_rev_fixed[0];
+   }
+   else {
+      /* vtx_tcl_output_vtxfmt_0/1 need to match configuration of "fragment
+	 part", since using some vertex interpolator later which is not in
+	 out_vtxfmt0/1 will lock up. It seems to be ok to write in vertex
+	 prog to a not enabled output however, so just don't mess with it.
+	 We only need to change compsel. */
+      GLuint out_compsel = 0;
+      GLuint vp_out = rmesa->curr_vp_hw->mesa_program.Base.OutputsWritten;
+
+      vimap_rev = &rmesa->curr_vp_hw->inputmap_rev[0];
+      assert(vp_out & (1 << VERT_RESULT_HPOS));
+      out_compsel = R200_OUTPUT_XYZW;
+      if (vp_out & (1 << VERT_RESULT_COL0)) {
+	 out_compsel |= R200_OUTPUT_COLOR_0;
+      }
+      if (vp_out & (1 << VERT_RESULT_COL1)) {
+	 out_compsel |= R200_OUTPUT_COLOR_1;
+      }
+      if (vp_out & (1 << VERT_RESULT_FOGC)) {
+         out_compsel |= R200_OUTPUT_DISCRETE_FOG;
+      }
+      if (vp_out & (1 << VERT_RESULT_PSIZ)) {
+	 out_compsel |= R200_OUTPUT_PT_SIZE;
+      }
+      for (i = VERT_RESULT_TEX0; i < VERT_RESULT_TEX6; i++) {
+	 if (vp_out & (1 << i)) {
+	    out_compsel |= R200_OUTPUT_TEX_0 << (i - VERT_RESULT_TEX0);
+	 }
+      }
+      if (rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] != out_compsel) {
+	 R200_STATECHANGE( rmesa, vtx );
+	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] = out_compsel;
+      }
+   }
+
+   /* Do the actual work:
+    */
+   r200ReleaseArrays( ctx, ~0 /* stage->changed_inputs */ );
+   r200EmitArrays( ctx, vimap_rev );
+
+   rmesa->tcl.Elts = VB->Elts;
+
+   for (i = 0 ; i < VB->PrimitiveCount ; i++)
+   {
+      GLuint prim = _tnl_translate_prim(&VB->Primitive[i]);
+      GLuint start = VB->Primitive[i].start;
+      GLuint length = VB->Primitive[i].count;
+
+      if (!length)
+	 continue;
+
+      if (rmesa->tcl.Elts)
+	 r200EmitEltPrimitive( ctx, start, start+length, prim );
+      else
+	 r200EmitPrimitive( ctx, start, start+length, prim );
+   }
+
+   return GL_FALSE;		/* finished the pipe */
+}
+
+
+
+/* Initial state for tcl stage.  
+ */
+const struct tnl_pipeline_stage _r200_tcl_stage =
+{
+   "r200 render",
+   NULL,			/*  private */
+   NULL,
+   NULL,
+   NULL,
+   r200_run_tcl_render	/* run */
+};
+
+
+
+/**********************************************************************/
+/*                 Validate state at pipeline start                   */
+/**********************************************************************/
+
+
+/*-----------------------------------------------------------------------
+ * Manage TCL fallbacks
+ */
+
+
+static void transition_to_swtnl( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+
+   R200_NEWPRIM( rmesa );
+
+   r200ChooseVertexState( ctx );
+   r200ChooseRenderState( ctx );
+
+   _mesa_validate_all_lighting_tables( ctx ); 
+
+   tnl->Driver.NotifyMaterialChange = 
+      _mesa_validate_all_lighting_tables;
+
+   r200ReleaseArrays( ctx, ~0 );
+
+   /* Still using the D3D based hardware-rasterizer from the radeon;
+    * need to put the card into D3D mode to make it work:
+    */
+   R200_STATECHANGE( rmesa, vap );
+   rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~(R200_VAP_TCL_ENABLE|R200_VAP_PROG_VTX_SHADER_ENABLE);
+}
+
+static void transition_to_hwtnl( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+
+   _tnl_need_projected_coords( ctx, GL_FALSE );
+
+   r200UpdateMaterial( ctx );
+
+   tnl->Driver.NotifyMaterialChange = r200UpdateMaterial;
+
+   if ( rmesa->dma.flush )			
+      rmesa->dma.flush( rmesa );	
+
+   rmesa->dma.flush = NULL;
+   
+   if (rmesa->swtcl.indexed_verts.buf) 
+      r200ReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
+			      __FUNCTION__ );
+
+   R200_STATECHANGE( rmesa, vap );
+   rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_TCL_ENABLE;
+   rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~R200_VAP_FORCE_W_TO_ONE;
+
+   if (ctx->VertexProgram._Enabled) {
+      rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_PROG_VTX_SHADER_ENABLE;
+   }
+
+   if ( ((rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] & R200_FOG_USE_MASK)
+      == R200_FOG_USE_SPEC_ALPHA) &&
+      (ctx->Fog.FogCoordinateSource == GL_FOG_COORD )) {
+      R200_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] &= ~R200_FOG_USE_MASK;
+      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] |= R200_FOG_USE_VTX_FOG;
+   }
+
+   R200_STATECHANGE( rmesa, vte );
+   rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL] &= ~(R200_VTX_XY_FMT|R200_VTX_Z_FMT);
+   rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL] |= R200_VTX_W0_FMT;
+
+   if (R200_DEBUG & DEBUG_FALLBACKS) 
+      fprintf(stderr, "R200 end tcl fallback\n");
+}
+
+
+static char *fallbackStrings[] = {
+   "Rasterization fallback",
+   "Unfilled triangles",
+   "Twosided lighting, differing materials",
+   "Materials in VB (maybe between begin/end)",
+   "Texgen unit 0",
+   "Texgen unit 1",
+   "Texgen unit 2",
+   "Texgen unit 3",
+   "Texgen unit 4",
+   "Texgen unit 5",
+   "User disable",
+   "Bitmap as points",
+   "Vertex program"
+};
+
+
+static char *getFallbackString(GLuint bit)
+{
+   int i = 0;
+   while (bit > 1) {
+      i++;
+      bit >>= 1;
+   }
+   return fallbackStrings[i];
+}
+
+
+
+void r200TclFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint oldfallback = rmesa->TclFallback;
+
+   if (mode) {
+      rmesa->TclFallback |= bit;
+      if (oldfallback == 0) {
+	 if (R200_DEBUG & DEBUG_FALLBACKS) 
+	    fprintf(stderr, "R200 begin tcl fallback %s\n",
+		    getFallbackString( bit ));
+	 transition_to_swtnl( ctx );
+      }
+   }
+   else {
+      rmesa->TclFallback &= ~bit;
+      if (oldfallback == bit) {
+	 if (R200_DEBUG & DEBUG_FALLBACKS) 
+	    fprintf(stderr, "R200 end tcl fallback %s\n",
+		    getFallbackString( bit ));
+	 transition_to_hwtnl( ctx );
+      }
+   }
+}
diff --git a/r200/r200_tcl.h b/r200/r200_tcl.h
new file mode 100644
index 0000000..ac5bc11
--- /dev/null
+++ b/r200/r200_tcl.h
@@ -0,0 +1,69 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_tcl.h,v 1.2 2002/12/16 16:18:55 dawes Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __R200_TCL_H__
+#define __R200_TCL_H__
+
+#include "r200_context.h"
+
+extern void r200TclPrimitive( GLcontext *ctx, GLenum prim, int hw_prim );
+extern void r200EmitEltPrimitive( GLcontext *ctx, GLuint first, GLuint last,
+				    GLuint flags );
+extern void r200EmitPrimitive( GLcontext *ctx, GLuint first, GLuint last,
+				 GLuint flags );
+
+extern void r200TclFallback( GLcontext *ctx, GLuint bit, GLboolean mode );
+
+extern void r200InitStaticFogData( void );
+
+extern float r200ComputeFogBlendFactor( GLcontext *ctx, GLfloat fogcoord );
+					      
+#define R200_TCL_FALLBACK_RASTER            0x1 /* rasterization */
+#define R200_TCL_FALLBACK_UNFILLED          0x2 /* unfilled tris */
+#define R200_TCL_FALLBACK_LIGHT_TWOSIDE     0x4 /* twoside tris */
+#define R200_TCL_FALLBACK_MATERIAL          0x8 /* material in vb */
+#define R200_TCL_FALLBACK_TEXGEN_0          0x10 /* texgen, unit 0 */
+#define R200_TCL_FALLBACK_TEXGEN_1          0x20 /* texgen, unit 1 */
+#define R200_TCL_FALLBACK_TEXGEN_2          0x40 /* texgen, unit 2 */
+#define R200_TCL_FALLBACK_TEXGEN_3          0x80 /* texgen, unit 3 */
+#define R200_TCL_FALLBACK_TEXGEN_4          0x100 /* texgen, unit 4 */
+#define R200_TCL_FALLBACK_TEXGEN_5          0x200 /* texgen, unit 5 */
+#define R200_TCL_FALLBACK_TCL_DISABLE       0x400 /* user disable */
+#define R200_TCL_FALLBACK_BITMAP            0x800 /* draw bitmap with points */
+#define R200_TCL_FALLBACK_VERTEX_PROGRAM    0x1000/* vertex program active */
+
+#define TCL_FALLBACK( ctx, bit, mode )	r200TclFallback( ctx, bit, mode )
+
+#endif
diff --git a/r200/r200_tex.c b/r200/r200_tex.c
new file mode 100644
index 0000000..6c6450c
--- /dev/null
+++ b/r200/r200_tex.c
@@ -0,0 +1,1216 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_tex.c,v 1.2 2002/11/05 17:46:08 tsi Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "colormac.h"
+#include "context.h"
+#include "enums.h"
+#include "image.h"
+#include "simple_list.h"
+#include "texformat.h"
+#include "texstore.h"
+#include "texmem.h"
+#include "teximage.h"
+#include "texobj.h"
+
+#include "r200_context.h"
+#include "r200_state.h"
+#include "r200_ioctl.h"
+#include "r200_swtcl.h"
+#include "r200_tex.h"
+
+#include "xmlpool.h"
+
+
+
+/**
+ * Set the texture wrap modes.
+ * 
+ * \param t Texture object whose wrap modes are to be set
+ * \param swrap Wrap mode for the \a s texture coordinate
+ * \param twrap Wrap mode for the \a t texture coordinate
+ */
+
+static void r200SetTexWrap( r200TexObjPtr t, GLenum swrap, GLenum twrap, GLenum rwrap )
+{
+   GLboolean  is_clamp = GL_FALSE;
+   GLboolean  is_clamp_to_border = GL_FALSE;
+
+   t->pp_txfilter &= ~(R200_CLAMP_S_MASK | R200_CLAMP_T_MASK | R200_BORDER_MODE_D3D);
+
+   switch ( swrap ) {
+   case GL_REPEAT:
+      t->pp_txfilter |= R200_CLAMP_S_WRAP;
+      break;
+   case GL_CLAMP:
+      t->pp_txfilter |= R200_CLAMP_S_CLAMP_GL;
+      is_clamp = GL_TRUE;
+      break;
+   case GL_CLAMP_TO_EDGE:
+      t->pp_txfilter |= R200_CLAMP_S_CLAMP_LAST;
+      break;
+   case GL_CLAMP_TO_BORDER:
+      t->pp_txfilter |= R200_CLAMP_S_CLAMP_GL;
+      is_clamp_to_border = GL_TRUE;
+      break;
+   case GL_MIRRORED_REPEAT:
+      t->pp_txfilter |= R200_CLAMP_S_MIRROR;
+      break;
+   case GL_MIRROR_CLAMP_EXT:
+      t->pp_txfilter |= R200_CLAMP_S_MIRROR_CLAMP_GL;
+      is_clamp = GL_TRUE;
+      break;
+   case GL_MIRROR_CLAMP_TO_EDGE_EXT:
+      t->pp_txfilter |= R200_CLAMP_S_MIRROR_CLAMP_LAST;
+      break;
+   case GL_MIRROR_CLAMP_TO_BORDER_EXT:
+      t->pp_txfilter |= R200_CLAMP_S_MIRROR_CLAMP_GL;
+      is_clamp_to_border = GL_TRUE;
+      break;
+   default:
+      _mesa_problem(NULL, "bad S wrap mode in %s", __FUNCTION__);
+   }
+
+   switch ( twrap ) {
+   case GL_REPEAT:
+      t->pp_txfilter |= R200_CLAMP_T_WRAP;
+      break;
+   case GL_CLAMP:
+      t->pp_txfilter |= R200_CLAMP_T_CLAMP_GL;
+      is_clamp = GL_TRUE;
+      break;
+   case GL_CLAMP_TO_EDGE:
+      t->pp_txfilter |= R200_CLAMP_T_CLAMP_LAST;
+      break;
+   case GL_CLAMP_TO_BORDER:
+      t->pp_txfilter |= R200_CLAMP_T_CLAMP_GL;
+      is_clamp_to_border = GL_TRUE;
+      break;
+   case GL_MIRRORED_REPEAT:
+      t->pp_txfilter |= R200_CLAMP_T_MIRROR;
+      break;
+   case GL_MIRROR_CLAMP_EXT:
+      t->pp_txfilter |= R200_CLAMP_T_MIRROR_CLAMP_GL;
+      is_clamp = GL_TRUE;
+      break;
+   case GL_MIRROR_CLAMP_TO_EDGE_EXT:
+      t->pp_txfilter |= R200_CLAMP_T_MIRROR_CLAMP_LAST;
+      break;
+   case GL_MIRROR_CLAMP_TO_BORDER_EXT:
+      t->pp_txfilter |= R200_CLAMP_T_MIRROR_CLAMP_GL;
+      is_clamp_to_border = GL_TRUE;
+      break;
+   default:
+      _mesa_problem(NULL, "bad T wrap mode in %s", __FUNCTION__);
+   }
+
+   t->pp_txformat_x &= ~R200_CLAMP_Q_MASK;
+
+   switch ( rwrap ) {
+   case GL_REPEAT:
+      t->pp_txformat_x |= R200_CLAMP_Q_WRAP;
+      break;
+   case GL_CLAMP:
+      t->pp_txformat_x |= R200_CLAMP_Q_CLAMP_GL;
+      is_clamp = GL_TRUE;
+      break;
+   case GL_CLAMP_TO_EDGE:
+      t->pp_txformat_x |= R200_CLAMP_Q_CLAMP_LAST;
+      break;
+   case GL_CLAMP_TO_BORDER:
+      t->pp_txformat_x |= R200_CLAMP_Q_CLAMP_GL;
+      is_clamp_to_border = GL_TRUE;
+      break;
+   case GL_MIRRORED_REPEAT:
+      t->pp_txformat_x |= R200_CLAMP_Q_MIRROR;
+      break;
+   case GL_MIRROR_CLAMP_EXT:
+      t->pp_txformat_x |= R200_CLAMP_Q_MIRROR_CLAMP_GL;
+      is_clamp = GL_TRUE;
+      break;
+   case GL_MIRROR_CLAMP_TO_EDGE_EXT:
+      t->pp_txformat_x |= R200_CLAMP_Q_MIRROR_CLAMP_LAST;
+      break;
+   case GL_MIRROR_CLAMP_TO_BORDER_EXT:
+      t->pp_txformat_x |= R200_CLAMP_Q_MIRROR_CLAMP_GL;
+      is_clamp_to_border = GL_TRUE;
+      break;
+   default:
+      _mesa_problem(NULL, "bad R wrap mode in %s", __FUNCTION__);
+   }
+
+   if ( is_clamp_to_border ) {
+      t->pp_txfilter |= R200_BORDER_MODE_D3D;
+   }
+
+   t->border_fallback = (is_clamp && is_clamp_to_border);
+}
+
+static void r200SetTexMaxAnisotropy( r200TexObjPtr t, GLfloat max )
+{
+   t->pp_txfilter &= ~R200_MAX_ANISO_MASK;
+
+   if ( max == 1.0 ) {
+      t->pp_txfilter |= R200_MAX_ANISO_1_TO_1;
+   } else if ( max <= 2.0 ) {
+      t->pp_txfilter |= R200_MAX_ANISO_2_TO_1;
+   } else if ( max <= 4.0 ) {
+      t->pp_txfilter |= R200_MAX_ANISO_4_TO_1;
+   } else if ( max <= 8.0 ) {
+      t->pp_txfilter |= R200_MAX_ANISO_8_TO_1;
+   } else {
+      t->pp_txfilter |= R200_MAX_ANISO_16_TO_1;
+   }
+}
+
+/**
+ * Set the texture magnification and minification modes.
+ * 
+ * \param t Texture whose filter modes are to be set
+ * \param minf Texture minification mode
+ * \param magf Texture magnification mode
+ */
+
+static void r200SetTexFilter( r200TexObjPtr t, GLenum minf, GLenum magf )
+{
+   GLuint anisotropy = (t->pp_txfilter & R200_MAX_ANISO_MASK);
+
+   t->pp_txfilter &= ~(R200_MIN_FILTER_MASK | R200_MAG_FILTER_MASK);
+   t->pp_txformat_x &= ~R200_VOLUME_FILTER_MASK;
+
+   if ( anisotropy == R200_MAX_ANISO_1_TO_1 ) {
+      switch ( minf ) {
+      case GL_NEAREST:
+	 t->pp_txfilter |= R200_MIN_FILTER_NEAREST;
+	 break;
+      case GL_LINEAR:
+	 t->pp_txfilter |= R200_MIN_FILTER_LINEAR;
+	 break;
+      case GL_NEAREST_MIPMAP_NEAREST:
+	 t->pp_txfilter |= R200_MIN_FILTER_NEAREST_MIP_NEAREST;
+	 break;
+      case GL_NEAREST_MIPMAP_LINEAR:
+	 t->pp_txfilter |= R200_MIN_FILTER_LINEAR_MIP_NEAREST;
+	 break;
+      case GL_LINEAR_MIPMAP_NEAREST:
+	 t->pp_txfilter |= R200_MIN_FILTER_NEAREST_MIP_LINEAR;
+	 break;
+      case GL_LINEAR_MIPMAP_LINEAR:
+	 t->pp_txfilter |= R200_MIN_FILTER_LINEAR_MIP_LINEAR;
+	 break;
+      }
+   } else {
+      switch ( minf ) {
+      case GL_NEAREST:
+	 t->pp_txfilter |= R200_MIN_FILTER_ANISO_NEAREST;
+	 break;
+      case GL_LINEAR:
+	 t->pp_txfilter |= R200_MIN_FILTER_ANISO_LINEAR;
+	 break;
+      case GL_NEAREST_MIPMAP_NEAREST:
+      case GL_LINEAR_MIPMAP_NEAREST:
+	 t->pp_txfilter |= R200_MIN_FILTER_ANISO_NEAREST_MIP_NEAREST;
+	 break;
+      case GL_NEAREST_MIPMAP_LINEAR:
+      case GL_LINEAR_MIPMAP_LINEAR:
+	 t->pp_txfilter |= R200_MIN_FILTER_ANISO_NEAREST_MIP_LINEAR;
+	 break;
+      }
+   }
+
+   /* Note we don't have 3D mipmaps so only use the mag filter setting
+    * to set the 3D texture filter mode.
+    */
+   switch ( magf ) {
+   case GL_NEAREST:
+      t->pp_txfilter |= R200_MAG_FILTER_NEAREST;
+      t->pp_txformat_x |= R200_VOLUME_FILTER_NEAREST;
+      break;
+   case GL_LINEAR:
+      t->pp_txfilter |= R200_MAG_FILTER_LINEAR;
+      t->pp_txformat_x |= R200_VOLUME_FILTER_LINEAR;
+      break;
+   }
+}
+
+static void r200SetTexBorderColor( r200TexObjPtr t, GLubyte c[4] )
+{
+   t->pp_border_color = r200PackColor( 4, c[0], c[1], c[2], c[3] );
+}
+
+
+/**
+ * Allocate space for and load the mesa images into the texture memory block.
+ * This will happen before drawing with a new texture, or drawing with a
+ * texture after it was swapped out or teximaged again.
+ */
+
+static r200TexObjPtr r200AllocTexObj( struct gl_texture_object *texObj )
+{
+   r200TexObjPtr t;
+
+   t = CALLOC_STRUCT( r200_tex_obj );
+   texObj->DriverData = t;
+   if ( t != NULL ) {
+      if ( R200_DEBUG & DEBUG_TEXTURE ) {
+	 fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, (void *)texObj, 
+		  (void *)t );
+      }
+
+      /* Initialize non-image-dependent parts of the state:
+       */
+      t->base.tObj = texObj;
+      t->border_fallback = GL_FALSE;
+
+      make_empty_list( & t->base );
+
+      r200SetTexWrap( t, texObj->WrapS, texObj->WrapT, texObj->WrapR );
+      r200SetTexMaxAnisotropy( t, texObj->MaxAnisotropy );
+      r200SetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
+      r200SetTexBorderColor( t, texObj->_BorderChan );
+   }
+
+   return t;
+}
+
+/* try to find a format which will only need a memcopy */
+static const struct gl_texture_format *
+r200Choose8888TexFormat( GLenum srcFormat, GLenum srcType )
+{
+   const GLuint ui = 1;
+   const GLubyte littleEndian = *((const GLubyte *) &ui);
+
+   if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
+       (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
+       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
+       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && littleEndian)) {
+      return &_mesa_texformat_rgba8888;
+   }
+   else if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
+       (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && littleEndian) ||
+       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
+       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && !littleEndian)) {
+      return &_mesa_texformat_rgba8888_rev;
+   }
+   else return _dri_texformat_argb8888;
+}
+
+static const struct gl_texture_format *
+r200ChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
+                           GLenum format, GLenum type )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   const GLboolean do32bpt =
+       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32 );
+   const GLboolean force16bpt =
+       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16 );
+   (void) format;
+
+   switch ( internalFormat ) {
+   case 4:
+   case GL_RGBA:
+   case GL_COMPRESSED_RGBA:
+      switch ( type ) {
+      case GL_UNSIGNED_INT_10_10_10_2:
+      case GL_UNSIGNED_INT_2_10_10_10_REV:
+	 return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_argb1555;
+      case GL_UNSIGNED_SHORT_4_4_4_4:
+      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+	 return _dri_texformat_argb4444;
+      case GL_UNSIGNED_SHORT_5_5_5_1:
+      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+	 return _dri_texformat_argb1555;
+      default:
+         return do32bpt ?
+	    r200Choose8888TexFormat(format, type) : _dri_texformat_argb4444;
+      }
+
+   case 3:
+   case GL_RGB:
+   case GL_COMPRESSED_RGB:
+      switch ( type ) {
+      case GL_UNSIGNED_SHORT_4_4_4_4:
+      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+	 return _dri_texformat_argb4444;
+      case GL_UNSIGNED_SHORT_5_5_5_1:
+      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+	 return _dri_texformat_argb1555;
+      case GL_UNSIGNED_SHORT_5_6_5:
+      case GL_UNSIGNED_SHORT_5_6_5_REV:
+	 return _dri_texformat_rgb565;
+      default:
+         return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
+      }
+
+   case GL_RGBA8:
+   case GL_RGB10_A2:
+   case GL_RGBA12:
+   case GL_RGBA16:
+      return !force16bpt ?
+	  r200Choose8888TexFormat(format, type) : _dri_texformat_argb4444;
+
+   case GL_RGBA4:
+   case GL_RGBA2:
+      return _dri_texformat_argb4444;
+
+   case GL_RGB5_A1:
+      return _dri_texformat_argb1555;
+
+   case GL_RGB8:
+   case GL_RGB10:
+   case GL_RGB12:
+   case GL_RGB16:
+      return !force16bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
+
+   case GL_RGB5:
+   case GL_RGB4:
+   case GL_R3_G3_B2:
+      return _dri_texformat_rgb565;
+
+   case GL_ALPHA:
+   case GL_ALPHA4:
+   case GL_ALPHA8:
+   case GL_ALPHA12:
+   case GL_ALPHA16:
+   case GL_COMPRESSED_ALPHA:
+   /* can't use a8 format since interpreting hw I8 as a8 would result
+      in wrong rgb values (same as alpha value instead of 0). */
+      return _dri_texformat_al88;
+
+   case 1:
+   case GL_LUMINANCE:
+   case GL_LUMINANCE4:
+   case GL_LUMINANCE8:
+   case GL_LUMINANCE12:
+   case GL_LUMINANCE16:
+   case GL_COMPRESSED_LUMINANCE:
+      return _dri_texformat_l8;
+
+   case 2:
+   case GL_LUMINANCE_ALPHA:
+   case GL_LUMINANCE4_ALPHA4:
+   case GL_LUMINANCE6_ALPHA2:
+   case GL_LUMINANCE8_ALPHA8:
+   case GL_LUMINANCE12_ALPHA4:
+   case GL_LUMINANCE12_ALPHA12:
+   case GL_LUMINANCE16_ALPHA16:
+   case GL_COMPRESSED_LUMINANCE_ALPHA:
+      return _dri_texformat_al88;
+
+   case GL_INTENSITY:
+   case GL_INTENSITY4:
+   case GL_INTENSITY8:
+   case GL_INTENSITY12:
+   case GL_INTENSITY16:
+   case GL_COMPRESSED_INTENSITY:
+       return _dri_texformat_i8;
+
+   case GL_YCBCR_MESA:
+      if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
+          type == GL_UNSIGNED_BYTE)
+         return &_mesa_texformat_ycbcr;
+      else
+         return &_mesa_texformat_ycbcr_rev;
+
+   case GL_RGB_S3TC:
+   case GL_RGB4_S3TC:
+   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+      return &_mesa_texformat_rgb_dxt1;
+
+   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+      return &_mesa_texformat_rgba_dxt1;
+
+   case GL_RGBA_S3TC:
+   case GL_RGBA4_S3TC:
+   case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+      return &_mesa_texformat_rgba_dxt3;
+
+   case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+      return &_mesa_texformat_rgba_dxt5;
+
+   default:
+      _mesa_problem(ctx,
+         "unexpected internalFormat 0x%x in r200ChooseTextureFormat",
+         (int) internalFormat);
+      return NULL;
+   }
+
+   return NULL; /* never get here */
+}
+
+
+static GLboolean
+r200ValidateClientStorage( GLcontext *ctx, GLenum target,
+			   GLint internalFormat,
+			   GLint srcWidth, GLint srcHeight, 
+                           GLenum format, GLenum type,  const void *pixels,
+			   const struct gl_pixelstore_attrib *packing,
+			   struct gl_texture_object *texObj,
+			   struct gl_texture_image *texImage)
+
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   if (0)
+      fprintf(stderr, "intformat %s format %s type %s\n",
+	      _mesa_lookup_enum_by_nr( internalFormat ),
+	      _mesa_lookup_enum_by_nr( format ),
+	      _mesa_lookup_enum_by_nr( type ));
+
+   if (!ctx->Unpack.ClientStorage)
+      return 0;
+
+   if (ctx->_ImageTransferState ||
+       texImage->IsCompressed ||
+       texObj->GenerateMipmap)
+      return 0;
+
+
+   /* This list is incomplete, may be different on ppc???
+    */
+   switch ( internalFormat ) {
+   case GL_RGBA:
+      if ( format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV ) {
+	 texImage->TexFormat = _dri_texformat_argb8888;
+      }
+      else
+	 return 0;
+      break;
+
+   case GL_RGB:
+      if ( format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5 ) {
+	 texImage->TexFormat = _dri_texformat_rgb565;
+      }
+      else
+	 return 0;
+      break;
+
+   case GL_YCBCR_MESA:
+      if ( format == GL_YCBCR_MESA && 
+	   type == GL_UNSIGNED_SHORT_8_8_REV_APPLE ) {
+	 texImage->TexFormat = &_mesa_texformat_ycbcr_rev;
+      }
+      else if ( format == GL_YCBCR_MESA && 
+		(type == GL_UNSIGNED_SHORT_8_8_APPLE || 
+		 type == GL_UNSIGNED_BYTE)) {
+	 texImage->TexFormat = &_mesa_texformat_ycbcr;
+      }
+      else
+	 return 0;
+      break;
+
+   default:
+      return 0;
+   }
+
+   /* Could deal with these packing issues, but currently don't:
+    */
+   if (packing->SkipPixels || 
+       packing->SkipRows || 
+       packing->SwapBytes ||
+       packing->LsbFirst) {
+      return 0;
+   }
+
+   {      
+      GLint srcRowStride = _mesa_image_row_stride(packing, srcWidth,
+						  format, type);
+
+      
+      if (0)
+	 fprintf(stderr, "%s: srcRowStride %d/%x\n", 
+		 __FUNCTION__, srcRowStride, srcRowStride);
+
+      /* Could check this later in upload, pitch restrictions could be
+       * relaxed, but would need to store the image pitch somewhere,
+       * as packing details might change before image is uploaded:
+       */
+      if (!r200IsGartMemory( rmesa, pixels, srcHeight * srcRowStride ) ||
+	  (srcRowStride & 63))
+	 return 0;
+
+
+      /* Have validated that _mesa_transfer_teximage would be a straight
+       * memcpy at this point.  NOTE: future calls to TexSubImage will
+       * overwrite the client data.  This is explicitly mentioned in the
+       * extension spec.
+       */
+      texImage->Data = (void *)pixels;
+      texImage->IsClientData = GL_TRUE;
+      texImage->RowStride = srcRowStride / texImage->TexFormat->TexelBytes;
+
+      return 1;
+   }
+}
+
+
+static void r200TexImage1D( GLcontext *ctx, GLenum target, GLint level,
+                              GLint internalFormat,
+                              GLint width, GLint border,
+                              GLenum format, GLenum type, const GLvoid *pixels,
+                              const struct gl_pixelstore_attrib *packing,
+                              struct gl_texture_object *texObj,
+                              struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+
+   if ( t ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) r200AllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
+         return;
+      }
+   }
+
+   /* Note, this will call ChooseTextureFormat */
+   _mesa_store_teximage1d(ctx, target, level, internalFormat,
+                          width, border, format, type, pixels,
+                          &ctx->Unpack, texObj, texImage);
+
+   t->dirty_images[0] |= (1 << level);
+}
+
+
+static void r200TexSubImage1D( GLcontext *ctx, GLenum target, GLint level,
+                                 GLint xoffset,
+                                 GLsizei width,
+                                 GLenum format, GLenum type,
+                                 const GLvoid *pixels,
+                                 const struct gl_pixelstore_attrib *packing,
+                                 struct gl_texture_object *texObj,
+                                 struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+
+   assert( t ); /* this _should_ be true */
+   if ( t ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) r200AllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
+         return;
+      }
+   }
+
+   _mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
+			     format, type, pixels, packing, texObj,
+			     texImage);
+
+   t->dirty_images[0] |= (1 << level);
+}
+
+
+static void r200TexImage2D( GLcontext *ctx, GLenum target, GLint level,
+                              GLint internalFormat,
+                              GLint width, GLint height, GLint border,
+                              GLenum format, GLenum type, const GLvoid *pixels,
+                              const struct gl_pixelstore_attrib *packing,
+                              struct gl_texture_object *texObj,
+                              struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+   GLuint face;
+
+   /* which cube face or ordinary 2D image */
+   switch (target) {
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+      ASSERT(face < 6);
+      break;
+   default:
+      face = 0;
+   }
+
+   if ( t != NULL ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) r200AllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
+         return;
+      }
+   }
+
+   texImage->IsClientData = GL_FALSE;
+
+   if (r200ValidateClientStorage( ctx, target, 
+				  internalFormat, 
+				  width, height, 
+				  format, type, pixels, 
+				  packing, texObj, texImage)) {
+      if (R200_DEBUG & DEBUG_TEXTURE)
+	 fprintf(stderr, "%s: Using client storage\n", __FUNCTION__); 
+   }
+   else {
+      if (R200_DEBUG & DEBUG_TEXTURE)
+	 fprintf(stderr, "%s: Using normal storage\n", __FUNCTION__); 
+
+      /* Normal path: copy (to cached memory) and eventually upload
+       * via another copy to GART memory and then a blit...  Could
+       * eliminate one copy by going straight to (permanent) GART.
+       *
+       * Note, this will call r200ChooseTextureFormat.
+       */
+      _mesa_store_teximage2d(ctx, target, level, internalFormat,
+			     width, height, border, format, type, pixels,
+			     &ctx->Unpack, texObj, texImage);
+      
+      t->dirty_images[face] |= (1 << level);
+   }
+}
+
+
+static void r200TexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
+                                 GLint xoffset, GLint yoffset,
+                                 GLsizei width, GLsizei height,
+                                 GLenum format, GLenum type,
+                                 const GLvoid *pixels,
+                                 const struct gl_pixelstore_attrib *packing,
+                                 struct gl_texture_object *texObj,
+                                 struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+   GLuint face;
+
+   /* which cube face or ordinary 2D image */
+   switch (target) {
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+      ASSERT(face < 6);
+      break;
+   default:
+      face = 0;
+   }
+
+   assert( t ); /* this _should_ be true */
+   if ( t ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) r200AllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
+         return;
+      }
+   }
+
+   _mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
+			     height, format, type, pixels, packing, texObj,
+			     texImage);
+
+   t->dirty_images[face] |= (1 << level);
+}
+
+
+static void r200CompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
+                              GLint internalFormat,
+                              GLint width, GLint height, GLint border,
+                              GLsizei imageSize, const GLvoid *data,
+                              struct gl_texture_object *texObj,
+                              struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+   GLuint face;
+
+   /* which cube face or ordinary 2D image */
+   switch (target) {
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+      ASSERT(face < 6);
+      break;
+   default:
+      face = 0;
+   }
+
+   if ( t != NULL ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) r200AllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexImage2D");
+         return;
+      }
+   }
+
+   texImage->IsClientData = GL_FALSE;
+/* can't call this, different parameters. Would never evaluate to true anyway currently
+   if (r200ValidateClientStorage( ctx, target, 
+				  internalFormat,
+				  width, height,
+				  format, type, pixels,
+				  packing, texObj, texImage)) {
+      if (R200_DEBUG & DEBUG_TEXTURE)
+	 fprintf(stderr, "%s: Using client storage\n", __FUNCTION__);
+   }
+   else */{
+      if (R200_DEBUG & DEBUG_TEXTURE)
+	 fprintf(stderr, "%s: Using normal storage\n", __FUNCTION__);
+
+      /* Normal path: copy (to cached memory) and eventually upload
+       * via another copy to GART memory and then a blit...  Could
+       * eliminate one copy by going straight to (permanent) GART.
+       *
+       * Note, this will call r200ChooseTextureFormat.
+       */
+      _mesa_store_compressed_teximage2d(ctx, target, level, internalFormat, width,
+                                 height, border, imageSize, data, texObj, texImage);
+
+      t->dirty_images[face] |= (1 << level);
+   }
+}
+
+
+static void r200CompressedTexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
+                                 GLint xoffset, GLint yoffset,
+                                 GLsizei width, GLsizei height,
+                                 GLenum format,
+                                 GLsizei imageSize, const GLvoid *data,
+                                 struct gl_texture_object *texObj,
+                                 struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+   GLuint face;
+
+
+   /* which cube face or ordinary 2D image */
+   switch (target) {
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+      ASSERT(face < 6);
+      break;
+   default:
+      face = 0;
+   }
+
+   assert( t ); /* this _should_ be true */
+   if ( t ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) r200AllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexSubImage2D");
+         return;
+      }
+   }
+
+   _mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
+                            height, format, imageSize, data, texObj, texImage);
+
+   t->dirty_images[face] |= (1 << level);
+}
+
+
+#if ENABLE_HW_3D_TEXTURE
+static void r200TexImage3D( GLcontext *ctx, GLenum target, GLint level,
+                            GLint internalFormat,
+                            GLint width, GLint height, GLint depth,
+                            GLint border,
+                            GLenum format, GLenum type, const GLvoid *pixels,
+                            const struct gl_pixelstore_attrib *packing,
+                            struct gl_texture_object *texObj,
+                            struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+
+   if ( t ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) r200AllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage3D");
+         return;
+      }
+   }
+
+   texImage->IsClientData = GL_FALSE;
+
+#if 0
+   if (r200ValidateClientStorage( ctx, target, 
+				  internalFormat, 
+				  width, height, 
+				  format, type, pixels, 
+				  packing, texObj, texImage)) {
+      if (R200_DEBUG & DEBUG_TEXTURE)
+	 fprintf(stderr, "%s: Using client storage\n", __FUNCTION__); 
+   }
+   else
+#endif
+   {
+      if (R200_DEBUG & DEBUG_TEXTURE)
+	 fprintf(stderr, "%s: Using normal storage\n", __FUNCTION__); 
+
+      /* Normal path: copy (to cached memory) and eventually upload
+       * via another copy to GART memory and then a blit...  Could
+       * eliminate one copy by going straight to (permanent) GART.
+       *
+       * Note, this will call r200ChooseTextureFormat.
+       */
+      _mesa_store_teximage3d(ctx, target, level, internalFormat,
+			     width, height, depth, border,
+                             format, type, pixels,
+			     &ctx->Unpack, texObj, texImage);
+      
+      t->dirty_images[0] |= (1 << level);
+   }
+}
+#endif
+
+
+#if ENABLE_HW_3D_TEXTURE
+static void
+r200TexSubImage3D( GLcontext *ctx, GLenum target, GLint level,
+                   GLint xoffset, GLint yoffset, GLint zoffset,
+                   GLsizei width, GLsizei height, GLsizei depth,
+                   GLenum format, GLenum type,
+                   const GLvoid *pixels,
+                   const struct gl_pixelstore_attrib *packing,
+                   struct gl_texture_object *texObj,
+                   struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+
+/*     fprintf(stderr, "%s\n", __FUNCTION__); */
+
+   assert( t ); /* this _should_ be true */
+   if ( t ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) r200AllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage3D");
+         return;
+      }
+      texObj->DriverData = t;
+   }
+
+   _mesa_store_texsubimage3d(ctx, target, level, xoffset, yoffset, zoffset,
+                             width, height, depth,
+                             format, type, pixels, packing, texObj, texImage);
+
+   t->dirty_images[0] |= (1 << level);
+}
+#endif
+
+
+
+static void r200TexEnv( GLcontext *ctx, GLenum target,
+			  GLenum pname, const GLfloat *param )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint unit = ctx->Texture.CurrentUnit;
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+
+   if ( R200_DEBUG & DEBUG_STATE ) {
+      fprintf( stderr, "%s( %s )\n",
+	       __FUNCTION__, _mesa_lookup_enum_by_nr( pname ) );
+   }
+
+   /* This is incorrect: Need to maintain this data for each of
+    * GL_TEXTURE_{123}D, GL_TEXTURE_RECTANGLE_NV, etc, and switch
+    * between them according to _ReallyEnabled.
+    */
+   switch ( pname ) {
+   case GL_TEXTURE_ENV_COLOR: {
+      GLubyte c[4];
+      GLuint envColor;
+      UNCLAMPED_FLOAT_TO_RGBA_CHAN( c, texUnit->EnvColor );
+      envColor = r200PackColor( 4, c[0], c[1], c[2], c[3] );
+      if ( rmesa->hw.tf.cmd[TF_TFACTOR_0 + unit] != envColor ) {
+	 R200_STATECHANGE( rmesa, tf );
+	 rmesa->hw.tf.cmd[TF_TFACTOR_0 + unit] = envColor;
+      }
+      break;
+   }
+
+   case GL_TEXTURE_LOD_BIAS_EXT: {
+      GLfloat bias, min;
+      GLuint b;
+      const int fixed_one = 0x8000000;
+
+      /* The R200's LOD bias is a signed 2's complement value with a
+       * range of -16.0 <= bias < 16.0. 
+       *
+       * NOTE: Add a small bias to the bias for conform mipsel.c test.
+       */
+      bias = *param + .01;
+      min = driQueryOptionb (&rmesa->optionCache, "no_neg_lod_bias") ?
+	  0.0 : -16.0;
+      bias = CLAMP( bias, min, 16.0 );
+      b = (int)(bias * fixed_one) & R200_LOD_BIAS_MASK;
+      
+      if ( (rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT_X] & R200_LOD_BIAS_MASK) != b ) {
+	 R200_STATECHANGE( rmesa, tex[unit] );
+	 rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT_X] &= ~R200_LOD_BIAS_MASK;
+	 rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT_X] |= b;
+      }
+      break;
+   }
+   case GL_COORD_REPLACE_ARB:
+      if (ctx->Point.PointSprite) {
+	 R200_STATECHANGE( rmesa, spr );
+	 if ((GLenum)param[0]) {
+	    rmesa->hw.spr.cmd[SPR_POINT_SPRITE_CNTL] |= R200_PS_GEN_TEX_0 << unit;
+	 } else {
+	    rmesa->hw.spr.cmd[SPR_POINT_SPRITE_CNTL] &= ~(R200_PS_GEN_TEX_0 << unit);
+	 }
+      }
+      break;
+   default:
+      return;
+   }
+}
+
+
+/**
+ * Changes variables and flags for a state update, which will happen at the
+ * next UpdateTextureState
+ */
+
+static void r200TexParameter( GLcontext *ctx, GLenum target,
+				struct gl_texture_object *texObj,
+				GLenum pname, const GLfloat *params )
+{
+   r200TexObjPtr t = (r200TexObjPtr) texObj->DriverData;
+
+   if ( R200_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+      fprintf( stderr, "%s( %s )\n", __FUNCTION__,
+	       _mesa_lookup_enum_by_nr( pname ) );
+   }
+
+   switch ( pname ) {
+   case GL_TEXTURE_MIN_FILTER:
+   case GL_TEXTURE_MAG_FILTER:
+   case GL_TEXTURE_MAX_ANISOTROPY_EXT:
+      r200SetTexMaxAnisotropy( t, texObj->MaxAnisotropy );
+      r200SetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
+      break;
+
+   case GL_TEXTURE_WRAP_S:
+   case GL_TEXTURE_WRAP_T:
+   case GL_TEXTURE_WRAP_R:
+      r200SetTexWrap( t, texObj->WrapS, texObj->WrapT, texObj->WrapR );
+      break;
+
+   case GL_TEXTURE_BORDER_COLOR:
+      r200SetTexBorderColor( t, texObj->_BorderChan );
+      break;
+
+   case GL_TEXTURE_BASE_LEVEL:
+   case GL_TEXTURE_MAX_LEVEL:
+   case GL_TEXTURE_MIN_LOD:
+   case GL_TEXTURE_MAX_LOD:
+      /* This isn't the most efficient solution but there doesn't appear to
+       * be a nice alternative.  Since there's no LOD clamping,
+       * we just have to rely on loading the right subset of mipmap levels
+       * to simulate a clamped LOD.
+       */
+      driSwapOutTextureObject( (driTextureObject *) t );
+      break;
+
+   default:
+      return;
+   }
+
+   /* Mark this texobj as dirty (one bit per tex unit)
+    */
+   t->dirty_state = TEX_ALL;
+}
+
+
+
+static void r200BindTexture( GLcontext *ctx, GLenum target,
+			       struct gl_texture_object *texObj )
+{
+   if ( R200_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+      fprintf( stderr, "%s( %p ) unit=%d\n", __FUNCTION__, (void *)texObj,
+	       ctx->Texture.CurrentUnit );
+   }
+
+   if ( (target == GL_TEXTURE_1D)
+	|| (target == GL_TEXTURE_2D) 
+#if ENABLE_HW_3D_TEXTURE
+	|| (target == GL_TEXTURE_3D)
+#endif
+	|| (target == GL_TEXTURE_CUBE_MAP)
+	|| (target == GL_TEXTURE_RECTANGLE_NV) ) {
+      assert( texObj->DriverData != NULL );
+   }
+}
+
+
+static void r200DeleteTexture( GLcontext *ctx,
+				 struct gl_texture_object *texObj )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+
+   if ( R200_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+      fprintf( stderr, "%s( %p (target = %s) )\n", __FUNCTION__, (void *)texObj,
+	       _mesa_lookup_enum_by_nr( texObj->Target ) );
+   }
+
+   if ( t != NULL ) {
+      if ( rmesa ) {
+         R200_FIREVERTICES( rmesa );
+      }
+
+      driDestroyTextureObject( t );
+   }
+   /* Free mipmap images and the texture object itself */
+   _mesa_delete_texture_object(ctx, texObj);
+}
+
+/* Need:  
+ *  - Same GEN_MODE for all active bits
+ *  - Same EyePlane/ObjPlane for all active bits when using Eye/Obj
+ *  - STRQ presumably all supported (matrix means incoming R values
+ *    can end up in STQ, this has implications for vertex support,
+ *    presumably ok if maos is used, though?)
+ *  
+ * Basically impossible to do this on the fly - just collect some
+ * basic info & do the checks from ValidateState().
+ */
+static void r200TexGen( GLcontext *ctx,
+			  GLenum coord,
+			  GLenum pname,
+			  const GLfloat *params )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint unit = ctx->Texture.CurrentUnit;
+   rmesa->recheck_texgen[unit] = GL_TRUE;
+}
+
+
+/**
+ * Allocate a new texture object.
+ * Called via ctx->Driver.NewTextureObject.
+ * Note: this function will be called during context creation to
+ * allocate the default texture objects.
+ * Note: we could use containment here to 'derive' the driver-specific
+ * texture object from the core mesa gl_texture_object.  Not done at this time.
+ * Fixup MaxAnisotropy according to user preference.
+ */
+static struct gl_texture_object *
+r200NewTextureObject( GLcontext *ctx, GLuint name, GLenum target )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   struct gl_texture_object *obj;
+   obj = _mesa_new_texture_object(ctx, name, target);
+   if (!obj)
+      return NULL;
+   obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
+   r200AllocTexObj( obj );
+   return obj;
+}
+
+
+void r200InitTextureFuncs( struct dd_function_table *functions )
+{
+   /* Note: we only plug in the functions we implement in the driver
+    * since _mesa_init_driver_functions() was already called.
+    */
+   functions->ChooseTextureFormat	= r200ChooseTextureFormat;
+   functions->TexImage1D		= r200TexImage1D;
+   functions->TexImage2D		= r200TexImage2D;
+#if ENABLE_HW_3D_TEXTURE
+   functions->TexImage3D		= r200TexImage3D;
+#else
+   functions->TexImage3D		= _mesa_store_teximage3d;
+#endif
+   functions->TexSubImage1D		= r200TexSubImage1D;
+   functions->TexSubImage2D		= r200TexSubImage2D;
+#if ENABLE_HW_3D_TEXTURE
+   functions->TexSubImage3D		= r200TexSubImage3D;
+#else
+   functions->TexSubImage3D		= _mesa_store_texsubimage3d;
+#endif
+   functions->NewTextureObject		= r200NewTextureObject;
+   functions->BindTexture		= r200BindTexture;
+   functions->DeleteTexture		= r200DeleteTexture;
+   functions->IsTextureResident		= driIsTextureResident;
+
+   functions->TexEnv			= r200TexEnv;
+   functions->TexParameter		= r200TexParameter;
+   functions->TexGen			= r200TexGen;
+
+   functions->CompressedTexImage2D	= r200CompressedTexImage2D;
+   functions->CompressedTexSubImage2D	= r200CompressedTexSubImage2D;
+
+   driInitTextureFormats();
+
+#if 000
+   /* moved or obsolete code */
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   driInitTextureObjects( ctx, & rmesa->swapped,
+			  DRI_TEXMGR_DO_TEXTURE_1D
+			  | DRI_TEXMGR_DO_TEXTURE_2D );
+
+   /* Hack: r200NewTextureObject is not yet installed when the
+    * default textures are created. Therefore set MaxAnisotropy of the
+    * default 2D texture now. */
+   ctx->Shared->Default2D->MaxAnisotropy = driQueryOptionf (&rmesa->optionCache,
+							    "def_max_anisotropy");
+#endif
+}
diff --git a/r200/r200_tex.h b/r200/r200_tex.h
new file mode 100644
index 0000000..68e9a0e
--- /dev/null
+++ b/r200/r200_tex.h
@@ -0,0 +1,55 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_tex.h,v 1.1 2002/10/30 12:51:53 alanh Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __R200_TEX_H__
+#define __R200_TEX_H__
+
+extern void r200SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
+			     unsigned long long offset, GLint depth,
+			     GLuint pitch);
+
+extern void r200UpdateTextureState( GLcontext *ctx );
+
+extern int r200UploadTexImages( r200ContextPtr rmesa, r200TexObjPtr t, GLuint face );
+
+extern void r200DestroyTexObj( r200ContextPtr rmesa, r200TexObjPtr t );
+
+extern void r200InitTextureFuncs( struct dd_function_table *functions );
+
+extern void r200UpdateFragmentShader( GLcontext *ctx );
+
+extern void set_re_cntl_d3d( GLcontext *ctx, int unit, GLboolean use_d3d );
+
+#endif /* __R200_TEX_H__ */
diff --git a/r200/r200_texmem.c b/r200/r200_texmem.c
new file mode 100644
index 0000000..9daafcf
--- /dev/null
+++ b/r200/r200_texmem.c
@@ -0,0 +1,531 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_texmem.c,v 1.5 2002/12/17 00:32:56 dawes Exp $ */
+/**************************************************************************
+
+Copyright (C) Tungsten Graphics 2002.  All Rights Reserved.  
+The Weather Channel, Inc. funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86
+license. This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation on the rights to use, copy, modify, merge, publish,
+distribute, sub license, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
+SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Gareth Hughes <gareth@valinux.com>
+ *
+ */
+ 
+#include <errno.h>
+
+#include "glheader.h"
+#include "imports.h"
+#include "context.h"
+#include "colormac.h"
+#include "macros.h"
+#include "r200_context.h"
+#include "r200_ioctl.h"
+#include "r200_tex.h"
+#include "radeon_reg.h"
+
+#include <unistd.h>  /* for usleep() */
+
+
+/**
+ * Destroy any device-dependent state associated with the texture.  This may
+ * include NULLing out hardware state that points to the texture.
+ */
+void
+r200DestroyTexObj( r200ContextPtr rmesa, r200TexObjPtr t )
+{
+   if ( R200_DEBUG & DEBUG_TEXTURE ) {
+      fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, 
+	       (void *)t, (void *)t->base.tObj );
+   }
+
+   if ( rmesa != NULL ) {
+      unsigned   i;
+
+
+      for ( i = 0 ; i < rmesa->glCtx->Const.MaxTextureUnits ; i++ ) {
+	 if ( t == rmesa->state.texture.unit[i].texobj ) {
+	    rmesa->state.texture.unit[i].texobj = NULL;
+	    rmesa->hw.tex[i].dirty = GL_FALSE;
+	    rmesa->hw.cube[i].dirty = GL_FALSE;
+	 }
+      }
+   }
+}
+
+
+/* ------------------------------------------------------------
+ * Texture image conversions
+ */
+
+
+static void r200UploadGARTClientSubImage( r200ContextPtr rmesa,
+					  r200TexObjPtr t, 
+					  struct gl_texture_image *texImage,
+					  GLint hwlevel,
+					  GLint x, GLint y, 
+					  GLint width, GLint height )
+{
+   const struct gl_texture_format *texFormat = texImage->TexFormat;
+   GLuint srcPitch, dstPitch;
+   int blit_format;
+   int srcOffset;
+
+   /*
+    * XXX it appears that we always upload the full image, not a subimage.
+    * I.e. x==0, y==0, width=texWidth, height=texWidth.  If this is ever
+    * changed, the src pitch will have to change.
+    */
+   switch ( texFormat->TexelBytes ) {
+   case 1:
+      blit_format = R200_CP_COLOR_FORMAT_CI8;
+      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+      break;
+   case 2:
+      blit_format = R200_CP_COLOR_FORMAT_RGB565;
+      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+      break;
+   case 4:
+      blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
+      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+      break;
+   default:
+      return;
+   }
+
+   t->image[0][hwlevel].data = texImage->Data;
+   srcOffset = r200GartOffsetFromVirtual( rmesa, texImage->Data );
+
+   assert( srcOffset != ~0 );
+
+   /* Don't currently need to cope with small pitches?
+    */
+   width = texImage->Width;
+   height = texImage->Height;
+
+   r200EmitWait( rmesa, RADEON_WAIT_3D );
+
+   r200EmitBlit( rmesa, blit_format, 
+		 srcPitch,  
+		 srcOffset,   
+		 dstPitch,
+		 t->bufAddr,
+		 x, 
+		 y, 
+		 t->image[0][hwlevel].x + x,
+		 t->image[0][hwlevel].y + y, 
+		 width,
+		 height );
+
+   r200EmitWait( rmesa, RADEON_WAIT_2D );
+}
+
+static void r200UploadRectSubImage( r200ContextPtr rmesa,
+				    r200TexObjPtr t, 
+				    struct gl_texture_image *texImage,
+				    GLint x, GLint y, 
+				    GLint width, GLint height )
+{
+   const struct gl_texture_format *texFormat = texImage->TexFormat;
+   int blit_format, dstPitch, done;
+
+   switch ( texFormat->TexelBytes ) {
+   case 1:
+      blit_format = R200_CP_COLOR_FORMAT_CI8;
+      break;
+   case 2:
+      blit_format = R200_CP_COLOR_FORMAT_RGB565;
+      break;
+   case 4:
+      blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
+      break;
+   default:
+      return;
+   }
+
+   t->image[0][0].data = texImage->Data;
+
+   /* Currently don't need to cope with small pitches.
+    */
+   width = texImage->Width;
+   height = texImage->Height;
+   dstPitch = t->pp_txpitch + 32;
+
+   if (rmesa->prefer_gart_client_texturing && texImage->IsClientData) {
+      /* In this case, could also use GART texturing.  This is
+       * currently disabled, but has been tested & works.
+       */
+      if ( !t->image_override )
+         t->pp_txoffset = r200GartOffsetFromVirtual( rmesa, texImage->Data );
+      t->pp_txpitch = texImage->RowStride * texFormat->TexelBytes - 32;
+
+      if (R200_DEBUG & DEBUG_TEXTURE)
+	 fprintf(stderr, 
+		 "Using GART texturing for rectangular client texture\n");
+
+      /* Release FB memory allocated for this image:
+       */
+      /* FIXME This may not be correct as driSwapOutTextureObject sets
+       * FIXME dirty_images.  It may be fine, though.
+       */
+      if ( t->base.memBlock ) {
+	 driSwapOutTextureObject( (driTextureObject *) t );
+      }
+   }
+   else if (texImage->IsClientData) {
+      /* Data already in GART memory, with usable pitch.
+       */
+      GLuint srcPitch;
+      srcPitch = texImage->RowStride * texFormat->TexelBytes;
+      r200EmitBlit( rmesa, 
+		    blit_format, 
+		    srcPitch,
+		    r200GartOffsetFromVirtual( rmesa, texImage->Data ),   
+		    dstPitch, t->bufAddr,
+		    0, 0, 
+		    0, 0, 
+		    width, height );
+   }
+   else {
+      /* Data not in GART memory, or bad pitch.
+       */
+      for (done = 0; done < height ; ) {
+	 struct r200_dma_region region;
+	 int lines = MIN2( height - done, RADEON_BUFFER_SIZE / dstPitch );
+	 int src_pitch;
+	 char *tex;
+
+         src_pitch = texImage->RowStride * texFormat->TexelBytes;
+
+	 tex = (char *)texImage->Data + done * src_pitch;
+
+	 memset(&region, 0, sizeof(region));
+	 r200AllocDmaRegion( rmesa, &region, lines * dstPitch, 1024 );
+
+	 /* Copy texdata to dma:
+	  */
+	 if (0)
+	    fprintf(stderr, "%s: src_pitch %d dst_pitch %d\n",
+		    __FUNCTION__, src_pitch, dstPitch);
+
+	 if (src_pitch == dstPitch) {
+	    memcpy( region.address + region.start, tex, lines * src_pitch );
+	 } 
+	 else {
+	    char *buf = region.address + region.start;
+	    int i;
+	    for (i = 0 ; i < lines ; i++) {
+	       memcpy( buf, tex, src_pitch );
+	       buf += dstPitch;
+	       tex += src_pitch;
+	    }
+	 }
+
+	 r200EmitWait( rmesa, RADEON_WAIT_3D );
+
+	 /* Blit to framebuffer
+	  */
+	 r200EmitBlit( rmesa,
+		       blit_format,
+		       dstPitch, GET_START( &region ),
+		       dstPitch | (t->tile_bits >> 16),
+		       t->bufAddr,
+		       0, 0,
+		       0, done,
+		       width, lines );
+	 
+	 r200EmitWait( rmesa, RADEON_WAIT_2D );
+
+	 r200ReleaseDmaRegion( rmesa, &region, __FUNCTION__ );
+	 done += lines;
+      }
+   }
+}
+
+
+/**
+ * Upload the texture image associated with texture \a t at the specified
+ * level at the address relative to \a start.
+ */
+static void uploadSubImage( r200ContextPtr rmesa, r200TexObjPtr t, 
+			    GLint hwlevel,
+			    GLint x, GLint y, GLint width, GLint height,
+			    GLuint face )
+{
+   struct gl_texture_image *texImage = NULL;
+   GLuint offset;
+   GLint imageWidth, imageHeight;
+   GLint ret;
+   drm_radeon_texture_t tex;
+   drm_radeon_tex_image_t tmp;
+   const int level = hwlevel + t->base.firstLevel;
+
+   if ( R200_DEBUG & DEBUG_TEXTURE ) {
+      fprintf( stderr, "%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n", 
+	       __FUNCTION__, (void *)t, (void *)t->base.tObj,
+	       level, width, height, face );
+   }
+
+   ASSERT(face < 6);
+
+   /* Ensure we have a valid texture to upload */
+   if ( ( hwlevel < 0 ) || ( hwlevel >= RADEON_MAX_TEXTURE_LEVELS ) ) {
+      _mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
+      return;
+   }
+
+   texImage = t->base.tObj->Image[face][level];
+
+   if ( !texImage ) {
+      if ( R200_DEBUG & DEBUG_TEXTURE )
+	 fprintf( stderr, "%s: texImage %d is NULL!\n", __FUNCTION__, level );
+      return;
+   }
+   if ( !texImage->Data ) {
+      if ( R200_DEBUG & DEBUG_TEXTURE )
+	 fprintf( stderr, "%s: image data is NULL!\n", __FUNCTION__ );
+      return;
+   }
+
+
+   if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+      assert(level == 0);
+      assert(hwlevel == 0);
+      if ( R200_DEBUG & DEBUG_TEXTURE )
+	 fprintf( stderr, "%s: image data is rectangular\n", __FUNCTION__);
+      r200UploadRectSubImage( rmesa, t, texImage, x, y, width, height );
+      return;
+   }
+   else if (texImage->IsClientData) {
+      if ( R200_DEBUG & DEBUG_TEXTURE )
+	 fprintf( stderr, "%s: image data is in GART client storage\n",
+		  __FUNCTION__);
+      r200UploadGARTClientSubImage( rmesa, t, texImage, hwlevel,
+				   x, y, width, height );
+      return;
+   }
+   else if ( R200_DEBUG & DEBUG_TEXTURE )
+      fprintf( stderr, "%s: image data is in normal memory\n",
+	       __FUNCTION__);
+      
+
+   imageWidth = texImage->Width;
+   imageHeight = texImage->Height;
+
+   offset = t->bufAddr + t->base.totalSize / 6 * face;
+
+   if ( R200_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
+      GLint imageX = 0;
+      GLint imageY = 0;
+      GLint blitX = t->image[face][hwlevel].x;
+      GLint blitY = t->image[face][hwlevel].y;
+      GLint blitWidth = t->image[face][hwlevel].width;
+      GLint blitHeight = t->image[face][hwlevel].height;
+      fprintf( stderr, "   upload image: %d,%d at %d,%d\n",
+	       imageWidth, imageHeight, imageX, imageY );
+      fprintf( stderr, "   upload  blit: %d,%d at %d,%d\n",
+	       blitWidth, blitHeight, blitX, blitY );
+      fprintf( stderr, "       blit ofs: 0x%07x level: %d/%d\n",
+	       (GLuint)offset, hwlevel, level );
+   }
+
+   t->image[face][hwlevel].data = texImage->Data;
+
+   /* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
+    * NOTE: we're always use a 1KB-wide blit and I8 texture format.
+    * We used to use 1, 2 and 4-byte texels and used to use the texture
+    * width to dictate the blit width - but that won't work for compressed
+    * textures. (Brian)
+    * NOTE: can't do that with texture tiling. (sroland)
+    */
+   tex.offset = offset;
+   tex.image = &tmp;
+   /* copy (x,y,width,height,data) */
+   memcpy( &tmp, &t->image[face][hwlevel], sizeof(tmp) );
+   
+   if (texImage->TexFormat->TexelBytes) {
+      /* use multi-byte upload scheme */
+      tex.height = imageHeight;
+      tex.width = imageWidth;
+      tex.format = t->pp_txformat & R200_TXFORMAT_FORMAT_MASK;
+      if (tex.format == R200_TXFORMAT_ABGR8888) {
+	 /* drm will refuse abgr8888 textures. */
+	 tex.format = R200_TXFORMAT_ARGB8888;
+      }
+      tex.pitch = MAX2((texImage->Width * texImage->TexFormat->TexelBytes) / 64, 1);
+      tex.offset += tmp.x & ~1023;
+      tmp.x = tmp.x % 1024;
+      if (t->tile_bits & R200_TXO_MICRO_TILE) {
+	 /* need something like "tiled coordinates" ? */
+	 tmp.y = tmp.x / (tex.pitch * 128) * 2;
+	 tmp.x = tmp.x % (tex.pitch * 128) / 2 / texImage->TexFormat->TexelBytes;
+	 tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
+      }
+      else {
+	 tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
+      }
+      if ((t->tile_bits & R200_TXO_MACRO_TILE) &&
+	 (texImage->Width * texImage->TexFormat->TexelBytes >= 256) &&
+	 ((!(t->tile_bits & R200_TXO_MICRO_TILE) && (texImage->Height >= 8)) ||
+	    (texImage->Height >= 16))) {
+	 /* weird: R200 disables macro tiling if mip width is smaller than 256 bytes,
+	    OR if height is smaller than 8 automatically, but if micro tiling is active
+	    the limit is height 16 instead ? */
+	 tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
+      }
+   }
+   else {
+      /* In case of for instance 8x8 texture (2x2 dxt blocks), padding after the first two blocks is
+         needed (only with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
+      /* set tex.height to 1/4 since 1 "macropixel" (dxt-block) has 4 real pixels. Needed
+         so the kernel module reads the right amount of data. */
+      tex.format = R200_TXFORMAT_I8; /* any 1-byte texel format */
+      tex.pitch = (BLIT_WIDTH_BYTES / 64);
+      tex.height = (imageHeight + 3) / 4;
+      tex.width = (imageWidth + 3) / 4;
+      switch (t->pp_txformat & R200_TXFORMAT_FORMAT_MASK) {
+      case R200_TXFORMAT_DXT1:
+           tex.width *= 8;
+           break;
+      case R200_TXFORMAT_DXT23:
+      case R200_TXFORMAT_DXT45:
+           tex.width *= 16;
+           break;
+      default:
+          fprintf(stderr, "unknown compressed tex format in uploadSubImage\n");
+      }
+   }
+
+   LOCK_HARDWARE( rmesa );
+   do {
+      ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_TEXTURE,
+                                 &tex, sizeof(drm_radeon_texture_t) );
+      if (ret) {
+	 if (R200_DEBUG & DEBUG_IOCTL)
+	    fprintf(stderr, "DRM_RADEON_TEXTURE:  again!\n");
+	 usleep(1);
+      }
+   } while ( ret == -EAGAIN );
+
+   UNLOCK_HARDWARE( rmesa );
+
+   if ( ret ) {
+      fprintf( stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret );
+      fprintf( stderr, "   offset=0x%08x\n",
+	       offset );
+      fprintf( stderr, "   image width=%d height=%d\n",
+	       imageWidth, imageHeight );
+      fprintf( stderr, "    blit width=%d height=%d data=%p\n",
+	       t->image[face][hwlevel].width, t->image[face][hwlevel].height,
+	       t->image[face][hwlevel].data );
+      exit( 1 );
+   }
+}
+
+
+/**
+ * Upload the texture images associated with texture \a t.  This might
+ * require the allocation of texture memory.
+ * 
+ * \param rmesa Context pointer
+ * \param t Texture to be uploaded
+ * \param face Cube map face to be uploaded.  Zero for non-cube maps.
+ */
+
+int r200UploadTexImages( r200ContextPtr rmesa, r200TexObjPtr t, GLuint face )
+{
+   const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+
+   if ( R200_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
+      fprintf( stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
+	       (void *)rmesa->glCtx, (void *)t->base.tObj, t->base.totalSize,
+	       t->base.firstLevel, t->base.lastLevel );
+   }
+
+   if ( !t || t->base.totalSize == 0 || t->image_override )
+      return 0;
+
+   if (R200_DEBUG & DEBUG_SYNC) {
+      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
+      r200Finish( rmesa->glCtx );
+   }
+
+   LOCK_HARDWARE( rmesa );
+
+   if ( t->base.memBlock == NULL ) {
+      int heap;
+
+      heap = driAllocateTexture( rmesa->texture_heaps, rmesa->nr_heaps,
+				 (driTextureObject *) t );
+      if ( heap == -1 ) {
+	 UNLOCK_HARDWARE( rmesa );
+	 return -1;
+      }
+
+      /* Set the base offset of the texture image */
+      t->bufAddr = rmesa->r200Screen->texOffset[heap] 
+	   + t->base.memBlock->ofs;
+      t->pp_txoffset = t->bufAddr;
+       
+      if (!(t->base.tObj->Image[0][0]->IsClientData)) {
+	 /* hope it's safe to add that here... */
+	 t->pp_txoffset |= t->tile_bits;
+      }
+
+      /* Mark this texobj as dirty on all units:
+       */
+      t->dirty_state = TEX_ALL;
+   }
+
+   /* Let the world know we've used this memory recently.
+    */
+   driUpdateTextureLRU( (driTextureObject *) t );
+   UNLOCK_HARDWARE( rmesa );
+
+   /* Upload any images that are new */
+   if (t->base.dirty_images[face]) {
+      int i;
+      for ( i = 0 ; i < numLevels ; i++ ) {
+         if ( (t->base.dirty_images[face] & (1 << (i+t->base.firstLevel))) != 0 ) {
+            uploadSubImage( rmesa, t, i, 0, 0, t->image[face][i].width,
+			    t->image[face][i].height, face );
+         }
+      }
+      t->base.dirty_images[face] = 0;
+   }
+
+
+   if (R200_DEBUG & DEBUG_SYNC) {
+      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
+      r200Finish( rmesa->glCtx );
+   }
+
+   return 0;
+}
diff --git a/r200/r200_texstate.c b/r200/r200_texstate.c
new file mode 100644
index 0000000..d12c3bc
--- /dev/null
+++ b/r200/r200_texstate.c
@@ -0,0 +1,1954 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/r200/r200_texstate.c,v 1.3 2003/02/15 22:18:47 dawes Exp $ */
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "context.h"
+#include "macros.h"
+#include "texformat.h"
+#include "texobj.h"
+#include "enums.h"
+
+#include "r200_context.h"
+#include "r200_state.h"
+#include "r200_ioctl.h"
+#include "r200_swtcl.h"
+#include "r200_tex.h"
+#include "r200_tcl.h"
+
+
+#define R200_TXFORMAT_A8        R200_TXFORMAT_I8
+#define R200_TXFORMAT_L8        R200_TXFORMAT_I8
+#define R200_TXFORMAT_AL88      R200_TXFORMAT_AI88
+#define R200_TXFORMAT_YCBCR     R200_TXFORMAT_YVYU422
+#define R200_TXFORMAT_YCBCR_REV R200_TXFORMAT_VYUY422
+#define R200_TXFORMAT_RGB_DXT1  R200_TXFORMAT_DXT1
+#define R200_TXFORMAT_RGBA_DXT1 R200_TXFORMAT_DXT1
+#define R200_TXFORMAT_RGBA_DXT3 R200_TXFORMAT_DXT23
+#define R200_TXFORMAT_RGBA_DXT5 R200_TXFORMAT_DXT45
+
+#define _COLOR(f) \
+    [ MESA_FORMAT_ ## f ] = { R200_TXFORMAT_ ## f, 0 }
+#define _COLOR_REV(f) \
+    [ MESA_FORMAT_ ## f ## _REV ] = { R200_TXFORMAT_ ## f, 0 }
+#define _ALPHA(f) \
+    [ MESA_FORMAT_ ## f ] = { R200_TXFORMAT_ ## f | R200_TXFORMAT_ALPHA_IN_MAP, 0 }
+#define _ALPHA_REV(f) \
+    [ MESA_FORMAT_ ## f ## _REV ] = { R200_TXFORMAT_ ## f | R200_TXFORMAT_ALPHA_IN_MAP, 0 }
+#define _YUV(f) \
+    [ MESA_FORMAT_ ## f ] = { R200_TXFORMAT_ ## f, R200_YUV_TO_RGB }
+#define _INVALID(f) \
+    [ MESA_FORMAT_ ## f ] = { 0xffffffff, 0 }
+#define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5) \
+			     && (tx_table_be[f].format != 0xffffffff) )
+
+struct tx_table {
+   GLuint format, filter;
+};
+
+static const struct tx_table tx_table_be[] =
+{
+   [ MESA_FORMAT_RGBA8888 ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   _ALPHA_REV(RGBA8888),
+   _ALPHA(ARGB8888),
+   _ALPHA_REV(ARGB8888),
+   _INVALID(RGB888),
+   _COLOR(RGB565),
+   _COLOR_REV(RGB565),
+   _ALPHA(ARGB4444),
+   _ALPHA_REV(ARGB4444),
+   _ALPHA(ARGB1555),
+   _ALPHA_REV(ARGB1555),
+   _ALPHA(AL88),
+   _ALPHA_REV(AL88),
+   _ALPHA(A8),
+   _COLOR(L8),
+   _ALPHA(I8),
+   _INVALID(CI8),
+   _YUV(YCBCR),
+   _YUV(YCBCR_REV),
+   _INVALID(RGB_FXT1),
+   _INVALID(RGBA_FXT1),
+   _COLOR(RGB_DXT1),
+   _ALPHA(RGBA_DXT1),
+   _ALPHA(RGBA_DXT3),
+   _ALPHA(RGBA_DXT5),
+};
+
+static const struct tx_table tx_table_le[] =
+{
+   _ALPHA(RGBA8888),
+   [ MESA_FORMAT_RGBA8888_REV ] = { R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   _ALPHA(ARGB8888),
+   _ALPHA_REV(ARGB8888),
+   [ MESA_FORMAT_RGB888 ] = { R200_TXFORMAT_ARGB8888, 0 },
+   _COLOR(RGB565),
+   _COLOR_REV(RGB565),
+   _ALPHA(ARGB4444),
+   _ALPHA_REV(ARGB4444),
+   _ALPHA(ARGB1555),
+   _ALPHA_REV(ARGB1555),
+   _ALPHA(AL88),
+   _ALPHA_REV(AL88),
+   _ALPHA(A8),
+   _COLOR(L8),
+   _ALPHA(I8),
+   _INVALID(CI8),
+   _YUV(YCBCR),
+   _YUV(YCBCR_REV),
+   _INVALID(RGB_FXT1),
+   _INVALID(RGBA_FXT1),
+   _COLOR(RGB_DXT1),
+   _ALPHA(RGBA_DXT1),
+   _ALPHA(RGBA_DXT3),
+   _ALPHA(RGBA_DXT5),
+};
+
+#undef _COLOR
+#undef _ALPHA
+#undef _INVALID
+
+/**
+ * This function computes the number of bytes of storage needed for
+ * the given texture object (all mipmap levels, all cube faces).
+ * The \c image[face][level].x/y/width/height parameters for upload/blitting
+ * are computed here.  \c pp_txfilter, \c pp_txformat, etc. will be set here
+ * too.
+ * 
+ * \param rmesa Context pointer
+ * \param tObj GL texture object whose images are to be posted to
+ *                 hardware state.
+ */
+static void r200SetTexImages( r200ContextPtr rmesa,
+			      struct gl_texture_object *tObj )
+{
+   r200TexObjPtr t = (r200TexObjPtr)tObj->DriverData;
+   const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
+   GLint curOffset, blitWidth;
+   GLint i, texelBytes;
+   GLint numLevels;
+   GLint log2Width, log2Height, log2Depth;
+
+   /* Set the hardware texture format
+    */
+   if ( !t->image_override ) {
+      if ( VALID_FORMAT( baseImage->TexFormat->MesaFormat ) ) {
+	 const struct tx_table *table = _mesa_little_endian() ? tx_table_le :
+								tx_table_be;
+
+         t->pp_txformat &= ~(R200_TXFORMAT_FORMAT_MASK |
+                             R200_TXFORMAT_ALPHA_IN_MAP);
+         t->pp_txfilter &= ~R200_YUV_TO_RGB;
+
+	 t->pp_txformat |= table[ baseImage->TexFormat->MesaFormat ].format;
+	 t->pp_txfilter |= table[ baseImage->TexFormat->MesaFormat ].filter;
+      }
+      else {
+         _mesa_problem(NULL, "unexpected texture format in %s", __FUNCTION__);
+         return;
+      }
+   }
+
+   texelBytes = baseImage->TexFormat->TexelBytes;
+
+   /* Compute which mipmap levels we really want to send to the hardware.
+    */
+
+   driCalculateTextureFirstLastLevel( (driTextureObject *) t );
+   log2Width  = tObj->Image[0][t->base.firstLevel]->WidthLog2;
+   log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
+   log2Depth  = tObj->Image[0][t->base.firstLevel]->DepthLog2;
+
+   numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+
+   assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
+
+   /* Calculate mipmap offsets and dimensions for blitting (uploading)
+    * The idea is that we lay out the mipmap levels within a block of
+    * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
+    */
+   curOffset = 0;
+   blitWidth = BLIT_WIDTH_BYTES;
+   t->tile_bits = 0;
+
+   /* figure out if this texture is suitable for tiling. */
+   if (texelBytes) {
+      if (rmesa->texmicrotile  && (tObj->Target != GL_TEXTURE_RECTANGLE_NV) &&
+      /* texrect might be able to use micro tiling too in theory? */
+	 (baseImage->Height > 1)) {
+	 /* allow 32 (bytes) x 1 mip (which will use two times the space
+	 the non-tiled version would use) max if base texture is large enough */
+	 if ((numLevels == 1) ||
+	   (((baseImage->Width * texelBytes / baseImage->Height) <= 32) &&
+	       (baseImage->Width * texelBytes > 64)) ||
+	    ((baseImage->Width * texelBytes / baseImage->Height) <= 16)) {
+	    t->tile_bits |= R200_TXO_MICRO_TILE;
+	 }
+      }
+      if (tObj->Target != GL_TEXTURE_RECTANGLE_NV) {
+	 /* we can set macro tiling even for small textures, they will be untiled anyway */
+	 t->tile_bits |= R200_TXO_MACRO_TILE;
+      }
+   }
+
+   for (i = 0; i < numLevels; i++) {
+      const struct gl_texture_image *texImage;
+      GLuint size;
+
+      texImage = tObj->Image[0][i + t->base.firstLevel];
+      if ( !texImage )
+	 break;
+
+      /* find image size in bytes */
+      if (texImage->IsCompressed) {
+      /* need to calculate the size AFTER padding even though the texture is
+         submitted without padding.
+         Only handle pot textures currently - don't know if npot is even possible,
+         size calculation would certainly need (trivial) adjustments.
+         Align (and later pad) to 32byte, not sure what that 64byte blit width is
+         good for? */
+         if ((t->pp_txformat & R200_TXFORMAT_FORMAT_MASK) == R200_TXFORMAT_DXT1) {
+            /* RGB_DXT1/RGBA_DXT1, 8 bytes per block */
+            if ((texImage->Width + 3) < 8) /* width one block */
+               size = texImage->CompressedSize * 4;
+            else if ((texImage->Width + 3) < 16)
+               size = texImage->CompressedSize * 2;
+            else size = texImage->CompressedSize;
+         }
+         else /* DXT3/5, 16 bytes per block */
+            if ((texImage->Width + 3) < 8)
+               size = texImage->CompressedSize * 2;
+            else size = texImage->CompressedSize;
+      }
+      else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+	 size = ((texImage->Width * texelBytes + 63) & ~63) * texImage->Height;
+      }
+      else if (t->tile_bits & R200_TXO_MICRO_TILE) {
+	 /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+	    though the actual offset may be different (if texture is less than
+	    32 bytes width) to the untiled case */
+	 int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
+	 size = (w * ((texImage->Height + 1) / 2)) * texImage->Depth;
+	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+      }
+      else {
+	 int w = (texImage->Width * texelBytes + 31) & ~31;
+	 size = w * texImage->Height * texImage->Depth;
+	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+      }
+      assert(size > 0);
+
+      /* Align to 32-byte offset.  It is faster to do this unconditionally
+       * (no branch penalty).
+       */
+
+      curOffset = (curOffset + 0x1f) & ~0x1f;
+
+      if (texelBytes) {
+	 t->image[0][i].x = curOffset; /* fix x and y coords up later together with offset */
+	 t->image[0][i].y = 0;
+	 t->image[0][i].width = MIN2(size / texelBytes, blitWidth);
+	 t->image[0][i].height = (size / texelBytes) / t->image[0][i].width;
+      }
+      else {
+         t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
+         t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
+         t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
+         t->image[0][i].height = size / t->image[0][i].width;     
+      }
+
+#if 0
+      /* for debugging only and only  applicable to non-rectangle targets */
+      assert(size % t->image[0][i].width == 0);
+      assert(t->image[0][i].x == 0
+             || (size < BLIT_WIDTH_BYTES && t->image[0][i].height == 1));
+#endif
+
+      if (0)
+         fprintf(stderr,
+                 "level %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
+                 i, texImage->Width, texImage->Height,
+                 t->image[0][i].x, t->image[0][i].y,
+                 t->image[0][i].width, t->image[0][i].height, size, curOffset);
+
+      curOffset += size;
+
+   }
+
+   /* Align the total size of texture memory block.
+    */
+   t->base.totalSize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+
+   /* Setup remaining cube face blits, if needed */
+   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+      const GLuint faceSize = t->base.totalSize;
+      GLuint face;
+      /* reuse face 0 x/y/width/height - just update the offset when uploading */
+      for (face = 1; face < 6; face++) {
+         for (i = 0; i < numLevels; i++) {
+            t->image[face][i].x =  t->image[0][i].x;
+            t->image[face][i].y =  t->image[0][i].y;
+            t->image[face][i].width  = t->image[0][i].width;
+            t->image[face][i].height = t->image[0][i].height;
+         }
+      }
+      t->base.totalSize = 6 * faceSize; /* total texmem needed */
+   }
+
+
+   /* Hardware state:
+    */
+   t->pp_txfilter &= ~R200_MAX_MIP_LEVEL_MASK;
+   t->pp_txfilter |= (numLevels - 1) << R200_MAX_MIP_LEVEL_SHIFT;
+
+   t->pp_txformat &= ~(R200_TXFORMAT_WIDTH_MASK |
+		       R200_TXFORMAT_HEIGHT_MASK |
+                       R200_TXFORMAT_CUBIC_MAP_ENABLE |
+                       R200_TXFORMAT_F5_WIDTH_MASK |
+                       R200_TXFORMAT_F5_HEIGHT_MASK);
+   t->pp_txformat |= ((log2Width << R200_TXFORMAT_WIDTH_SHIFT) |
+		      (log2Height << R200_TXFORMAT_HEIGHT_SHIFT));
+
+   t->pp_txformat_x &= ~(R200_DEPTH_LOG2_MASK | R200_TEXCOORD_MASK);
+   if (tObj->Target == GL_TEXTURE_3D) {
+      t->pp_txformat_x |= (log2Depth << R200_DEPTH_LOG2_SHIFT);
+      t->pp_txformat_x |= R200_TEXCOORD_VOLUME;
+   }
+   else if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+      ASSERT(log2Width == log2Height);
+      t->pp_txformat |= ((log2Width << R200_TXFORMAT_F5_WIDTH_SHIFT) |
+                         (log2Height << R200_TXFORMAT_F5_HEIGHT_SHIFT) |
+/* don't think we need this bit, if it exists at all - fglrx does not set it */
+                         (R200_TXFORMAT_CUBIC_MAP_ENABLE));
+      t->pp_txformat_x |= R200_TEXCOORD_CUBIC_ENV;
+      t->pp_cubic_faces = ((log2Width << R200_FACE_WIDTH_1_SHIFT) |
+                           (log2Height << R200_FACE_HEIGHT_1_SHIFT) |
+                           (log2Width << R200_FACE_WIDTH_2_SHIFT) |
+                           (log2Height << R200_FACE_HEIGHT_2_SHIFT) |
+                           (log2Width << R200_FACE_WIDTH_3_SHIFT) |
+                           (log2Height << R200_FACE_HEIGHT_3_SHIFT) |
+                           (log2Width << R200_FACE_WIDTH_4_SHIFT) |
+                           (log2Height << R200_FACE_HEIGHT_4_SHIFT));
+   }
+   else {
+      /* If we don't in fact send enough texture coordinates, q will be 1,
+       * making TEXCOORD_PROJ act like TEXCOORD_NONPROJ (Right?)
+       */
+      t->pp_txformat_x |= R200_TEXCOORD_PROJ;
+   }
+
+   t->pp_txsize = (((tObj->Image[0][t->base.firstLevel]->Width - 1) << 0) |
+                   ((tObj->Image[0][t->base.firstLevel]->Height - 1) << 16));
+
+   /* Only need to round to nearest 32 for textures, but the blitter
+    * requires 64-byte aligned pitches, and we may/may not need the
+    * blitter.   NPOT only!
+    */
+   if ( !t->image_override ) {
+      if (baseImage->IsCompressed)
+         t->pp_txpitch = (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
+      else
+         t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * texelBytes) + 63) & ~(63);
+      t->pp_txpitch -= 32;
+   }
+
+   t->dirty_state = TEX_ALL;
+
+   /* FYI: r200UploadTexImages( rmesa, t ) used to be called here */
+}
+
+
+
+/* ================================================================
+ * Texture combine functions
+ */
+
+/* GL_ARB_texture_env_combine support
+ */
+
+/* The color tables have combine functions for GL_SRC_COLOR,
+ * GL_ONE_MINUS_SRC_COLOR, GL_SRC_ALPHA and GL_ONE_MINUS_SRC_ALPHA.
+ */
+static GLuint r200_register_color[][R200_MAX_TEXTURE_UNITS] =
+{
+   {
+      R200_TXC_ARG_A_R0_COLOR,
+      R200_TXC_ARG_A_R1_COLOR,
+      R200_TXC_ARG_A_R2_COLOR,
+      R200_TXC_ARG_A_R3_COLOR,
+      R200_TXC_ARG_A_R4_COLOR,
+      R200_TXC_ARG_A_R5_COLOR
+   },
+   {
+      R200_TXC_ARG_A_R0_COLOR | R200_TXC_COMP_ARG_A,
+      R200_TXC_ARG_A_R1_COLOR | R200_TXC_COMP_ARG_A,
+      R200_TXC_ARG_A_R2_COLOR | R200_TXC_COMP_ARG_A,
+      R200_TXC_ARG_A_R3_COLOR | R200_TXC_COMP_ARG_A,
+      R200_TXC_ARG_A_R4_COLOR | R200_TXC_COMP_ARG_A,
+      R200_TXC_ARG_A_R5_COLOR | R200_TXC_COMP_ARG_A
+   },
+   {
+      R200_TXC_ARG_A_R0_ALPHA,
+      R200_TXC_ARG_A_R1_ALPHA,
+      R200_TXC_ARG_A_R2_ALPHA,
+      R200_TXC_ARG_A_R3_ALPHA,
+      R200_TXC_ARG_A_R4_ALPHA,
+      R200_TXC_ARG_A_R5_ALPHA
+   },
+   {
+      R200_TXC_ARG_A_R0_ALPHA | R200_TXC_COMP_ARG_A,
+      R200_TXC_ARG_A_R1_ALPHA | R200_TXC_COMP_ARG_A,
+      R200_TXC_ARG_A_R2_ALPHA | R200_TXC_COMP_ARG_A,
+      R200_TXC_ARG_A_R3_ALPHA | R200_TXC_COMP_ARG_A,
+      R200_TXC_ARG_A_R4_ALPHA | R200_TXC_COMP_ARG_A,
+      R200_TXC_ARG_A_R5_ALPHA | R200_TXC_COMP_ARG_A
+   },
+};
+
+static GLuint r200_tfactor_color[] =
+{
+   R200_TXC_ARG_A_TFACTOR_COLOR,
+   R200_TXC_ARG_A_TFACTOR_COLOR | R200_TXC_COMP_ARG_A,
+   R200_TXC_ARG_A_TFACTOR_ALPHA,
+   R200_TXC_ARG_A_TFACTOR_ALPHA | R200_TXC_COMP_ARG_A
+};
+
+static GLuint r200_tfactor1_color[] =
+{
+   R200_TXC_ARG_A_TFACTOR1_COLOR,
+   R200_TXC_ARG_A_TFACTOR1_COLOR | R200_TXC_COMP_ARG_A,
+   R200_TXC_ARG_A_TFACTOR1_ALPHA,
+   R200_TXC_ARG_A_TFACTOR1_ALPHA | R200_TXC_COMP_ARG_A
+};
+
+static GLuint r200_primary_color[] =
+{
+   R200_TXC_ARG_A_DIFFUSE_COLOR,
+   R200_TXC_ARG_A_DIFFUSE_COLOR | R200_TXC_COMP_ARG_A,
+   R200_TXC_ARG_A_DIFFUSE_ALPHA,
+   R200_TXC_ARG_A_DIFFUSE_ALPHA | R200_TXC_COMP_ARG_A
+};
+
+/* GL_ZERO table - indices 0-3
+ * GL_ONE  table - indices 1-4
+ */
+static GLuint r200_zero_color[] =
+{
+   R200_TXC_ARG_A_ZERO,
+   R200_TXC_ARG_A_ZERO | R200_TXC_COMP_ARG_A,
+   R200_TXC_ARG_A_ZERO,
+   R200_TXC_ARG_A_ZERO | R200_TXC_COMP_ARG_A,
+   R200_TXC_ARG_A_ZERO
+};
+
+/* The alpha tables only have GL_SRC_ALPHA and GL_ONE_MINUS_SRC_ALPHA.
+ */
+static GLuint r200_register_alpha[][R200_MAX_TEXTURE_UNITS] =
+{
+   {
+      R200_TXA_ARG_A_R0_ALPHA,
+      R200_TXA_ARG_A_R1_ALPHA,
+      R200_TXA_ARG_A_R2_ALPHA,
+      R200_TXA_ARG_A_R3_ALPHA,
+      R200_TXA_ARG_A_R4_ALPHA,
+      R200_TXA_ARG_A_R5_ALPHA
+   },
+   {
+      R200_TXA_ARG_A_R0_ALPHA | R200_TXA_COMP_ARG_A,
+      R200_TXA_ARG_A_R1_ALPHA | R200_TXA_COMP_ARG_A,
+      R200_TXA_ARG_A_R2_ALPHA | R200_TXA_COMP_ARG_A,
+      R200_TXA_ARG_A_R3_ALPHA | R200_TXA_COMP_ARG_A,
+      R200_TXA_ARG_A_R4_ALPHA | R200_TXA_COMP_ARG_A,
+      R200_TXA_ARG_A_R5_ALPHA | R200_TXA_COMP_ARG_A
+   },
+};
+
+static GLuint r200_tfactor_alpha[] =
+{
+   R200_TXA_ARG_A_TFACTOR_ALPHA,
+   R200_TXA_ARG_A_TFACTOR_ALPHA | R200_TXA_COMP_ARG_A
+};
+
+static GLuint r200_tfactor1_alpha[] =
+{
+   R200_TXA_ARG_A_TFACTOR1_ALPHA,
+   R200_TXA_ARG_A_TFACTOR1_ALPHA | R200_TXA_COMP_ARG_A
+};
+
+static GLuint r200_primary_alpha[] =
+{
+   R200_TXA_ARG_A_DIFFUSE_ALPHA,
+   R200_TXA_ARG_A_DIFFUSE_ALPHA | R200_TXA_COMP_ARG_A
+};
+
+/* GL_ZERO table - indices 0-1
+ * GL_ONE  table - indices 1-2
+ */
+static GLuint r200_zero_alpha[] =
+{
+   R200_TXA_ARG_A_ZERO,
+   R200_TXA_ARG_A_ZERO | R200_TXA_COMP_ARG_A,
+   R200_TXA_ARG_A_ZERO,
+};
+
+
+/* Extract the arg from slot A, shift it into the correct argument slot
+ * and set the corresponding complement bit.
+ */
+#define R200_COLOR_ARG( n, arg )			\
+do {							\
+   color_combine |=					\
+      ((color_arg[n] & R200_TXC_ARG_A_MASK)		\
+       << R200_TXC_ARG_##arg##_SHIFT);			\
+   color_combine |=					\
+      ((color_arg[n] >> R200_TXC_COMP_ARG_A_SHIFT)	\
+       << R200_TXC_COMP_ARG_##arg##_SHIFT);		\
+} while (0)
+
+#define R200_ALPHA_ARG( n, arg )			\
+do {							\
+   alpha_combine |=					\
+      ((alpha_arg[n] & R200_TXA_ARG_A_MASK)		\
+       << R200_TXA_ARG_##arg##_SHIFT);			\
+   alpha_combine |=					\
+      ((alpha_arg[n] >> R200_TXA_COMP_ARG_A_SHIFT)	\
+       << R200_TXA_COMP_ARG_##arg##_SHIFT);		\
+} while (0)
+
+
+/* ================================================================
+ * Texture unit state management
+ */
+
+static GLboolean r200UpdateTextureEnv( GLcontext *ctx, int unit, int slot, GLuint replaceargs )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   GLuint color_combine, alpha_combine;
+   GLuint color_scale = rmesa->hw.pix[slot].cmd[PIX_PP_TXCBLEND2] &
+      ~(R200_TXC_SCALE_MASK | R200_TXC_OUTPUT_REG_MASK | R200_TXC_TFACTOR_SEL_MASK |
+	R200_TXC_TFACTOR1_SEL_MASK);
+   GLuint alpha_scale = rmesa->hw.pix[slot].cmd[PIX_PP_TXABLEND2] &
+      ~(R200_TXA_DOT_ALPHA | R200_TXA_SCALE_MASK | R200_TXA_OUTPUT_REG_MASK |
+	R200_TXA_TFACTOR_SEL_MASK | R200_TXA_TFACTOR1_SEL_MASK);
+
+   /* texUnit->_Current can be NULL if and only if the texture unit is
+    * not actually enabled.
+    */
+   assert( (texUnit->_ReallyEnabled == 0)
+	   || (texUnit->_Current != NULL) );
+
+   if ( R200_DEBUG & DEBUG_TEXTURE ) {
+      fprintf( stderr, "%s( %p, %d )\n", __FUNCTION__, (void *)ctx, unit );
+   }
+
+   /* Set the texture environment state.  Isn't this nice and clean?
+    * The chip will automagically set the texture alpha to 0xff when
+    * the texture format does not include an alpha component.  This
+    * reduces the amount of special-casing we have to do, alpha-only
+    * textures being a notable exception.
+    */
+
+   color_scale |= ((rmesa->state.texture.unit[unit].outputreg + 1) << R200_TXC_OUTPUT_REG_SHIFT) |
+			(unit << R200_TXC_TFACTOR_SEL_SHIFT) |
+			(replaceargs << R200_TXC_TFACTOR1_SEL_SHIFT);
+   alpha_scale |= ((rmesa->state.texture.unit[unit].outputreg + 1) << R200_TXA_OUTPUT_REG_SHIFT) |
+			(unit << R200_TXA_TFACTOR_SEL_SHIFT) |
+			(replaceargs << R200_TXA_TFACTOR1_SEL_SHIFT);
+
+   if ( !texUnit->_ReallyEnabled ) {
+      assert( unit == 0);
+      color_combine = R200_TXC_ARG_A_ZERO | R200_TXC_ARG_B_ZERO
+	  | R200_TXC_ARG_C_DIFFUSE_COLOR | R200_TXC_OP_MADD;
+      alpha_combine = R200_TXA_ARG_A_ZERO | R200_TXA_ARG_B_ZERO
+	  | R200_TXA_ARG_C_DIFFUSE_ALPHA | R200_TXA_OP_MADD;
+   }
+   else {
+      GLuint color_arg[3], alpha_arg[3];
+      GLuint i;
+      const GLuint numColorArgs = texUnit->_CurrentCombine->_NumArgsRGB;
+      const GLuint numAlphaArgs = texUnit->_CurrentCombine->_NumArgsA;
+      GLuint RGBshift = texUnit->_CurrentCombine->ScaleShiftRGB;
+      GLuint Ashift = texUnit->_CurrentCombine->ScaleShiftA;
+
+
+      const GLint replaceoprgb =
+	 ctx->Texture.Unit[replaceargs]._CurrentCombine->OperandRGB[0] - GL_SRC_COLOR;
+      const GLint replaceopa =
+	 ctx->Texture.Unit[replaceargs]._CurrentCombine->OperandA[0] - GL_SRC_ALPHA;
+
+      /* Step 1:
+       * Extract the color and alpha combine function arguments.
+       */
+      for ( i = 0 ; i < numColorArgs ; i++ ) {
+	 GLint op = texUnit->_CurrentCombine->OperandRGB[i] - GL_SRC_COLOR;
+	 const GLint srcRGBi = texUnit->_CurrentCombine->SourceRGB[i];
+	 assert(op >= 0);
+	 assert(op <= 3);
+	 switch ( srcRGBi ) {
+	 case GL_TEXTURE:
+	    color_arg[i] = r200_register_color[op][unit];
+	    break;
+	 case GL_CONSTANT:
+	    color_arg[i] = r200_tfactor_color[op];
+	    break;
+	 case GL_PRIMARY_COLOR:
+	    color_arg[i] = r200_primary_color[op];
+	    break;
+	 case GL_PREVIOUS:
+	    if (replaceargs != unit) {
+	       const GLint srcRGBreplace =
+		  ctx->Texture.Unit[replaceargs]._CurrentCombine->SourceRGB[0];
+	       if (op >= 2) {
+		  op = op ^ replaceopa;
+	       }
+	       else {
+		  op = op ^ replaceoprgb;
+	       }
+	       switch (srcRGBreplace) {
+	       case GL_TEXTURE:
+		  color_arg[i] = r200_register_color[op][replaceargs];
+		  break;
+	       case GL_CONSTANT:
+		  color_arg[i] = r200_tfactor1_color[op];
+		  break;
+	       case GL_PRIMARY_COLOR:
+		  color_arg[i] = r200_primary_color[op];
+		  break;
+	       case GL_PREVIOUS:
+		  if (slot == 0)
+		     color_arg[i] = r200_primary_color[op];
+		  else
+		     color_arg[i] = r200_register_color[op]
+			[rmesa->state.texture.unit[replaceargs - 1].outputreg];
+		  break;
+	       case GL_ZERO:
+		  color_arg[i] = r200_zero_color[op];
+		  break;
+	       case GL_ONE:
+		  color_arg[i] = r200_zero_color[op+1];
+		  break;
+	       case GL_TEXTURE0:
+	       case GL_TEXTURE1:
+	       case GL_TEXTURE2:
+	       case GL_TEXTURE3:
+	       case GL_TEXTURE4:
+	       case GL_TEXTURE5:
+		  color_arg[i] = r200_register_color[op][srcRGBreplace - GL_TEXTURE0];
+		  break;
+	       default:
+	       return GL_FALSE;
+	       }
+	    }
+	    else {
+	       if (slot == 0)
+		  color_arg[i] = r200_primary_color[op];
+	       else
+		  color_arg[i] = r200_register_color[op]
+		     [rmesa->state.texture.unit[unit - 1].outputreg];
+            }
+	    break;
+	 case GL_ZERO:
+	    color_arg[i] = r200_zero_color[op];
+	    break;
+	 case GL_ONE:
+	    color_arg[i] = r200_zero_color[op+1];
+	    break;
+	 case GL_TEXTURE0:
+	 case GL_TEXTURE1:
+	 case GL_TEXTURE2:
+	 case GL_TEXTURE3:
+	 case GL_TEXTURE4:
+	 case GL_TEXTURE5:
+	    color_arg[i] = r200_register_color[op][srcRGBi - GL_TEXTURE0];
+	    break;
+	 default:
+	    return GL_FALSE;
+	 }
+      }
+
+      for ( i = 0 ; i < numAlphaArgs ; i++ ) {
+	 GLint op = texUnit->_CurrentCombine->OperandA[i] - GL_SRC_ALPHA;
+	 const GLint srcAi = texUnit->_CurrentCombine->SourceA[i];
+	 assert(op >= 0);
+	 assert(op <= 1);
+	 switch ( srcAi ) {
+	 case GL_TEXTURE:
+	    alpha_arg[i] = r200_register_alpha[op][unit];
+	    break;
+	 case GL_CONSTANT:
+	    alpha_arg[i] = r200_tfactor_alpha[op];
+	    break;
+	 case GL_PRIMARY_COLOR:
+	    alpha_arg[i] = r200_primary_alpha[op];
+	    break;
+	 case GL_PREVIOUS:
+	    if (replaceargs != unit) {
+	       const GLint srcAreplace =
+		  ctx->Texture.Unit[replaceargs]._CurrentCombine->SourceA[0];
+	       op = op ^ replaceopa;
+	       switch (srcAreplace) {
+	       case GL_TEXTURE:
+		  alpha_arg[i] = r200_register_alpha[op][replaceargs];
+		  break;
+	       case GL_CONSTANT:
+		  alpha_arg[i] = r200_tfactor1_alpha[op];
+		  break;
+	       case GL_PRIMARY_COLOR:
+		  alpha_arg[i] = r200_primary_alpha[op];
+		  break;
+	       case GL_PREVIOUS:
+		  if (slot == 0)
+		     alpha_arg[i] = r200_primary_alpha[op];
+		  else
+		     alpha_arg[i] = r200_register_alpha[op]
+			[rmesa->state.texture.unit[replaceargs - 1].outputreg];
+		  break;
+	       case GL_ZERO:
+		  alpha_arg[i] = r200_zero_alpha[op];
+		  break;
+	       case GL_ONE:
+		  alpha_arg[i] = r200_zero_alpha[op+1];
+		  break;
+	       case GL_TEXTURE0:
+	       case GL_TEXTURE1:
+	       case GL_TEXTURE2:
+	       case GL_TEXTURE3:
+	       case GL_TEXTURE4:
+	       case GL_TEXTURE5:
+		  alpha_arg[i] = r200_register_alpha[op][srcAreplace - GL_TEXTURE0];
+		  break;
+	       default:
+	       return GL_FALSE;
+	       }
+	    }
+	    else {
+	       if (slot == 0)
+		  alpha_arg[i] = r200_primary_alpha[op];
+	       else
+		  alpha_arg[i] = r200_register_alpha[op]
+		    [rmesa->state.texture.unit[unit - 1].outputreg];
+            }
+	    break;
+	 case GL_ZERO:
+	    alpha_arg[i] = r200_zero_alpha[op];
+	    break;
+	 case GL_ONE:
+	    alpha_arg[i] = r200_zero_alpha[op+1];
+	    break;
+	 case GL_TEXTURE0:
+	 case GL_TEXTURE1:
+	 case GL_TEXTURE2:
+	 case GL_TEXTURE3:
+	 case GL_TEXTURE4:
+	 case GL_TEXTURE5:
+	    alpha_arg[i] = r200_register_alpha[op][srcAi - GL_TEXTURE0];
+	    break;
+	 default:
+	    return GL_FALSE;
+	 }
+      }
+
+      /* Step 2:
+       * Build up the color and alpha combine functions.
+       */
+      switch ( texUnit->_CurrentCombine->ModeRGB ) {
+      case GL_REPLACE:
+	 color_combine = (R200_TXC_ARG_A_ZERO |
+			  R200_TXC_ARG_B_ZERO |
+			  R200_TXC_OP_MADD);
+	 R200_COLOR_ARG( 0, C );
+	 break;
+      case GL_MODULATE:
+	 color_combine = (R200_TXC_ARG_C_ZERO |
+			  R200_TXC_OP_MADD);
+	 R200_COLOR_ARG( 0, A );
+	 R200_COLOR_ARG( 1, B );
+	 break;
+      case GL_ADD:
+	 color_combine = (R200_TXC_ARG_B_ZERO |
+			  R200_TXC_COMP_ARG_B | 
+			  R200_TXC_OP_MADD);
+	 R200_COLOR_ARG( 0, A );
+	 R200_COLOR_ARG( 1, C );
+	 break;
+      case GL_ADD_SIGNED:
+	 color_combine = (R200_TXC_ARG_B_ZERO |
+			  R200_TXC_COMP_ARG_B |
+			  R200_TXC_BIAS_ARG_C |	/* new */
+			  R200_TXC_OP_MADD); /* was ADDSIGNED */
+	 R200_COLOR_ARG( 0, A );
+	 R200_COLOR_ARG( 1, C );
+	 break;
+      case GL_SUBTRACT:
+	 color_combine = (R200_TXC_ARG_B_ZERO |
+			  R200_TXC_COMP_ARG_B | 
+			  R200_TXC_NEG_ARG_C |
+			  R200_TXC_OP_MADD);
+	 R200_COLOR_ARG( 0, A );
+	 R200_COLOR_ARG( 1, C );
+	 break;
+      case GL_INTERPOLATE:
+	 color_combine = (R200_TXC_OP_LERP);
+	 R200_COLOR_ARG( 0, B );
+	 R200_COLOR_ARG( 1, A );
+	 R200_COLOR_ARG( 2, C );
+	 break;
+
+      case GL_DOT3_RGB_EXT:
+      case GL_DOT3_RGBA_EXT:
+	 /* The EXT version of the DOT3 extension does not support the
+	  * scale factor, but the ARB version (and the version in OpenGL
+	  * 1.3) does.
+	  */
+	 RGBshift = 0;
+	 /* FALLTHROUGH */
+
+      case GL_DOT3_RGB:
+      case GL_DOT3_RGBA:
+	 /* DOT3 works differently on R200 than on R100.  On R100, just
+	  * setting the DOT3 mode did everything for you.  On R200, the
+	  * driver has to enable the biasing and scale in the inputs to
+	  * put them in the proper [-1,1] range.  This is what the 4x and
+	  * the -0.5 in the DOT3 spec do.  The post-scale is then set
+	  * normally.
+	  */
+
+	 color_combine = (R200_TXC_ARG_C_ZERO |
+			  R200_TXC_OP_DOT3 |
+			  R200_TXC_BIAS_ARG_A |
+			  R200_TXC_BIAS_ARG_B |
+			  R200_TXC_SCALE_ARG_A |
+			  R200_TXC_SCALE_ARG_B);
+	 R200_COLOR_ARG( 0, A );
+	 R200_COLOR_ARG( 1, B );
+	 break;
+
+      case GL_MODULATE_ADD_ATI:
+	 color_combine = (R200_TXC_OP_MADD);
+	 R200_COLOR_ARG( 0, A );
+	 R200_COLOR_ARG( 1, C );
+	 R200_COLOR_ARG( 2, B );
+	 break;
+      case GL_MODULATE_SIGNED_ADD_ATI:
+	 color_combine = (R200_TXC_BIAS_ARG_C |	/* new */
+			  R200_TXC_OP_MADD); /* was ADDSIGNED */
+	 R200_COLOR_ARG( 0, A );
+	 R200_COLOR_ARG( 1, C );
+	 R200_COLOR_ARG( 2, B );
+	 break;
+      case GL_MODULATE_SUBTRACT_ATI:
+	 color_combine = (R200_TXC_NEG_ARG_C |
+			  R200_TXC_OP_MADD);
+	 R200_COLOR_ARG( 0, A );
+	 R200_COLOR_ARG( 1, C );
+	 R200_COLOR_ARG( 2, B );
+	 break;
+      default:
+	 return GL_FALSE;
+      }
+
+      switch ( texUnit->_CurrentCombine->ModeA ) {
+      case GL_REPLACE:
+	 alpha_combine = (R200_TXA_ARG_A_ZERO |
+			  R200_TXA_ARG_B_ZERO |
+			  R200_TXA_OP_MADD);
+	 R200_ALPHA_ARG( 0, C );
+	 break;
+      case GL_MODULATE:
+	 alpha_combine = (R200_TXA_ARG_C_ZERO |
+			  R200_TXA_OP_MADD);
+	 R200_ALPHA_ARG( 0, A );
+	 R200_ALPHA_ARG( 1, B );
+	 break;
+      case GL_ADD:
+	 alpha_combine = (R200_TXA_ARG_B_ZERO |
+			  R200_TXA_COMP_ARG_B |
+			  R200_TXA_OP_MADD);
+	 R200_ALPHA_ARG( 0, A );
+	 R200_ALPHA_ARG( 1, C );
+	 break;
+      case GL_ADD_SIGNED:
+	 alpha_combine = (R200_TXA_ARG_B_ZERO |
+			  R200_TXA_COMP_ARG_B |
+			  R200_TXA_BIAS_ARG_C |	/* new */
+			  R200_TXA_OP_MADD); /* was ADDSIGNED */
+	 R200_ALPHA_ARG( 0, A );
+	 R200_ALPHA_ARG( 1, C );
+	 break;
+      case GL_SUBTRACT:
+	 alpha_combine = (R200_TXA_ARG_B_ZERO |
+			  R200_TXA_COMP_ARG_B |
+			  R200_TXA_NEG_ARG_C |
+			  R200_TXA_OP_MADD);
+	 R200_ALPHA_ARG( 0, A );
+	 R200_ALPHA_ARG( 1, C );
+	 break;
+      case GL_INTERPOLATE:
+	 alpha_combine = (R200_TXA_OP_LERP);
+	 R200_ALPHA_ARG( 0, B );
+	 R200_ALPHA_ARG( 1, A );
+	 R200_ALPHA_ARG( 2, C );
+	 break;
+
+      case GL_MODULATE_ADD_ATI:
+	 alpha_combine = (R200_TXA_OP_MADD);
+	 R200_ALPHA_ARG( 0, A );
+	 R200_ALPHA_ARG( 1, C );
+	 R200_ALPHA_ARG( 2, B );
+	 break;
+      case GL_MODULATE_SIGNED_ADD_ATI:
+	 alpha_combine = (R200_TXA_BIAS_ARG_C |	/* new */
+			  R200_TXA_OP_MADD); /* was ADDSIGNED */
+	 R200_ALPHA_ARG( 0, A );
+	 R200_ALPHA_ARG( 1, C );
+	 R200_ALPHA_ARG( 2, B );
+	 break;
+      case GL_MODULATE_SUBTRACT_ATI:
+	 alpha_combine = (R200_TXA_NEG_ARG_C |
+			  R200_TXA_OP_MADD);
+	 R200_ALPHA_ARG( 0, A );
+	 R200_ALPHA_ARG( 1, C );
+	 R200_ALPHA_ARG( 2, B );
+	 break;
+      default:
+	 return GL_FALSE;
+      }
+
+      if ( (texUnit->_CurrentCombine->ModeRGB == GL_DOT3_RGBA_EXT)
+	   || (texUnit->_CurrentCombine->ModeRGB == GL_DOT3_RGBA) ) {
+	 alpha_scale |= R200_TXA_DOT_ALPHA;
+	 Ashift = RGBshift;
+      }
+
+      /* Step 3:
+       * Apply the scale factor.
+       */
+      color_scale |= (RGBshift << R200_TXC_SCALE_SHIFT);
+      alpha_scale |= (Ashift   << R200_TXA_SCALE_SHIFT);
+
+      /* All done!
+       */
+   }
+
+   if ( rmesa->hw.pix[slot].cmd[PIX_PP_TXCBLEND] != color_combine ||
+	rmesa->hw.pix[slot].cmd[PIX_PP_TXABLEND] != alpha_combine ||
+	rmesa->hw.pix[slot].cmd[PIX_PP_TXCBLEND2] != color_scale ||
+	rmesa->hw.pix[slot].cmd[PIX_PP_TXABLEND2] != alpha_scale) {
+      R200_STATECHANGE( rmesa, pix[slot] );
+      rmesa->hw.pix[slot].cmd[PIX_PP_TXCBLEND] = color_combine;
+      rmesa->hw.pix[slot].cmd[PIX_PP_TXABLEND] = alpha_combine;
+      rmesa->hw.pix[slot].cmd[PIX_PP_TXCBLEND2] = color_scale;
+      rmesa->hw.pix[slot].cmd[PIX_PP_TXABLEND2] = alpha_scale;
+   }
+
+   return GL_TRUE;
+}
+
+void r200SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
+		      unsigned long long offset, GLint depth, GLuint pitch)
+{
+	r200ContextPtr rmesa =
+	    (r200ContextPtr) ((__DRIcontextPrivate *) pDRICtx->private)->
+	    driverPrivate;
+	struct gl_texture_object *tObj =
+	    _mesa_lookup_texture(rmesa->glCtx, texname);
+	r200TexObjPtr t;
+
+	if (!tObj)
+		return;
+
+	t = (r200TexObjPtr) tObj->DriverData;
+
+	t->image_override = GL_TRUE;
+
+	if (!offset)
+		return;
+
+	t->pp_txoffset = offset;
+	t->pp_txpitch = pitch - 32;
+
+	switch (depth) {
+	case 32:
+		t->pp_txformat = tx_table_le[MESA_FORMAT_ARGB8888].format;
+		t->pp_txfilter |= tx_table_le[MESA_FORMAT_ARGB8888].filter;
+		break;
+	case 24:
+	default:
+		t->pp_txformat = tx_table_le[MESA_FORMAT_RGB888].format;
+		t->pp_txfilter |= tx_table_le[MESA_FORMAT_RGB888].filter;
+		break;
+	case 16:
+		t->pp_txformat = tx_table_le[MESA_FORMAT_RGB565].format;
+		t->pp_txfilter |= tx_table_le[MESA_FORMAT_RGB565].filter;
+		break;
+	}
+}
+
+#define REF_COLOR 1
+#define REF_ALPHA 2
+
+static GLboolean r200UpdateAllTexEnv( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLint i, j, currslot;
+   GLint maxunitused = -1;
+   GLboolean texregfree[6] = {GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE};
+   GLubyte stageref[7] = {0, 0, 0, 0, 0, 0, 0};
+   GLint nextunit[R200_MAX_TEXTURE_UNITS] = {0, 0, 0, 0, 0, 0};
+   GLint currentnext = -1;
+   GLboolean ok;
+
+   /* find highest used unit */
+   for ( j = 0; j < R200_MAX_TEXTURE_UNITS; j++) {
+      if (ctx->Texture.Unit[j]._ReallyEnabled) {
+	 maxunitused = j;
+      }
+   }
+   stageref[maxunitused + 1] = REF_COLOR | REF_ALPHA;
+
+   for ( j = maxunitused; j >= 0; j-- ) {
+      const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[j];
+
+      rmesa->state.texture.unit[j].outputreg = -1;
+
+      if (stageref[j + 1]) {
+
+	 /* use the lowest available reg. That gets us automatically reg0 for the last stage.
+	    need this even for disabled units, as it may get referenced due to the replace
+	    optimization */
+	 for ( i = 0 ; i < R200_MAX_TEXTURE_UNITS; i++ ) {
+	    if (texregfree[i]) {
+	       rmesa->state.texture.unit[j].outputreg = i;
+	       break;
+	    }
+	 }
+	 if (rmesa->state.texture.unit[j].outputreg == -1) {
+	    /* no more free regs we can use. Need a fallback :-( */
+	    return GL_FALSE;
+         }
+
+         nextunit[j] = currentnext;
+
+         if (!texUnit->_ReallyEnabled) {
+	 /* the not enabled stages are referenced "indirectly",
+            must not cut off the lower stages */
+	    stageref[j] = REF_COLOR | REF_ALPHA;
+	    continue;
+         }
+	 currentnext = j;
+ 
+	 const GLuint numColorArgs = texUnit->_CurrentCombine->_NumArgsRGB;
+	 const GLuint numAlphaArgs = texUnit->_CurrentCombine->_NumArgsA;
+	 const GLboolean isdot3rgba = (texUnit->_CurrentCombine->ModeRGB == GL_DOT3_RGBA) ||
+				      (texUnit->_CurrentCombine->ModeRGB == GL_DOT3_RGBA_EXT);
+
+
+	 /* check if we need the color part, special case for dot3_rgba
+	    as if only the alpha part is referenced later on it still is using the color part */
+	 if ((stageref[j + 1] & REF_COLOR) || isdot3rgba) {
+	    for ( i = 0 ; i < numColorArgs ; i++ ) {
+	       const GLuint srcRGBi = texUnit->_CurrentCombine->SourceRGB[i];
+	       const GLuint op = texUnit->_CurrentCombine->OperandRGB[i];
+	       switch ( srcRGBi ) {
+	       case GL_PREVIOUS:
+		  /* op 0/1 are referencing color, op 2/3 alpha */
+		  stageref[j] |= (op >> 1) + 1;
+	          break;
+	       case GL_TEXTURE:
+		  texregfree[j] = GL_FALSE;
+		  break;
+	       case GL_TEXTURE0:
+	       case GL_TEXTURE1:
+	       case GL_TEXTURE2:
+	       case GL_TEXTURE3:
+	       case GL_TEXTURE4:
+	       case GL_TEXTURE5:
+		  texregfree[srcRGBi - GL_TEXTURE0] = GL_FALSE;
+	          break;
+	       default: /* don't care about other sources here */
+		  break;
+	       }
+	    }
+	 }
+
+	 /* alpha args are ignored for dot3_rgba */
+	 if ((stageref[j + 1] & REF_ALPHA) && !isdot3rgba) {
+
+	    for ( i = 0 ; i < numAlphaArgs ; i++ ) {
+	       const GLuint srcAi = texUnit->_CurrentCombine->SourceA[i];
+	       switch ( srcAi ) {
+	       case GL_PREVIOUS:
+		  stageref[j] |= REF_ALPHA;
+		  break;
+	       case GL_TEXTURE:
+		  texregfree[j] = GL_FALSE;
+		  break;
+	       case GL_TEXTURE0:
+	       case GL_TEXTURE1:
+	       case GL_TEXTURE2:
+	       case GL_TEXTURE3:
+	       case GL_TEXTURE4:
+	       case GL_TEXTURE5:
+		  texregfree[srcAi - GL_TEXTURE0] = GL_FALSE;
+		  break;
+	       default: /* don't care about other sources here */
+		  break;
+	       }
+	    }
+	 }
+      }
+   }
+
+   /* don't enable texture sampling for units if the result is not used */
+   for (i = 0; i < R200_MAX_TEXTURE_UNITS; i++) {
+      if (ctx->Texture.Unit[i]._ReallyEnabled && !texregfree[i])
+	 rmesa->state.texture.unit[i].unitneeded = ctx->Texture.Unit[i]._ReallyEnabled;
+      else rmesa->state.texture.unit[i].unitneeded = 0;
+   }
+
+   ok = GL_TRUE;
+   currslot = 0;
+   rmesa->state.envneeded = 1;
+
+   i = 0;
+   while ((i <= maxunitused) && (i >= 0)) {
+      /* only output instruction if the results are referenced */
+      if (ctx->Texture.Unit[i]._ReallyEnabled && stageref[i+1]) {
+         GLuint replaceunit = i;
+	 /* try to optimize GL_REPLACE away (only one level deep though) */
+	 if (	(ctx->Texture.Unit[i]._CurrentCombine->ModeRGB == GL_REPLACE) &&
+		(ctx->Texture.Unit[i]._CurrentCombine->ModeA == GL_REPLACE) &&
+		(ctx->Texture.Unit[i]._CurrentCombine->ScaleShiftRGB == 0) &&
+		(ctx->Texture.Unit[i]._CurrentCombine->ScaleShiftA == 0) &&
+		(nextunit[i] > 0) ) {
+	    /* yippie! can optimize it away! */
+	    replaceunit = i;
+	    i = nextunit[i];
+	 }
+
+	 /* need env instruction slot */
+	 rmesa->state.envneeded |= 1 << currslot;
+	 ok = r200UpdateTextureEnv( ctx, i, currslot, replaceunit );
+	 if (!ok) return GL_FALSE;
+	 currslot++;
+      }
+      i = i + 1;
+   }
+
+   if (currslot == 0) {
+      /* need one stage at least */
+      rmesa->state.texture.unit[0].outputreg = 0;
+      ok = r200UpdateTextureEnv( ctx, 0, 0, 0 );
+   }
+
+   R200_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~(R200_TEX_BLEND_ENABLE_MASK | R200_MULTI_PASS_ENABLE);
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= rmesa->state.envneeded << R200_TEX_BLEND_0_ENABLE_SHIFT;
+
+   return ok;
+}
+
+#undef REF_COLOR
+#undef REF_ALPHA
+
+
+#define TEXOBJ_TXFILTER_MASK (R200_MAX_MIP_LEVEL_MASK |		\
+			      R200_MIN_FILTER_MASK | 		\
+			      R200_MAG_FILTER_MASK |		\
+			      R200_MAX_ANISO_MASK |		\
+			      R200_YUV_TO_RGB |			\
+			      R200_YUV_TEMPERATURE_MASK |	\
+			      R200_CLAMP_S_MASK | 		\
+			      R200_CLAMP_T_MASK | 		\
+			      R200_BORDER_MODE_D3D )
+
+#define TEXOBJ_TXFORMAT_MASK (R200_TXFORMAT_WIDTH_MASK |	\
+			      R200_TXFORMAT_HEIGHT_MASK |	\
+			      R200_TXFORMAT_FORMAT_MASK |	\
+			      R200_TXFORMAT_F5_WIDTH_MASK |	\
+			      R200_TXFORMAT_F5_HEIGHT_MASK |	\
+			      R200_TXFORMAT_ALPHA_IN_MAP |	\
+			      R200_TXFORMAT_CUBIC_MAP_ENABLE |	\
+			      R200_TXFORMAT_NON_POWER2)
+
+#define TEXOBJ_TXFORMAT_X_MASK (R200_DEPTH_LOG2_MASK |		\
+                                R200_TEXCOORD_MASK |		\
+                                R200_CLAMP_Q_MASK | 		\
+                                R200_VOLUME_FILTER_MASK)
+
+
+static void import_tex_obj_state( r200ContextPtr rmesa,
+				  int unit,
+				  r200TexObjPtr texobj )
+{
+/* do not use RADEON_DB_STATE to avoid stale texture caches */
+   int *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
+
+   R200_STATECHANGE( rmesa, tex[unit] );
+
+   cmd[TEX_PP_TXFILTER] &= ~TEXOBJ_TXFILTER_MASK;
+   cmd[TEX_PP_TXFILTER] |= texobj->pp_txfilter & TEXOBJ_TXFILTER_MASK;
+   cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
+   cmd[TEX_PP_TXFORMAT] |= texobj->pp_txformat & TEXOBJ_TXFORMAT_MASK;
+   cmd[TEX_PP_TXFORMAT_X] &= ~TEXOBJ_TXFORMAT_X_MASK;
+   cmd[TEX_PP_TXFORMAT_X] |= texobj->pp_txformat_x & TEXOBJ_TXFORMAT_X_MASK;
+   cmd[TEX_PP_TXSIZE] = texobj->pp_txsize; /* NPOT only! */
+   cmd[TEX_PP_TXPITCH] = texobj->pp_txpitch; /* NPOT only! */
+   cmd[TEX_PP_BORDER_COLOR] = texobj->pp_border_color;
+   if (rmesa->r200Screen->drmSupportsFragShader) {
+      cmd[TEX_PP_TXOFFSET_NEWDRM] = texobj->pp_txoffset;
+   }
+   else {
+      cmd[TEX_PP_TXOFFSET_OLDDRM] = texobj->pp_txoffset;
+   }
+
+   if (texobj->base.tObj->Target == GL_TEXTURE_CUBE_MAP) {
+      int *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
+      GLuint bytesPerFace = texobj->base.totalSize / 6;
+      ASSERT(texobj->base.totalSize % 6 == 0);
+
+      R200_STATECHANGE( rmesa, cube[unit] );
+      cube_cmd[CUBE_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
+      if (rmesa->r200Screen->drmSupportsFragShader) {
+	 /* that value is submitted twice. could change cube atom
+	    to not include that command when new drm is used */
+	 cmd[TEX_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
+      }
+      cube_cmd[CUBE_PP_CUBIC_OFFSET_F1] = texobj->pp_txoffset + 1 * bytesPerFace;
+      cube_cmd[CUBE_PP_CUBIC_OFFSET_F2] = texobj->pp_txoffset + 2 * bytesPerFace;
+      cube_cmd[CUBE_PP_CUBIC_OFFSET_F3] = texobj->pp_txoffset + 3 * bytesPerFace;
+      cube_cmd[CUBE_PP_CUBIC_OFFSET_F4] = texobj->pp_txoffset + 4 * bytesPerFace;
+      cube_cmd[CUBE_PP_CUBIC_OFFSET_F5] = texobj->pp_txoffset + 5 * bytesPerFace;
+   }
+
+   texobj->dirty_state &= ~(1<<unit);
+}
+
+
+static void set_texgen_matrix( r200ContextPtr rmesa, 
+			       GLuint unit,
+			       const GLfloat *s_plane,
+			       const GLfloat *t_plane,
+			       const GLfloat *r_plane,
+			       const GLfloat *q_plane )
+{
+   GLfloat m[16];
+
+   m[0]  = s_plane[0];
+   m[4]  = s_plane[1];
+   m[8]  = s_plane[2];
+   m[12] = s_plane[3];
+
+   m[1]  = t_plane[0];
+   m[5]  = t_plane[1];
+   m[9]  = t_plane[2];
+   m[13] = t_plane[3];
+
+   m[2]  = r_plane[0];
+   m[6]  = r_plane[1];
+   m[10] = r_plane[2];
+   m[14] = r_plane[3];
+
+   m[3]  = q_plane[0];
+   m[7]  = q_plane[1];
+   m[11] = q_plane[2];
+   m[15] = q_plane[3];
+
+   _math_matrix_loadf( &(rmesa->TexGenMatrix[unit]), m);
+   _math_matrix_analyse( &(rmesa->TexGenMatrix[unit]) );
+   rmesa->TexGenEnabled |= R200_TEXMAT_0_ENABLE<<unit;
+}
+
+
+static GLuint r200_need_dis_texgen(const GLbitfield texGenEnabled,
+				   const GLfloat *planeS,
+				   const GLfloat *planeT,
+				   const GLfloat *planeR,
+				   const GLfloat *planeQ)
+{
+   GLuint needtgenable = 0;
+
+   if (!(texGenEnabled & S_BIT)) {
+      if (((texGenEnabled & T_BIT) && planeT[0] != 0.0) ||
+	 ((texGenEnabled & R_BIT) && planeR[0] != 0.0) ||
+	 ((texGenEnabled & Q_BIT) && planeQ[0] != 0.0)) {
+	 needtgenable |= S_BIT;
+      }
+   }
+   if (!(texGenEnabled & T_BIT)) {
+      if (((texGenEnabled & S_BIT) && planeS[1] != 0.0) ||
+	 ((texGenEnabled & R_BIT) && planeR[1] != 0.0) ||
+	 ((texGenEnabled & Q_BIT) && planeQ[1] != 0.0)) {
+	 needtgenable |= T_BIT;
+     }
+   }
+   if (!(texGenEnabled & R_BIT)) {
+      if (((texGenEnabled & S_BIT) && planeS[2] != 0.0) ||
+	 ((texGenEnabled & T_BIT) && planeT[2] != 0.0) ||
+	 ((texGenEnabled & Q_BIT) && planeQ[2] != 0.0)) {
+	 needtgenable |= R_BIT;
+      }
+   }
+   if (!(texGenEnabled & Q_BIT)) {
+      if (((texGenEnabled & S_BIT) && planeS[3] != 0.0) ||
+	 ((texGenEnabled & T_BIT) && planeT[3] != 0.0) ||
+	 ((texGenEnabled & R_BIT) && planeR[3] != 0.0)) {
+	 needtgenable |= Q_BIT;
+      }
+   }
+
+   return needtgenable;
+}
+
+
+/*
+ * Returns GL_FALSE if fallback required.  
+ */
+static GLboolean r200_validate_texgen( GLcontext *ctx, GLuint unit )
+{  
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   GLuint inputshift = R200_TEXGEN_0_INPUT_SHIFT + unit*4;
+   GLuint tgi, tgcm;
+   GLuint mode = 0;
+   GLboolean mixed_fallback = GL_FALSE;
+   static const GLfloat I[16] = {
+      1,  0,  0,  0,
+      0,  1,  0,  0,
+      0,  0,  1,  0,
+      0,  0,  0,  1 };
+   static const GLfloat reflect[16] = {
+      -1,  0,  0,  0,
+       0, -1,  0,  0,
+       0,  0,  -1, 0,
+       0,  0,  0,  1 };
+
+   rmesa->TexGenCompSel &= ~(R200_OUTPUT_TEX_0 << unit);
+   rmesa->TexGenEnabled &= ~(R200_TEXGEN_TEXMAT_0_ENABLE<<unit);
+   rmesa->TexGenEnabled &= ~(R200_TEXMAT_0_ENABLE<<unit);
+   rmesa->TexGenNeedNormals[unit] = GL_FALSE;
+   tgi = rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_1] & ~(R200_TEXGEN_INPUT_MASK <<
+						   inputshift);
+   tgcm = rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_2] & ~(R200_TEXGEN_COMP_MASK <<
+						    (unit * 4));
+
+   if (0) 
+      fprintf(stderr, "%s unit %d\n", __FUNCTION__, unit);
+
+   if (texUnit->TexGenEnabled & S_BIT) {
+      mode = texUnit->GenModeS;
+   } else {
+      tgcm |= R200_TEXGEN_COMP_S << (unit * 4);
+   }
+
+   if (texUnit->TexGenEnabled & T_BIT) {
+      if (texUnit->GenModeT != mode)
+	 mixed_fallback = GL_TRUE;
+   } else {
+      tgcm |= R200_TEXGEN_COMP_T << (unit * 4);
+   }
+
+   if (texUnit->TexGenEnabled & R_BIT) {
+      if (texUnit->GenModeR != mode)
+	 mixed_fallback = GL_TRUE;
+   } else {
+      tgcm |= R200_TEXGEN_COMP_R << (unit * 4);
+   }
+
+   if (texUnit->TexGenEnabled & Q_BIT) {
+      if (texUnit->GenModeQ != mode)
+	 mixed_fallback = GL_TRUE;
+   } else {
+      tgcm |= R200_TEXGEN_COMP_Q << (unit * 4);
+   }
+
+   if (mixed_fallback) {
+      if (R200_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "fallback mixed texgen, 0x%x (0x%x 0x%x 0x%x 0x%x)\n",
+		 texUnit->TexGenEnabled, texUnit->GenModeS, texUnit->GenModeT,
+		 texUnit->GenModeR, texUnit->GenModeQ);
+      return GL_FALSE;
+   }
+
+/* we CANNOT do mixed mode if the texgen mode requires a plane where the input
+   is not enabled for texgen, since the planes are concatenated into texmat,
+   and thus the input will come from texcoord rather than tex gen equation!
+   Either fallback or just hope that those texcoords aren't really needed...
+   Assuming the former will cause lots of unnecessary fallbacks, the latter will
+   generate bogus results sometimes - it's pretty much impossible to really know
+   when a fallback is needed, depends on texmat and what sort of texture is bound
+   etc, - for now fallback if we're missing either S or T bits, there's a high
+   probability we need the texcoords in that case.
+   That's a lot of work for some obscure texgen mixed mode fixup - why oh why
+   doesn't the chip just directly accept the plane parameters :-(. */
+   switch (mode) {
+   case GL_OBJECT_LINEAR: {
+      GLuint needtgenable = r200_need_dis_texgen( texUnit->TexGenEnabled,
+				texUnit->ObjectPlaneS, texUnit->ObjectPlaneT,
+				texUnit->ObjectPlaneR, texUnit->ObjectPlaneQ );
+      if (needtgenable & (S_BIT | T_BIT)) {
+	 if (R200_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "fallback mixed texgen / obj plane, 0x%x\n",
+		 texUnit->TexGenEnabled);
+	 return GL_FALSE;
+      }
+      if (needtgenable & (R_BIT)) {
+	 tgcm &= ~(R200_TEXGEN_COMP_R << (unit * 4));
+      }
+      if (needtgenable & (Q_BIT)) {
+	 tgcm &= ~(R200_TEXGEN_COMP_Q << (unit * 4));
+      }
+
+      tgi |= R200_TEXGEN_INPUT_OBJ << inputshift;
+      set_texgen_matrix( rmesa, unit, 
+	 (texUnit->TexGenEnabled & S_BIT) ? texUnit->ObjectPlaneS : I,
+	 (texUnit->TexGenEnabled & T_BIT) ? texUnit->ObjectPlaneT : I + 4,
+	 (texUnit->TexGenEnabled & R_BIT) ? texUnit->ObjectPlaneR : I + 8,
+	 (texUnit->TexGenEnabled & Q_BIT) ? texUnit->ObjectPlaneQ : I + 12);
+      }
+      break;
+
+   case GL_EYE_LINEAR: {
+      GLuint needtgenable = r200_need_dis_texgen( texUnit->TexGenEnabled,
+				texUnit->EyePlaneS, texUnit->EyePlaneT,
+				texUnit->EyePlaneR, texUnit->EyePlaneQ );
+      if (needtgenable & (S_BIT | T_BIT)) {
+	 if (R200_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "fallback mixed texgen / eye plane, 0x%x\n",
+		 texUnit->TexGenEnabled);
+	 return GL_FALSE;
+      }
+      if (needtgenable & (R_BIT)) {
+	 tgcm &= ~(R200_TEXGEN_COMP_R << (unit * 4));
+      }
+      if (needtgenable & (Q_BIT)) {
+	 tgcm &= ~(R200_TEXGEN_COMP_Q << (unit * 4));
+      }
+      tgi |= R200_TEXGEN_INPUT_EYE << inputshift;
+      set_texgen_matrix( rmesa, unit,
+	 (texUnit->TexGenEnabled & S_BIT) ? texUnit->EyePlaneS : I,
+	 (texUnit->TexGenEnabled & T_BIT) ? texUnit->EyePlaneT : I + 4,
+	 (texUnit->TexGenEnabled & R_BIT) ? texUnit->EyePlaneR : I + 8,
+	 (texUnit->TexGenEnabled & Q_BIT) ? texUnit->EyePlaneQ : I + 12);
+      }
+      break;
+
+   case GL_REFLECTION_MAP_NV:
+      rmesa->TexGenNeedNormals[unit] = GL_TRUE;
+      tgi |= R200_TEXGEN_INPUT_EYE_REFLECT << inputshift;
+      /* pretty weird, must only negate when lighting is enabled? */
+      if (ctx->Light.Enabled)
+	 set_texgen_matrix( rmesa, unit, 
+	    (texUnit->TexGenEnabled & S_BIT) ? reflect : I,
+	    (texUnit->TexGenEnabled & T_BIT) ? reflect + 4 : I + 4,
+	    (texUnit->TexGenEnabled & R_BIT) ? reflect + 8 : I + 8,
+	    I + 12);
+      break;
+
+   case GL_NORMAL_MAP_NV:
+      rmesa->TexGenNeedNormals[unit] = GL_TRUE;
+      tgi |= R200_TEXGEN_INPUT_EYE_NORMAL<<inputshift;
+      break;
+
+   case GL_SPHERE_MAP:
+      rmesa->TexGenNeedNormals[unit] = GL_TRUE;
+      tgi |= R200_TEXGEN_INPUT_SPHERE<<inputshift;
+      break;
+
+   case 0:
+      /* All texgen units were disabled, so just pass coords through. */
+      tgi |= unit << inputshift;
+      break;
+
+   default:
+      /* Unsupported mode, fallback:
+       */
+      if (R200_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "fallback unsupported texgen, %d\n",
+		 texUnit->GenModeS);
+      return GL_FALSE;
+   }
+
+   rmesa->TexGenEnabled |= R200_TEXGEN_TEXMAT_0_ENABLE << unit;
+   rmesa->TexGenCompSel |= R200_OUTPUT_TEX_0 << unit;
+
+   if (tgi != rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_1] || 
+       tgcm != rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_2])
+   {
+      R200_STATECHANGE(rmesa, tcg);
+      rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_1] = tgi;
+      rmesa->hw.tcg.cmd[TCG_TEX_PROC_CTL_2] = tgcm;
+   }
+
+   return GL_TRUE;
+}
+
+
+static void disable_tex( GLcontext *ctx, int unit )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   if (rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (R200_TEX_0_ENABLE<<unit)) {
+      /* Texture unit disabled */
+      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
+	 /* The old texture is no longer bound to this texture unit.
+	  * Mark it as such.
+	  */
+
+	 rmesa->state.texture.unit[unit].texobj->base.bound &= ~(1UL << unit);
+	 rmesa->state.texture.unit[unit].texobj = NULL;
+      }
+
+      R200_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~(R200_TEX_0_ENABLE << unit);
+	 
+      R200_STATECHANGE( rmesa, vtx );
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
+	 
+      if (rmesa->TclFallback & (R200_TCL_FALLBACK_TEXGEN_0<<unit)) {
+	 TCL_FALLBACK( ctx, (R200_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
+      }
+
+      /* Actually want to keep all units less than max active texture
+       * enabled, right?  Fix this for >2 texunits.
+       */
+
+      {
+	 GLuint tmp = rmesa->TexGenEnabled;
+
+	 rmesa->TexGenEnabled &= ~(R200_TEXGEN_TEXMAT_0_ENABLE<<unit);
+	 rmesa->TexGenEnabled &= ~(R200_TEXMAT_0_ENABLE<<unit);
+	 rmesa->TexGenNeedNormals[unit] = GL_FALSE;
+	 rmesa->TexGenCompSel &= ~(R200_OUTPUT_TEX_0 << unit);
+
+	 if (tmp != rmesa->TexGenEnabled) {
+	    rmesa->recheck_texgen[unit] = GL_TRUE;
+	    rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+	 }
+      }
+   }
+}
+
+void set_re_cntl_d3d( GLcontext *ctx, int unit, GLboolean use_d3d )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   GLuint re_cntl;
+
+   re_cntl = rmesa->hw.set.cmd[SET_RE_CNTL] & ~(R200_VTX_STQ0_D3D << (2 * unit));
+   if (use_d3d)
+      re_cntl |= R200_VTX_STQ0_D3D << (2 * unit);
+
+   if ( re_cntl != rmesa->hw.set.cmd[SET_RE_CNTL] ) {
+      R200_STATECHANGE( rmesa, set );
+      rmesa->hw.set.cmd[SET_RE_CNTL] = re_cntl;
+   }
+}
+
+static GLboolean enable_tex_2d( GLcontext *ctx, int unit )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   struct gl_texture_object *tObj = texUnit->_Current;
+   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
+
+   /* Need to load the 2d images associated with this unit.
+    */
+   if (t->pp_txformat & R200_TXFORMAT_NON_POWER2) {
+      t->pp_txformat &= ~R200_TXFORMAT_NON_POWER2;
+      t->base.dirty_images[0] = ~0;
+   }
+
+   ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
+
+   if ( t->base.dirty_images[0] ) {
+      R200_FIREVERTICES( rmesa );
+      r200SetTexImages( rmesa, tObj );
+      r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, 0 );
+      if ( !t->base.memBlock && !t->image_override ) 
+	 return GL_FALSE;
+   }
+
+   set_re_cntl_d3d( ctx, unit, GL_FALSE );
+
+   return GL_TRUE;
+}
+
+#if ENABLE_HW_3D_TEXTURE
+static GLboolean enable_tex_3d( GLcontext *ctx, int unit )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   struct gl_texture_object *tObj = texUnit->_Current;
+   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
+
+   /* Need to load the 3d images associated with this unit.
+    */
+   if (t->pp_txformat & R200_TXFORMAT_NON_POWER2) {
+      t->pp_txformat &= ~R200_TXFORMAT_NON_POWER2;
+      t->base.dirty_images[0] = ~0;
+   }
+
+   ASSERT(tObj->Target == GL_TEXTURE_3D);
+
+   /* R100 & R200 do not support mipmaps for 3D textures.
+    */
+   if ( (tObj->MinFilter != GL_NEAREST) && (tObj->MinFilter != GL_LINEAR) ) {
+      return GL_FALSE;
+   }
+
+   if ( t->base.dirty_images[0] ) {
+      R200_FIREVERTICES( rmesa );
+      r200SetTexImages( rmesa, tObj );
+      r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, 0 );
+      if ( !t->base.memBlock ) 
+	 return GL_FALSE;
+   }
+
+   set_re_cntl_d3d( ctx, unit, GL_TRUE );
+
+   return GL_TRUE;
+}
+#endif
+
+static GLboolean enable_tex_cube( GLcontext *ctx, int unit )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   struct gl_texture_object *tObj = texUnit->_Current;
+   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
+   GLuint face;
+
+   /* Need to load the 2d images associated with this unit.
+    */
+   if (t->pp_txformat & R200_TXFORMAT_NON_POWER2) {
+      t->pp_txformat &= ~R200_TXFORMAT_NON_POWER2;
+      for (face = 0; face < 6; face++)
+         t->base.dirty_images[face] = ~0;
+   }
+
+   ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
+
+   if ( t->base.dirty_images[0] || t->base.dirty_images[1] ||
+        t->base.dirty_images[2] || t->base.dirty_images[3] ||
+        t->base.dirty_images[4] || t->base.dirty_images[5] ) {
+      /* flush */
+      R200_FIREVERTICES( rmesa );
+      /* layout memory space, once for all faces */
+      r200SetTexImages( rmesa, tObj );
+   }
+
+   /* upload (per face) */
+   for (face = 0; face < 6; face++) {
+      if (t->base.dirty_images[face]) {
+         r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, face );
+      }
+   }
+      
+   if ( !t->base.memBlock ) {
+      /* texmem alloc failed, use s/w fallback */
+      return GL_FALSE;
+   }
+
+   set_re_cntl_d3d( ctx, unit, GL_TRUE );
+
+   return GL_TRUE;
+}
+
+static GLboolean enable_tex_rect( GLcontext *ctx, int unit )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   struct gl_texture_object *tObj = texUnit->_Current;
+   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
+
+   if (!(t->pp_txformat & R200_TXFORMAT_NON_POWER2)) {
+      t->pp_txformat |= R200_TXFORMAT_NON_POWER2;
+      t->base.dirty_images[0] = ~0;
+   }
+
+   ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
+
+   if ( t->base.dirty_images[0] ) {
+      R200_FIREVERTICES( rmesa );
+      r200SetTexImages( rmesa, tObj );
+      r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, 0 );
+      if ( !t->base.memBlock &&
+           !t->image_override &&
+           !rmesa->prefer_gart_client_texturing ) 
+	 return GL_FALSE;
+   }
+
+   set_re_cntl_d3d( ctx, unit, GL_FALSE );
+
+   return GL_TRUE;
+}
+
+
+static GLboolean update_tex_common( GLcontext *ctx, int unit )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   struct gl_texture_object *tObj = texUnit->_Current;
+   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
+
+   /* Fallback if there's a texture border */
+   if ( tObj->Image[0][tObj->BaseLevel]->Border > 0 )
+       return GL_FALSE;
+
+   /* Update state if this is a different texture object to last
+    * time.
+    */
+   if ( rmesa->state.texture.unit[unit].texobj != t ) {
+      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
+	 /* The old texture is no longer bound to this texture unit.
+	  * Mark it as such.
+	  */
+
+	 rmesa->state.texture.unit[unit].texobj->base.bound &= 
+	     ~(1UL << unit);
+      }
+
+      rmesa->state.texture.unit[unit].texobj = t;
+      t->base.bound |= (1UL << unit);
+      t->dirty_state |= 1<<unit;
+      driUpdateTextureLRU( (driTextureObject *) t ); /* XXX: should be locked! */
+   }
+
+
+   /* Newly enabled?
+    */
+   if ( 1|| !(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (R200_TEX_0_ENABLE<<unit))) {
+      R200_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_0_ENABLE << unit;
+
+      R200_STATECHANGE( rmesa, vtx );
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] |= 4 << (unit * 3);
+
+      rmesa->recheck_texgen[unit] = GL_TRUE;
+   }
+
+   if (t->dirty_state & (1<<unit)) {
+      import_tex_obj_state( rmesa, unit, t );
+   }
+
+   if (rmesa->recheck_texgen[unit]) {
+      GLboolean fallback = !r200_validate_texgen( ctx, unit );
+      TCL_FALLBACK( ctx, (R200_TCL_FALLBACK_TEXGEN_0<<unit), fallback);
+      rmesa->recheck_texgen[unit] = 0;
+      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+   }
+
+   FALLBACK( rmesa, R200_FALLBACK_BORDER_MODE, t->border_fallback );
+   return !t->border_fallback;
+}
+
+
+
+static GLboolean r200UpdateTextureUnit( GLcontext *ctx, int unit )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLuint unitneeded = rmesa->state.texture.unit[unit].unitneeded;
+
+   if ( unitneeded & (TEXTURE_RECT_BIT) ) {
+      return (enable_tex_rect( ctx, unit ) &&
+	      update_tex_common( ctx, unit ));
+   }
+   else if ( unitneeded & (TEXTURE_1D_BIT | TEXTURE_2D_BIT) ) {
+      return (enable_tex_2d( ctx, unit ) &&
+	      update_tex_common( ctx, unit ));
+   }
+#if ENABLE_HW_3D_TEXTURE
+   else if ( unitneeded & (TEXTURE_3D_BIT) ) {
+      return (enable_tex_3d( ctx, unit ) &&
+	      update_tex_common( ctx, unit ));
+   }
+#endif
+   else if ( unitneeded & (TEXTURE_CUBE_BIT) ) {
+      return (enable_tex_cube( ctx, unit ) &&
+	      update_tex_common( ctx, unit ));
+   }
+   else if ( unitneeded ) {
+      return GL_FALSE;
+   }
+   else {
+      disable_tex( ctx, unit );
+      return GL_TRUE;
+   }
+}
+
+
+void r200UpdateTextureState( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLboolean ok;
+   GLuint dbg;
+
+   if (ctx->ATIFragmentShader._Enabled) {
+      GLuint i;
+      for (i = 0; i < R200_MAX_TEXTURE_UNITS; i++) {
+	 rmesa->state.texture.unit[i].unitneeded = ctx->Texture.Unit[i]._ReallyEnabled;
+      }
+      ok = GL_TRUE;
+   }
+   else {
+      ok = r200UpdateAllTexEnv( ctx );
+   }
+   if (ok) {
+      ok = (r200UpdateTextureUnit( ctx, 0 ) &&
+	 r200UpdateTextureUnit( ctx, 1 ) &&
+	 r200UpdateTextureUnit( ctx, 2 ) &&
+	 r200UpdateTextureUnit( ctx, 3 ) &&
+	 r200UpdateTextureUnit( ctx, 4 ) &&
+	 r200UpdateTextureUnit( ctx, 5 ));
+   }
+
+   if (ok && ctx->ATIFragmentShader._Enabled) {
+      r200UpdateFragmentShader(ctx);
+   }
+
+   FALLBACK( rmesa, R200_FALLBACK_TEXTURE, !ok );
+
+   if (rmesa->TclFallback)
+      r200ChooseVertexState( ctx );
+
+
+   if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200) {
+
+      /*
+       * T0 hang workaround -------------
+       * not needed for r200 derivatives
+        */
+      if ((rmesa->hw.ctx.cmd[CTX_PP_CNTL] & R200_TEX_ENABLE_MASK) == R200_TEX_0_ENABLE &&
+	 (rmesa->hw.tex[0].cmd[TEX_PP_TXFILTER] & R200_MIN_FILTER_MASK) > R200_MIN_FILTER_LINEAR) {
+
+	 R200_STATECHANGE(rmesa, ctx);
+	 R200_STATECHANGE(rmesa, tex[1]);
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_1_ENABLE;
+	 if (!(rmesa->hw.cst.cmd[CST_PP_CNTL_X] & R200_PPX_TEX_1_ENABLE))
+	    rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
+	 rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] |= R200_TXFORMAT_LOOKUP_DISABLE;
+      }
+      else if (!ctx->ATIFragmentShader._Enabled) {
+	 if ((rmesa->hw.ctx.cmd[CTX_PP_CNTL] & R200_TEX_1_ENABLE) &&
+	    (rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] & R200_TXFORMAT_LOOKUP_DISABLE)) {
+	    R200_STATECHANGE(rmesa, tex[1]);
+	    rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] &= ~R200_TXFORMAT_LOOKUP_DISABLE;
+         }
+      }
+      /* do the same workaround for the first pass of a fragment shader.
+       * completely unknown if necessary / sufficient.
+       */
+      if ((rmesa->hw.cst.cmd[CST_PP_CNTL_X] & R200_PPX_TEX_ENABLE_MASK) == R200_PPX_TEX_0_ENABLE &&
+	 (rmesa->hw.tex[0].cmd[TEX_PP_TXFILTER] & R200_MIN_FILTER_MASK) > R200_MIN_FILTER_LINEAR) {
+
+	 R200_STATECHANGE(rmesa, cst);
+	 R200_STATECHANGE(rmesa, tex[1]);
+	 rmesa->hw.cst.cmd[CST_PP_CNTL_X] |= R200_PPX_TEX_1_ENABLE;
+	 if (!(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & R200_TEX_1_ENABLE))
+	    rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
+	 rmesa->hw.tex[1].cmd[TEX_PP_TXMULTI_CTL] |= R200_PASS1_TXFORMAT_LOOKUP_DISABLE;
+      }
+
+      /* maybe needs to be done pairwise due to 2 parallel (physical) tex units ?
+         looks like that's not the case, if 8500/9100 owners don't complain remove this...
+      for ( i = 0; i < ctx->Const.MaxTextureUnits; i += 2) {
+         if (((rmesa->hw.ctx.cmd[CTX_PP_CNTL] & ((R200_TEX_0_ENABLE |
+            R200_TEX_1_ENABLE ) << i)) == (R200_TEX_0_ENABLE << i)) &&
+            ((rmesa->hw.tex[i].cmd[TEX_PP_TXFILTER] & R200_MIN_FILTER_MASK) >
+            R200_MIN_FILTER_LINEAR)) {
+            R200_STATECHANGE(rmesa, ctx);
+            R200_STATECHANGE(rmesa, tex[i+1]);
+            rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= (R200_TEX_1_ENABLE << i);
+            rmesa->hw.tex[i+1].cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
+            rmesa->hw.tex[i+1].cmd[TEX_PP_TXFORMAT] |= 0x08000000;
+         }
+         else {
+            if ((rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (R200_TEX_1_ENABLE << i)) &&
+               (rmesa->hw.tex[i+1].cmd[TEX_PP_TXFORMAT] & 0x08000000)) {
+               R200_STATECHANGE(rmesa, tex[i+1]);
+               rmesa->hw.tex[i+1].cmd[TEX_PP_TXFORMAT] &= ~0x08000000;
+            }
+         }
+      } */
+
+      /*
+       * Texture cache LRU hang workaround -------------
+       * not needed for r200 derivatives
+       * hopefully this covers first pass of a shader as well
+       */
+
+      /* While the cases below attempt to only enable the workaround in the
+       * specific cases necessary, they were insufficient.  See bugzilla #1519,
+       * #729, #814.  Tests with quake3 showed no impact on performance.
+       */
+      dbg = 0x6;
+
+      /*
+      if (((rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (R200_TEX_0_ENABLE )) &&
+         ((((rmesa->hw.tex[0].cmd[TEX_PP_TXFILTER] & R200_MIN_FILTER_MASK)) &
+         0x04) == 0)) ||
+         ((rmesa->hw.ctx.cmd[CTX_PP_CNTL] & R200_TEX_2_ENABLE) &&
+         ((((rmesa->hw.tex[2].cmd[TEX_PP_TXFILTER] & R200_MIN_FILTER_MASK)) &
+         0x04) == 0)) ||
+         ((rmesa->hw.ctx.cmd[CTX_PP_CNTL] & R200_TEX_4_ENABLE) &&
+         ((((rmesa->hw.tex[4].cmd[TEX_PP_TXFILTER] & R200_MIN_FILTER_MASK)) &
+         0x04) == 0)))
+      {
+         dbg |= 0x02;
+      }
+
+      if (((rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (R200_TEX_1_ENABLE )) &&
+         ((((rmesa->hw.tex[1].cmd[TEX_PP_TXFILTER] & R200_MIN_FILTER_MASK)) &
+         0x04) == 0)) ||
+         ((rmesa->hw.ctx.cmd[CTX_PP_CNTL] & R200_TEX_3_ENABLE) &&
+         ((((rmesa->hw.tex[3].cmd[TEX_PP_TXFILTER] & R200_MIN_FILTER_MASK)) &
+         0x04) == 0)) ||
+         ((rmesa->hw.ctx.cmd[CTX_PP_CNTL] & R200_TEX_5_ENABLE) &&
+         ((((rmesa->hw.tex[5].cmd[TEX_PP_TXFILTER] & R200_MIN_FILTER_MASK)) &
+         0x04) == 0)))
+      {
+         dbg |= 0x04;
+      }*/
+
+      if (dbg != rmesa->hw.tam.cmd[TAM_DEBUG3]) {
+         R200_STATECHANGE( rmesa, tam );
+         rmesa->hw.tam.cmd[TAM_DEBUG3] = dbg;
+         if (0) printf("TEXCACHE LRU HANG WORKAROUND %x\n", dbg);
+      }
+   }
+}
diff --git a/r200/r200_vertprog.c b/r200/r200_vertprog.c
new file mode 100644
index 0000000..6089d61
--- /dev/null
+++ b/r200/r200_vertprog.c
@@ -0,0 +1,1256 @@
+/**************************************************************************
+
+Copyright (C) 2005 Aapo Tahkola.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Aapo Tahkola <aet@rasterburn.org>
+ *   Roland Scheidegger <rscheidegger_lists@hispeed.ch>
+ */
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "program.h"
+#include "shader/prog_instruction.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+#include "shader/programopt.h"
+#include "tnl/tnl.h"
+
+#include "r200_context.h"
+#include "r200_vertprog.h"
+#include "r200_ioctl.h"
+#include "r200_tcl.h"
+
+#if SWIZZLE_X != VSF_IN_COMPONENT_X || \
+    SWIZZLE_Y != VSF_IN_COMPONENT_Y || \
+    SWIZZLE_Z != VSF_IN_COMPONENT_Z || \
+    SWIZZLE_W != VSF_IN_COMPONENT_W || \
+    SWIZZLE_ZERO != VSF_IN_COMPONENT_ZERO || \
+    SWIZZLE_ONE != VSF_IN_COMPONENT_ONE || \
+    WRITEMASK_X != VSF_FLAG_X || \
+    WRITEMASK_Y != VSF_FLAG_Y || \
+    WRITEMASK_Z != VSF_FLAG_Z || \
+    WRITEMASK_W != VSF_FLAG_W
+#error Cannot change these!
+#endif
+
+#define SCALAR_FLAG (1<<31)
+#define FLAG_MASK (1<<31)
+#define OP_MASK (0xf)  /* we are unlikely to have more than 15 */
+#define OPN(operator, ip) {#operator, OPCODE_##operator, ip}
+
+static struct{
+   char *name;
+   int opcode;
+   unsigned long ip; /* number of input operands and flags */
+}op_names[]={
+   OPN(ABS, 1),
+   OPN(ADD, 2),
+   OPN(ARL, 1|SCALAR_FLAG),
+   OPN(DP3, 2),
+   OPN(DP4, 2),
+   OPN(DPH, 2),
+   OPN(DST, 2),
+   OPN(EX2, 1|SCALAR_FLAG),
+   OPN(EXP, 1|SCALAR_FLAG),
+   OPN(FLR, 1),
+   OPN(FRC, 1),
+   OPN(LG2, 1|SCALAR_FLAG),
+   OPN(LIT, 1),
+   OPN(LOG, 1|SCALAR_FLAG),
+   OPN(MAD, 3),
+   OPN(MAX, 2),
+   OPN(MIN, 2),
+   OPN(MOV, 1),
+   OPN(MUL, 2),
+   OPN(POW, 2|SCALAR_FLAG),
+   OPN(RCP, 1|SCALAR_FLAG),
+   OPN(RSQ, 1|SCALAR_FLAG),
+   OPN(SGE, 2),
+   OPN(SLT, 2),
+   OPN(SUB, 2),
+   OPN(SWZ, 1),
+   OPN(XPD, 2),
+   OPN(PRINT, 0),
+   OPN(END, 0),
+};
+#undef OPN
+
+static GLboolean r200VertexProgUpdateParams(GLcontext *ctx, struct r200_vertex_program *vp)
+{
+   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+   GLfloat *fcmd = (GLfloat *)&rmesa->hw.vpp[0].cmd[VPP_CMD_0 + 1];
+   int pi;
+   struct gl_vertex_program *mesa_vp = &vp->mesa_program;
+   struct gl_program_parameter_list *paramList;
+   drm_radeon_cmd_header_t tmp;
+
+   R200_STATECHANGE( rmesa, vpp[0] );
+   R200_STATECHANGE( rmesa, vpp[1] );
+   assert(mesa_vp->Base.Parameters);
+   _mesa_load_state_parameters(ctx, mesa_vp->Base.Parameters);
+   paramList = mesa_vp->Base.Parameters;
+
+   if(paramList->NumParameters > R200_VSF_MAX_PARAM){
+      fprintf(stderr, "%s:Params exhausted\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+   for(pi = 0; pi < paramList->NumParameters; pi++) {
+      switch(paramList->Parameters[pi].Type) {
+      case PROGRAM_STATE_VAR:
+      case PROGRAM_NAMED_PARAM:
+      //fprintf(stderr, "%s", vp->Parameters->Parameters[pi].Name);
+      case PROGRAM_CONSTANT:
+	 *fcmd++ = paramList->ParameterValues[pi][0];
+	 *fcmd++ = paramList->ParameterValues[pi][1];
+	 *fcmd++ = paramList->ParameterValues[pi][2];
+	 *fcmd++ = paramList->ParameterValues[pi][3];
+	 break;
+      default:
+	 _mesa_problem(NULL, "Bad param type in %s", __FUNCTION__);
+	 break;
+      }
+      if (pi == 95) {
+	 fcmd = (GLfloat *)&rmesa->hw.vpp[1].cmd[VPP_CMD_0 + 1];
+      }
+   }
+   /* hack up the cmd_size so not the whole state atom is emitted always. */
+   rmesa->hw.vpp[0].cmd_size =
+      1 + 4 * ((paramList->NumParameters > 96) ? 96 : paramList->NumParameters);
+   tmp.i = rmesa->hw.vpp[0].cmd[VPP_CMD_0];
+   tmp.veclinear.count = (paramList->NumParameters > 96) ? 96 : paramList->NumParameters;
+   rmesa->hw.vpp[0].cmd[VPP_CMD_0] = tmp.i;
+   if (paramList->NumParameters > 96) {
+      rmesa->hw.vpp[1].cmd_size = 1 + 4 * (paramList->NumParameters - 96);
+      tmp.i = rmesa->hw.vpp[1].cmd[VPP_CMD_0];
+      tmp.veclinear.count = paramList->NumParameters - 96;
+      rmesa->hw.vpp[1].cmd[VPP_CMD_0] = tmp.i;
+   }
+   return GL_TRUE;
+}
+
+static __inline unsigned long t_dst_mask(GLuint mask)
+{
+   /* WRITEMASK_* is equivalent to VSF_FLAG_* */
+   return mask & VSF_FLAG_ALL;
+}
+
+static unsigned long t_dst(struct prog_dst_register *dst)
+{
+   switch(dst->File) {
+   case PROGRAM_TEMPORARY:
+      return ((dst->Index << R200_VPI_OUT_REG_INDEX_SHIFT)
+	 | R200_VSF_OUT_CLASS_TMP);
+   case PROGRAM_OUTPUT:
+      switch (dst->Index) {
+      case VERT_RESULT_HPOS:
+	 return R200_VSF_OUT_CLASS_RESULT_POS;
+      case VERT_RESULT_COL0:
+	 return R200_VSF_OUT_CLASS_RESULT_COLOR;
+      case VERT_RESULT_COL1:
+	 return ((1 << R200_VPI_OUT_REG_INDEX_SHIFT)
+	    | R200_VSF_OUT_CLASS_RESULT_COLOR);
+      case VERT_RESULT_FOGC:
+	 return R200_VSF_OUT_CLASS_RESULT_FOGC;
+      case VERT_RESULT_TEX0:
+      case VERT_RESULT_TEX1:
+      case VERT_RESULT_TEX2:
+      case VERT_RESULT_TEX3:
+      case VERT_RESULT_TEX4:
+      case VERT_RESULT_TEX5:
+	 return (((dst->Index - VERT_RESULT_TEX0) << R200_VPI_OUT_REG_INDEX_SHIFT)
+	    | R200_VSF_OUT_CLASS_RESULT_TEXC);
+      case VERT_RESULT_PSIZ:
+	 return R200_VSF_OUT_CLASS_RESULT_POINTSIZE;
+      default:
+	 fprintf(stderr, "problem in %s, unknown dst output reg %d\n", __FUNCTION__, dst->Index);
+	 exit(0);
+	 return 0;
+      }
+   case PROGRAM_ADDRESS:
+      assert (dst->Index == 0);
+      return R200_VSF_OUT_CLASS_ADDR;
+   default:
+      fprintf(stderr, "problem in %s, unknown register type %d\n", __FUNCTION__, dst->File);
+      exit(0);
+      return 0;
+   }
+}
+
+static unsigned long t_src_class(enum register_file file)
+{
+
+   switch(file){
+   case PROGRAM_TEMPORARY:
+      return VSF_IN_CLASS_TMP;
+
+   case PROGRAM_INPUT:
+      return VSF_IN_CLASS_ATTR;
+
+   case PROGRAM_LOCAL_PARAM:
+   case PROGRAM_ENV_PARAM:
+   case PROGRAM_NAMED_PARAM:
+   case PROGRAM_STATE_VAR:
+      return VSF_IN_CLASS_PARAM;
+   /*
+   case PROGRAM_OUTPUT:
+   case PROGRAM_WRITE_ONLY:
+   case PROGRAM_ADDRESS:
+   */
+   default:
+      fprintf(stderr, "problem in %s", __FUNCTION__);
+      exit(0);
+   }
+}
+
+static __inline unsigned long t_swizzle(GLubyte swizzle)
+{
+/* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
+   return swizzle;
+}
+
+#if 0
+static void vp_dump_inputs(struct r200_vertex_program *vp, char *caller)
+{
+   int i;
+
+   if(vp == NULL){
+      fprintf(stderr, "vp null in call to %s from %s\n", __FUNCTION__, caller);
+      return ;
+   }
+
+   fprintf(stderr, "%s:<", caller);
+   for(i=0; i < VERT_ATTRIB_MAX; i++)
+   fprintf(stderr, "%d ", vp->inputs[i]);
+   fprintf(stderr, ">\n");
+
+}
+#endif
+
+static unsigned long t_src_index(struct r200_vertex_program *vp, struct prog_src_register *src)
+{
+/*
+   int i;
+   int max_reg = -1;
+*/
+   if(src->File == PROGRAM_INPUT){
+/*      if(vp->inputs[src->Index] != -1)
+	 return vp->inputs[src->Index];
+
+      for(i=0; i < VERT_ATTRIB_MAX; i++)
+	 if(vp->inputs[i] > max_reg)
+	    max_reg = vp->inputs[i];
+
+      vp->inputs[src->Index] = max_reg+1;*/
+
+      //vp_dump_inputs(vp, __FUNCTION__);	
+      assert(vp->inputs[src->Index] != -1);
+      return vp->inputs[src->Index];
+   } else {
+      if (src->Index < 0) {
+	 fprintf(stderr, "WARNING negative offsets for indirect addressing do not work\n");
+	 return 0;
+      }
+      return src->Index;
+   }
+}
+
+static unsigned long t_src(struct r200_vertex_program *vp, struct prog_src_register *src)
+{
+
+   return MAKE_VSF_SOURCE(t_src_index(vp, src),
+			t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			t_swizzle(GET_SWZ(src->Swizzle, 1)),
+			t_swizzle(GET_SWZ(src->Swizzle, 2)),
+			t_swizzle(GET_SWZ(src->Swizzle, 3)),
+			t_src_class(src->File),
+			src->NegateBase) | (src->RelAddr << 4);
+}
+
+static unsigned long t_src_scalar(struct r200_vertex_program *vp, struct prog_src_register *src)
+{
+
+   return MAKE_VSF_SOURCE(t_src_index(vp, src),
+			t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			t_src_class(src->File),
+			src->NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src->RelAddr << 4);
+}
+
+static unsigned long t_opcode(enum prog_opcode opcode)
+{
+
+   switch(opcode){
+   case OPCODE_ADD: return R200_VPI_OUT_OP_ADD;
+   /* FIXME: ARL works fine, but negative offsets won't work - fglrx just
+    * seems to ignore neg offsets which isn't quite correct...
+    */
+   case OPCODE_ARL: return R200_VPI_OUT_OP_ARL;
+   case OPCODE_DP4: return R200_VPI_OUT_OP_DOT;
+   case OPCODE_DST: return R200_VPI_OUT_OP_DST;
+   case OPCODE_EX2: return R200_VPI_OUT_OP_EX2;
+   case OPCODE_EXP: return R200_VPI_OUT_OP_EXP;
+   case OPCODE_FRC: return R200_VPI_OUT_OP_FRC;
+   case OPCODE_LG2: return R200_VPI_OUT_OP_LG2;
+   case OPCODE_LIT: return R200_VPI_OUT_OP_LIT;
+   case OPCODE_LOG: return R200_VPI_OUT_OP_LOG;
+   case OPCODE_MAX: return R200_VPI_OUT_OP_MAX;
+   case OPCODE_MIN: return R200_VPI_OUT_OP_MIN;
+   case OPCODE_MUL: return R200_VPI_OUT_OP_MUL;
+   case OPCODE_RCP: return R200_VPI_OUT_OP_RCP;
+   case OPCODE_RSQ: return R200_VPI_OUT_OP_RSQ;
+   case OPCODE_SGE: return R200_VPI_OUT_OP_SGE;
+   case OPCODE_SLT: return R200_VPI_OUT_OP_SLT;
+
+   default: 
+      fprintf(stderr, "%s: Should not be called with opcode %d!", __FUNCTION__, opcode);
+   }
+   exit(-1);
+   return 0;
+}
+
+static unsigned long op_operands(enum prog_opcode opcode)
+{
+   int i;
+
+   /* Can we trust mesas opcodes to be in order ? */
+   for(i=0; i < sizeof(op_names) / sizeof(*op_names); i++)
+      if(op_names[i].opcode == opcode)
+	 return op_names[i].ip;
+
+   fprintf(stderr, "op %d not found in op_names\n", opcode);
+   exit(-1);
+   return 0;
+}
+
+/* TODO: Get rid of t_src_class call */
+#define CMP_SRCS(a, b) (((a.RelAddr != b.RelAddr) || (a.Index != b.Index)) && \
+		       ((t_src_class(a.File) == VSF_IN_CLASS_PARAM && \
+			 t_src_class(b.File) == VSF_IN_CLASS_PARAM) || \
+			(t_src_class(a.File) == VSF_IN_CLASS_ATTR && \
+			 t_src_class(b.File) == VSF_IN_CLASS_ATTR))) \
+
+/* fglrx on rv250 codes up unused sources as follows:
+   unused but necessary sources are same as previous source, zero-ed out.
+   unnecessary sources are same as previous source but with VSF_IN_CLASS_NONE set.
+   i.e. an add (2 args) has its 2nd arg (if you use it as mov) zero-ed out, and 3rd arg
+   set to VSF_IN_CLASS_NONE. Not sure if strictly necessary. */
+
+/* use these simpler definitions. Must obviously not be used with not yet set up regs.
+   Those are NOT semantically equivalent to the r300 ones, requires code changes */
+#define ZERO_SRC_0 (((o_inst->src0 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
+				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
+
+#define ZERO_SRC_1 (((o_inst->src1 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
+				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
+
+#define ZERO_SRC_2 (((o_inst->src2 & ~(0xfff << R200_VPI_IN_X_SHIFT)) \
+				   | ((R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_X_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Y_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_Z_SHIFT) \
+				   | (R200_VPI_IN_SELECT_ZERO << R200_VPI_IN_W_SHIFT))))
+
+#define UNUSED_SRC_0 ((o_inst->src0 & ~15) | 9)
+
+#define UNUSED_SRC_1 ((o_inst->src1 & ~15) | 9)
+
+#define UNUSED_SRC_2 ((o_inst->src2 & ~15) | 9)
+
+
+/**
+ * Generate an R200 vertex program from Mesa's internal representation.
+ *
+ * \return  GL_TRUE for success, GL_FALSE for failure.
+ */
+static GLboolean r200_translate_vertex_program(GLcontext *ctx, struct r200_vertex_program *vp)
+{
+   struct gl_vertex_program *mesa_vp = &vp->mesa_program;
+   struct prog_instruction *vpi;
+   int i;
+   VERTEX_SHADER_INSTRUCTION *o_inst;
+   unsigned long operands;
+   int are_srcs_scalar;
+   unsigned long hw_op;
+   int dofogfix = 0;
+   int fog_temp_i = 0;
+   int free_inputs;
+   int array_count = 0;
+
+   vp->native = GL_FALSE;
+   vp->translated = GL_TRUE;
+   vp->fogmode = ctx->Fog.Mode;
+
+   if (mesa_vp->Base.NumInstructions == 0)
+      return GL_FALSE;
+
+#if 0
+   if ((mesa_vp->Base.InputsRead &
+      ~(VERT_BIT_POS | VERT_BIT_NORMAL | VERT_BIT_COLOR0 | VERT_BIT_COLOR1 |
+      VERT_BIT_FOG | VERT_BIT_TEX0 | VERT_BIT_TEX1 | VERT_BIT_TEX2 |
+      VERT_BIT_TEX3 | VERT_BIT_TEX4 | VERT_BIT_TEX5)) != 0) {
+      if (R200_DEBUG & DEBUG_FALLBACKS) {
+	 fprintf(stderr, "can't handle vert prog inputs 0x%x\n",
+	    mesa_vp->Base.InputsRead);
+      }
+      return GL_FALSE;
+   }
+#endif
+
+   if ((mesa_vp->Base.OutputsWritten &
+      ~((1 << VERT_RESULT_HPOS) | (1 << VERT_RESULT_COL0) | (1 << VERT_RESULT_COL1) |
+      (1 << VERT_RESULT_FOGC) | (1 << VERT_RESULT_TEX0) | (1 << VERT_RESULT_TEX1) |
+      (1 << VERT_RESULT_TEX2) | (1 << VERT_RESULT_TEX3) | (1 << VERT_RESULT_TEX4) |
+      (1 << VERT_RESULT_TEX5) | (1 << VERT_RESULT_PSIZ))) != 0) {
+      if (R200_DEBUG & DEBUG_FALLBACKS) {
+	 fprintf(stderr, "can't handle vert prog outputs 0x%x\n",
+	    mesa_vp->Base.OutputsWritten);
+      }
+      return GL_FALSE;
+   }
+
+   if (mesa_vp->IsNVProgram) {
+   /* subtle differences in spec like guaranteed initialized regs could cause
+      headaches. Might want to remove the driconf option to enable it completely */
+      return GL_FALSE;
+   }
+   /* Initial value should be last tmp reg that hw supports.
+      Strangely enough r300 doesnt mind even though these would be out of range.
+      Smart enough to realize that it doesnt need it? */
+   int u_temp_i = R200_VSF_MAX_TEMPS - 1;
+   struct prog_src_register src[3];
+   struct prog_dst_register dst;
+
+/* FIXME: is changing the prog safe to do here? */
+   if (mesa_vp->IsPositionInvariant &&
+      /* make sure we only do this once */
+       !(mesa_vp->Base.OutputsWritten & (1 << VERT_RESULT_HPOS))) {
+	 _mesa_insert_mvp_code(ctx, mesa_vp);
+      }
+
+   /* for fogc, can't change mesa_vp, as it would hose swtnl, and exp with
+      base e isn't directly available neither. */
+   if ((mesa_vp->Base.OutputsWritten & (1 << VERT_RESULT_FOGC)) && !vp->fogpidx) {
+      struct gl_program_parameter_list *paramList;
+      gl_state_index tokens[STATE_LENGTH] = { STATE_FOG_PARAMS, 0, 0, 0, 0 };
+      paramList = mesa_vp->Base.Parameters;
+      vp->fogpidx = _mesa_add_state_reference(paramList, tokens);
+   }
+
+   vp->pos_end = 0;
+   mesa_vp->Base.NumNativeInstructions = 0;
+   if (mesa_vp->Base.Parameters)
+      mesa_vp->Base.NumNativeParameters = mesa_vp->Base.Parameters->NumParameters;
+   else
+      mesa_vp->Base.NumNativeParameters = 0;
+
+   for(i = 0; i < VERT_ATTRIB_MAX; i++)
+      vp->inputs[i] = -1;
+   for(i = 0; i < 15; i++)
+      vp->inputmap_rev[i] = 255;
+   free_inputs = 0x2ffd;
+
+/* fglrx uses fixed inputs as follows for conventional attribs.
+   generic attribs use non-fixed assignment, fglrx will always use the
+   lowest attrib values available. We'll just do the same.
+   There are 12 generic attribs possible, corresponding to attrib 0, 2-11
+   and 13 in a hw vertex prog.
+   attr 1 and 12 aren't used for generic attribs as those cannot be made vec4
+   (correspond to vertex normal/weight - maybe weight actually could be made vec4).
+   Additionally, not more than 12 arrays in total are possible I think.
+   attr 0 is pos, R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0
+   attr 2-5 use colors 0-3 (R200_VTX_FP_RGBA << R200_VTX_COLOR_0/1/2/3_SHIFT in R200_SE_VTX_FMT_0)
+   attr 6-11 use tex 0-5 (4 << R200_VTX_TEX0/1/2/3/4/5_COMP_CNT_SHIFT in R200_SE_VTX_FMT_1)
+   attr 13 uses vtx1 pos (R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0)
+*/
+
+/* attr 4,5 and 13 are only used with generic attribs.
+   Haven't seen attr 14 used, maybe that's for the hw pointsize vec1 (which is
+   not possibe to use with vertex progs as it is lacking in vert prog specification) */
+/* may look different when using idx buf / input_route instead of se_vtx_fmt? */
+   if (mesa_vp->Base.InputsRead & VERT_BIT_POS) {
+      vp->inputs[VERT_ATTRIB_POS] = 0;
+      vp->inputmap_rev[0] = VERT_ATTRIB_POS;
+      free_inputs &= ~(1 << 0);
+      array_count++;
+   }
+   if (mesa_vp->Base.InputsRead & VERT_BIT_WEIGHT) {
+      vp->inputs[VERT_ATTRIB_WEIGHT] = 12;
+      vp->inputmap_rev[1] = VERT_ATTRIB_WEIGHT;
+      array_count++;
+   }
+   if (mesa_vp->Base.InputsRead & VERT_BIT_NORMAL) {
+      vp->inputs[VERT_ATTRIB_NORMAL] = 1;
+      vp->inputmap_rev[2] = VERT_ATTRIB_NORMAL;
+      array_count++;
+   }
+   if (mesa_vp->Base.InputsRead & VERT_BIT_COLOR0) {
+      vp->inputs[VERT_ATTRIB_COLOR0] = 2;
+      vp->inputmap_rev[4] = VERT_ATTRIB_COLOR0;
+      free_inputs &= ~(1 << 2);
+      array_count++;
+   }
+   if (mesa_vp->Base.InputsRead & VERT_BIT_COLOR1) {
+      vp->inputs[VERT_ATTRIB_COLOR1] = 3;
+      vp->inputmap_rev[5] = VERT_ATTRIB_COLOR1;
+      free_inputs &= ~(1 << 3);
+      array_count++;
+   }
+   if (mesa_vp->Base.InputsRead & VERT_BIT_FOG) {
+      vp->inputs[VERT_ATTRIB_FOG] = 15; array_count++;
+      vp->inputmap_rev[3] = VERT_ATTRIB_FOG;
+      array_count++;
+   }
+   for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX5; i++) {
+      if (mesa_vp->Base.InputsRead & (1 << i)) {
+	 vp->inputs[i] = i - VERT_ATTRIB_TEX0 + 6;
+	 vp->inputmap_rev[8 + i - VERT_ATTRIB_TEX0] = i;
+	 free_inputs &= ~(1 << (i - VERT_ATTRIB_TEX0 + 6));
+	 array_count++;
+      }
+   }
+   /* using VERT_ATTRIB_TEX6/7 would be illegal */
+   /* completely ignore aliasing? */
+   for (i = VERT_ATTRIB_GENERIC0; i < VERT_ATTRIB_MAX; i++) {
+      int j;
+   /* completely ignore aliasing? */
+      if (mesa_vp->Base.InputsRead & (1 << i)) {
+	 array_count++;
+	 if (array_count > 12) {
+	    if (R200_DEBUG & DEBUG_FALLBACKS) {
+	       fprintf(stderr, "more than 12 attribs used in vert prog\n");
+	    }
+	    return GL_FALSE;
+	 }
+	 for (j = 0; j < 14; j++) {
+	    /* will always find one due to limited array_count */
+	    if (free_inputs & (1 << j)) {
+	       free_inputs &= ~(1 << j);
+	       vp->inputs[i] = j;
+	       if (j == 0) vp->inputmap_rev[j] = i; /* mapped to pos */
+	       else if (j < 12) vp->inputmap_rev[j + 2] = i; /* mapped to col/tex */
+	       else vp->inputmap_rev[j + 1] = i; /* mapped to pos1 */
+	       break;
+	    }
+	 }
+      }
+   }
+
+   if (!(mesa_vp->Base.OutputsWritten & (1 << VERT_RESULT_HPOS))) {
+      if (R200_DEBUG & DEBUG_FALLBACKS) {
+	 fprintf(stderr, "can't handle vert prog without position output\n");
+      }
+      return GL_FALSE;
+   }
+   if (free_inputs & 1) {
+      if (R200_DEBUG & DEBUG_FALLBACKS) {
+	 fprintf(stderr, "can't handle vert prog without position input\n");
+      }
+      return GL_FALSE;
+   }
+
+   o_inst = vp->instr;
+   for (vpi = mesa_vp->Base.Instructions; vpi->Opcode != OPCODE_END; vpi++, o_inst++){
+      operands = op_operands(vpi->Opcode);
+      are_srcs_scalar = operands & SCALAR_FLAG;
+      operands &= OP_MASK;
+
+      for(i = 0; i < operands; i++) {
+	 src[i] = vpi->SrcReg[i];
+	 /* hack up default attrib values as per spec as swizzling.
+	    normal, fog, secondary color. Crazy?
+	    May need more if we don't submit vec4 elements? */
+	 if (src[i].File == PROGRAM_INPUT) {
+	    if (src[i].Index == VERT_ATTRIB_NORMAL) {
+	       int j;
+	       for (j = 0; j < 4; j++) {
+		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
+		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
+		     src[i].Swizzle |= SWIZZLE_ONE << (j*3);
+		  }
+	       }
+	    }
+	    else if (src[i].Index == VERT_ATTRIB_COLOR1) {
+	       int j;
+	       for (j = 0; j < 4; j++) {
+		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
+		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
+		     src[i].Swizzle |= SWIZZLE_ZERO << (j*3);
+		  }
+	       }
+	    }
+	    else if (src[i].Index == VERT_ATTRIB_FOG) {
+	       int j;
+	       for (j = 0; j < 4; j++) {
+		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
+		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
+		     src[i].Swizzle |= SWIZZLE_ONE << (j*3);
+		  }
+		  else if ((GET_SWZ(src[i].Swizzle, j) == SWIZZLE_Y) ||
+			    GET_SWZ(src[i].Swizzle, j) == SWIZZLE_Z) {
+		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
+		     src[i].Swizzle |= SWIZZLE_ZERO << (j*3);
+		  }
+	       }
+	    }
+	 }
+      }
+
+      if(operands == 3){
+	 if( CMP_SRCS(src[1], src[2]) || CMP_SRCS(src[0], src[2]) ){
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
+		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+		VSF_FLAG_ALL);
+
+	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[2]),
+		  SWIZZLE_X, SWIZZLE_Y,
+		  SWIZZLE_Z, SWIZZLE_W,
+		  t_src_class(src[2].File), VSF_FLAG_NONE) | (src[2].RelAddr << 4);
+
+	    o_inst->src1 = ZERO_SRC_0;
+	    o_inst->src2 = UNUSED_SRC_1;
+	    o_inst++;
+
+	    src[2].File = PROGRAM_TEMPORARY;
+	    src[2].Index = u_temp_i;
+	    src[2].RelAddr = 0;
+	    u_temp_i--;
+	 }
+      }
+
+      if(operands >= 2){
+	 if( CMP_SRCS(src[1], src[0]) ){
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
+		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+		VSF_FLAG_ALL);
+
+	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		  SWIZZLE_X, SWIZZLE_Y,
+		  SWIZZLE_Z, SWIZZLE_W,
+		  t_src_class(src[0].File), VSF_FLAG_NONE) | (src[0].RelAddr << 4);
+
+	    o_inst->src1 = ZERO_SRC_0;
+	    o_inst->src2 = UNUSED_SRC_1;
+	    o_inst++;
+
+	    src[0].File = PROGRAM_TEMPORARY;
+	    src[0].Index = u_temp_i;
+	    src[0].RelAddr = 0;
+	    u_temp_i--;
+	 }
+      }
+
+      dst = vpi->DstReg;
+      if (dst.File == PROGRAM_OUTPUT &&
+	  dst.Index == VERT_RESULT_FOGC &&
+	  dst.WriteMask & WRITEMASK_X) {
+	  fog_temp_i = u_temp_i;
+	  dst.File = PROGRAM_TEMPORARY;
+	  dst.Index = fog_temp_i;
+	  dofogfix = 1;
+	  u_temp_i--;
+      }
+
+      /* These ops need special handling. */
+      switch(vpi->Opcode){
+      case OPCODE_POW:
+/* pow takes only one argument, first scalar is in slot x, 2nd in slot z (other slots don't matter).
+   So may need to insert additional instruction */
+	 if ((src[0].File == src[1].File) &&
+	     (src[0].Index == src[1].Index)) {
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&dst),
+		   t_dst_mask(dst.WriteMask));
+	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		   SWIZZLE_ZERO,
+		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
+		   SWIZZLE_ZERO,
+		   t_src_class(src[0].File),
+		   src[0].NegateBase) | (src[0].RelAddr << 4);
+	    o_inst->src1 = UNUSED_SRC_0;
+	    o_inst->src2 = UNUSED_SRC_0;
+	 }
+	 else {
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
+		   (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+		   VSF_FLAG_ALL);
+	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		   SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO,
+		   t_src_class(src[0].File),
+		   src[0].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
+	    o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+		   SWIZZLE_ZERO, SWIZZLE_ZERO,
+		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)), SWIZZLE_ZERO,
+		   t_src_class(src[1].File),
+		   src[1].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
+	    o_inst->src2 = UNUSED_SRC_1;
+	    o_inst++;
+
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&dst),
+		   t_dst_mask(dst.WriteMask));
+	    o_inst->src0 = MAKE_VSF_SOURCE(u_temp_i,
+		   VSF_IN_COMPONENT_X,
+		   VSF_IN_COMPONENT_Y,
+		   VSF_IN_COMPONENT_Z,
+		   VSF_IN_COMPONENT_W,
+		   VSF_IN_CLASS_TMP,
+		   VSF_FLAG_NONE);
+	    o_inst->src1 = UNUSED_SRC_0;
+	    o_inst->src2 = UNUSED_SRC_0;
+	    u_temp_i--;
+	 }
+	 goto next;
+
+      case OPCODE_MOV://ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO} 
+      case OPCODE_SWZ:
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
+		t_dst_mask(dst.WriteMask));
+	 o_inst->src0 = t_src(vp, &src[0]);
+	 o_inst->src1 = ZERO_SRC_0;
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+
+      case OPCODE_MAD:
+	 hw_op=(src[0].File == PROGRAM_TEMPORARY &&
+	    src[1].File == PROGRAM_TEMPORARY &&
+	    src[2].File == PROGRAM_TEMPORARY) ? R200_VPI_OUT_OP_MAD_2 : R200_VPI_OUT_OP_MAD;
+
+	 o_inst->op = MAKE_VSF_OP(hw_op, t_dst(&dst),
+	    t_dst_mask(dst.WriteMask));
+	 o_inst->src0 = t_src(vp, &src[0]);
+#if 0
+if ((o_inst - vp->instr) == 31) {
+/* fix up the broken vertex program of quake4 demo... */
+o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+			SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X,
+			t_src_class(src[1].File),
+			src[1].NegateBase) | (src[1].RelAddr << 4);
+o_inst->src2 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+			SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y,
+			t_src_class(src[1].File),
+			src[1].NegateBase) | (src[1].RelAddr << 4);
+}
+else {
+	 o_inst->src1 = t_src(vp, &src[1]);
+	 o_inst->src2 = t_src(vp, &src[2]);
+}
+#else
+	 o_inst->src1 = t_src(vp, &src[1]);
+	 o_inst->src2 = t_src(vp, &src[2]);
+#endif
+	 goto next;
+
+      case OPCODE_DP3://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ZERO} PARAM 0{} {X Y Z ZERO} 
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&dst),
+		t_dst_mask(dst.WriteMask));
+
+	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
+		SWIZZLE_ZERO,
+		t_src_class(src[0].File),
+		src[0].NegateBase) | (src[0].RelAddr << 4);
+
+	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
+		SWIZZLE_ZERO,
+		t_src_class(src[1].File),
+		src[1].NegateBase) | (src[1].RelAddr << 4);
+
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+
+      case OPCODE_DPH://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W} 
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&dst),
+		t_dst_mask(dst.WriteMask));
+
+	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
+		VSF_IN_COMPONENT_ONE,
+		t_src_class(src[0].File),
+		src[0].NegateBase) | (src[0].RelAddr << 4);
+	 o_inst->src1 = t_src(vp, &src[1]);
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+
+      case OPCODE_SUB://ADD RESULT 1.X Y Z W TMP 0{} {X Y Z W} PARAM 1{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
+		t_dst_mask(dst.WriteMask));
+
+	 o_inst->src0 = t_src(vp, &src[0]);
+	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
+		t_src_class(src[1].File),
+		(!src[1].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+
+      case OPCODE_ABS://MAX RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
+	 o_inst->op=MAKE_VSF_OP(R200_VPI_OUT_OP_MAX, t_dst(&dst),
+		t_dst_mask(dst.WriteMask));
+
+	 o_inst->src0=t_src(vp, &src[0]);
+	 o_inst->src1=MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 3)),
+		t_src_class(src[0].File),
+		(!src[0].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
+	 o_inst->src2 = UNUSED_SRC_1;
+	 goto next;
+
+      case OPCODE_FLR:
+      /* FRC TMP 0.X Y Z W PARAM 0{} {X Y Z W} 
+         ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} TMP 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W */
+
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_FRC,
+	    (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+	    t_dst_mask(dst.WriteMask));
+
+	 o_inst->src0 = t_src(vp, &src[0]);
+	 o_inst->src1 = UNUSED_SRC_0;
+	 o_inst->src2 = UNUSED_SRC_1;
+	 o_inst++;
+
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
+		t_dst_mask(dst.WriteMask));
+
+	 o_inst->src0 = t_src(vp, &src[0]);
+	 o_inst->src1 = MAKE_VSF_SOURCE(u_temp_i,
+		VSF_IN_COMPONENT_X,
+		VSF_IN_COMPONENT_Y,
+		VSF_IN_COMPONENT_Z,
+		VSF_IN_COMPONENT_W,
+		VSF_IN_CLASS_TMP,
+		/* Not 100% sure about this */
+		(!src[0].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE/*VSF_FLAG_ALL*/);
+
+	 o_inst->src2 = UNUSED_SRC_0;
+	 u_temp_i--;
+	 goto next;
+
+      case OPCODE_XPD:
+	 /* mul r0, r1.yzxw, r2.zxyw
+	    mad r0, -r2.yzxw, r1.zxyw, r0
+	    NOTE: might need MAD_2
+	  */
+
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
+	    (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+	    t_dst_mask(dst.WriteMask));
+
+	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
+		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
+		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
+		t_src_class(src[0].File),
+		src[0].NegateBase) | (src[0].RelAddr << 4);
+
+	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
+		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
+		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
+		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
+		t_src_class(src[1].File),
+		src[1].NegateBase) | (src[1].RelAddr << 4);
+
+	 o_inst->src2 = UNUSED_SRC_1;
+	 o_inst++;
+	 u_temp_i--;
+
+	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MAD, t_dst(&dst),
+		t_dst_mask(dst.WriteMask));
+
+	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
+		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
+		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
+		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
+		t_src_class(src[1].File),
+		(!src[1].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
+
+	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
+		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
+		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
+		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
+		t_src_class(src[0].File),
+		src[0].NegateBase) | (src[0].RelAddr << 4);
+
+	 o_inst->src2 = MAKE_VSF_SOURCE(u_temp_i+1,
+		VSF_IN_COMPONENT_X,
+		VSF_IN_COMPONENT_Y,
+		VSF_IN_COMPONENT_Z,
+		VSF_IN_COMPONENT_W,
+		VSF_IN_CLASS_TMP,
+		VSF_FLAG_NONE);
+	 goto next;
+
+      case OPCODE_END:
+	 assert(0);
+      default:
+	 break;
+      }
+
+      o_inst->op = MAKE_VSF_OP(t_opcode(vpi->Opcode), t_dst(&dst),
+	    t_dst_mask(dst.WriteMask));
+
+      if(are_srcs_scalar){
+	 switch(operands){
+	    case 1:
+		o_inst->src0 = t_src_scalar(vp, &src[0]);
+		o_inst->src1 = UNUSED_SRC_0;
+		o_inst->src2 = UNUSED_SRC_1;
+	    break;
+
+	    case 2:
+		o_inst->src0 = t_src_scalar(vp, &src[0]);
+		o_inst->src1 = t_src_scalar(vp, &src[1]);
+		o_inst->src2 = UNUSED_SRC_1;
+	    break;
+
+	    case 3:
+		o_inst->src0 = t_src_scalar(vp, &src[0]);
+		o_inst->src1 = t_src_scalar(vp, &src[1]);
+		o_inst->src2 = t_src_scalar(vp, &src[2]);
+	    break;
+
+	    default:
+		fprintf(stderr, "illegal number of operands %lu\n", operands);
+		exit(-1);
+	    break;
+	 }
+      } else {
+	 switch(operands){
+	    case 1:
+		o_inst->src0 = t_src(vp, &src[0]);
+		o_inst->src1 = UNUSED_SRC_0;
+		o_inst->src2 = UNUSED_SRC_1;
+	    break;
+
+	    case 2:
+		o_inst->src0 = t_src(vp, &src[0]);
+		o_inst->src1 = t_src(vp, &src[1]);
+		o_inst->src2 = UNUSED_SRC_1;
+	    break;
+
+	    case 3:
+		o_inst->src0 = t_src(vp, &src[0]);
+		o_inst->src1 = t_src(vp, &src[1]);
+		o_inst->src2 = t_src(vp, &src[2]);
+	    break;
+
+	    default:
+		fprintf(stderr, "illegal number of operands %lu\n", operands);
+		exit(-1);
+	    break;
+	 }
+      }
+      next:
+
+      if (dofogfix) {
+	 o_inst++;
+	 if (vp->fogmode == GL_EXP) {
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
+		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+		VSF_FLAG_X);
+	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
+	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, X, X, X, X, PARAM, NONE);
+	    o_inst->src2 = UNUSED_SRC_1;
+	    o_inst++;
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_EXP_E,
+		R200_VSF_OUT_CLASS_RESULT_FOGC,
+		VSF_FLAG_X);
+	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
+	    o_inst->src1 = UNUSED_SRC_0;
+	    o_inst->src2 = UNUSED_SRC_1;
+	 }
+	 else if (vp->fogmode == GL_EXP2) {
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
+		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+		VSF_FLAG_X);
+	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
+	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, X, X, X, X, PARAM, NONE);
+	    o_inst->src2 = UNUSED_SRC_1;
+	    o_inst++;
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
+		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+		VSF_FLAG_X);
+	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
+	    o_inst->src1 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
+	    o_inst->src2 = UNUSED_SRC_1;
+	    o_inst++;
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_EXP_E,
+		R200_VSF_OUT_CLASS_RESULT_FOGC,
+		VSF_FLAG_X);
+	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
+	    o_inst->src1 = UNUSED_SRC_0;
+	    o_inst->src2 = UNUSED_SRC_1;
+	 }
+	 else { /* fogmode == GL_LINEAR */
+		/* could do that with single op (dot) if using params like
+		   with fixed function pipeline fog */
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
+		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
+		VSF_FLAG_X);
+	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
+	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, Z, Z, Z, Z, PARAM, NONE);
+	    o_inst->src2 = UNUSED_SRC_1;
+	    o_inst++;
+	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
+		R200_VSF_OUT_CLASS_RESULT_FOGC,
+		VSF_FLAG_X);
+	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
+	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, W, W, W, W, PARAM, NONE);
+	    o_inst->src2 = UNUSED_SRC_1;
+
+	 }
+         dofogfix = 0;
+      }
+
+      if (mesa_vp->Base.NumNativeTemporaries <
+	 (mesa_vp->Base.NumTemporaries + (R200_VSF_MAX_TEMPS - 1 - u_temp_i))) {
+	 mesa_vp->Base.NumNativeTemporaries =
+	    mesa_vp->Base.NumTemporaries + (R200_VSF_MAX_TEMPS - 1 - u_temp_i);
+      }
+      if (u_temp_i < mesa_vp->Base.NumTemporaries) {
+	 if (R200_DEBUG & DEBUG_FALLBACKS) {
+	    fprintf(stderr, "Ran out of temps, num temps %d, us %d\n", mesa_vp->Base.NumTemporaries, u_temp_i);
+	 }
+	 return GL_FALSE;
+      }
+      u_temp_i = R200_VSF_MAX_TEMPS - 1;
+      if(o_inst - vp->instr >= R200_VSF_MAX_INST) {
+	 mesa_vp->Base.NumNativeInstructions = 129;
+	 if (R200_DEBUG & DEBUG_FALLBACKS) {
+	    fprintf(stderr, "more than 128 native instructions\n");
+	 }
+	 return GL_FALSE;
+      }
+      if ((o_inst->op & R200_VSF_OUT_CLASS_MASK) == R200_VSF_OUT_CLASS_RESULT_POS) {
+	 vp->pos_end = (o_inst - vp->instr);
+      }
+   }
+
+   vp->native = GL_TRUE;
+   mesa_vp->Base.NumNativeInstructions = (o_inst - vp->instr);
+#if 0
+   fprintf(stderr, "hw program:\n");
+   for(i=0; i < vp->program.length; i++)
+      fprintf(stderr, "%08x\n", vp->instr[i]);
+#endif
+   return GL_TRUE;
+}
+
+void r200SetupVertexProg( GLcontext *ctx ) {
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   struct r200_vertex_program *vp = (struct r200_vertex_program *)ctx->VertexProgram.Current;
+   GLboolean fallback;
+   GLint i;
+
+   if (!vp->translated || (ctx->Fog.Enabled && ctx->Fog.Mode != vp->fogmode)) {
+      rmesa->curr_vp_hw = NULL;
+      r200_translate_vertex_program(ctx, vp);
+   }
+   /* could optimize setting up vertex progs away for non-tcl hw */
+   fallback = !(vp->native && r200VertexProgUpdateParams(ctx, vp) &&
+      rmesa->r200Screen->drmSupportsVertexProgram);
+   TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, fallback);
+   if (rmesa->TclFallback) return;
+
+   R200_STATECHANGE( rmesa, vap );
+   /* FIXME: fglrx sets R200_VAP_SINGLE_BUF_STATE_ENABLE too. Do we need it?
+             maybe only when using more than 64 inst / 96 param? */
+   rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_PROG_VTX_SHADER_ENABLE /*| R200_VAP_SINGLE_BUF_STATE_ENABLE*/;
+
+   R200_STATECHANGE( rmesa, pvs );
+
+   rmesa->hw.pvs.cmd[PVS_CNTL_1] = (0 << R200_PVS_CNTL_1_PROGRAM_START_SHIFT) |
+      ((vp->mesa_program.Base.NumNativeInstructions - 1) << R200_PVS_CNTL_1_PROGRAM_END_SHIFT) |
+      (vp->pos_end << R200_PVS_CNTL_1_POS_END_SHIFT);
+   rmesa->hw.pvs.cmd[PVS_CNTL_2] = (0 << R200_PVS_CNTL_2_PARAM_OFFSET_SHIFT) |
+      (vp->mesa_program.Base.NumNativeParameters << R200_PVS_CNTL_2_PARAM_COUNT_SHIFT);
+
+   /* maybe user clip planes just work with vertex progs... untested */
+   if (ctx->Transform.ClipPlanesEnabled) {
+      R200_STATECHANGE( rmesa, tcl );
+      if (vp->mesa_program.IsPositionInvariant) {
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= (ctx->Transform.ClipPlanesEnabled << 2);
+      }
+      else {
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~(0xfc);
+      }
+   }
+
+   if (vp != rmesa->curr_vp_hw) {
+      GLuint count = vp->mesa_program.Base.NumNativeInstructions;
+      drm_radeon_cmd_header_t tmp;
+
+      R200_STATECHANGE( rmesa, vpi[0] );
+      R200_STATECHANGE( rmesa, vpi[1] );
+
+      /* FIXME: what about using a memcopy... */
+      for (i = 0; (i < 64) && i < count; i++) {
+	 rmesa->hw.vpi[0].cmd[VPI_OPDST_0 + 4 * i] = vp->instr[i].op;
+	 rmesa->hw.vpi[0].cmd[VPI_SRC0_0 + 4 * i] = vp->instr[i].src0;
+	 rmesa->hw.vpi[0].cmd[VPI_SRC1_0 + 4 * i] = vp->instr[i].src1;
+	 rmesa->hw.vpi[0].cmd[VPI_SRC2_0 + 4 * i] = vp->instr[i].src2;
+      }
+      /* hack up the cmd_size so not the whole state atom is emitted always.
+         This may require some more thought, we may emit half progs on lost state, but
+         hopefully it won't matter?
+         WARNING: must not use R200_DB_STATECHANGE, this will produce bogus (and rejected)
+         packet emits (due to the mismatched cmd_size and count in cmd/last_cmd) */
+      rmesa->hw.vpi[0].cmd_size = 1 + 4 * ((count > 64) ? 64 : count);
+      tmp.i = rmesa->hw.vpi[0].cmd[VPI_CMD_0];
+      tmp.veclinear.count = (count > 64) ? 64 : count;
+      rmesa->hw.vpi[0].cmd[VPI_CMD_0] = tmp.i;
+      if (count > 64) {
+	 for (i = 0; i < (count - 64); i++) {
+	    rmesa->hw.vpi[1].cmd[VPI_OPDST_0 + 4 * i] = vp->instr[i + 64].op;
+	    rmesa->hw.vpi[1].cmd[VPI_SRC0_0 + 4 * i] = vp->instr[i + 64].src0;
+	    rmesa->hw.vpi[1].cmd[VPI_SRC1_0 + 4 * i] = vp->instr[i + 64].src1;
+	    rmesa->hw.vpi[1].cmd[VPI_SRC2_0 + 4 * i] = vp->instr[i + 64].src2;
+	 }
+	 rmesa->hw.vpi[1].cmd_size = 1 + 4 * (count - 64);
+	 tmp.i = rmesa->hw.vpi[1].cmd[VPI_CMD_0];
+	 tmp.veclinear.count = count - 64;
+	 rmesa->hw.vpi[1].cmd[VPI_CMD_0] = tmp.i;
+      }
+      rmesa->curr_vp_hw = vp;
+   }
+}
+
+
+static void
+r200BindProgram(GLcontext *ctx, GLenum target, struct gl_program *prog)
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   switch(target){
+   case GL_VERTEX_PROGRAM_ARB:
+      rmesa->curr_vp_hw = NULL;
+      break;
+   default:
+      _mesa_problem(ctx, "Target not supported yet!");
+      break;
+   }
+}
+
+static struct gl_program *
+r200NewProgram(GLcontext *ctx, GLenum target, GLuint id)
+{
+   struct r200_vertex_program *vp;
+
+   switch(target){
+   case GL_VERTEX_PROGRAM_ARB:
+      vp = CALLOC_STRUCT(r200_vertex_program);
+      return _mesa_init_vertex_program(ctx, &vp->mesa_program, target, id);
+   case GL_FRAGMENT_PROGRAM_ARB:
+   case GL_FRAGMENT_PROGRAM_NV:
+      return _mesa_init_fragment_program( ctx, CALLOC_STRUCT(gl_fragment_program), target, id );
+   default:
+      _mesa_problem(ctx, "Bad target in r200NewProgram");
+   }
+   return NULL;	
+}
+
+
+static void
+r200DeleteProgram(GLcontext *ctx, struct gl_program *prog)
+{
+   _mesa_delete_program(ctx, prog);
+}
+
+static void
+r200ProgramStringNotify(GLcontext *ctx, GLenum target, struct gl_program *prog)
+{
+   struct r200_vertex_program *vp = (void *)prog;
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+
+   switch(target) {
+   case GL_VERTEX_PROGRAM_ARB:
+      vp->translated = GL_FALSE;
+      vp->fogpidx = 0;
+/*      memset(&vp->translated, 0, sizeof(struct r200_vertex_program) - sizeof(struct gl_vertex_program));*/
+      r200_translate_vertex_program(ctx, vp);
+      rmesa->curr_vp_hw = NULL;
+      break;
+   case GL_FRAGMENT_SHADER_ATI:
+      rmesa->afs_loaded = NULL;
+      break;
+   }
+   /* need this for tcl fallbacks */
+   _tnl_program_string(ctx, target, prog);
+}
+
+static GLboolean
+r200IsProgramNative(GLcontext *ctx, GLenum target, struct gl_program *prog)
+{
+   struct r200_vertex_program *vp = (void *)prog;
+
+   switch(target){
+   case GL_VERTEX_STATE_PROGRAM_NV:
+   case GL_VERTEX_PROGRAM_ARB:
+      if (!vp->translated) {
+	 r200_translate_vertex_program(ctx, vp);
+      }
+     /* does not take parameters etc. into account */
+      return vp->native;
+   default:
+      _mesa_problem(ctx, "Bad target in r200NewProgram");
+   }
+   return 0;
+}
+
+void r200InitShaderFuncs(struct dd_function_table *functions)
+{
+   functions->NewProgram = r200NewProgram;
+   functions->BindProgram = r200BindProgram;
+   functions->DeleteProgram = r200DeleteProgram;
+   functions->ProgramStringNotify = r200ProgramStringNotify;
+   functions->IsProgramNative = r200IsProgramNative;
+}
diff --git a/r200/r200_vertprog.h b/r200/r200_vertprog.h
new file mode 100644
index 0000000..9382376
--- /dev/null
+++ b/r200/r200_vertprog.h
@@ -0,0 +1,163 @@
+#ifndef __VERTEX_SHADER_H__
+#define __VERTEX_SHADER_H__
+
+#include "r200_reg.h"
+
+typedef struct {
+   uint32_t op;
+   uint32_t src0;
+   uint32_t src1;
+   uint32_t src2;
+} VERTEX_SHADER_INSTRUCTION;
+
+extern void r200InitShaderFuncs(struct dd_function_table *functions);
+extern void r200SetupVertexProg( GLcontext *ctx );
+
+#define VSF_FLAG_X	1
+#define VSF_FLAG_Y	2
+#define VSF_FLAG_Z	4
+#define VSF_FLAG_W	8
+#define VSF_FLAG_XYZ	(VSF_FLAG_X | VSF_FLAG_Y | VSF_FLAG_Z)
+#define VSF_FLAG_ALL	0xf
+#define VSF_FLAG_NONE	0
+
+#define R200_VSF_MAX_INST	128
+#define R200_VSF_MAX_PARAM	192
+#define R200_VSF_MAX_TEMPS	12
+
+#define R200_VPI_OUT_REG_INDEX_SHIFT            13
+#define R200_VPI_OUT_REG_INDEX_MASK             (31 << 13) /* GUESS based on fglrx native limits */
+
+#define R200_VPI_OUT_WRITE_X                    (1 << 20)
+#define R200_VPI_OUT_WRITE_Y                    (1 << 21)
+#define R200_VPI_OUT_WRITE_Z                    (1 << 22)
+#define R200_VPI_OUT_WRITE_W                    (1 << 23)
+
+#define R200_VPI_IN_REG_CLASS_TEMPORARY         (0 << 0)
+#define R200_VPI_IN_REG_CLASS_ATTRIBUTE         (1 << 0)
+#define R200_VPI_IN_REG_CLASS_PARAMETER         (2 << 0)
+#define R200_VPI_IN_REG_CLASS_NONE              (9 << 0)
+#define R200_VPI_IN_REG_CLASS_MASK              (31 << 0) /* GUESS */
+
+#define R200_VPI_IN_REG_INDEX_SHIFT             5
+#define R200_VPI_IN_REG_INDEX_MASK              (255 << 5) /* GUESS based on fglrx native limits */
+
+/* The R200 can select components from the input register arbitrarily.
+// Use the following constants, shifted by the component shift you
+// want to select */
+#define R200_VPI_IN_SELECT_X    0
+#define R200_VPI_IN_SELECT_Y    1
+#define R200_VPI_IN_SELECT_Z    2
+#define R200_VPI_IN_SELECT_W    3
+#define R200_VPI_IN_SELECT_ZERO 4
+#define R200_VPI_IN_SELECT_ONE  5
+#define R200_VPI_IN_SELECT_MASK 7
+
+#define R200_VPI_IN_X_SHIFT                     13
+#define R200_VPI_IN_Y_SHIFT                     16
+#define R200_VPI_IN_Z_SHIFT                     19
+#define R200_VPI_IN_W_SHIFT                     22
+
+#define R200_VPI_IN_NEG_X                       (1 << 25)
+#define R200_VPI_IN_NEG_Y                       (1 << 26)
+#define R200_VPI_IN_NEG_Z                       (1 << 27)
+#define R200_VPI_IN_NEG_W                       (1 << 28)
+
+#define R200_VSF_OUT_CLASS_TMP			(0 << 8)
+#define R200_VSF_OUT_CLASS_ADDR			(3 << 8)
+#define R200_VSF_OUT_CLASS_RESULT_POS		(4 << 8)
+#define R200_VSF_OUT_CLASS_RESULT_COLOR		(5 << 8)
+#define R200_VSF_OUT_CLASS_RESULT_TEXC		(6 << 8)
+#define R200_VSF_OUT_CLASS_RESULT_FOGC		(7 << 8)
+#define R200_VSF_OUT_CLASS_RESULT_POINTSIZE	(8 << 8)
+#define R200_VSF_OUT_CLASS_MASK			(31 << 8)
+
+/* opcodes - they all are the same as on r300 it seems, however
+   LIT and POW require different setup */
+#define R200_VPI_OUT_OP_DOT                     (1 << 0)
+#define R200_VPI_OUT_OP_MUL                     (2 << 0)
+#define R200_VPI_OUT_OP_ADD                     (3 << 0)
+#define R200_VPI_OUT_OP_MAD                     (4 << 0)
+#define R200_VPI_OUT_OP_DST                     (5 << 0)
+#define R200_VPI_OUT_OP_FRC                     (6 << 0)
+#define R200_VPI_OUT_OP_MAX                     (7 << 0)
+#define R200_VPI_OUT_OP_MIN                     (8 << 0)
+#define R200_VPI_OUT_OP_SGE                     (9 << 0)
+#define R200_VPI_OUT_OP_SLT                     (10 << 0)
+
+#define R200_VPI_OUT_OP_ARL                     (13 << 0)
+
+#define R200_VPI_OUT_OP_EXP                     (65 << 0)
+#define R200_VPI_OUT_OP_LOG                     (66 << 0)
+/* base e exp. Useful for fog. */
+#define R200_VPI_OUT_OP_EXP_E                   (67 << 0)
+
+#define R200_VPI_OUT_OP_LIT                     (68 << 0)
+#define R200_VPI_OUT_OP_POW                     (69 << 0)
+#define R200_VPI_OUT_OP_RCP                     (70 << 0)
+#define R200_VPI_OUT_OP_RSQ                     (72 << 0)
+
+#define R200_VPI_OUT_OP_EX2                     (75 << 0)
+#define R200_VPI_OUT_OP_LG2                     (76 << 0)
+
+#define R200_VPI_OUT_OP_MAD_2                   (128 << 0)
+
+/* first CARD32 of an instruction */
+
+/* possible operations: 
+    DOT, MUL, ADD, MAD, FRC, MAX, MIN, SGE, SLT, EXP, LOG, LIT, POW, RCP, RSQ, EX2,
+    LG2, MAD_2, ARL */
+
+#define MAKE_VSF_OP(op, out_reg, out_reg_fields) \
+   ((op) | (out_reg) | ((out_reg_fields) << 20) )
+
+#define VSF_IN_CLASS_TMP	0
+#define VSF_IN_CLASS_ATTR	1
+#define VSF_IN_CLASS_PARAM	2
+#define VSF_IN_CLASS_NONE	9
+
+#define VSF_IN_COMPONENT_X	0
+#define VSF_IN_COMPONENT_Y	1
+#define VSF_IN_COMPONENT_Z	2
+#define VSF_IN_COMPONENT_W	3
+#define VSF_IN_COMPONENT_ZERO	4
+#define VSF_IN_COMPONENT_ONE	5
+
+#define MAKE_VSF_SOURCE(in_reg_index, comp_x, comp_y, comp_z, comp_w, class, negate) \
+	( ((in_reg_index)<<R200_VPI_IN_REG_INDEX_SHIFT) \
+	   | ((comp_x)<<R200_VPI_IN_X_SHIFT) \
+	   | ((comp_y)<<R200_VPI_IN_Y_SHIFT) \
+	   | ((comp_z)<<R200_VPI_IN_Z_SHIFT) \
+	   | ((comp_w)<<R200_VPI_IN_W_SHIFT) \
+	   | ((negate)<<25) | ((class)))
+
+#define EASY_VSF_SOURCE(in_reg_index, comp_x, comp_y, comp_z, comp_w, class, negate) \
+	MAKE_VSF_SOURCE(in_reg_index, \
+		VSF_IN_COMPONENT_##comp_x, \
+		VSF_IN_COMPONENT_##comp_y, \
+		VSF_IN_COMPONENT_##comp_z, \
+		VSF_IN_COMPONENT_##comp_w, \
+		VSF_IN_CLASS_##class, VSF_FLAG_##negate)
+
+/* special sources: */
+
+/* (1.0,1.0,1.0,1.0) vector (ATTR, plain ) */
+#define VSF_ATTR_UNITY(reg) 	EASY_VSF_SOURCE(reg, ONE, ONE, ONE, ONE, ATTR, NONE)
+#define VSF_UNITY(reg) 	EASY_VSF_SOURCE(reg, ONE, ONE, ONE, ONE, NONE, NONE)
+
+/* contents of unmodified register */
+#define VSF_REG(reg) 	EASY_VSF_SOURCE(reg, X, Y, Z, W, ATTR, NONE)
+
+/* contents of unmodified parameter */
+#define VSF_PARAM(reg) 	EASY_VSF_SOURCE(reg, X, Y, Z, W, PARAM, NONE)
+
+/* contents of unmodified temporary register */
+#define VSF_TMP(reg) 	EASY_VSF_SOURCE(reg, X, Y, Z, W, TMP, NONE)
+
+/* components of ATTR register */
+#define VSF_ATTR_X(reg) EASY_VSF_SOURCE(reg, X, X, X, X, ATTR, NONE)
+#define VSF_ATTR_Y(reg) EASY_VSF_SOURCE(reg, Y, Y, Y, Y, ATTR, NONE)
+#define VSF_ATTR_Z(reg) EASY_VSF_SOURCE(reg, Z, Z, Z, Z, ATTR, NONE)
+#define VSF_ATTR_W(reg) EASY_VSF_SOURCE(reg, W, W, W, W, ATTR, NONE)
+
+#endif
diff --git a/r300/Lindent b/r300/Lindent
new file mode 100755
index 0000000..7d8d889
--- /dev/null
+++ b/r300/Lindent
@@ -0,0 +1,2 @@
+#!/bin/sh
+indent -npro -kr -i8 -ts8 -sob -l80 -ss -ncs "$@"
diff --git a/r300/Makefile.am b/r300/Makefile.am
new file mode 100644
index 0000000..0992115
--- /dev/null
+++ b/r300/Makefile.am
@@ -0,0 +1,30 @@
+AM_CFLAGS = -DIN_DRI_DRIVER -DGLX_DIRECT_RENDERING -DGLX_INDIRECT_RENDERING
+
+R300_CFLAGS = -DCOMPILE_R300 -DR200_MERGED=0 -DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R300
+R300_CFLAGS += -I../radeon -I../radeon/server
+
+r300_dri_la_LTLIBRARIES = r300_dri.la
+r300_dri_la_CFLAGS = $(AM_CFLAGS) $(DRM_CFLAGS) $(DRI_CFLAGS) $(R300_CFLAGS)
+r300_dri_la_LDFLAGS = -module -noprefix -avoid-version -lm -ldl \
+		$(DRM_LIBS) $(DRI_LIBS)
+r300_dri_ladir = @libdir@/dri
+r300_dri_la_SOURCES = \
+	../radeon/radeon_screen.c \
+	radeon_context.c \
+	radeon_ioctl.c \
+	radeon_lock.c \
+	radeon_span.c \
+	radeon_state.c \
+	r300_mem.c \
+	r300_context.c \
+	r300_ioctl.c \
+	r300_cmdbuf.c \
+	r300_state.c \
+	r300_render.c \
+	r300_texmem.c \
+	r300_tex.c \
+	r300_texstate.c \
+	r300_vertprog.c \
+	r300_fragprog.c \
+	r300_shader.c \
+	r300_emit.c
diff --git a/r300/r300_cmdbuf.c b/r300/r300_cmdbuf.c
new file mode 100644
index 0000000..3befa58
--- /dev/null
+++ b/r300/r300_cmdbuf.c
@@ -0,0 +1,590 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include "glheader.h"
+#include "state.h"
+#include "imports.h"
+#include "macros.h"
+#include "context.h"
+#include "swrast/swrast.h"
+#include "simple_list.h"
+
+#include "drm.h"
+#include "radeon_drm.h"
+
+#include "radeon_ioctl.h"
+#include "r300_context.h"
+#include "r300_ioctl.h"
+#include "radeon_reg.h"
+#include "r300_reg.h"
+#include "r300_cmdbuf.h"
+#include "r300_emit.h"
+#include "r300_state.h"
+
+// Set this to 1 for extremely verbose debugging of command buffers
+#define DEBUG_CMDBUF		0
+
+/**
+ * Send the current command buffer via ioctl to the hardware.
+ */
+int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller)
+{
+	int ret;
+	int i;
+	drm_radeon_cmd_buffer_t cmd;
+	int start;
+
+	if (r300->radeon.lost_context) {
+		start = 0;
+		r300->radeon.lost_context = GL_FALSE;
+	} else
+		start = r300->cmdbuf.count_reemit;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL) {
+		fprintf(stderr, "%s from %s - %i cliprects\n",
+			__FUNCTION__, caller, r300->radeon.numClipRects);
+
+		if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_VERBOSE)
+			for (i = start; i < r300->cmdbuf.count_used; ++i)
+				fprintf(stderr, "%d: %08x\n", i,
+					r300->cmdbuf.cmd_buf[i]);
+	}
+
+	cmd.buf = (char *)(r300->cmdbuf.cmd_buf + start);
+	cmd.bufsz = (r300->cmdbuf.count_used - start) * 4;
+
+	if (r300->radeon.state.scissor.enabled) {
+		cmd.nbox = r300->radeon.state.scissor.numClipRects;
+		cmd.boxes =
+		    (drm_clip_rect_t *) r300->radeon.state.scissor.pClipRects;
+	} else {
+		cmd.nbox = r300->radeon.numClipRects;
+		cmd.boxes = (drm_clip_rect_t *) r300->radeon.pClipRects;
+	}
+
+	ret = drmCommandWrite(r300->radeon.dri.fd,
+			      DRM_RADEON_CMDBUF, &cmd, sizeof(cmd));
+
+	if (RADEON_DEBUG & DEBUG_SYNC) {
+		fprintf(stderr, "Syncing in %s (from %s)\n\n",
+			__FUNCTION__, caller);
+		radeonWaitForIdleLocked(&r300->radeon);
+	}
+
+	r300->dma.nr_released_bufs = 0;
+	r300->cmdbuf.count_used = 0;
+	r300->cmdbuf.count_reemit = 0;
+
+	return ret;
+}
+
+int r300FlushCmdBuf(r300ContextPtr r300, const char *caller)
+{
+	int ret;
+
+	LOCK_HARDWARE(&r300->radeon);
+
+	ret = r300FlushCmdBufLocked(r300, caller);
+
+	UNLOCK_HARDWARE(&r300->radeon);
+
+	if (ret) {
+		fprintf(stderr, "drmRadeonCmdBuffer: %d\n", ret);
+		_mesa_exit(ret);
+	}
+
+	return ret;
+}
+
+static void r300PrintStateAtom(r300ContextPtr r300, struct r300_state_atom *state)
+{
+	int i;
+	int dwords = (*state->check) (r300, state);
+
+	fprintf(stderr, "  emit %s/%d/%d\n", state->name, dwords,
+		state->cmd_size);
+
+	if (RADEON_DEBUG & DEBUG_VERBOSE)
+		for (i = 0; i < dwords; i++)
+			fprintf(stderr, "      %s[%d]: %08X\n",
+				state->name, i, state->cmd[i]);
+}
+
+/**
+ * Emit all atoms with a dirty field equal to dirty.
+ *
+ * The caller must have ensured that there is enough space in the command
+ * buffer.
+ */
+static __inline__ void r300EmitAtoms(r300ContextPtr r300, GLboolean dirty)
+{
+	struct r300_state_atom *atom;
+	uint32_t *dest;
+
+	dest = r300->cmdbuf.cmd_buf + r300->cmdbuf.count_used;
+
+	if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
+		foreach(atom, &r300->hw.atomlist) {
+			if ((atom->dirty || r300->hw.all_dirty) == dirty) {
+				int dwords = (*atom->check) (r300, atom);
+
+				if (dwords)
+					r300PrintStateAtom(r300, atom);
+				else
+					fprintf(stderr,
+						"  skip state %s\n",
+						atom->name);
+			}
+		}
+	}
+
+	/* Emit WAIT */
+	*dest = cmdwait(R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	dest++;
+	r300->cmdbuf.count_used++;
+
+	/* Emit cache flush */
+	*dest = cmdpacket0(R300_TX_CNTL, 1);
+	dest++;
+	r300->cmdbuf.count_used++;
+
+	*dest = R300_TX_FLUSH;
+	dest++;
+	r300->cmdbuf.count_used++;
+
+	/* Emit END3D */
+	*dest = cmdpacify();
+	dest++;
+	r300->cmdbuf.count_used++;
+
+	/* Emit actual atoms */
+
+	foreach(atom, &r300->hw.atomlist) {
+		if ((atom->dirty || r300->hw.all_dirty) == dirty) {
+			int dwords = (*atom->check) (r300, atom);
+
+			if (dwords) {
+				memcpy(dest, atom->cmd, dwords * 4);
+				dest += dwords;
+				r300->cmdbuf.count_used += dwords;
+				atom->dirty = GL_FALSE;
+			}
+		}
+	}
+}
+
+/**
+ * Copy dirty hardware state atoms into the command buffer.
+ *
+ * We also copy out clean state if we're at the start of a buffer. That makes
+ * it easy to recover from lost contexts.
+ */
+void r300EmitState(r300ContextPtr r300)
+{
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_PRIMS))
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (r300->cmdbuf.count_used && !r300->hw.is_dirty
+	    && !r300->hw.all_dirty)
+		return;
+
+	/* To avoid going across the entire set of states multiple times, just check
+	 * for enough space for the case of emitting all state, and inline the
+	 * r300AllocCmdBuf code here without all the checks.
+	 */
+	r300EnsureCmdBufSpace(r300, r300->hw.max_state_size, __FUNCTION__);
+
+	if (!r300->cmdbuf.count_used) {
+		if (RADEON_DEBUG & DEBUG_STATE)
+			fprintf(stderr, "Begin reemit state\n");
+
+		r300EmitAtoms(r300, GL_FALSE);
+		r300->cmdbuf.count_reemit = r300->cmdbuf.count_used;
+	}
+
+	if (RADEON_DEBUG & DEBUG_STATE)
+		fprintf(stderr, "Begin dirty state\n");
+
+	r300EmitAtoms(r300, GL_TRUE);
+
+	assert(r300->cmdbuf.count_used < r300->cmdbuf.size);
+
+	r300->hw.is_dirty = GL_FALSE;
+	r300->hw.all_dirty = GL_FALSE;
+}
+
+#define CHECK( NM, COUNT )				\
+static int check_##NM( r300ContextPtr r300, 		\
+			struct r300_state_atom* atom )	\
+{							\
+   (void) atom;	(void) r300;				\
+   return (COUNT);					\
+}
+
+#define packet0_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->packet0.count)
+#define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
+
+CHECK(always, atom->cmd_size)
+    CHECK(variable, packet0_count(atom->cmd) ? (1 + packet0_count(atom->cmd)) : 0)
+    CHECK(vpu, vpu_count(atom->cmd) ? (1 + vpu_count(atom->cmd) * 4) : 0)
+#undef packet0_count
+#undef vpu_count
+#define ALLOC_STATE( ATOM, CHK, SZ, IDX )				\
+   do {									\
+      r300->hw.ATOM.cmd_size = (SZ);					\
+      r300->hw.ATOM.cmd = (uint32_t*)CALLOC((SZ) * sizeof(uint32_t));	\
+      r300->hw.ATOM.name = #ATOM;					\
+      r300->hw.ATOM.idx = (IDX);					\
+      r300->hw.ATOM.check = check_##CHK;				\
+      r300->hw.ATOM.dirty = GL_FALSE;					\
+      r300->hw.max_state_size += (SZ);					\
+      insert_at_tail(&r300->hw.atomlist, &r300->hw.ATOM);		\
+   } while (0)
+/**
+ * Allocate memory for the command buffer and initialize the state atom
+ * list. Note that the initial hardware state is set by r300InitState().
+ */
+void r300InitCmdBuf(r300ContextPtr r300)
+{
+	int size, mtu;
+	int has_tcl = 1;
+
+	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+		has_tcl = 0;
+
+	r300->hw.max_state_size = 2 + 2;	/* reserve extra space for WAIT_IDLE and tex cache flush */
+
+	mtu = r300->radeon.glCtx->Const.MaxTextureUnits;
+	if (RADEON_DEBUG & DEBUG_TEXTURE) {
+		fprintf(stderr, "Using %d maximum texture units..\n", mtu);
+	}
+
+	/* Setup the atom linked list */
+	make_empty_list(&r300->hw.atomlist);
+	r300->hw.atomlist.name = "atom-list";
+
+	/* Initialize state atoms */
+	ALLOC_STATE(vpt, always, R300_VPT_CMDSIZE, 0);
+	r300->hw.vpt.cmd[R300_VPT_CMD_0] = cmdpacket0(R300_SE_VPORT_XSCALE, 6);
+	ALLOC_STATE(vap_cntl, always, 2, 0);
+	r300->hw.vap_cntl.cmd[0] = cmdpacket0(R300_VAP_CNTL, 1);
+	ALLOC_STATE(vte, always, 3, 0);
+	r300->hw.vte.cmd[0] = cmdpacket0(R300_SE_VTE_CNTL, 2);
+	ALLOC_STATE(unk2134, always, 3, 0);
+	r300->hw.unk2134.cmd[0] = cmdpacket0(0x2134, 2);
+	ALLOC_STATE(vap_cntl_status, always, 2, 0);
+	r300->hw.vap_cntl_status.cmd[0] = cmdpacket0(R300_VAP_CNTL_STATUS, 1);
+	ALLOC_STATE(vir[0], variable, R300_VIR_CMDSIZE, 0);
+	r300->hw.vir[0].cmd[R300_VIR_CMD_0] =
+	    cmdpacket0(R300_VAP_INPUT_ROUTE_0_0, 1);
+	ALLOC_STATE(vir[1], variable, R300_VIR_CMDSIZE, 1);
+	r300->hw.vir[1].cmd[R300_VIR_CMD_0] =
+	    cmdpacket0(R300_VAP_INPUT_ROUTE_1_0, 1);
+	ALLOC_STATE(vic, always, R300_VIC_CMDSIZE, 0);
+	r300->hw.vic.cmd[R300_VIC_CMD_0] = cmdpacket0(R300_VAP_INPUT_CNTL_0, 2);
+	ALLOC_STATE(unk21DC, always, 2, 0);
+	r300->hw.unk21DC.cmd[0] = cmdpacket0(0x21DC, 1);
+	ALLOC_STATE(vap_clip_cntl, always, 2, 0);
+	r300->hw.vap_clip_cntl.cmd[0] = cmdpacket0(R300_VAP_CLIP_CNTL, 1);
+	ALLOC_STATE(unk2220, always, 5, 0);
+	r300->hw.unk2220.cmd[0] = cmdpacket0(0x2220, 4);
+	ALLOC_STATE(unk2288, always, 2, 0);
+	r300->hw.unk2288.cmd[0] = cmdpacket0(R300_VAP_UNKNOWN_2288, 1);
+	ALLOC_STATE(vof, always, R300_VOF_CMDSIZE, 0);
+	r300->hw.vof.cmd[R300_VOF_CMD_0] =
+	    cmdpacket0(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+
+	if (has_tcl) {
+		ALLOC_STATE(pvs, always, R300_PVS_CMDSIZE, 0);
+		r300->hw.pvs.cmd[R300_PVS_CMD_0] =
+		    cmdpacket0(R300_VAP_PVS_CNTL_1, 3);
+	}
+
+	ALLOC_STATE(gb_enable, always, 2, 0);
+	r300->hw.gb_enable.cmd[0] = cmdpacket0(R300_GB_ENABLE, 1);
+	ALLOC_STATE(gb_misc, always, R300_GB_MISC_CMDSIZE, 0);
+	r300->hw.gb_misc.cmd[0] = cmdpacket0(R300_GB_MSPOS0, 5);
+	ALLOC_STATE(txe, always, R300_TXE_CMDSIZE, 0);
+	r300->hw.txe.cmd[R300_TXE_CMD_0] = cmdpacket0(R300_TX_ENABLE, 1);
+	ALLOC_STATE(unk4200, always, 5, 0);
+	r300->hw.unk4200.cmd[0] = cmdpacket0(0x4200, 4);
+	ALLOC_STATE(unk4214, always, 2, 0);
+	r300->hw.unk4214.cmd[0] = cmdpacket0(0x4214, 1);
+	ALLOC_STATE(ps, always, R300_PS_CMDSIZE, 0);
+	r300->hw.ps.cmd[0] = cmdpacket0(R300_RE_POINTSIZE, 1);
+	ALLOC_STATE(unk4230, always, 4, 0);
+	r300->hw.unk4230.cmd[0] = cmdpacket0(0x4230, 3);
+	ALLOC_STATE(lcntl, always, 2, 0);
+	r300->hw.lcntl.cmd[0] = cmdpacket0(R300_RE_LINE_CNT, 1);
+	ALLOC_STATE(unk4260, always, 4, 0);
+	r300->hw.unk4260.cmd[0] = cmdpacket0(0x4260, 3);
+	ALLOC_STATE(shade, always, 5, 0);
+	r300->hw.shade.cmd[0] = cmdpacket0(R300_RE_SHADE, 4);
+	ALLOC_STATE(polygon_mode, always, 4, 0);
+	r300->hw.polygon_mode.cmd[0] = cmdpacket0(R300_RE_POLYGON_MODE, 3);
+	ALLOC_STATE(fogp, always, 3, 0);
+	r300->hw.fogp.cmd[0] = cmdpacket0(R300_RE_FOG_SCALE, 2);
+	ALLOC_STATE(zbias_cntl, always, 2, 0);
+	r300->hw.zbias_cntl.cmd[0] = cmdpacket0(R300_RE_ZBIAS_CNTL, 1);
+	ALLOC_STATE(zbs, always, R300_ZBS_CMDSIZE, 0);
+	r300->hw.zbs.cmd[R300_ZBS_CMD_0] =
+	    cmdpacket0(R300_RE_ZBIAS_T_FACTOR, 4);
+	ALLOC_STATE(occlusion_cntl, always, 2, 0);
+	r300->hw.occlusion_cntl.cmd[0] = cmdpacket0(R300_RE_OCCLUSION_CNTL, 1);
+	ALLOC_STATE(cul, always, R300_CUL_CMDSIZE, 0);
+	r300->hw.cul.cmd[R300_CUL_CMD_0] = cmdpacket0(R300_RE_CULL_CNTL, 1);
+	ALLOC_STATE(unk42C0, always, 3, 0);
+	r300->hw.unk42C0.cmd[0] = cmdpacket0(0x42C0, 2);
+	ALLOC_STATE(rc, always, R300_RC_CMDSIZE, 0);
+	r300->hw.rc.cmd[R300_RC_CMD_0] = cmdpacket0(R300_RS_CNTL_0, 2);
+	ALLOC_STATE(ri, always, R300_RI_CMDSIZE, 0);
+	r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(R300_RS_INTERP_0, 8);
+	ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
+	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_ROUTE_0, 1);
+	ALLOC_STATE(unk43A4, always, 3, 0);
+	r300->hw.unk43A4.cmd[0] = cmdpacket0(0x43A4, 2);
+	ALLOC_STATE(unk43E8, always, 2, 0);
+	r300->hw.unk43E8.cmd[0] = cmdpacket0(0x43E8, 1);
+	ALLOC_STATE(fp, always, R300_FP_CMDSIZE, 0);
+	r300->hw.fp.cmd[R300_FP_CMD_0] = cmdpacket0(R300_PFS_CNTL_0, 3);
+	r300->hw.fp.cmd[R300_FP_CMD_1] = cmdpacket0(R300_PFS_NODE_0, 4);
+	ALLOC_STATE(fpt, variable, R300_FPT_CMDSIZE, 0);
+	r300->hw.fpt.cmd[R300_FPT_CMD_0] = cmdpacket0(R300_PFS_TEXI_0, 0);
+	ALLOC_STATE(unk46A4, always, 6, 0);
+	r300->hw.unk46A4.cmd[0] = cmdpacket0(0x46A4, 5);
+	ALLOC_STATE(fpi[0], variable, R300_FPI_CMDSIZE, 0);
+	r300->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_PFS_INSTR0_0, 1);
+	ALLOC_STATE(fpi[1], variable, R300_FPI_CMDSIZE, 1);
+	r300->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_PFS_INSTR1_0, 1);
+	ALLOC_STATE(fpi[2], variable, R300_FPI_CMDSIZE, 2);
+	r300->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_PFS_INSTR2_0, 1);
+	ALLOC_STATE(fpi[3], variable, R300_FPI_CMDSIZE, 3);
+	r300->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_PFS_INSTR3_0, 1);
+	ALLOC_STATE(fogs, always, R300_FOGS_CMDSIZE, 0);
+	r300->hw.fogs.cmd[R300_FOGS_CMD_0] = cmdpacket0(R300_RE_FOG_STATE, 1);
+	ALLOC_STATE(fogc, always, R300_FOGC_CMDSIZE, 0);
+	r300->hw.fogc.cmd[R300_FOGC_CMD_0] = cmdpacket0(R300_FOG_COLOR_R, 3);
+	ALLOC_STATE(at, always, R300_AT_CMDSIZE, 0);
+	r300->hw.at.cmd[R300_AT_CMD_0] = cmdpacket0(R300_PP_ALPHA_TEST, 2);
+	ALLOC_STATE(unk4BD8, always, 2, 0);
+	r300->hw.unk4BD8.cmd[0] = cmdpacket0(0x4BD8, 1);
+	ALLOC_STATE(fpp, variable, R300_FPP_CMDSIZE, 0);
+	r300->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, 0);
+	ALLOC_STATE(unk4E00, always, 2, 0);
+	r300->hw.unk4E00.cmd[0] = cmdpacket0(0x4E00, 1);
+	ALLOC_STATE(bld, always, R300_BLD_CMDSIZE, 0);
+	r300->hw.bld.cmd[R300_BLD_CMD_0] = cmdpacket0(R300_RB3D_CBLEND, 2);
+	ALLOC_STATE(cmk, always, R300_CMK_CMDSIZE, 0);
+	r300->hw.cmk.cmd[R300_CMK_CMD_0] = cmdpacket0(R300_RB3D_COLORMASK, 1);
+	ALLOC_STATE(blend_color, always, 4, 0);
+	r300->hw.blend_color.cmd[0] = cmdpacket0(R300_RB3D_BLEND_COLOR, 3);
+	ALLOC_STATE(cb, always, R300_CB_CMDSIZE, 0);
+	r300->hw.cb.cmd[R300_CB_CMD_0] = cmdpacket0(R300_RB3D_COLOROFFSET0, 1);
+	r300->hw.cb.cmd[R300_CB_CMD_1] = cmdpacket0(R300_RB3D_COLORPITCH0, 1);
+	ALLOC_STATE(unk4E50, always, 10, 0);
+	r300->hw.unk4E50.cmd[0] = cmdpacket0(0x4E50, 9);
+	ALLOC_STATE(unk4E88, always, 2, 0);
+	r300->hw.unk4E88.cmd[0] = cmdpacket0(0x4E88, 1);
+	ALLOC_STATE(unk4EA0, always, 3, 0);
+	r300->hw.unk4EA0.cmd[0] = cmdpacket0(0x4EA0, 2);
+	ALLOC_STATE(zs, always, R300_ZS_CMDSIZE, 0);
+	r300->hw.zs.cmd[R300_ZS_CMD_0] =
+	    cmdpacket0(R300_RB3D_ZSTENCIL_CNTL_0, 3);
+	ALLOC_STATE(zstencil_format, always, 5, 0);
+	r300->hw.zstencil_format.cmd[0] =
+	    cmdpacket0(R300_RB3D_ZSTENCIL_FORMAT, 4);
+	ALLOC_STATE(zb, always, R300_ZB_CMDSIZE, 0);
+	r300->hw.zb.cmd[R300_ZB_CMD_0] = cmdpacket0(R300_RB3D_DEPTHOFFSET, 2);
+	ALLOC_STATE(unk4F28, always, 2, 0);
+	r300->hw.unk4F28.cmd[0] = cmdpacket0(0x4F28, 1);
+	ALLOC_STATE(unk4F30, always, 3, 0);
+	r300->hw.unk4F30.cmd[0] = cmdpacket0(0x4F30, 2);
+	ALLOC_STATE(unk4F44, always, 2, 0);
+	r300->hw.unk4F44.cmd[0] = cmdpacket0(0x4F44, 1);
+	ALLOC_STATE(unk4F54, always, 2, 0);
+	r300->hw.unk4F54.cmd[0] = cmdpacket0(0x4F54, 1);
+
+	/* VPU only on TCL */
+	if (has_tcl) {
+   	        int i;
+		ALLOC_STATE(vpi, vpu, R300_VPI_CMDSIZE, 0);
+		r300->hw.vpi.cmd[R300_VPI_CMD_0] =
+		    cmdvpu(R300_PVS_UPLOAD_PROGRAM, 0);
+
+		ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
+		r300->hw.vpp.cmd[R300_VPP_CMD_0] =
+		    cmdvpu(R300_PVS_UPLOAD_PARAMETERS, 0);
+
+		ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
+		r300->hw.vps.cmd[R300_VPS_CMD_0] =
+		    cmdvpu(R300_PVS_UPLOAD_POINTSIZE, 1);
+
+		for (i = 0; i < 6; i++) {
+		  ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
+		  r300->hw.vpucp[i].cmd[R300_VPUCP_CMD_0] =
+ 		    cmdvpu(R300_PVS_UPLOAD_CLIP_PLANE0+i, 1);
+		}
+	}
+
+	/* Textures */
+	ALLOC_STATE(tex.filter, variable, mtu + 1, 0);
+	r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_FILTER_0, 0);
+
+	ALLOC_STATE(tex.filter_1, variable, mtu + 1, 0);
+	r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_FILTER1_0, 0);
+
+	ALLOC_STATE(tex.size, variable, mtu + 1, 0);
+	r300->hw.tex.size.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_SIZE_0, 0);
+
+	ALLOC_STATE(tex.format, variable, mtu + 1, 0);
+	r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_FORMAT_0, 0);
+
+	ALLOC_STATE(tex.pitch, variable, mtu + 1, 0);
+	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_PITCH_0, 0);
+
+	ALLOC_STATE(tex.offset, variable, mtu + 1, 0);
+	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_OFFSET_0, 0);
+
+	ALLOC_STATE(tex.chroma_key, variable, mtu + 1, 0);
+	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_CHROMA_KEY_0, 0);
+
+	ALLOC_STATE(tex.border_color, variable, mtu + 1, 0);
+	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_BORDER_COLOR_0, 0);
+
+	r300->hw.is_dirty = GL_TRUE;
+	r300->hw.all_dirty = GL_TRUE;
+
+	/* Initialize command buffer */
+	size =
+	    256 * driQueryOptioni(&r300->radeon.optionCache,
+				  "command_buffer_size");
+	if (size < 2 * r300->hw.max_state_size) {
+		size = 2 * r300->hw.max_state_size + 65535;
+	}
+	if (size > 64 * 256)
+		size = 64 * 256;
+
+	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA)) {
+		fprintf(stderr, "sizeof(drm_r300_cmd_header_t)=%zd\n",
+			sizeof(drm_r300_cmd_header_t));
+		fprintf(stderr, "sizeof(drm_radeon_cmd_buffer_t)=%zd\n",
+			sizeof(drm_radeon_cmd_buffer_t));
+		fprintf(stderr,
+			"Allocating %d bytes command buffer (max state is %d bytes)\n",
+			size * 4, r300->hw.max_state_size * 4);
+	}
+
+	r300->cmdbuf.size = size;
+	r300->cmdbuf.cmd_buf = (uint32_t *) CALLOC(size * 4);
+	r300->cmdbuf.count_used = 0;
+	r300->cmdbuf.count_reemit = 0;
+}
+
+/**
+ * Destroy the command buffer and state atoms.
+ */
+void r300DestroyCmdBuf(r300ContextPtr r300)
+{
+	struct r300_state_atom *atom;
+
+	FREE(r300->cmdbuf.cmd_buf);
+
+	foreach(atom, &r300->hw.atomlist) {
+		FREE(atom->cmd);
+	}
+}
+
+void r300EmitBlit(r300ContextPtr rmesa,
+		  GLuint color_fmt,
+		  GLuint src_pitch,
+		  GLuint src_offset,
+		  GLuint dst_pitch,
+		  GLuint dst_offset,
+		  GLint srcx, GLint srcy,
+		  GLint dstx, GLint dsty, GLuint w, GLuint h)
+{
+	drm_r300_cmd_header_t *cmd;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr,
+			"%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
+			__FUNCTION__, src_pitch, src_offset, srcx, srcy,
+			dst_pitch, dst_offset, dstx, dsty, w, h);
+
+	assert((src_pitch & 63) == 0);
+	assert((dst_pitch & 63) == 0);
+	assert((src_offset & 1023) == 0);
+	assert((dst_offset & 1023) == 0);
+	assert(w < (1 << 16));
+	assert(h < (1 << 16));
+
+	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 8, __FUNCTION__);
+
+	cmd[0].header.cmd_type = R300_CMD_PACKET3;
+	cmd[0].header.pad0 = R300_CMD_PACKET3_RAW;
+	cmd[1].u = R300_CP_CMD_BITBLT_MULTI | (5 << 16);
+	cmd[2].u = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
+		    RADEON_GMC_DST_PITCH_OFFSET_CNTL |
+		    RADEON_GMC_BRUSH_NONE |
+		    (color_fmt << 8) |
+		    RADEON_GMC_SRC_DATATYPE_COLOR |
+		    RADEON_ROP3_S |
+		    RADEON_DP_SRC_SOURCE_MEMORY |
+		    RADEON_GMC_CLR_CMP_CNTL_DIS | RADEON_GMC_WR_MSK_DIS);
+
+	cmd[3].u = ((src_pitch / 64) << 22) | (src_offset >> 10);
+	cmd[4].u = ((dst_pitch / 64) << 22) | (dst_offset >> 10);
+	cmd[5].u = (srcx << 16) | srcy;
+	cmd[6].u = (dstx << 16) | dsty;	/* dst */
+	cmd[7].u = (w << 16) | h;
+}
+
+void r300EmitWait(r300ContextPtr rmesa, GLuint flags)
+{
+	drm_r300_cmd_header_t *cmd;
+
+	assert(!(flags & ~(R300_WAIT_2D | R300_WAIT_3D)));
+
+	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
+	cmd[0].u = 0;
+	cmd[0].wait.cmd_type = R300_CMD_WAIT;
+	cmd[0].wait.flags = flags;
+}
diff --git a/r300/r300_cmdbuf.h b/r300/r300_cmdbuf.h
new file mode 100644
index 0000000..bfb2eda
--- /dev/null
+++ b/r300/r300_cmdbuf.h
@@ -0,0 +1,116 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R300_CMDBUF_H__
+#define __R300_CMDBUF_H__
+
+#include "r300_context.h"
+
+extern int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller);
+extern int r300FlushCmdBuf(r300ContextPtr r300, const char *caller);
+
+extern void r300EmitState(r300ContextPtr r300);
+
+extern void r300InitCmdBuf(r300ContextPtr r300);
+extern void r300DestroyCmdBuf(r300ContextPtr r300);
+
+/**
+ * Make sure that enough space is available in the command buffer
+ * by flushing if necessary.
+ *
+ * \param dwords The number of dwords we need to be free on the command buffer
+ */
+static __inline__ void r300EnsureCmdBufSpace(r300ContextPtr r300,
+					     int dwords, const char *caller)
+{
+	assert(dwords < r300->cmdbuf.size);
+
+	if (r300->cmdbuf.count_used + dwords > r300->cmdbuf.size)
+		r300FlushCmdBuf(r300, caller);
+}
+
+/**
+ * Allocate the given number of dwords in the command buffer and return
+ * a pointer to the allocated area.
+ * When necessary, these functions cause a flush. r300AllocCmdBuf() also
+ * causes state reemission after a flush. This is necessary to ensure
+ * correct hardware state after an unlock.
+ */
+static __inline__ uint32_t *r300RawAllocCmdBuf(r300ContextPtr r300,
+					       int dwords, const char *caller)
+{
+	uint32_t *ptr;
+
+	r300EnsureCmdBufSpace(r300, dwords, caller);
+
+	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
+	r300->cmdbuf.count_used += dwords;
+	return ptr;
+}
+
+static __inline__ uint32_t *r300AllocCmdBuf(r300ContextPtr r300,
+					    int dwords, const char *caller)
+{
+	uint32_t *ptr;
+
+	r300EnsureCmdBufSpace(r300, dwords, caller);
+
+	if (!r300->cmdbuf.count_used) {
+		if (RADEON_DEBUG & DEBUG_IOCTL)
+			fprintf(stderr,
+				"Reemit state after flush (from %s)\n", caller);
+		r300EmitState(r300);
+	}
+
+	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
+	r300->cmdbuf.count_used += dwords;
+	return ptr;
+}
+
+extern void r300EmitBlit(r300ContextPtr rmesa,
+			 GLuint color_fmt,
+			 GLuint src_pitch,
+			 GLuint src_offset,
+			 GLuint dst_pitch,
+			 GLuint dst_offset,
+			 GLint srcx, GLint srcy,
+			 GLint dstx, GLint dsty, GLuint w, GLuint h);
+
+extern void r300EmitWait(r300ContextPtr rmesa, GLuint flags);
+extern void r300EmitLOAD_VBPNTR(r300ContextPtr rmesa, int start);
+extern void r300EmitVertexShader(r300ContextPtr rmesa);
+extern void r300EmitPixelShader(r300ContextPtr rmesa);
+
+#endif				/* __R300_CMDBUF_H__ */
diff --git a/r300/r300_context.c b/r300/r300_context.c
new file mode 100644
index 0000000..9ea14ab
--- /dev/null
+++ b/r300/r300_context.c
@@ -0,0 +1,532 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include "glheader.h"
+#include "api_arrayelt.h"
+#include "context.h"
+#include "simple_list.h"
+#include "imports.h"
+#include "matrix.h"
+#include "extensions.h"
+#include "state.h"
+#include "bufferobj.h"
+
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "vbo/vbo.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "tnl/t_vp_build.h"
+
+#include "drivers/common/driverfuncs.h"
+
+#include "radeon_ioctl.h"
+#include "radeon_span.h"
+#include "r300_context.h"
+#include "r300_cmdbuf.h"
+#include "r300_state.h"
+#include "r300_ioctl.h"
+#include "r300_tex.h"
+#include "r300_emit.h"
+
+#ifdef USER_BUFFERS
+#include "r300_mem.h"
+#endif
+
+#include "vblank.h"
+#include "utils.h"
+#include "xmlpool.h"		/* for symbolic values of enum-type options */
+
+/* hw_tcl_on derives from future_hw_tcl_on when its safe to change it. */
+int future_hw_tcl_on = 1;
+int hw_tcl_on = 1;
+
+#define need_GL_EXT_stencil_two_side
+#define need_GL_ARB_multisample
+#define need_GL_ARB_texture_compression
+#define need_GL_ARB_vertex_buffer_object
+#define need_GL_ARB_vertex_program
+#define need_GL_EXT_blend_minmax
+//#define need_GL_EXT_fog_coord
+#define need_GL_EXT_secondary_color
+#define need_GL_EXT_blend_equation_separate
+#define need_GL_EXT_blend_func_separate
+#define need_GL_EXT_gpu_program_parameters
+#define need_GL_NV_vertex_program
+#include "extension_helper.h"
+
+const struct dri_extension card_extensions[] = {
+  /* *INDENT-OFF* */
+  {"GL_ARB_multisample",		GL_ARB_multisample_functions},
+  {"GL_ARB_multitexture",		NULL},
+  {"GL_ARB_texture_border_clamp",	NULL},
+  {"GL_ARB_texture_compression",	GL_ARB_texture_compression_functions},
+  {"GL_ARB_texture_cube_map",		NULL},
+  {"GL_ARB_texture_env_add",		NULL},
+  {"GL_ARB_texture_env_combine",	NULL},
+  {"GL_ARB_texture_env_crossbar",	NULL},
+  {"GL_ARB_texture_env_dot3",		NULL},
+  {"GL_ARB_texture_mirrored_repeat",	NULL},
+  {"GL_ARB_vertex_buffer_object",	GL_ARB_vertex_buffer_object_functions},
+  {"GL_ARB_vertex_program",		GL_ARB_vertex_program_functions},
+  {"GL_ARB_fragment_program",		NULL},
+  {"GL_EXT_blend_equation_separate",	GL_EXT_blend_equation_separate_functions},
+  {"GL_EXT_blend_func_separate",	GL_EXT_blend_func_separate_functions},
+  {"GL_EXT_blend_minmax",		GL_EXT_blend_minmax_functions},
+  {"GL_EXT_blend_subtract",		NULL},
+//  {"GL_EXT_fog_coord",			GL_EXT_fog_coord_functions },
+  {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
+  {"GL_EXT_secondary_color", 		GL_EXT_secondary_color_functions},
+  {"GL_EXT_stencil_two_side",		GL_EXT_stencil_two_side_functions},
+  {"GL_EXT_stencil_wrap",		NULL},
+  {"GL_EXT_texture_edge_clamp",		NULL},
+  {"GL_EXT_texture_env_combine", 	NULL},
+  {"GL_EXT_texture_env_dot3", 		NULL},
+  {"GL_EXT_texture_filter_anisotropic",	NULL},
+  {"GL_EXT_texture_lod_bias",		NULL},
+  {"GL_EXT_texture_mirror_clamp",	NULL},
+  {"GL_EXT_texture_rectangle",		NULL},
+  {"GL_ATI_texture_env_combine3",	NULL},
+  {"GL_ATI_texture_mirror_once",	NULL},
+  {"GL_MESA_pack_invert",		NULL},
+  {"GL_MESA_ycbcr_texture",		NULL},
+  {"GL_MESAX_texture_float",		NULL},
+  {"GL_NV_blend_square",		NULL},
+  {"GL_NV_vertex_program",		GL_NV_vertex_program_functions},
+  {"GL_SGIS_generate_mipmap",		NULL},
+  {NULL,				NULL}
+  /* *INDENT-ON* */
+};
+
+extern struct tnl_pipeline_stage _r300_render_stage;
+extern const struct tnl_pipeline_stage _r300_tcl_stage;
+
+static const struct tnl_pipeline_stage *r300_pipeline[] = {
+
+	/* Try and go straight to t&l
+	 */
+	&_r300_tcl_stage,
+
+	/* Catch any t&l fallbacks
+	 */
+	&_tnl_vertex_transform_stage,
+	&_tnl_normal_transform_stage,
+	&_tnl_lighting_stage,
+	&_tnl_fog_coordinate_stage,
+	&_tnl_texgen_stage,
+	&_tnl_texture_transform_stage,
+	&_tnl_vertex_program_stage,
+
+	/* Try again to go to tcl?
+	 *     - no good for asymmetric-twoside (do with multipass)
+	 *     - no good for asymmetric-unfilled (do with multipass)
+	 *     - good for material
+	 *     - good for texgen
+	 *     - need to manipulate a bit of state
+	 *
+	 * - worth it/not worth it?
+	 */
+
+	/* Else do them here.
+	 */
+	&_r300_render_stage,
+	&_tnl_render_stage,	/* FALLBACK  */
+	0,
+};
+
+/* Create the device specific rendering context.
+ */
+GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+			    __DRIcontextPrivate * driContextPriv,
+			    void *sharedContextPrivate)
+{
+	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+	struct dd_function_table functions;
+	r300ContextPtr r300;
+	GLcontext *ctx;
+	int tcl_mode, i;
+
+	assert(glVisual);
+	assert(driContextPriv);
+	assert(screen);
+
+	/* Allocate the R300 context */
+	r300 = (r300ContextPtr) CALLOC(sizeof(*r300));
+	if (!r300)
+		return GL_FALSE;
+
+	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
+		hw_tcl_on = future_hw_tcl_on = 0;
+
+	/* Parse configuration files.
+	 * Do this here so that initialMaxAnisotropy is set before we create
+	 * the default textures.
+	 */
+	driParseConfigFiles(&r300->radeon.optionCache, &screen->optionCache,
+			    screen->driScreen->myNum, "r300");
+	r300->initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache,
+						     "def_max_anisotropy");
+
+	/* Init default driver functions then plug in our R300-specific functions
+	 * (the texture functions are especially important)
+	 */
+	_mesa_init_driver_functions(&functions);
+	r300InitIoctlFuncs(&functions);
+	r300InitStateFuncs(&functions);
+	r300InitTextureFuncs(&functions);
+	r300InitShaderFuncs(&functions);
+
+#ifdef USER_BUFFERS
+	r300_mem_init(r300);
+#endif
+
+	if (!radeonInitContext(&r300->radeon, &functions,
+			       glVisual, driContextPriv,
+			       sharedContextPrivate)) {
+		FREE(r300);
+		return GL_FALSE;
+	}
+
+	/* Init r300 context data */
+	r300->dma.buf0_address =
+	    r300->radeon.radeonScreen->buffers->list[0].address;
+
+	(void)memset(r300->texture_heaps, 0, sizeof(r300->texture_heaps));
+	make_empty_list(&r300->swapped);
+
+	r300->nr_heaps = 1 /* screen->numTexHeaps */ ;
+	assert(r300->nr_heaps < RADEON_NR_TEX_HEAPS);
+	for (i = 0; i < r300->nr_heaps; i++) {
+		/* *INDENT-OFF* */
+		r300->texture_heaps[i] = driCreateTextureHeap(i, r300,
+							       screen->
+							       texSize[i], 12,
+							       RADEON_NR_TEX_REGIONS,
+							       (drmTextureRegionPtr)
+							       r300->radeon.sarea->
+							       tex_list[i],
+							       &r300->radeon.sarea->
+							       tex_age[i],
+							       &r300->swapped,
+							       sizeof
+							       (r300TexObj),
+							       (destroy_texture_object_t
+								*)
+							       r300DestroyTexObj);
+		/* *INDENT-ON* */
+	}
+	r300->texture_depth = driQueryOptioni(&r300->radeon.optionCache,
+					      "texture_depth");
+	if (r300->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
+		r300->texture_depth = (screen->cpp == 4) ?
+		    DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
+
+	/* Set the maximum texture size small enough that we can guarentee that
+	 * all texture units can bind a maximal texture and have them both in
+	 * texturable memory at once.
+	 */
+
+	ctx = r300->radeon.glCtx;
+
+	ctx->Const.MaxTextureImageUnits =
+	    driQueryOptioni(&r300->radeon.optionCache, "texture_image_units");
+	ctx->Const.MaxTextureCoordUnits =
+	    driQueryOptioni(&r300->radeon.optionCache, "texture_coord_units");
+	ctx->Const.MaxTextureUnits =
+	    MIN2(ctx->Const.MaxTextureImageUnits,
+		 ctx->Const.MaxTextureCoordUnits);
+	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+
+	ctx->Const.MinPointSize = 1.0;
+	ctx->Const.MinPointSizeAA = 1.0;
+	ctx->Const.MaxPointSize = R300_POINTSIZE_MAX;
+	ctx->Const.MaxPointSizeAA = R300_POINTSIZE_MAX;
+
+	ctx->Const.MinLineWidth = 1.0;
+	ctx->Const.MinLineWidthAA = 1.0;
+	ctx->Const.MaxLineWidth = R300_LINESIZE_MAX;
+	ctx->Const.MaxLineWidthAA = R300_LINESIZE_MAX;
+
+#ifdef USER_BUFFERS
+	/* Needs further modifications */
+#if 0
+	ctx->Const.MaxArrayLockSize =
+	    ( /*512 */ RADEON_BUFFER_SIZE * 16 * 1024) / (4 * 4);
+#endif
+#endif
+
+	/* Initialize the software rasterizer and helper modules.
+	 */
+	_swrast_CreateContext(ctx);
+	_vbo_CreateContext(ctx);
+	_tnl_CreateContext(ctx);
+	_swsetup_CreateContext(ctx);
+	_swsetup_Wakeup(ctx);
+	_ae_create_context(ctx);
+
+	/* Install the customized pipeline:
+	 */
+	_tnl_destroy_pipeline(ctx);
+	_tnl_install_pipeline(ctx, r300_pipeline);
+
+	/* Try and keep materials and vertices separate:
+	 */
+/* 	_tnl_isolate_materials(ctx, GL_TRUE); */
+
+	/* Configure swrast and TNL to match hardware characteristics:
+	 */
+	_swrast_allow_pixel_fog(ctx, GL_FALSE);
+	_swrast_allow_vertex_fog(ctx, GL_TRUE);
+	_tnl_allow_pixel_fog(ctx, GL_FALSE);
+	_tnl_allow_vertex_fog(ctx, GL_TRUE);
+
+	/* currently bogus data */
+	ctx->Const.VertexProgram.MaxInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
+	ctx->Const.VertexProgram.MaxNativeInstructions =
+	    VSF_MAX_FRAGMENT_LENGTH / 4;
+	ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
+	ctx->Const.VertexProgram.MaxTemps = 32;
+	ctx->Const.VertexProgram.MaxNativeTemps =
+	    /*VSF_MAX_FRAGMENT_TEMPS */ 32;
+	ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
+	ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+
+	ctx->Const.FragmentProgram.MaxNativeTemps = PFS_NUM_TEMP_REGS;
+	ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
+	ctx->Const.FragmentProgram.MaxNativeParameters = PFS_NUM_CONST_REGS;
+	ctx->Const.FragmentProgram.MaxNativeAluInstructions = PFS_MAX_ALU_INST;
+	ctx->Const.FragmentProgram.MaxNativeTexInstructions = PFS_MAX_TEX_INST;
+	ctx->Const.FragmentProgram.MaxNativeInstructions =
+	    PFS_MAX_ALU_INST + PFS_MAX_TEX_INST;
+	ctx->Const.FragmentProgram.MaxNativeTexIndirections =
+	    PFS_MAX_TEX_INDIRECT;
+	ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;	/* and these are?? */
+	_tnl_ProgramCacheInit(ctx);
+	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
+
+	driInitExtensions(ctx, card_extensions, GL_TRUE);
+
+	if (driQueryOptionb
+	    (&r300->radeon.optionCache, "disable_stencil_two_side"))
+		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
+
+	if (r300->radeon.glCtx->Mesa_DXTn
+	    && !driQueryOptionb(&r300->radeon.optionCache, "disable_s3tc")) {
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+		_mesa_enable_extension(ctx, "GL_S3_s3tc");
+	} else
+	    if (driQueryOptionb(&r300->radeon.optionCache, "force_s3tc_enable"))
+	{
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+	}
+
+	r300->disable_lowimpact_fallback =
+	    driQueryOptionb(&r300->radeon.optionCache,
+			    "disable_lowimpact_fallback");
+
+	radeonInitSpanFuncs(ctx);
+	r300InitCmdBuf(r300);
+	r300InitState(r300);
+
+	TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
+
+	tcl_mode = driQueryOptioni(&r300->radeon.optionCache, "tcl_mode");
+	if (driQueryOptionb(&r300->radeon.optionCache, "no_rast")) {
+		fprintf(stderr, "disabling 3D acceleration\n");
+#if R200_MERGED
+		FALLBACK(&r300->radeon, RADEON_FALLBACK_DISABLE, 1);
+#endif
+	}
+	if (tcl_mode == DRI_CONF_TCL_SW ||
+	    !(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+		if (r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+			r300->radeon.radeonScreen->chip_flags &=
+			    ~RADEON_CHIPSET_TCL;
+			fprintf(stderr, "Disabling HW TCL support\n");
+		}
+		TCL_FALLBACK(r300->radeon.glCtx,
+			     RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
+	}
+
+	return GL_TRUE;
+}
+
+static void r300FreeGartAllocations(r300ContextPtr r300)
+{
+	int i, ret, tries = 0, done_age, in_use = 0;
+	drm_radeon_mem_free_t memfree;
+
+	memfree.region = RADEON_MEM_REGION_GART;
+
+#ifdef USER_BUFFERS
+	for (i = r300->rmm->u_last; i > 0; i--) {
+		if (r300->rmm->u_list[i].ptr == NULL) {
+			continue;
+		}
+
+		/* check whether this buffer is still in use */
+		if (r300->rmm->u_list[i].pending) {
+			in_use++;
+		}
+	}
+	/* Cannot flush/lock if no context exists. */
+	if (in_use)
+		r300FlushCmdBuf(r300, __FUNCTION__);
+
+	done_age = radeonGetAge((radeonContextPtr) r300);
+
+	for (i = r300->rmm->u_last; i > 0; i--) {
+		if (r300->rmm->u_list[i].ptr == NULL) {
+			continue;
+		}
+
+		/* check whether this buffer is still in use */
+		if (!r300->rmm->u_list[i].pending) {
+			continue;
+		}
+
+		assert(r300->rmm->u_list[i].h_pending == 0);
+
+		tries = 0;
+		while (r300->rmm->u_list[i].age > done_age && tries++ < 1000) {
+			usleep(10);
+			done_age = radeonGetAge((radeonContextPtr) r300);
+		}
+		if (tries >= 1000) {
+			WARN_ONCE("Failed to idle region!");
+		}
+
+		memfree.region_offset = (char *)r300->rmm->u_list[i].ptr -
+		    (char *)r300->radeon.radeonScreen->gartTextures.map;
+
+		ret = drmCommandWrite(r300->radeon.radeonScreen->driScreen->fd,
+				      DRM_RADEON_FREE, &memfree,
+				      sizeof(memfree));
+		if (ret) {
+			fprintf(stderr, "Failed to free at %p\nret = %s\n",
+				r300->rmm->u_list[i].ptr, strerror(-ret));
+		} else {
+			if (i == r300->rmm->u_last)
+				r300->rmm->u_last--;
+
+			r300->rmm->u_list[i].pending = 0;
+			r300->rmm->u_list[i].ptr = NULL;
+		}
+	}
+	r300->rmm->u_head = i;
+#endif				/* USER_BUFFERS */
+}
+
+/* Destroy the device specific context.
+ */
+void r300DestroyContext(__DRIcontextPrivate * driContextPriv)
+{
+	GET_CURRENT_CONTEXT(ctx);
+	r300ContextPtr r300 = (r300ContextPtr) driContextPriv->driverPrivate;
+	radeonContextPtr radeon = (radeonContextPtr) r300;
+	radeonContextPtr current = ctx ? RADEON_CONTEXT(ctx) : NULL;
+
+	if (RADEON_DEBUG & DEBUG_DRI) {
+		fprintf(stderr, "Destroying context !\n");
+	}
+
+	/* check if we're deleting the currently bound context */
+	if (&r300->radeon == current) {
+		radeonFlush(r300->radeon.glCtx);
+		_mesa_make_current(NULL, NULL, NULL);
+	}
+
+	/* Free r300 context resources */
+	assert(r300);		/* should never be null */
+
+	if (r300) {
+		GLboolean release_texture_heaps;
+
+		release_texture_heaps =
+		    (r300->radeon.glCtx->Shared->RefCount == 1);
+		_swsetup_DestroyContext(r300->radeon.glCtx);
+		_tnl_ProgramCacheDestroy(r300->radeon.glCtx);
+		_tnl_DestroyContext(r300->radeon.glCtx);
+		_vbo_DestroyContext(r300->radeon.glCtx);
+		_swrast_DestroyContext(r300->radeon.glCtx);
+
+		if (r300->dma.current.buf) {
+			r300ReleaseDmaRegion(r300, &r300->dma.current,
+					     __FUNCTION__);
+#ifndef USER_BUFFERS
+			r300FlushCmdBuf(r300, __FUNCTION__);
+#endif
+		}
+		r300FreeGartAllocations(r300);
+		r300DestroyCmdBuf(r300);
+
+		if (radeon->state.scissor.pClipRects) {
+			FREE(radeon->state.scissor.pClipRects);
+			radeon->state.scissor.pClipRects = NULL;
+		}
+
+		if (release_texture_heaps) {
+			/* This share group is about to go away, free our private
+			 * texture object data.
+			 */
+			int i;
+
+			for (i = 0; i < r300->nr_heaps; i++) {
+				driDestroyTextureHeap(r300->texture_heaps[i]);
+				r300->texture_heaps[i] = NULL;
+			}
+
+			assert(is_empty_list(&r300->swapped));
+		}
+
+		radeonCleanupContext(&r300->radeon);
+
+#ifdef USER_BUFFERS
+		/* the memory manager might be accessed when Mesa frees the shared
+		 * state, so don't destroy it earlier
+		 */
+		r300_mem_destroy(r300);
+#endif
+
+		/* free the option cache */
+		driDestroyOptionCache(&r300->radeon.optionCache);
+
+		FREE(r300);
+	}
+}
diff --git a/r300/r300_context.h b/r300/r300_context.h
new file mode 100644
index 0000000..6b0a588
--- /dev/null
+++ b/r300/r300_context.h
@@ -0,0 +1,916 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R300_CONTEXT_H__
+#define __R300_CONTEXT_H__
+
+#include "tnl/t_vertex.h"
+#include "drm.h"
+#include "radeon_drm.h"
+#include "dri_util.h"
+#include "texmem.h"
+
+#include "macros.h"
+#include "mtypes.h"
+#include "colormac.h"
+
+#define USER_BUFFERS
+
+//#define OPTIMIZE_ELTS
+
+struct r300_context;
+typedef struct r300_context r300ContextRec;
+typedef struct r300_context *r300ContextPtr;
+
+#include "radeon_lock.h"
+#include "mm.h"
+
+/* From http://gcc.gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
+   I suppose we could inline this and use macro to fetch out __LINE__ and stuff in case we run into trouble
+   with other compilers ... GLUE!
+*/
+#define WARN_ONCE(a, ...)	{ \
+	static int warn##__LINE__=1; \
+	if(warn##__LINE__){ \
+		fprintf(stderr, "*********************************WARN_ONCE*********************************\n"); \
+		fprintf(stderr, "File %s function %s line %d\n", \
+			__FILE__, __FUNCTION__, __LINE__); \
+		fprintf(stderr,  a, ## __VA_ARGS__);\
+		fprintf(stderr, "***************************************************************************\n"); \
+		warn##__LINE__=0;\
+		} \
+	}
+
+#include "r300_vertprog.h"
+#include "r300_fragprog.h"
+
+/**
+ * This function takes a float and packs it into a uint32_t
+ */
+static __inline__ uint32_t r300PackFloat32(float fl)
+{
+	union {
+		float fl;
+		uint32_t u;
+	} u;
+
+	u.fl = fl;
+	return u.u;
+}
+
+/* This is probably wrong for some values, I need to test this
+ * some more.  Range checking would be a good idea also..
+ *
+ * But it works for most things.  I'll fix it later if someone
+ * else with a better clue doesn't
+ */
+static __inline__ uint32_t r300PackFloat24(float f)
+{
+	float mantissa;
+	int exponent;
+	uint32_t float24 = 0;
+
+	if (f == 0.0)
+		return 0;
+
+	mantissa = frexpf(f, &exponent);
+
+	/* Handle -ve */
+	if (mantissa < 0) {
+		float24 |= (1 << 23);
+		mantissa = mantissa * -1.0;
+	}
+	/* Handle exponent, bias of 63 */
+	exponent += 62;
+	float24 |= (exponent << 16);
+	/* Kill 7 LSB of mantissa */
+	float24 |= (r300PackFloat32(mantissa) & 0x7FFFFF) >> 7;
+
+	return float24;
+}
+
+/************ DMA BUFFERS **************/
+
+/* Need refcounting on dma buffers:
+ */
+struct r300_dma_buffer {
+	int refcount;		/**< the number of retained regions in buf */
+	drmBufPtr buf;
+	int id;
+};
+#undef GET_START
+#ifdef USER_BUFFERS
+#define GET_START(rvb) (r300GartOffsetFromVirtual(rmesa, (rvb)->address+(rvb)->start))
+#else
+#define GET_START(rvb) (rmesa->radeon.radeonScreen->gart_buffer_offset +		\
+			(rvb)->address - rmesa->dma.buf0_address +	\
+			(rvb)->start)
+#endif
+/* A retained region, eg vertices for indexed vertices.
+ */
+struct r300_dma_region {
+	struct r300_dma_buffer *buf;
+	char *address;		/* == buf->address */
+	int start, end, ptr;	/* offsets from start of buf */
+
+	int aos_offset;		/* address in GART memory */
+	int aos_stride;		/* distance between elements, in dwords */
+	int aos_size;		/* number of components (1-4) */
+	int aos_reg;		/* VAP register assignment */
+};
+
+struct r300_dma {
+	/* Active dma region.  Allocations for vertices and retained
+	 * regions come from here.  Also used for emitting random vertices,
+	 * these may be flushed by calling flush_current();
+	 */
+	struct r300_dma_region current;
+
+	void (*flush) (r300ContextPtr);
+
+	char *buf0_address;	/* start of buf[0], for index calcs */
+
+	/* Number of "in-flight" DMA buffers, i.e. the number of buffers
+	 * for which a DISCARD command is currently queued in the command buffer.
+	 */
+	GLuint nr_released_bufs;
+};
+
+       /* Texture related */
+
+typedef struct r300_tex_obj r300TexObj, *r300TexObjPtr;
+
+/* Texture object in locally shared texture space.
+ */
+struct r300_tex_obj {
+	driTextureObject base;
+
+	GLuint bufAddr;		/* Offset to start of locally
+				   shared texture block */
+
+	GLuint dirty_state;	/* Flags (1 per texunit) for
+				   whether or not this texobj
+				   has dirty hardware state
+				   (pp_*) that needs to be
+				   brought into the
+				   texunit. */
+
+	drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
+	/* Six, for the cube faces */
+
+	GLboolean image_override;	/* Image overridden by GLX_EXT_tfp */
+
+	GLuint pitch;		/* this isn't sent to hardware just used in calculations */
+	/* hardware register values */
+	/* Note that R200 has 8 registers per texture and R300 only 7 */
+	GLuint filter;
+	GLuint filter_1;
+	GLuint pitch_reg;
+	GLuint size;		/* npot only */
+	GLuint format;
+	GLuint offset;		/* Image location in the card's address space.
+				   All cube faces follow. */
+	GLuint unknown4;
+	GLuint unknown5;
+	/* end hardware registers */
+
+	/* registers computed by r200 code - keep them here to
+	   compare against what is actually written.
+
+	   to be removed later.. */
+	GLuint pp_border_color;
+	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
+	GLuint format_x;
+
+	GLboolean border_fallback;
+
+	GLuint tile_bits;	/* hw texture tile bits used on this texture */
+};
+
+struct r300_texture_env_state {
+	r300TexObjPtr texobj;
+	GLenum format;
+	GLenum envMode;
+};
+
+/* The blit width for texture uploads
+ */
+#define R300_BLIT_WIDTH_BYTES 1024
+#define R300_MAX_TEXTURE_UNITS 8
+
+struct r300_texture_state {
+	struct r300_texture_env_state unit[R300_MAX_TEXTURE_UNITS];
+	int tc_count;		/* number of incoming texture coordinates from VAP */
+};
+
+/**
+ * A block of hardware state.
+ *
+ * When check returns non-zero, the returned number of dwords must be
+ * copied verbatim into the command buffer in order to update a state atom
+ * when it is dirty.
+ */
+struct r300_state_atom {
+	struct r300_state_atom *next, *prev;
+	const char *name;	/* for debug */
+	int cmd_size;		/* maximum size in dwords */
+	GLuint idx;		/* index in an array (e.g. textures) */
+	uint32_t *cmd;
+	GLboolean dirty;
+
+	int (*check) (r300ContextPtr, struct r300_state_atom * atom);
+};
+
+#define R300_VPT_CMD_0		0
+#define R300_VPT_XSCALE		1
+#define R300_VPT_XOFFSET	2
+#define R300_VPT_YSCALE		3
+#define R300_VPT_YOFFSET	4
+#define R300_VPT_ZSCALE		5
+#define R300_VPT_ZOFFSET	6
+#define R300_VPT_CMDSIZE	7
+
+#define R300_VIR_CMD_0		0	/* vir is variable size (at least 1) */
+#define R300_VIR_CNTL_0		1
+#define R300_VIR_CNTL_1		2
+#define R300_VIR_CNTL_2		3
+#define R300_VIR_CNTL_3		4
+#define R300_VIR_CNTL_4		5
+#define R300_VIR_CNTL_5		6
+#define R300_VIR_CNTL_6		7
+#define R300_VIR_CNTL_7		8
+#define R300_VIR_CMDSIZE	9
+
+#define R300_VIC_CMD_0		0
+#define R300_VIC_CNTL_0		1
+#define R300_VIC_CNTL_1		2
+#define R300_VIC_CMDSIZE	3
+
+#define R300_VOF_CMD_0		0
+#define R300_VOF_CNTL_0		1
+#define R300_VOF_CNTL_1		2
+#define R300_VOF_CMDSIZE	3
+
+#define R300_PVS_CMD_0		0
+#define R300_PVS_CNTL_1		1
+#define R300_PVS_CNTL_2		2
+#define R300_PVS_CNTL_3		3
+#define R300_PVS_CMDSIZE	4
+
+#define R300_GB_MISC_CMD_0		0
+#define R300_GB_MISC_MSPOS_0		1
+#define R300_GB_MISC_MSPOS_1		2
+#define R300_GB_MISC_TILE_CONFIG	3
+#define R300_GB_MISC_SELECT		4
+#define R300_GB_MISC_AA_CONFIG		5
+#define R300_GB_MISC_CMDSIZE		6
+
+#define R300_TXE_CMD_0		0
+#define R300_TXE_ENABLE		1
+#define R300_TXE_CMDSIZE	2
+
+#define R300_PS_CMD_0		0
+#define R300_PS_POINTSIZE	1
+#define R300_PS_CMDSIZE		2
+
+#define R300_ZBS_CMD_0		0
+#define R300_ZBS_T_FACTOR	1
+#define R300_ZBS_T_CONSTANT	2
+#define R300_ZBS_W_FACTOR	3
+#define R300_ZBS_W_CONSTANT	4
+#define R300_ZBS_CMDSIZE	5
+
+#define R300_CUL_CMD_0		0
+#define R300_CUL_CULL		1
+#define R300_CUL_CMDSIZE	2
+
+#define R300_RC_CMD_0		0
+#define R300_RC_CNTL_0		1
+#define R300_RC_CNTL_1		2
+#define R300_RC_CMDSIZE		3
+
+#define R300_RI_CMD_0		0
+#define R300_RI_INTERP_0	1
+#define R300_RI_INTERP_1	2
+#define R300_RI_INTERP_2	3
+#define R300_RI_INTERP_3	4
+#define R300_RI_INTERP_4	5
+#define R300_RI_INTERP_5	6
+#define R300_RI_INTERP_6	7
+#define R300_RI_INTERP_7	8
+#define R300_RI_CMDSIZE		9
+
+#define R300_RR_CMD_0		0	/* rr is variable size (at least 1) */
+#define R300_RR_ROUTE_0		1
+#define R300_RR_ROUTE_1		2
+#define R300_RR_ROUTE_2		3
+#define R300_RR_ROUTE_3		4
+#define R300_RR_ROUTE_4		5
+#define R300_RR_ROUTE_5		6
+#define R300_RR_ROUTE_6		7
+#define R300_RR_ROUTE_7		8
+#define R300_RR_CMDSIZE		9
+
+#define R300_FP_CMD_0		0
+#define R300_FP_CNTL0		1
+#define R300_FP_CNTL1		2
+#define R300_FP_CNTL2		3
+#define R300_FP_CMD_1		4
+#define R300_FP_NODE0		5
+#define R300_FP_NODE1		6
+#define R300_FP_NODE2		7
+#define R300_FP_NODE3		8
+#define R300_FP_CMDSIZE		9
+
+#define R300_FPT_CMD_0		0
+#define R300_FPT_INSTR_0	1
+#define R300_FPT_CMDSIZE	65
+
+#define R300_FPI_CMD_0		0
+#define R300_FPI_INSTR_0	1
+#define R300_FPI_CMDSIZE	65
+
+#define R300_FPP_CMD_0		0
+#define R300_FPP_PARAM_0	1
+#define R300_FPP_CMDSIZE	(32*4+1)
+
+#define R300_FOGS_CMD_0		0
+#define R300_FOGS_STATE		1
+#define R300_FOGS_CMDSIZE	2
+
+#define R300_FOGC_CMD_0		0
+#define R300_FOGC_R		1
+#define R300_FOGC_G		2
+#define R300_FOGC_B		3
+#define R300_FOGC_CMDSIZE	4
+
+#define R300_FOGP_CMD_0		0
+#define R300_FOGP_SCALE		1
+#define R300_FOGP_START		2
+#define R300_FOGP_CMDSIZE	3
+
+#define R300_AT_CMD_0		0
+#define R300_AT_ALPHA_TEST	1
+#define R300_AT_UNKNOWN		2
+#define R300_AT_CMDSIZE		3
+
+#define R300_BLD_CMD_0		0
+#define R300_BLD_CBLEND		1
+#define R300_BLD_ABLEND		2
+#define R300_BLD_CMDSIZE	3
+
+#define R300_CMK_CMD_0		0
+#define R300_CMK_COLORMASK	1
+#define R300_CMK_CMDSIZE	2
+
+#define R300_CB_CMD_0		0
+#define R300_CB_OFFSET		1
+#define R300_CB_CMD_1		2
+#define R300_CB_PITCH		3
+#define R300_CB_CMDSIZE		4
+
+#define R300_ZS_CMD_0		0
+#define R300_ZS_CNTL_0		1
+#define R300_ZS_CNTL_1		2
+#define R300_ZS_CNTL_2		3
+#define R300_ZS_CMDSIZE		4
+
+#define R300_ZB_CMD_0		0
+#define R300_ZB_OFFSET		1
+#define R300_ZB_PITCH		2
+#define R300_ZB_CMDSIZE		3
+
+#define R300_VPI_CMD_0		0
+#define R300_VPI_INSTR_0	1
+#define R300_VPI_CMDSIZE	1025	/* 256 16 byte instructions */
+
+#define R300_VPP_CMD_0		0
+#define R300_VPP_PARAM_0	1
+#define R300_VPP_CMDSIZE	1025	/* 256 4-component parameters */
+
+#define R300_VPUCP_CMD_0		0
+#define R300_VPUCP_X            1
+#define R300_VPUCP_Y            2
+#define R300_VPUCP_Z            3
+#define R300_VPUCP_W            4
+#define R300_VPUCP_CMDSIZE	5	/* 256 4-component parameters */
+
+#define R300_VPS_CMD_0		0
+#define R300_VPS_ZERO_0		1
+#define R300_VPS_ZERO_1		2
+#define R300_VPS_POINTSIZE	3
+#define R300_VPS_ZERO_3		4
+#define R300_VPS_CMDSIZE	5
+
+	/* the layout is common for all fields inside tex */
+#define R300_TEX_CMD_0		0
+#define R300_TEX_VALUE_0	1
+/* We don't really use this, instead specify mtu+1 dynamically
+#define R300_TEX_CMDSIZE	(MAX_TEXTURE_UNITS+1)
+*/
+
+/**
+ * Cache for hardware register state.
+ */
+struct r300_hw_state {
+	struct r300_state_atom atomlist;
+
+	GLboolean is_dirty;
+	GLboolean all_dirty;
+	int max_state_size;	/* in dwords */
+
+	struct r300_state_atom vpt;	/* viewport (1D98) */
+	struct r300_state_atom vap_cntl;
+	struct r300_state_atom vof;	/* VAP output format register 0x2090 */
+	struct r300_state_atom vte;	/* (20B0) */
+	struct r300_state_atom unk2134;	/* (2134) */
+	struct r300_state_atom vap_cntl_status;
+	struct r300_state_atom vir[2];	/* vap input route (2150/21E0) */
+	struct r300_state_atom vic;	/* vap input control (2180) */
+	struct r300_state_atom unk21DC;	/* (21DC) */
+	struct r300_state_atom vap_clip_cntl;
+	struct r300_state_atom unk2220;	/* (2220) */
+	struct r300_state_atom unk2288;	/* (2288) */
+	struct r300_state_atom pvs;	/* pvs_cntl (22D0) */
+	struct r300_state_atom gb_enable;	/* (4008) */
+	struct r300_state_atom gb_misc;	/* Multisampling position shifts ? (4010) */
+	struct r300_state_atom unk4200;	/* (4200) */
+	struct r300_state_atom unk4214;	/* (4214) */
+	struct r300_state_atom ps;	/* pointsize (421C) */
+	struct r300_state_atom unk4230;	/* (4230) */
+	struct r300_state_atom lcntl;	/* line control */
+	struct r300_state_atom unk4260;	/* (4260) */
+	struct r300_state_atom shade;
+	struct r300_state_atom polygon_mode;
+	struct r300_state_atom fogp;	/* fog parameters (4294) */
+	struct r300_state_atom unk429C;	/* (429C) */
+	struct r300_state_atom zbias_cntl;
+	struct r300_state_atom zbs;	/* zbias (42A4) */
+	struct r300_state_atom occlusion_cntl;
+	struct r300_state_atom cul;	/* cull cntl (42B8) */
+	struct r300_state_atom unk42C0;	/* (42C0) */
+	struct r300_state_atom rc;	/* rs control (4300) */
+	struct r300_state_atom ri;	/* rs interpolators (4310) */
+	struct r300_state_atom rr;	/* rs route (4330) */
+	struct r300_state_atom unk43A4;	/* (43A4) */
+	struct r300_state_atom unk43E8;	/* (43E8) */
+	struct r300_state_atom fp;	/* fragment program cntl + nodes (4600) */
+	struct r300_state_atom fpt;	/* texi - (4620) */
+	struct r300_state_atom unk46A4;	/* (46A4) */
+	struct r300_state_atom fpi[4];	/* fp instructions (46C0/47C0/48C0/49C0) */
+	struct r300_state_atom fogs;	/* fog state (4BC0) */
+	struct r300_state_atom fogc;	/* fog color (4BC8) */
+	struct r300_state_atom at;	/* alpha test (4BD4) */
+	struct r300_state_atom unk4BD8;	/* (4BD8) */
+	struct r300_state_atom fpp;	/* 0x4C00 and following */
+	struct r300_state_atom unk4E00;	/* (4E00) */
+	struct r300_state_atom bld;	/* blending (4E04) */
+	struct r300_state_atom cmk;	/* colormask (4E0C) */
+	struct r300_state_atom blend_color;	/* constant blend color */
+	struct r300_state_atom cb;	/* colorbuffer (4E28) */
+	struct r300_state_atom unk4E50;	/* (4E50) */
+	struct r300_state_atom unk4E88;	/* (4E88) */
+	struct r300_state_atom unk4EA0;	/* (4E88) I saw it only written on RV350 hardware..  */
+	struct r300_state_atom zs;	/* zstencil control (4F00) */
+	struct r300_state_atom zstencil_format;
+	struct r300_state_atom zb;	/* z buffer (4F20) */
+	struct r300_state_atom unk4F28;	/* (4F28) */
+	struct r300_state_atom unk4F30;	/* (4F30) */
+	struct r300_state_atom unk4F44;	/* (4F44) */
+	struct r300_state_atom unk4F54;	/* (4F54) */
+
+	struct r300_state_atom vpi;	/* vp instructions */
+	struct r300_state_atom vpp;	/* vp parameters */
+	struct r300_state_atom vps;	/* vertex point size (?) */
+	struct r300_state_atom vpucp[6];	/* vp user clip plane - 6 */
+	/* 8 texture units */
+	/* the state is grouped by function and not by
+	   texture unit. This makes single unit updates
+	   really awkward - we are much better off
+	   updating the whole thing at once */
+	struct {
+		struct r300_state_atom filter;
+		struct r300_state_atom filter_1;
+		struct r300_state_atom size;
+		struct r300_state_atom format;
+		struct r300_state_atom pitch;
+		struct r300_state_atom offset;
+		struct r300_state_atom chroma_key;
+		struct r300_state_atom border_color;
+	} tex;
+	struct r300_state_atom txe;	/* tex enable (4104) */
+};
+
+/**
+ * This structure holds the command buffer while it is being constructed.
+ *
+ * The first batch of commands in the buffer is always the state that needs
+ * to be re-emitted when the context is lost. This batch can be skipped
+ * otherwise.
+ */
+struct r300_cmdbuf {
+	int size;		/* DWORDs allocated for buffer */
+	uint32_t *cmd_buf;
+	int count_used;		/* DWORDs filled so far */
+	int count_reemit;	/* size of re-emission batch */
+};
+
+/**
+ * State cache
+ */
+
+struct r300_depthbuffer_state {
+	GLfloat scale;
+};
+
+struct r300_stencilbuffer_state {
+	GLuint clear;
+	GLboolean hw_stencil;
+
+};
+
+/* Vertex shader state */
+
+/* Perhaps more if we store programs in vmem? */
+/* drm_r300_cmd_header_t->vpu->count is unsigned char */
+#define VSF_MAX_FRAGMENT_LENGTH (255*4)
+
+/* Can be tested with colormat currently. */
+#define VSF_MAX_FRAGMENT_TEMPS (14)
+
+#define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
+#define STATE_R300_TEXRECT_FACTOR (STATE_INTERNAL_DRIVER+1)
+
+struct r300_vertex_shader_fragment {
+	int length;
+	union {
+		GLuint d[VSF_MAX_FRAGMENT_LENGTH];
+		float f[VSF_MAX_FRAGMENT_LENGTH];
+		VERTEX_SHADER_INSTRUCTION i[VSF_MAX_FRAGMENT_LENGTH / 4];
+	} body;
+};
+
+#define VSF_DEST_PROGRAM	0x0
+#define VSF_DEST_MATRIX0	0x200
+#define VSF_DEST_MATRIX1	0x204
+#define VSF_DEST_MATRIX2	0x208
+#define VSF_DEST_VECTOR0	0x20c
+#define VSF_DEST_VECTOR1	0x20d
+#define VSF_DEST_UNKNOWN1	0x400
+#define VSF_DEST_UNKNOWN2	0x406
+
+struct r300_vertex_shader_state {
+	struct r300_vertex_shader_fragment program;
+
+	struct r300_vertex_shader_fragment unknown1;
+	struct r300_vertex_shader_fragment unknown2;
+
+	int program_start;
+	int unknown_ptr1;	/* pointer within program space */
+	int program_end;
+
+	int param_offset;
+	int param_count;
+
+	int unknown_ptr2;	/* pointer within program space */
+	int unknown_ptr3;	/* pointer within program space */
+};
+
+extern int hw_tcl_on;
+
+//#define CURRENT_VERTEX_SHADER(ctx) (ctx->VertexProgram._Current)
+#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->selected_vp)
+
+/* Should but doesnt work */
+//#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->curr_vp)
+
+/* r300_vertex_shader_state and r300_vertex_program should probably be merged together someday.
+ * Keeping them them seperate for now should ensure fixed pipeline keeps functioning properly.
+ */
+
+struct r300_vertex_program_key {
+	GLuint InputsRead;
+	GLuint OutputsWritten;
+};
+
+struct r300_vertex_program {
+	struct r300_vertex_program *next;
+	struct r300_vertex_program_key key;
+	int translated;
+
+	struct r300_vertex_shader_fragment program;
+
+	int pos_end;
+	int num_temporaries;	/* Number of temp vars used by program */
+	int wpos_idx;
+	int inputs[VERT_ATTRIB_MAX];
+	int outputs[VERT_RESULT_MAX];
+	int native;
+	int ref_count;
+	int use_ref_count;
+};
+
+struct r300_vertex_program_cont {
+	struct gl_vertex_program mesa_program;	/* Must be first */
+	struct r300_vertex_shader_fragment params;
+	struct r300_vertex_program *progs;
+};
+
+#define PFS_MAX_ALU_INST	64
+#define PFS_MAX_TEX_INST	64
+#define PFS_MAX_TEX_INDIRECT 4
+#define PFS_NUM_TEMP_REGS	32
+#define PFS_NUM_CONST_REGS	16
+
+/* Mapping Mesa registers to R300 temporaries */
+struct reg_acc {
+	int reg;		/* Assigned hw temp */
+	unsigned int refcount;	/* Number of uses by mesa program */
+};
+
+/**
+ * Describe the current lifetime information for an R300 temporary
+ */
+struct reg_lifetime {
+	/* Index of the first slot where this register is free in the sense
+	   that it can be used as a new destination register.
+	   This is -1 if the register has been assigned to a Mesa register
+	   and the last access to the register has not yet been emitted */
+	int free;
+
+	/* Index of the first slot where this register is currently reserved.
+	   This is used to stop e.g. a scalar operation from being moved
+	   before the allocation time of a register that was first allocated
+	   for a vector operation. */
+	int reserved;
+
+	/* Index of the first slot in which the register can be used as a
+	   source without losing the value that is written by the last
+	   emitted instruction that writes to the register */
+	int vector_valid;
+	int scalar_valid;
+
+	/* Index to the slot where the register was last read.
+	   This is also the first slot in which the register may be written again */
+	int vector_lastread;
+	int scalar_lastread;
+};
+
+/**
+ * Store usage information about an ALU instruction slot during the
+ * compilation of a fragment program.
+ */
+#define SLOT_SRC_VECTOR  (1<<0)
+#define SLOT_SRC_SCALAR  (1<<3)
+#define SLOT_SRC_BOTH    (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
+#define SLOT_OP_VECTOR   (1<<16)
+#define SLOT_OP_SCALAR   (1<<17)
+#define SLOT_OP_BOTH     (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
+
+struct r300_pfs_compile_slot {
+	/* Bitmask indicating which parts of the slot are used, using SLOT_ constants
+	   defined above */
+	unsigned int used;
+
+	/* Selected sources */
+	int vsrc[3];
+	int ssrc[3];
+};
+
+/**
+ * Store information during compilation of fragment programs.
+ */
+struct r300_pfs_compile_state {
+	int nrslots;		/* number of ALU slots used so far */
+
+	/* Track which (parts of) slots are already filled with instructions */
+	struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
+
+	/* Track the validity of R300 temporaries */
+	struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
+
+	/* Used to map Mesa's inputs/temps onto hardware temps */
+	int temp_in_use;
+	struct reg_acc temps[PFS_NUM_TEMP_REGS];
+	struct reg_acc inputs[32];	/* don't actually need 32... */
+
+	/* Track usage of hardware temps, for register allocation,
+	 * indirection detection, etc. */
+	GLuint used_in_node;
+	GLuint dest_in_node;
+};
+
+/**
+ * Store everything about a fragment program that is needed
+ * to render with that program.
+ */
+struct r300_fragment_program {
+	struct gl_fragment_program mesa_program;
+
+	GLcontext *ctx;
+	GLboolean translated;
+	GLboolean error;
+	struct r300_pfs_compile_state *cs;
+
+	struct {
+		int length;
+		GLuint inst[PFS_MAX_TEX_INST];
+	} tex;
+
+	struct {
+		struct {
+			GLuint inst0;
+			GLuint inst1;
+			GLuint inst2;
+			GLuint inst3;
+		} inst[PFS_MAX_ALU_INST];
+	} alu;
+
+	struct {
+		int tex_offset;
+		int tex_end;
+		int alu_offset;
+		int alu_end;
+		int flags;
+	} node[4];
+	int cur_node;
+	int first_node_has_tex;
+
+	int alu_offset;
+	int alu_end;
+	int tex_offset;
+	int tex_end;
+
+	/* Hardware constants.
+	 * Contains a pointer to the value. The destination of the pointer
+	 * is supposed to be updated when GL state changes.
+	 * Typically, this is either a pointer into
+	 * gl_program_parameter_list::ParameterValues, or a pointer to a
+	 * global constant (e.g. for sin/cos-approximation)
+	 */
+	const GLfloat *constant[PFS_NUM_CONST_REGS];
+	int const_nr;
+
+	int max_temp_idx;
+
+	GLuint optimization;
+};
+
+#define R300_MAX_AOS_ARRAYS		16
+
+#define AOS_FORMAT_USHORT	0
+#define AOS_FORMAT_FLOAT	1
+#define AOS_FORMAT_UBYTE	2
+#define AOS_FORMAT_FLOAT_COLOR	3
+
+#define REG_COORDS	0
+#define REG_COLOR0	1
+#define REG_TEX0	2
+
+struct dt {
+	GLint size;
+	GLenum type;
+	GLsizei stride;
+	void *data;
+};
+
+struct radeon_vertex_buffer {
+	int Count;
+	void *Elts;
+	int elt_size;
+	int elt_min, elt_max;	/* debug */
+
+	struct dt AttribPtr[VERT_ATTRIB_MAX];
+
+	const struct _mesa_prim *Primitive;
+	GLuint PrimitiveCount;
+	GLint LockFirst;
+	GLsizei LockCount;
+	int lock_uptodate;
+};
+
+struct r300_state {
+	struct r300_depthbuffer_state depth;
+	struct r300_texture_state texture;
+	int sw_tcl_inputs[VERT_ATTRIB_MAX];
+	struct r300_vertex_shader_state vertex_shader;
+	struct r300_pfs_compile_state pfs_compile;
+	struct r300_dma_region aos[R300_MAX_AOS_ARRAYS];
+	int aos_count;
+	struct radeon_vertex_buffer VB;
+
+	GLuint *Elts;
+	struct r300_dma_region elt_dma;
+
+	 DECLARE_RENDERINPUTS(render_inputs_bitset);	/* actual render inputs that R300 was configured for.
+							   They are the same as tnl->render_inputs for fixed pipeline */
+
+	struct {
+		int transform_offset;	/* Transform matrix offset, -1 if none */
+	} vap_param;		/* vertex processor parameter allocation - tells where to write parameters */
+
+	struct r300_stencilbuffer_state stencil;
+
+};
+
+#define R300_FALLBACK_NONE 0
+#define R300_FALLBACK_TCL 1
+#define R300_FALLBACK_RAST 2
+
+/**
+ * \brief R300 context structure.
+ */
+struct r300_context {
+	struct radeon_context radeon;	/* parent class, must be first */
+
+	struct r300_hw_state hw;
+	struct r300_cmdbuf cmdbuf;
+	struct r300_state state;
+	struct gl_vertex_program *curr_vp;
+	struct r300_vertex_program *selected_vp;
+
+	/* Vertex buffers
+	 */
+	struct r300_dma dma;
+	GLboolean save_on_next_unlock;
+	GLuint NewGLState;
+
+	/* Texture object bookkeeping
+	 */
+	unsigned nr_heaps;
+	driTexHeap *texture_heaps[RADEON_NR_TEX_HEAPS];
+	driTextureObject swapped;
+	int texture_depth;
+	float initialMaxAnisotropy;
+
+	/* Clientdata textures;
+	 */
+	GLuint prefer_gart_client_texturing;
+
+#ifdef USER_BUFFERS
+	struct r300_memory_manager *rmm;
+#endif
+
+	GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
+	GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
+
+	GLboolean disable_lowimpact_fallback;
+};
+
+struct r300_buffer_object {
+	struct gl_buffer_object mesa_obj;
+	int id;
+};
+
+#define R300_CONTEXT(ctx)		((r300ContextPtr)(ctx->DriverCtx))
+
+extern void r300DestroyContext(__DRIcontextPrivate * driContextPriv);
+extern GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+				   __DRIcontextPrivate * driContextPriv,
+				   void *sharedContextPrivate);
+
+extern void r300SelectVertexShader(r300ContextPtr r300);
+extern void r300InitShaderFuncs(struct dd_function_table *functions);
+extern int r300VertexProgUpdateParams(GLcontext * ctx,
+				      struct r300_vertex_program_cont *vp,
+				      float *dst);
+
+#define RADEON_D_CAPTURE 0
+#define RADEON_D_PLAYBACK 1
+#define RADEON_D_PLAYBACK_RAW 2
+#define RADEON_D_T 3
+
+#endif				/* __R300_CONTEXT_H__ */
diff --git a/r300/r300_emit.c b/r300/r300_emit.c
new file mode 100644
index 0000000..2c26069
--- /dev/null
+++ b/r300/r300_emit.c
@@ -0,0 +1,627 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "mtypes.h"
+#include "colormac.h"
+#include "imports.h"
+#include "macros.h"
+#include "image.h"
+
+#include "swrast_setup/swrast_setup.h"
+#include "math/m_translate.h"
+#include "tnl/tnl.h"
+#include "tnl/t_context.h"
+
+#include "r300_context.h"
+#include "radeon_ioctl.h"
+#include "r300_state.h"
+#include "r300_emit.h"
+#include "r300_ioctl.h"
+
+#ifdef USER_BUFFERS
+#include "r300_mem.h"
+#endif
+
+#if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
+    SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
+    SWIZZLE_Z != R300_INPUT_ROUTE_SELECT_Z || \
+    SWIZZLE_W != R300_INPUT_ROUTE_SELECT_W || \
+    SWIZZLE_ZERO != R300_INPUT_ROUTE_SELECT_ZERO || \
+    SWIZZLE_ONE != R300_INPUT_ROUTE_SELECT_ONE
+#error Cannot change these!
+#endif
+
+#define DEBUG_ALL DEBUG_VERTS
+
+#if defined(USE_X86_ASM)
+#define COPY_DWORDS( dst, src, nr )					\
+do {									\
+	int __tmp;							\
+	__asm__ __volatile__( "rep ; movsl"				\
+			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
+			      : "0" (nr),				\
+			        "D" ((long)dst),			\
+			        "S" ((long)src) );			\
+} while (0)
+#else
+#define COPY_DWORDS( dst, src, nr )		\
+do {						\
+   int j;					\
+   for ( j = 0 ; j < nr ; j++ )			\
+      dst[j] = ((int *)src)[j];			\
+   dst += nr;					\
+} while (0)
+#endif
+
+static void r300EmitVec4(GLcontext * ctx,
+			 struct r300_dma_region *rvb,
+			 GLvoid * data, int stride, int count)
+{
+	int i;
+	int *out = (int *)(rvb->address + rvb->start);
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d stride %d\n",
+			__FUNCTION__, count, stride);
+
+	if (stride == 4)
+		COPY_DWORDS(out, data, count);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out++;
+			data += stride;
+		}
+}
+
+static void r300EmitVec8(GLcontext * ctx,
+			 struct r300_dma_region *rvb,
+			 GLvoid * data, int stride, int count)
+{
+	int i;
+	int *out = (int *)(rvb->address + rvb->start);
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d stride %d\n",
+			__FUNCTION__, count, stride);
+
+	if (stride == 8)
+		COPY_DWORDS(out, data, count * 2);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out[1] = *(int *)(data + 4);
+			out += 2;
+			data += stride;
+		}
+}
+
+static void r300EmitVec12(GLcontext * ctx,
+			  struct r300_dma_region *rvb,
+			  GLvoid * data, int stride, int count)
+{
+	int i;
+	int *out = (int *)(rvb->address + rvb->start);
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+			__FUNCTION__, count, stride, (void *)out, (void *)data);
+
+	if (stride == 12)
+		COPY_DWORDS(out, data, count * 3);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out[1] = *(int *)(data + 4);
+			out[2] = *(int *)(data + 8);
+			out += 3;
+			data += stride;
+		}
+}
+
+static void r300EmitVec16(GLcontext * ctx,
+			  struct r300_dma_region *rvb,
+			  GLvoid * data, int stride, int count)
+{
+	int i;
+	int *out = (int *)(rvb->address + rvb->start);
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d stride %d\n",
+			__FUNCTION__, count, stride);
+
+	if (stride == 16)
+		COPY_DWORDS(out, data, count * 4);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out[1] = *(int *)(data + 4);
+			out[2] = *(int *)(data + 8);
+			out[3] = *(int *)(data + 12);
+			out += 4;
+			data += stride;
+		}
+}
+
+static void r300EmitVec(GLcontext * ctx,
+			struct r300_dma_region *rvb,
+			GLvoid * data, int size, int stride, int count)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d size %d stride %d\n",
+			__FUNCTION__, count, size, stride);
+
+	/* Gets triggered when playing with future_hw_tcl_on ... */
+	//assert(!rvb->buf);
+
+	if (stride == 0) {
+		r300AllocDmaRegion(rmesa, rvb, size * 4, 4);
+		count = 1;
+		rvb->aos_offset = GET_START(rvb);
+		rvb->aos_stride = 0;
+	} else {
+		r300AllocDmaRegion(rmesa, rvb, size * count * 4, 4);	/* alignment? */
+		rvb->aos_offset = GET_START(rvb);
+		rvb->aos_stride = size;
+	}
+
+	/* Emit the data
+	 */
+	switch (size) {
+	case 1:
+		r300EmitVec4(ctx, rvb, data, stride, count);
+		break;
+	case 2:
+		r300EmitVec8(ctx, rvb, data, stride, count);
+		break;
+	case 3:
+		r300EmitVec12(ctx, rvb, data, stride, count);
+		break;
+	case 4:
+		r300EmitVec16(ctx, rvb, data, stride, count);
+		break;
+	default:
+		assert(0);
+		_mesa_exit(-1);
+		break;
+	}
+
+}
+
+static GLuint t_type(struct dt *dt)
+{
+	switch (dt->type) {
+	case GL_UNSIGNED_BYTE:
+		return AOS_FORMAT_UBYTE;
+	case GL_SHORT:
+		return AOS_FORMAT_USHORT;
+	case GL_FLOAT:
+		return AOS_FORMAT_FLOAT;
+	default:
+		assert(0);
+		break;
+	}
+
+	return AOS_FORMAT_FLOAT;
+}
+
+static GLuint t_vir0_size(struct dt *dt)
+{
+	switch (dt->type) {
+	case GL_UNSIGNED_BYTE:
+		return 4;
+	case GL_SHORT:
+		return 7;
+	case GL_FLOAT:
+		return dt->size - 1;
+	default:
+		assert(0);
+		break;
+	}
+
+	return 0;
+}
+
+static GLuint t_aos_size(struct dt *dt)
+{
+	switch (dt->type) {
+	case GL_UNSIGNED_BYTE:
+		return 1;
+	case GL_SHORT:
+		return 2;
+	case GL_FLOAT:
+		return dt->size;
+	default:
+		assert(0);
+		break;
+	}
+
+	return 0;
+}
+
+static GLuint t_vir0(uint32_t * dst, struct dt *dt, int *inputs,
+		     GLint * tab, GLuint nr)
+{
+	GLuint i, dw;
+
+	for (i = 0; i + 1 < nr; i += 2) {
+		dw = t_vir0_size(&dt[tab[i]]) | (inputs[tab[i]] << 8) |
+		    (t_type(&dt[tab[i]]) << 14);
+		dw |=
+		    (t_vir0_size(&dt[tab[i + 1]]) |
+		     (inputs[tab[i + 1]] << 8) | (t_type(&dt[tab[i + 1]])
+						  << 14)) << 16;
+
+		if (i + 2 == nr) {
+			dw |= (1 << (13 + 16));
+		}
+		dst[i >> 1] = dw;
+	}
+
+	if (nr & 1) {
+		dw = t_vir0_size(&dt[tab[nr - 1]]) | (inputs[tab[nr - 1]]
+						      << 8) |
+		    (t_type(&dt[tab[nr - 1]]) << 14);
+		dw |= 1 << 13;
+
+		dst[nr >> 1] = dw;
+	}
+
+	return (nr + 1) >> 1;
+}
+
+static GLuint t_swizzle(int swizzle[4])
+{
+	return (swizzle[0] << R300_INPUT_ROUTE_X_SHIFT) |
+	    (swizzle[1] << R300_INPUT_ROUTE_Y_SHIFT) |
+	    (swizzle[2] << R300_INPUT_ROUTE_Z_SHIFT) |
+	    (swizzle[3] << R300_INPUT_ROUTE_W_SHIFT);
+}
+
+static GLuint t_vir1(uint32_t * dst, int swizzle[][4], GLuint nr)
+{
+	GLuint i;
+
+	for (i = 0; i + 1 < nr; i += 2) {
+		dst[i >> 1] = t_swizzle(swizzle[i]) | R300_INPUT_ROUTE_ENABLE;
+		dst[i >> 1] |=
+		    (t_swizzle(swizzle[i + 1]) | R300_INPUT_ROUTE_ENABLE)
+		    << 16;
+	}
+
+	if (nr & 1)
+		dst[nr >> 1] =
+		    t_swizzle(swizzle[nr - 1]) | R300_INPUT_ROUTE_ENABLE;
+
+	return (nr + 1) >> 1;
+}
+
+static GLuint t_emit_size(struct dt *dt)
+{
+	return dt->size;
+}
+
+static GLuint t_vic(GLcontext * ctx, GLuint InputsRead)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	GLuint i, vic_1 = 0;
+
+	if (InputsRead & (1 << VERT_ATTRIB_POS))
+		vic_1 |= R300_INPUT_CNTL_POS;
+
+	if (InputsRead & (1 << VERT_ATTRIB_NORMAL))
+		vic_1 |= R300_INPUT_CNTL_NORMAL;
+
+	if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
+		vic_1 |= R300_INPUT_CNTL_COLOR;
+
+	r300->state.texture.tc_count = 0;
+	for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+		if (InputsRead & (1 << (VERT_ATTRIB_TEX0 + i))) {
+			r300->state.texture.tc_count++;
+			vic_1 |= R300_INPUT_CNTL_TC0 << i;
+		}
+
+	return vic_1;
+}
+
+/* Emit vertex data to GART memory
+ * Route inputs to the vertex processor
+ * This function should never return R300_FALLBACK_TCL when using software tcl.
+ */
+
+int r300EmitArrays(GLcontext * ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	r300ContextPtr r300 = rmesa;
+	struct radeon_vertex_buffer *VB = &rmesa->state.VB;
+	GLuint nr;
+	GLuint count = VB->Count;
+	GLuint i;
+	GLuint InputsRead = 0, OutputsWritten = 0;
+	int *inputs = NULL;
+	int vir_inputs[VERT_ATTRIB_MAX];
+	GLint tab[VERT_ATTRIB_MAX];
+	int swizzle[VERT_ATTRIB_MAX][4];
+
+	if (hw_tcl_on) {
+		struct r300_vertex_program *prog =
+		    (struct r300_vertex_program *)
+		    CURRENT_VERTEX_SHADER(ctx);
+		inputs = prog->inputs;
+		InputsRead = CURRENT_VERTEX_SHADER(ctx)->key.InputsRead;
+		OutputsWritten = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+	} else {
+		DECLARE_RENDERINPUTS(inputs_bitset);
+		inputs = r300->state.sw_tcl_inputs;
+
+		RENDERINPUTS_COPY(inputs_bitset,
+				  TNL_CONTEXT(ctx)->render_inputs_bitset);
+
+		assert(RENDERINPUTS_TEST(inputs_bitset, _TNL_ATTRIB_POS));
+		InputsRead |= 1 << VERT_ATTRIB_POS;
+		OutputsWritten |= 1 << VERT_RESULT_HPOS;
+
+		assert(RENDERINPUTS_TEST(inputs_bitset, _TNL_ATTRIB_NORMAL)
+		       == 0);
+
+		assert(RENDERINPUTS_TEST(inputs_bitset, _TNL_ATTRIB_COLOR0));
+		InputsRead |= 1 << VERT_ATTRIB_COLOR0;
+		OutputsWritten |= 1 << VERT_RESULT_COL0;
+
+		if (RENDERINPUTS_TEST(inputs_bitset, _TNL_ATTRIB_COLOR1)) {
+			InputsRead |= 1 << VERT_ATTRIB_COLOR1;
+			OutputsWritten |= 1 << VERT_RESULT_COL1;
+		}
+
+		for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+			if (RENDERINPUTS_TEST
+			    (inputs_bitset, _TNL_ATTRIB_TEX(i))) {
+				InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
+				OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
+			}
+
+		for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++)
+			if (InputsRead & (1 << i))
+				inputs[i] = nr++;
+			else
+				inputs[i] = -1;
+
+		if (!
+		    (r300->radeon.radeonScreen->
+		     chip_flags & RADEON_CHIPSET_TCL)) {
+			/* Fixed, apply to vir0 only */
+			memcpy(vir_inputs, inputs,
+			       VERT_ATTRIB_MAX * sizeof(int));
+			inputs = vir_inputs;
+
+			if (InputsRead & VERT_ATTRIB_POS)
+				inputs[VERT_ATTRIB_POS] = 0;
+
+			if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
+				inputs[VERT_ATTRIB_COLOR0] = 2;
+
+			if (InputsRead & (1 << VERT_ATTRIB_COLOR1))
+				inputs[VERT_ATTRIB_COLOR1] = 3;
+
+			for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
+				if (InputsRead & (1 << i))
+					inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
+		}
+
+		RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset,
+				  inputs_bitset);
+	}
+	assert(InputsRead);
+	assert(OutputsWritten);
+
+	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++)
+		if (InputsRead & (1 << i))
+			tab[nr++] = i;
+
+	if (nr > R300_MAX_AOS_ARRAYS)
+		return R300_FALLBACK_TCL;
+
+	for (i = 0; i < nr; i++) {
+		int ci;
+		int comp_size, fix, found = 0;
+
+		swizzle[i][0] = SWIZZLE_ZERO;
+		swizzle[i][1] = SWIZZLE_ZERO;
+		swizzle[i][2] = SWIZZLE_ZERO;
+		swizzle[i][3] = SWIZZLE_ONE;
+
+		for (ci = 0; ci < VB->AttribPtr[tab[i]].size; ci++)
+			swizzle[i][ci] = ci;
+
+#if MESA_BIG_ENDIAN
+#define SWAP_INT(a, b) do { \
+	int __temp; \
+	__temp = a;\
+	a = b; \
+	b = __temp; \
+} while (0)
+
+		if (VB->AttribPtr[tab[i]].type == GL_UNSIGNED_BYTE) {
+			SWAP_INT(swizzle[i][0], swizzle[i][3]);
+			SWAP_INT(swizzle[i][1], swizzle[i][2]);
+		}
+#endif				/* MESA_BIG_ENDIAN */
+
+		if (r300IsGartMemory(rmesa, VB->AttribPtr[tab[i]].data,
+				     /*(count-1)*stride */ 4)) {
+			if (VB->AttribPtr[tab[i]].stride % 4)
+				return R300_FALLBACK_TCL;
+
+			rmesa->state.aos[i].address =
+			    VB->AttribPtr[tab[i]].data;
+			rmesa->state.aos[i].start = 0;
+			rmesa->state.aos[i].aos_offset =
+			    r300GartOffsetFromVirtual(rmesa,
+						      VB->
+						      AttribPtr[tab[i]].data);
+			rmesa->state.aos[i].aos_stride =
+			    VB->AttribPtr[tab[i]].stride / 4;
+
+			rmesa->state.aos[i].aos_size =
+			    t_emit_size(&VB->AttribPtr[tab[i]]);
+		} else {
+			/* TODO: r300EmitVec can only handle 4 byte vectors */
+			if (VB->AttribPtr[tab[i]].type != GL_FLOAT)
+				return R300_FALLBACK_TCL;
+
+			r300EmitVec(ctx, &rmesa->state.aos[i],
+				    VB->AttribPtr[tab[i]].data,
+				    t_emit_size(&VB->AttribPtr[tab[i]]),
+				    VB->AttribPtr[tab[i]].stride, count);
+		}
+
+		rmesa->state.aos[i].aos_size =
+		    t_aos_size(&VB->AttribPtr[tab[i]]);
+
+		comp_size = _mesa_sizeof_type(VB->AttribPtr[tab[i]].type);
+
+		for (fix = 0; fix <= 4 - VB->AttribPtr[tab[i]].size; fix++) {
+			if ((rmesa->state.aos[i].aos_offset -
+			     comp_size * fix) % 4)
+				continue;
+
+			found = 1;
+			break;
+		}
+
+		if (found) {
+			if (fix > 0) {
+				WARN_ONCE("Feeling lucky?\n");
+			}
+
+			rmesa->state.aos[i].aos_offset -= comp_size * fix;
+
+			for (ci = 0; ci < VB->AttribPtr[tab[i]].size; ci++)
+				swizzle[i][ci] += fix;
+		} else {
+			WARN_ONCE
+			    ("Cannot handle offset %x with stride %d, comp %d\n",
+			     rmesa->state.aos[i].aos_offset,
+			     rmesa->state.aos[i].aos_stride,
+			     VB->AttribPtr[tab[i]].size);
+			return R300_FALLBACK_TCL;
+		}
+	}
+
+	/* setup INPUT_ROUTE */
+	R300_STATECHANGE(r300, vir[0]);
+	((drm_r300_cmd_header_t *) r300->hw.vir[0].cmd)->packet0.count =
+	    t_vir0(&r300->hw.vir[0].cmd[R300_VIR_CNTL_0], VB->AttribPtr,
+		   inputs, tab, nr);
+
+	R300_STATECHANGE(r300, vir[1]);
+	((drm_r300_cmd_header_t *) r300->hw.vir[1].cmd)->packet0.count =
+	    t_vir1(&r300->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle, nr);
+
+	/* Set up input_cntl */
+	/* I don't think this is needed for vertex buffers, but it doesn't hurt anything */
+	R300_STATECHANGE(r300, vic);
+	r300->hw.vic.cmd[R300_VIC_CNTL_0] = 0x5555;	/* Hard coded value, no idea what it means */
+	r300->hw.vic.cmd[R300_VIC_CNTL_1] = t_vic(ctx, InputsRead);
+
+	/* Stage 3: VAP output */
+
+	R300_STATECHANGE(r300, vof);
+
+	r300->hw.vof.cmd[R300_VOF_CNTL_0] = 0;
+	r300->hw.vof.cmd[R300_VOF_CNTL_1] = 0;
+
+	if (OutputsWritten & (1 << VERT_RESULT_HPOS))
+		r300->hw.vof.cmd[R300_VOF_CNTL_0] |=
+		    R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
+
+	if (OutputsWritten & (1 << VERT_RESULT_COL0))
+		r300->hw.vof.cmd[R300_VOF_CNTL_0] |=
+		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_PRESENT;
+
+	if (OutputsWritten & (1 << VERT_RESULT_COL1))
+		r300->hw.vof.cmd[R300_VOF_CNTL_0] |=
+		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT;
+
+	/*if(OutputsWritten & (1 << VERT_RESULT_BFC0))
+	   r300->hw.vof.cmd[R300_VOF_CNTL_0] |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT;
+
+	   if(OutputsWritten & (1 << VERT_RESULT_BFC1))
+	   r300->hw.vof.cmd[R300_VOF_CNTL_0] |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT; */
+	//if(OutputsWritten & (1 << VERT_RESULT_FOGC))
+
+	if (OutputsWritten & (1 << VERT_RESULT_PSIZ))
+		r300->hw.vof.cmd[R300_VOF_CNTL_0] |=
+		    R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
+
+	for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+		if (OutputsWritten & (1 << (VERT_RESULT_TEX0 + i)))
+			r300->hw.vof.cmd[R300_VOF_CNTL_1] |= (4 << (3 * i));
+
+	rmesa->state.aos_count = nr;
+
+	return R300_FALLBACK_NONE;
+}
+
+#ifdef USER_BUFFERS
+void r300UseArrays(GLcontext * ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	int i;
+
+	if (rmesa->state.elt_dma.buf)
+		r300_mem_use(rmesa, rmesa->state.elt_dma.buf->id);
+
+	for (i = 0; i < rmesa->state.aos_count; i++) {
+		if (rmesa->state.aos[i].buf)
+			r300_mem_use(rmesa, rmesa->state.aos[i].buf->id);
+	}
+}
+#endif
+
+void r300ReleaseArrays(GLcontext * ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	int i;
+
+	r300ReleaseDmaRegion(rmesa, &rmesa->state.elt_dma, __FUNCTION__);
+	for (i = 0; i < rmesa->state.aos_count; i++) {
+		r300ReleaseDmaRegion(rmesa, &rmesa->state.aos[i], __FUNCTION__);
+	}
+}
diff --git a/r300/r300_emit.h b/r300/r300_emit.h
new file mode 100644
index 0000000..7be098f
--- /dev/null
+++ b/r300/r300_emit.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (C) 2005 Vladimir Dergachev.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/*
+ * Authors:
+ *   Vladimir Dergachev <volodya@mindspring.com>
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ *   Aapo Tahkola <aet@rasterburn.org>
+ *   Ben Skeggs <darktama@iinet.net.au>
+ *   Jerome Glisse <j.glisse@gmail.com>
+ */
+
+/* This files defines functions for accessing R300 hardware.
+ */
+#ifndef __R300_EMIT_H__
+#define __R300_EMIT_H__
+
+#include "glheader.h"
+#include "r300_context.h"
+#include "r300_cmdbuf.h"
+#include "radeon_reg.h"
+
+/*
+ * CP type-3 packets
+ */
+#define RADEON_CP_PACKET3_UNK1B                     0xC0001B00
+#define RADEON_CP_PACKET3_INDX_BUFFER               0xC0003300
+#define RADEON_CP_PACKET3_3D_DRAW_VBUF_2            0xC0003400
+#define RADEON_CP_PACKET3_3D_DRAW_IMMD_2            0xC0003500
+#define RADEON_CP_PACKET3_3D_DRAW_INDX_2            0xC0003600
+#define RADEON_CP_PACKET3_3D_LOAD_VBPNTR            0xC0002F00
+#define RADEON_CP_PACKET3_3D_CLEAR_ZMASK            0xC0003202
+#define RADEON_CP_PACKET3_3D_CLEAR_CMASK            0xC0003802
+#define RADEON_CP_PACKET3_3D_CLEAR_HIZ              0xC0003702
+
+#define CP_PACKET0(reg, n)	(RADEON_CP_PACKET0 | ((n)<<16) | ((reg)>>2))
+
+static __inline__ uint32_t cmdpacket0(int reg, int count)
+{
+	drm_r300_cmd_header_t cmd;
+
+	cmd.packet0.cmd_type = R300_CMD_PACKET0;
+	cmd.packet0.count = count;
+	cmd.packet0.reghi = ((unsigned int)reg & 0xFF00) >> 8;
+	cmd.packet0.reglo = ((unsigned int)reg & 0x00FF);
+
+	return cmd.u;
+}
+
+static __inline__ uint32_t cmdvpu(int addr, int count)
+{
+	drm_r300_cmd_header_t cmd;
+
+	cmd.vpu.cmd_type = R300_CMD_VPU;
+	cmd.vpu.count = count;
+	cmd.vpu.adrhi = ((unsigned int)addr & 0xFF00) >> 8;
+	cmd.vpu.adrlo = ((unsigned int)addr & 0x00FF);
+
+	return cmd.u;
+}
+
+static __inline__ uint32_t cmdpacket3(int packet)
+{
+	drm_r300_cmd_header_t cmd;
+
+	cmd.packet3.cmd_type = R300_CMD_PACKET3;
+	cmd.packet3.packet = packet;
+
+	return cmd.u;
+}
+
+static __inline__ uint32_t cmdcpdelay(unsigned short count)
+{
+	drm_r300_cmd_header_t cmd;
+
+	cmd.delay.cmd_type = R300_CMD_CP_DELAY;
+	cmd.delay.count = count;
+
+	return cmd.u;
+}
+
+static __inline__ uint32_t cmdwait(unsigned char flags)
+{
+	drm_r300_cmd_header_t cmd;
+
+	cmd.wait.cmd_type = R300_CMD_WAIT;
+	cmd.wait.flags = flags;
+
+	return cmd.u;
+}
+
+static __inline__ uint32_t cmdpacify(void)
+{
+	drm_r300_cmd_header_t cmd;
+
+	cmd.header.cmd_type = R300_CMD_END3D;
+
+	return cmd.u;
+}
+
+/**
+ * Prepare to write a register value to register at address reg.
+ * If num_extra > 0 then the following extra values are written
+ * to registers with address +4, +8 and so on..
+ */
+#define reg_start(reg, num_extra)					\
+	do {								\
+		int _n;							\
+		_n=(num_extra);						\
+		cmd = (drm_radeon_cmd_header_t*)			\
+			r300AllocCmdBuf(rmesa,				\
+					(_n+2),				\
+					__FUNCTION__);			\
+		cmd_reserved=_n+2;					\
+		cmd_written=1;						\
+		cmd[0].i=cmdpacket0((reg), _n+1);			\
+	} while (0);
+
+/**
+ * Emit GLuint freestyle
+ */
+#define e32(dword)							\
+	do {								\
+		if(cmd_written<cmd_reserved) {				\
+			cmd[cmd_written].i=(dword);			\
+			cmd_written++;					\
+		} else {						\
+			fprintf(stderr,					\
+				"e32 but no previous packet "		\
+				"declaration.\n"			\
+				"Aborting! in %s::%s at line %d, "	\
+				"cmd_written=%d cmd_reserved=%d\n",	\
+				__FILE__, __FUNCTION__, __LINE__,	\
+				cmd_written, cmd_reserved);		\
+			_mesa_exit(-1);					\
+		}							\
+	} while(0)
+
+#define	efloat(f) e32(r300PackFloat32(f))
+
+#define vsf_start_fragment(dest, length)				\
+	do {								\
+		int _n;							\
+		_n = (length);						\
+		cmd = (drm_radeon_cmd_header_t*)			\
+			r300AllocCmdBuf(rmesa,				\
+					(_n+1),				\
+					__FUNCTION__);			\
+		cmd_reserved = _n+2;					\
+		cmd_written =1;						\
+		cmd[0].i = cmdvpu((dest), _n/4);			\
+	} while (0);
+
+#define start_packet3(packet, count)					\
+	{								\
+		int _n;							\
+		GLuint _p;						\
+		_n = (count);						\
+		_p = (packet);						\
+		cmd = (drm_radeon_cmd_header_t*)			\
+			r300AllocCmdBuf(rmesa,				\
+					(_n+3),				\
+					__FUNCTION__);			\
+		cmd_reserved = _n+3;					\
+		cmd_written = 2;					\
+		if(_n > 0x3fff) {					\
+			fprintf(stderr,"Too big packet3 %08x: cannot "	\
+				"store %d dwords\n",			\
+				_p, _n);				\
+			_mesa_exit(-1);					\
+		}							\
+		cmd[0].i = cmdpacket3(R300_CMD_PACKET3_RAW);		\
+		cmd[1].i = _p | ((_n & 0x3fff)<<16);			\
+	}
+
+/**
+ * Must be sent to switch to 2d commands
+ */
+void static inline end_3d(r300ContextPtr rmesa)
+{
+	drm_radeon_cmd_header_t *cmd = NULL;
+
+	cmd =
+	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
+	cmd[0].header.cmd_type = R300_CMD_END3D;
+}
+
+void static inline cp_delay(r300ContextPtr rmesa, unsigned short count)
+{
+	drm_radeon_cmd_header_t *cmd = NULL;
+
+	cmd =
+	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
+	cmd[0].i = cmdcpdelay(count);
+}
+
+void static inline cp_wait(r300ContextPtr rmesa, unsigned char flags)
+{
+	drm_radeon_cmd_header_t *cmd = NULL;
+
+	cmd =
+	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
+	cmd[0].i = cmdwait(flags);
+}
+
+extern int r300EmitArrays(GLcontext * ctx);
+
+#ifdef USER_BUFFERS
+void r300UseArrays(GLcontext * ctx);
+#endif
+
+extern void r300ReleaseArrays(GLcontext * ctx);
+
+#endif
diff --git a/r300/r300_fragprog.c b/r300/r300_fragprog.c
new file mode 100644
index 0000000..cce8e68
--- /dev/null
+++ b/r300/r300_fragprog.c
@@ -0,0 +1,2472 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * \author Ben Skeggs <darktama@iinet.net.au>
+ *
+ * \author Jerome Glisse <j.glisse@gmail.com>
+ *
+ * \todo Depth write, WPOS/FOGC inputs
+ *
+ * \todo FogOption
+ *
+ * \todo Verify results of opcodes for accuracy, I've only checked them in
+ * specific cases.
+ */
+
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "shader/prog_instruction.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+
+#include "r300_context.h"
+#include "r300_fragprog.h"
+#include "r300_reg.h"
+#include "r300_state.h"
+
+/*
+ * Usefull macros and values
+ */
+#define ERROR(fmt, args...) do {			\
+		fprintf(stderr, "%s::%s(): " fmt "\n",	\
+			__FILE__, __FUNCTION__, ##args);	\
+		fp->error = GL_TRUE;			\
+	} while(0)
+
+#define PFS_INVAL 0xFFFFFFFF
+#define COMPILE_STATE struct r300_pfs_compile_state *cs = fp->cs
+
+#define SWIZZLE_XYZ		0
+#define SWIZZLE_XXX		1
+#define SWIZZLE_YYY		2
+#define SWIZZLE_ZZZ		3
+#define SWIZZLE_WWW		4
+#define SWIZZLE_YZX		5
+#define SWIZZLE_ZXY		6
+#define SWIZZLE_WZY		7
+#define SWIZZLE_111		8
+#define SWIZZLE_000		9
+#define SWIZZLE_HHH		10
+
+#define swizzle(r, x, y, z, w) do_swizzle(fp, r,		\
+					  ((SWIZZLE_##x<<0)|	\
+					   (SWIZZLE_##y<<3)|	\
+					   (SWIZZLE_##z<<6)|	\
+					   (SWIZZLE_##w<<9)),	\
+					  0)
+
+#define REG_TYPE_INPUT		0
+#define REG_TYPE_OUTPUT		1
+#define REG_TYPE_TEMP		2
+#define REG_TYPE_CONST		3
+
+#define REG_TYPE_SHIFT		0
+#define REG_INDEX_SHIFT		2
+#define REG_VSWZ_SHIFT		8
+#define REG_SSWZ_SHIFT		13
+#define REG_NEGV_SHIFT		18
+#define REG_NEGS_SHIFT		19
+#define REG_ABS_SHIFT		20
+#define REG_NO_USE_SHIFT	21	// Hack for refcounting
+#define REG_VALID_SHIFT		22	// Does the register contain a defined value?
+#define REG_BUILTIN_SHIFT   23	// Is it a builtin (like all zero/all one)?
+
+#define REG_TYPE_MASK		(0x03 << REG_TYPE_SHIFT)
+#define REG_INDEX_MASK		(0x3F << REG_INDEX_SHIFT)
+#define REG_VSWZ_MASK		(0x1F << REG_VSWZ_SHIFT)
+#define REG_SSWZ_MASK		(0x1F << REG_SSWZ_SHIFT)
+#define REG_NEGV_MASK		(0x01 << REG_NEGV_SHIFT)
+#define REG_NEGS_MASK		(0x01 << REG_NEGS_SHIFT)
+#define REG_ABS_MASK		(0x01 << REG_ABS_SHIFT)
+#define REG_NO_USE_MASK		(0x01 << REG_NO_USE_SHIFT)
+#define REG_VALID_MASK		(0x01 << REG_VALID_SHIFT)
+#define REG_BUILTIN_MASK	(0x01 << REG_BUILTIN_SHIFT)
+
+#define REG(type, index, vswz, sswz, nouse, valid, builtin)	\
+	(((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |			\
+	 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |		\
+	 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |		\
+	 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |		\
+	 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |	\
+	 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |			\
+	 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
+#define REG_GET_TYPE(reg)						\
+	((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
+#define REG_GET_INDEX(reg)						\
+	((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
+#define REG_GET_VSWZ(reg)						\
+	((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
+#define REG_GET_SSWZ(reg)						\
+	((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
+#define REG_GET_NO_USE(reg)						\
+	((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
+#define REG_GET_VALID(reg)						\
+	((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
+#define REG_GET_BUILTIN(reg)						\
+	((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
+#define REG_SET_TYPE(reg, type)						\
+	reg = ((reg & ~REG_TYPE_MASK) |					\
+	       ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
+#define REG_SET_INDEX(reg, index)					\
+	reg = ((reg & ~REG_INDEX_MASK) |				\
+	       ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
+#define REG_SET_VSWZ(reg, vswz)						\
+	reg = ((reg & ~REG_VSWZ_MASK) |					\
+	       ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
+#define REG_SET_SSWZ(reg, sswz)						\
+	reg = ((reg & ~REG_SSWZ_MASK) |					\
+	       ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
+#define REG_SET_NO_USE(reg, nouse)					\
+	reg = ((reg & ~REG_NO_USE_MASK) |				\
+	       ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
+#define REG_SET_VALID(reg, valid)					\
+	reg = ((reg & ~REG_VALID_MASK) |				\
+	       ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
+#define REG_SET_BUILTIN(reg, builtin)					\
+	reg = ((reg & ~REG_BUILTIN_MASK) |				\
+	       ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
+#define REG_ABS(reg)							\
+	reg = (reg | REG_ABS_MASK)
+#define REG_NEGV(reg)							\
+	reg = (reg | REG_NEGV_MASK)
+#define REG_NEGS(reg)							\
+	reg = (reg | REG_NEGS_MASK)
+
+/*
+ * Datas structures for fragment program generation
+ */
+
+/* description of r300 native hw instructions */
+static const struct {
+	const char *name;
+	int argc;
+	int v_op;
+	int s_op;
+} r300_fpop[] = {
+	/* *INDENT-OFF* */
+	{"MAD", 3, R300_FPI0_OUTC_MAD, R300_FPI2_OUTA_MAD},
+	{"DP3", 2, R300_FPI0_OUTC_DP3, R300_FPI2_OUTA_DP4},
+	{"DP4", 2, R300_FPI0_OUTC_DP4, R300_FPI2_OUTA_DP4},
+	{"MIN", 2, R300_FPI0_OUTC_MIN, R300_FPI2_OUTA_MIN},
+	{"MAX", 2, R300_FPI0_OUTC_MAX, R300_FPI2_OUTA_MAX},
+	{"CMP", 3, R300_FPI0_OUTC_CMP, R300_FPI2_OUTA_CMP},
+	{"FRC", 1, R300_FPI0_OUTC_FRC, R300_FPI2_OUTA_FRC},
+	{"EX2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_EX2},
+	{"LG2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_LG2},
+	{"RCP", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RCP},
+	{"RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RSQ},
+	{"REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA, PFS_INVAL},
+	{"CMPH", 3, R300_FPI0_OUTC_CMPH, PFS_INVAL},
+	/* *INDENT-ON* */
+};
+
+/* vector swizzles r300 can support natively, with a couple of
+ * cases we handle specially
+ *
+ * REG_VSWZ/REG_SSWZ is an index into this table
+ */
+
+/* mapping from SWIZZLE_* to r300 native values for scalar insns */
+#define SWIZZLE_HALF 6
+
+#define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
+					  SWIZZLE_##y, \
+					  SWIZZLE_##z, \
+					  SWIZZLE_ZERO))
+/* native swizzles */
+static const struct r300_pfs_swizzle {
+	GLuint hash;		/* swizzle value this matches */
+	GLuint base;		/* base value for hw swizzle */
+	GLuint stride;		/* difference in base between arg0/1/2 */
+	GLuint flags;
+} v_swiz[] = {
+	/* *INDENT-OFF* */
+	{MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
+	{MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
+	{MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
+	{MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
+	{MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
+	{PFS_INVAL, 0, 0, 0},
+	/* *INDENT-ON* */
+};
+
+/* used during matching of non-native swizzles */
+#define SWZ_X_MASK (7 << 0)
+#define SWZ_Y_MASK (7 << 3)
+#define SWZ_Z_MASK (7 << 6)
+#define SWZ_W_MASK (7 << 9)
+static const struct {
+	GLuint hash;		/* used to mask matching swizzle components */
+	int mask;		/* actual outmask */
+	int count;		/* count of components matched */
+} s_mask[] = {
+	/* *INDENT-OFF* */
+	{SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
+	{SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
+	{SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
+	{SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
+	{SWZ_X_MASK, 1, 1},
+	{SWZ_Y_MASK, 2, 1},
+	{SWZ_Z_MASK, 4, 1},
+	{PFS_INVAL, PFS_INVAL, PFS_INVAL}
+	/* *INDENT-ON* */
+};
+
+static const struct {
+	int base;		/* hw value of swizzle */
+	int stride;		/* difference between SRC0/1/2 */
+	GLuint flags;
+} s_swiz[] = {
+	/* *INDENT-OFF* */
+	{R300_FPI2_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
+	{R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
+	{R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
+	{R300_FPI2_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
+	{R300_FPI2_ARGA_ZERO, 0, 0},
+	{R300_FPI2_ARGA_ONE, 0, 0},
+	{R300_FPI2_ARGA_HALF, 0, 0}
+	/* *INDENT-ON* */
+};
+
+/* boiler-plate reg, for convenience */
+static const GLuint undef = REG(REG_TYPE_TEMP,
+				0,
+				SWIZZLE_XYZ,
+				SWIZZLE_W,
+				GL_FALSE,
+				GL_FALSE,
+				GL_FALSE);
+
+/* constant one source */
+static const GLuint pfs_one = REG(REG_TYPE_CONST,
+				  0,
+				  SWIZZLE_111,
+				  SWIZZLE_ONE,
+				  GL_FALSE,
+				  GL_TRUE,
+				  GL_TRUE);
+
+/* constant half source */
+static const GLuint pfs_half = REG(REG_TYPE_CONST,
+				   0,
+				   SWIZZLE_HHH,
+				   SWIZZLE_HALF,
+				   GL_FALSE,
+				   GL_TRUE,
+				   GL_TRUE);
+
+/* constant zero source */
+static const GLuint pfs_zero = REG(REG_TYPE_CONST,
+				   0,
+				   SWIZZLE_000,
+				   SWIZZLE_ZERO,
+				   GL_FALSE,
+				   GL_TRUE,
+				   GL_TRUE);
+
+/*
+ * Common functions prototypes
+ */
+static void dump_program(struct r300_fragment_program *fp);
+static void emit_arith(struct r300_fragment_program *fp, int op,
+		       GLuint dest, int mask,
+		       GLuint src0, GLuint src1, GLuint src2, int flags);
+
+/**
+ * Get an R300 temporary that can be written to in the given slot.
+ */
+static int get_hw_temp(struct r300_fragment_program *fp, int slot)
+{
+	COMPILE_STATE;
+	int r;
+
+	for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
+			break;
+	}
+
+	if (r >= PFS_NUM_TEMP_REGS) {
+		ERROR("Out of hardware temps\n");
+		return 0;
+	}
+	// Reserved is used to avoid the following scenario:
+	//  R300 temporary X is first assigned to Mesa temporary Y during vector ops
+	//  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
+	//  Then scalar ops on Mesa temporary Z are emitted and move back in time
+	//  to overwrite the value of temporary Y.
+	// End scenario.
+	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+	cs->hwtemps[r].free = -1;
+
+	// Reset to some value that won't mess things up when the user
+	// tries to read from a temporary that hasn't been assigned a value yet.
+	// In the normal case, vector_valid and scalar_valid should be set to
+	// a sane value by the first emit that writes to this temporary.
+	cs->hwtemps[r].vector_valid = 0;
+	cs->hwtemps[r].scalar_valid = 0;
+
+	if (r > fp->max_temp_idx)
+		fp->max_temp_idx = r;
+
+	return r;
+}
+
+/**
+ * Get an R300 temporary that will act as a TEX destination register.
+ */
+static int get_hw_temp_tex(struct r300_fragment_program *fp)
+{
+	COMPILE_STATE;
+	int r;
+
+	for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+		if (cs->used_in_node & (1 << r))
+			continue;
+
+		// Note: Be very careful here
+		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
+			break;
+	}
+
+	if (r >= PFS_NUM_TEMP_REGS)
+		return get_hw_temp(fp, 0);	/* Will cause an indirection */
+
+	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+	cs->hwtemps[r].free = -1;
+
+	// Reset to some value that won't mess things up when the user
+	// tries to read from a temporary that hasn't been assigned a value yet.
+	// In the normal case, vector_valid and scalar_valid should be set to
+	// a sane value by the first emit that writes to this temporary.
+	cs->hwtemps[r].vector_valid = cs->nrslots;
+	cs->hwtemps[r].scalar_valid = cs->nrslots;
+
+	if (r > fp->max_temp_idx)
+		fp->max_temp_idx = r;
+
+	return r;
+}
+
+/**
+ * Mark the given hardware register as free.
+ */
+static void free_hw_temp(struct r300_fragment_program *fp, int idx)
+{
+	COMPILE_STATE;
+
+	// Be very careful here. Consider sequences like
+	//  MAD r0, r1,r2,r3
+	//  TEX r4, ...
+	// The TEX instruction may be moved in front of the MAD instruction
+	// due to the way nodes work. We don't want to alias r1 and r4 in
+	// this case.
+	// I'm certain the register allocation could be further sanitized,
+	// but it's tricky because of stuff that can happen inside emit_tex
+	// and emit_arith.
+	cs->hwtemps[idx].free = cs->nrslots + 1;
+}
+
+/**
+ * Create a new Mesa temporary register.
+ */
+static GLuint get_temp_reg(struct r300_fragment_program *fp)
+{
+	COMPILE_STATE;
+	GLuint r = undef;
+	GLuint index;
+
+	index = ffs(~cs->temp_in_use);
+	if (!index) {
+		ERROR("Out of program temps\n");
+		return r;
+	}
+
+	cs->temp_in_use |= (1 << --index);
+	cs->temps[index].refcount = 0xFFFFFFFF;
+	cs->temps[index].reg = -1;
+
+	REG_SET_TYPE(r, REG_TYPE_TEMP);
+	REG_SET_INDEX(r, index);
+	REG_SET_VALID(r, GL_TRUE);
+	return r;
+}
+
+/**
+ * Create a new Mesa temporary register that will act as the destination
+ * register for a texture read.
+ */
+static GLuint get_temp_reg_tex(struct r300_fragment_program *fp)
+{
+	COMPILE_STATE;
+	GLuint r = undef;
+	GLuint index;
+
+	index = ffs(~cs->temp_in_use);
+	if (!index) {
+		ERROR("Out of program temps\n");
+		return r;
+	}
+
+	cs->temp_in_use |= (1 << --index);
+	cs->temps[index].refcount = 0xFFFFFFFF;
+	cs->temps[index].reg = get_hw_temp_tex(fp);
+
+	REG_SET_TYPE(r, REG_TYPE_TEMP);
+	REG_SET_INDEX(r, index);
+	REG_SET_VALID(r, GL_TRUE);
+	return r;
+}
+
+/**
+ * Free a Mesa temporary and the associated R300 temporary.
+ */
+static void free_temp(struct r300_fragment_program *fp, GLuint r)
+{
+	COMPILE_STATE;
+	GLuint index = REG_GET_INDEX(r);
+
+	if (!(cs->temp_in_use & (1 << index)))
+		return;
+
+	if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
+		free_hw_temp(fp, cs->temps[index].reg);
+		cs->temps[index].reg = -1;
+		cs->temp_in_use &= ~(1 << index);
+	} else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
+		free_hw_temp(fp, cs->inputs[index].reg);
+		cs->inputs[index].reg = -1;
+	}
+}
+
+/**
+ * Emit a hardware constant/parameter.
+ *
+ * \p cp Stable pointer to an array of 4 floats.
+ *  The pointer must be stable in the sense that it remains to be valid
+ *  and hold the contents of the constant/parameter throughout the lifetime
+ *  of the fragment program (actually, up until the next time the fragment
+ *  program is translated).
+ */
+static GLuint emit_const4fv(struct r300_fragment_program *fp,
+			    const GLfloat * cp)
+{
+	GLuint reg = undef;
+	int index;
+
+	for (index = 0; index < fp->const_nr; ++index) {
+		if (fp->constant[index] == cp)
+			break;
+	}
+
+	if (index >= fp->const_nr) {
+		if (index >= PFS_NUM_CONST_REGS) {
+			ERROR("Out of hw constants!\n");
+			return reg;
+		}
+
+		fp->const_nr++;
+		fp->constant[index] = cp;
+	}
+
+	REG_SET_TYPE(reg, REG_TYPE_CONST);
+	REG_SET_INDEX(reg, index);
+	REG_SET_VALID(reg, GL_TRUE);
+	return reg;
+}
+
+static inline GLuint negate(GLuint r)
+{
+	REG_NEGS(r);
+	REG_NEGV(r);
+	return r;
+}
+
+/* Hack, to prevent clobbering sources used multiple times when
+ * emulating non-native instructions
+ */
+static inline GLuint keep(GLuint r)
+{
+	REG_SET_NO_USE(r, GL_TRUE);
+	return r;
+}
+
+static inline GLuint absolute(GLuint r)
+{
+	REG_ABS(r);
+	return r;
+}
+
+static int swz_native(struct r300_fragment_program *fp,
+		      GLuint src, GLuint * r, GLuint arbneg)
+{
+	/* Native swizzle, handle negation */
+	src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
+
+	if ((arbneg & 0x7) == 0x0) {
+		src = src & ~REG_NEGV_MASK;
+		*r = src;
+	} else if ((arbneg & 0x7) == 0x7) {
+		src |= REG_NEGV_MASK;
+		*r = src;
+	} else {
+		if (!REG_GET_VALID(*r))
+			*r = get_temp_reg(fp);
+		src |= REG_NEGV_MASK;
+		emit_arith(fp,
+			   PFS_OP_MAD,
+			   *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
+		src = src & ~REG_NEGV_MASK;
+		emit_arith(fp,
+			   PFS_OP_MAD,
+			   *r,
+			   (arbneg ^ 0x7) | WRITEMASK_W,
+			   src, pfs_one, pfs_zero, 0);
+	}
+
+	return 3;
+}
+
+static int swz_emit_partial(struct r300_fragment_program *fp,
+			    GLuint src,
+			    GLuint * r, int mask, int mc, GLuint arbneg)
+{
+	GLuint tmp;
+	GLuint wmask = 0;
+
+	if (!REG_GET_VALID(*r))
+		*r = get_temp_reg(fp);
+
+	/* A partial match, VSWZ/mask define what parts of the
+	 * desired swizzle we match
+	 */
+	if (mc + s_mask[mask].count == 3) {
+		wmask = WRITEMASK_W;
+		src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
+	}
+
+	tmp = arbneg & s_mask[mask].mask;
+	if (tmp) {
+		tmp = tmp ^ s_mask[mask].mask;
+		if (tmp) {
+			emit_arith(fp,
+				   PFS_OP_MAD,
+				   *r,
+				   arbneg & s_mask[mask].mask,
+				   keep(src) | REG_NEGV_MASK,
+				   pfs_one, pfs_zero, 0);
+			if (!wmask) {
+				REG_SET_NO_USE(src, GL_TRUE);
+			} else {
+				REG_SET_NO_USE(src, GL_FALSE);
+			}
+			emit_arith(fp,
+				   PFS_OP_MAD,
+				   *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
+		} else {
+			if (!wmask) {
+				REG_SET_NO_USE(src, GL_TRUE);
+			} else {
+				REG_SET_NO_USE(src, GL_FALSE);
+			}
+			emit_arith(fp,
+				   PFS_OP_MAD,
+				   *r,
+				   (arbneg & s_mask[mask].mask) | wmask,
+				   src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
+		}
+	} else {
+		if (!wmask) {
+			REG_SET_NO_USE(src, GL_TRUE);
+		} else {
+			REG_SET_NO_USE(src, GL_FALSE);
+		}
+		emit_arith(fp, PFS_OP_MAD,
+			   *r,
+			   s_mask[mask].mask | wmask,
+			   src, pfs_one, pfs_zero, 0);
+	}
+
+	return s_mask[mask].count;
+}
+
+static GLuint do_swizzle(struct r300_fragment_program *fp,
+			 GLuint src, GLuint arbswz, GLuint arbneg)
+{
+	GLuint r = undef;
+	GLuint vswz;
+	int c_mask = 0;
+	int v_match = 0;
+
+	/* If swizzling from something without an XYZW native swizzle,
+	 * emit result to a temp, and do new swizzle from the temp.
+	 */
+#if 0
+	if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
+		GLuint temp = get_temp_reg(fp);
+		emit_arith(fp,
+			   PFS_OP_MAD,
+			   temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
+		src = temp;
+	}
+#endif
+
+	if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
+		GLuint vsrcswz =
+		    (v_swiz[REG_GET_VSWZ(src)].
+		     hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
+		    REG_GET_SSWZ(src) << 9;
+		GLint i;
+
+		GLuint newswz = 0;
+		GLuint offset;
+		for (i = 0; i < 4; ++i) {
+			offset = GET_SWZ(arbswz, i);
+
+			newswz |=
+			    (offset <= 3) ? GET_SWZ(vsrcswz,
+						    offset) << i *
+			    3 : offset << i * 3;
+		}
+
+		arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
+		REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
+	} else {
+		/* set scalar swizzling */
+		REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
+
+	}
+	do {
+		vswz = REG_GET_VSWZ(src);
+		do {
+			int chash;
+
+			REG_SET_VSWZ(src, vswz);
+			chash = v_swiz[REG_GET_VSWZ(src)].hash &
+			    s_mask[c_mask].hash;
+
+			if (chash == (arbswz & s_mask[c_mask].hash)) {
+				if (s_mask[c_mask].count == 3) {
+					v_match += swz_native(fp,
+							      src, &r, arbneg);
+				} else {
+					v_match += swz_emit_partial(fp,
+								    src,
+								    &r,
+								    c_mask,
+								    v_match,
+								    arbneg);
+				}
+
+				if (v_match == 3)
+					return r;
+
+				/* Fill with something invalid.. all 0's was
+				 * wrong before, matched SWIZZLE_X.  So all
+				 * 1's will be okay for now
+				 */
+				arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
+			}
+		} while (v_swiz[++vswz].hash != PFS_INVAL);
+		REG_SET_VSWZ(src, SWIZZLE_XYZ);
+	} while (s_mask[++c_mask].hash != PFS_INVAL);
+
+	ERROR("should NEVER get here\n");
+	return r;
+}
+
+static GLuint t_src(struct r300_fragment_program *fp,
+		    struct prog_src_register fpsrc)
+{
+	GLuint r = undef;
+
+	switch (fpsrc.File) {
+	case PROGRAM_TEMPORARY:
+		REG_SET_INDEX(r, fpsrc.Index);
+		REG_SET_VALID(r, GL_TRUE);
+		REG_SET_TYPE(r, REG_TYPE_TEMP);
+		break;
+	case PROGRAM_INPUT:
+		REG_SET_INDEX(r, fpsrc.Index);
+		REG_SET_VALID(r, GL_TRUE);
+		REG_SET_TYPE(r, REG_TYPE_INPUT);
+		break;
+	case PROGRAM_LOCAL_PARAM:
+		r = emit_const4fv(fp,
+				  fp->mesa_program.Base.LocalParams[fpsrc.
+								    Index]);
+		break;
+	case PROGRAM_ENV_PARAM:
+		r = emit_const4fv(fp,
+				  fp->ctx->FragmentProgram.Parameters[fpsrc.
+								      Index]);
+		break;
+	case PROGRAM_STATE_VAR:
+	case PROGRAM_NAMED_PARAM:
+		r = emit_const4fv(fp,
+				  fp->mesa_program.Base.Parameters->
+				  ParameterValues[fpsrc.Index]);
+		break;
+	default:
+		ERROR("unknown SrcReg->File %x\n", fpsrc.File);
+		return r;
+	}
+
+	/* no point swizzling ONE/ZERO/HALF constants... */
+	if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
+		r = do_swizzle(fp, r, fpsrc.Swizzle, fpsrc.NegateBase);
+	return r;
+}
+
+static GLuint t_scalar_src(struct r300_fragment_program *fp,
+			   struct prog_src_register fpsrc)
+{
+	struct prog_src_register src = fpsrc;
+	int sc = GET_SWZ(fpsrc.Swizzle, 0);	/* X */
+
+	src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
+
+	return t_src(fp, src);
+}
+
+static GLuint t_dst(struct r300_fragment_program *fp,
+		    struct prog_dst_register dest)
+{
+	GLuint r = undef;
+
+	switch (dest.File) {
+	case PROGRAM_TEMPORARY:
+		REG_SET_INDEX(r, dest.Index);
+		REG_SET_VALID(r, GL_TRUE);
+		REG_SET_TYPE(r, REG_TYPE_TEMP);
+		return r;
+	case PROGRAM_OUTPUT:
+		REG_SET_TYPE(r, REG_TYPE_OUTPUT);
+		switch (dest.Index) {
+		case FRAG_RESULT_COLR:
+		case FRAG_RESULT_DEPR:
+			REG_SET_INDEX(r, dest.Index);
+			REG_SET_VALID(r, GL_TRUE);
+			return r;
+		default:
+			ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
+			return r;
+		}
+	default:
+		ERROR("Bad DstReg->File 0x%x\n", dest.File);
+		return r;
+	}
+}
+
+static int t_hw_src(struct r300_fragment_program *fp, GLuint src, GLboolean tex)
+{
+	COMPILE_STATE;
+	int idx;
+	int index = REG_GET_INDEX(src);
+
+	switch (REG_GET_TYPE(src)) {
+	case REG_TYPE_TEMP:
+		/* NOTE: if reg==-1 here, a source is being read that
+		 *       hasn't been written to. Undefined results.
+		 */
+		if (cs->temps[index].reg == -1)
+			cs->temps[index].reg = get_hw_temp(fp, cs->nrslots);
+
+		idx = cs->temps[index].reg;
+
+		if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
+			free_temp(fp, src);
+		break;
+	case REG_TYPE_INPUT:
+		idx = cs->inputs[index].reg;
+
+		if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
+			free_hw_temp(fp, cs->inputs[index].reg);
+		break;
+	case REG_TYPE_CONST:
+		return (index | SRC_CONST);
+	default:
+		ERROR("Invalid type for source reg\n");
+		return (0 | SRC_CONST);
+	}
+
+	if (!tex)
+		cs->used_in_node |= (1 << idx);
+
+	return idx;
+}
+
+static int t_hw_dst(struct r300_fragment_program *fp,
+		    GLuint dest, GLboolean tex, int slot)
+{
+	COMPILE_STATE;
+	int idx;
+	GLuint index = REG_GET_INDEX(dest);
+	assert(REG_GET_VALID(dest));
+
+	switch (REG_GET_TYPE(dest)) {
+	case REG_TYPE_TEMP:
+		if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
+			if (!tex) {
+				cs->temps[index].reg = get_hw_temp(fp, slot);
+			} else {
+				cs->temps[index].reg = get_hw_temp_tex(fp);
+			}
+		}
+		idx = cs->temps[index].reg;
+
+		if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
+			free_temp(fp, dest);
+
+		cs->dest_in_node |= (1 << idx);
+		cs->used_in_node |= (1 << idx);
+		break;
+	case REG_TYPE_OUTPUT:
+		switch (index) {
+		case FRAG_RESULT_COLR:
+			fp->node[fp->cur_node].flags |=
+			    R300_PFS_NODE_OUTPUT_COLOR;
+			break;
+		case FRAG_RESULT_DEPR:
+			fp->node[fp->cur_node].flags |=
+			    R300_PFS_NODE_OUTPUT_DEPTH;
+			break;
+		}
+		return index;
+		break;
+	default:
+		ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
+		return 0;
+	}
+
+	return idx;
+}
+
+static void emit_nop(struct r300_fragment_program *fp)
+{
+	COMPILE_STATE;
+
+	if (cs->nrslots >= PFS_MAX_ALU_INST) {
+		ERROR("Out of ALU instruction slots\n");
+		return;
+	}
+
+	fp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
+	fp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
+	fp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
+	fp->alu.inst[cs->nrslots].inst3 = NOP_INST3;
+	cs->nrslots++;
+}
+
+static void emit_tex(struct r300_fragment_program *fp,
+		     struct prog_instruction *fpi, int opcode)
+{
+	COMPILE_STATE;
+	GLuint coord = t_src(fp, fpi->SrcReg[0]);
+	GLuint dest = undef, rdest = undef;
+	GLuint din, uin;
+	int unit = fpi->TexSrcUnit;
+	int hwsrc, hwdest;
+	GLuint tempreg = 0;
+
+	uin = cs->used_in_node;
+	din = cs->dest_in_node;
+
+	/* Resolve source/dest to hardware registers */
+	if (opcode != R300_FPITX_OP_KIL) {
+		if (fpi->TexSrcTarget == TEXTURE_RECT_INDEX) {
+			/**
+			 * Hardware uses [0..1]x[0..1] range for rectangle textures
+			 * instead of [0..Width]x[0..Height].
+			 * Add a scaling instruction.
+			 *
+			 * \todo Refactor this once we have proper rewriting/optimization
+			 * support for programs.
+			 */
+			gl_state_index tokens[STATE_LENGTH] = {
+				STATE_INTERNAL, STATE_R300_TEXRECT_FACTOR, 0, 0,
+				0
+			};
+			int factor_index;
+			GLuint factorreg;
+
+			tokens[2] = unit;
+			factor_index =
+			    _mesa_add_state_reference(fp->mesa_program.Base.
+						      Parameters, tokens);
+			factorreg =
+			    emit_const4fv(fp,
+					  fp->mesa_program.Base.Parameters->
+					  ParameterValues[factor_index]);
+			tempreg = keep(get_temp_reg(fp));
+
+			emit_arith(fp, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
+				   coord, factorreg, pfs_zero, 0);
+
+			/* Ensure correct node indirection */
+			uin = cs->used_in_node;
+			din = cs->dest_in_node;
+
+			hwsrc = t_hw_src(fp, tempreg, GL_TRUE);
+		} else {
+			hwsrc = t_hw_src(fp, coord, GL_TRUE);
+		}
+
+		dest = t_dst(fp, fpi->DstReg);
+
+		/* r300 doesn't seem to be able to do TEX->output reg */
+		if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+			rdest = dest;
+			dest = get_temp_reg_tex(fp);
+		}
+		hwdest =
+		    t_hw_dst(fp, dest, GL_TRUE,
+			     fp->node[fp->cur_node].alu_offset);
+
+		/* Use a temp that hasn't been used in this node, rather
+		 * than causing an indirection
+		 */
+		if (uin & (1 << hwdest)) {
+			free_hw_temp(fp, hwdest);
+			hwdest = get_hw_temp_tex(fp);
+			cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
+		}
+	} else {
+		hwdest = 0;
+		unit = 0;
+		hwsrc = t_hw_src(fp, coord, GL_TRUE);
+	}
+
+	/* Indirection if source has been written in this node, or if the
+	 * dest has been read/written in this node
+	 */
+	if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
+	     (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
+
+		/* Finish off current node */
+		if (fp->node[fp->cur_node].alu_offset == cs->nrslots)
+			emit_nop(fp);
+
+		fp->node[fp->cur_node].alu_end =
+		    cs->nrslots - fp->node[fp->cur_node].alu_offset - 1;
+		assert(fp->node[fp->cur_node].alu_end >= 0);
+
+		if (++fp->cur_node >= PFS_MAX_TEX_INDIRECT) {
+			ERROR("too many levels of texture indirection\n");
+			return;
+		}
+
+		/* Start new node */
+		fp->node[fp->cur_node].tex_offset = fp->tex.length;
+		fp->node[fp->cur_node].alu_offset = cs->nrslots;
+		fp->node[fp->cur_node].tex_end = -1;
+		fp->node[fp->cur_node].alu_end = -1;
+		fp->node[fp->cur_node].flags = 0;
+		cs->used_in_node = 0;
+		cs->dest_in_node = 0;
+	}
+
+	if (fp->cur_node == 0)
+		fp->first_node_has_tex = 1;
+
+	fp->tex.inst[fp->tex.length++] = 0 | (hwsrc << R300_FPITX_SRC_SHIFT)
+	    | (hwdest << R300_FPITX_DST_SHIFT)
+	    | (unit << R300_FPITX_IMAGE_SHIFT)
+	    /* not entirely sure about this */
+	    | (opcode << R300_FPITX_OPCODE_SHIFT);
+
+	cs->dest_in_node |= (1 << hwdest);
+	if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
+		cs->used_in_node |= (1 << hwsrc);
+
+	fp->node[fp->cur_node].tex_end++;
+
+	/* Copy from temp to output if needed */
+	if (REG_GET_VALID(rdest)) {
+		emit_arith(fp, PFS_OP_MAD, rdest, WRITEMASK_XYZW, dest,
+			   pfs_one, pfs_zero, 0);
+		free_temp(fp, dest);
+	}
+
+	/* Free temp register */
+	if (tempreg != 0)
+		free_temp(fp, tempreg);
+}
+
+/**
+ * Returns the first slot where we could possibly allow writing to dest,
+ * according to register allocation.
+ */
+static int get_earliest_allowed_write(struct r300_fragment_program *fp,
+				      GLuint dest, int mask)
+{
+	COMPILE_STATE;
+	int idx;
+	int pos;
+	GLuint index = REG_GET_INDEX(dest);
+	assert(REG_GET_VALID(dest));
+
+	switch (REG_GET_TYPE(dest)) {
+	case REG_TYPE_TEMP:
+		if (cs->temps[index].reg == -1)
+			return 0;
+
+		idx = cs->temps[index].reg;
+		break;
+	case REG_TYPE_OUTPUT:
+		return 0;
+	default:
+		ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
+		return 0;
+	}
+
+	pos = cs->hwtemps[idx].reserved;
+	if (mask & WRITEMASK_XYZ) {
+		if (pos < cs->hwtemps[idx].vector_lastread)
+			pos = cs->hwtemps[idx].vector_lastread;
+	}
+	if (mask & WRITEMASK_W) {
+		if (pos < cs->hwtemps[idx].scalar_lastread)
+			pos = cs->hwtemps[idx].scalar_lastread;
+	}
+
+	return pos;
+}
+
+/**
+ * Allocates a slot for an ALU instruction that can consist of
+ * a vertex part or a scalar part or both.
+ *
+ * Sources from src (src[0] to src[argc-1]) are added to the slot in the
+ * appropriate position (vector and/or scalar), and their positions are
+ * recorded in the srcpos array.
+ *
+ * This function emits instruction code for the source fetch and the
+ * argument selection. It does not emit instruction code for the
+ * opcode or the destination selection.
+ *
+ * @return the index of the slot
+ */
+static int find_and_prepare_slot(struct r300_fragment_program *fp,
+				 GLboolean emit_vop,
+				 GLboolean emit_sop,
+				 int argc, GLuint * src, GLuint dest, int mask)
+{
+	COMPILE_STATE;
+	int hwsrc[3];
+	int srcpos[3];
+	unsigned int used;
+	int tempused;
+	int tempvsrc[3];
+	int tempssrc[3];
+	int pos;
+	int regnr;
+	int i, j;
+
+	// Determine instruction slots, whether sources are required on
+	// vector or scalar side, and the smallest slot number where
+	// all source registers are available
+	used = 0;
+	if (emit_vop)
+		used |= SLOT_OP_VECTOR;
+	if (emit_sop)
+		used |= SLOT_OP_SCALAR;
+
+	pos = get_earliest_allowed_write(fp, dest, mask);
+
+	if (fp->node[fp->cur_node].alu_offset > pos)
+		pos = fp->node[fp->cur_node].alu_offset;
+	for (i = 0; i < argc; ++i) {
+		if (!REG_GET_BUILTIN(src[i])) {
+			if (emit_vop)
+				used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
+			if (emit_sop)
+				used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
+		}
+
+		hwsrc[i] = t_hw_src(fp, src[i], GL_FALSE);	/* Note: sideeffects wrt refcounting! */
+		regnr = hwsrc[i] & 31;
+
+		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
+			if (used & (SLOT_SRC_VECTOR << i)) {
+				if (cs->hwtemps[regnr].vector_valid > pos)
+					pos = cs->hwtemps[regnr].vector_valid;
+			}
+			if (used & (SLOT_SRC_SCALAR << i)) {
+				if (cs->hwtemps[regnr].scalar_valid > pos)
+					pos = cs->hwtemps[regnr].scalar_valid;
+			}
+		}
+	}
+
+	// Find a slot that fits
+	for (;; ++pos) {
+		if (cs->slot[pos].used & used & SLOT_OP_BOTH)
+			continue;
+
+		if (pos >= cs->nrslots) {
+			if (cs->nrslots >= PFS_MAX_ALU_INST) {
+				ERROR("Out of ALU instruction slots\n");
+				return -1;
+			}
+
+			fp->alu.inst[pos].inst0 = NOP_INST0;
+			fp->alu.inst[pos].inst1 = NOP_INST1;
+			fp->alu.inst[pos].inst2 = NOP_INST2;
+			fp->alu.inst[pos].inst3 = NOP_INST3;
+
+			cs->nrslots++;
+		}
+		// Note: When we need both parts (vector and scalar) of a source,
+		// we always try to put them into the same position. This makes the
+		// code easier to read, and it is optimal (i.e. one doesn't gain
+		// anything by splitting the parts).
+		// It also avoids headaches with swizzles that access both parts (i.e WXY)
+		tempused = cs->slot[pos].used;
+		for (i = 0; i < 3; ++i) {
+			tempvsrc[i] = cs->slot[pos].vsrc[i];
+			tempssrc[i] = cs->slot[pos].ssrc[i];
+		}
+
+		for (i = 0; i < argc; ++i) {
+			int flags = (used >> i) & SLOT_SRC_BOTH;
+
+			if (!flags) {
+				srcpos[i] = 0;
+				continue;
+			}
+
+			for (j = 0; j < 3; ++j) {
+				if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
+					if (tempvsrc[j] != hwsrc[i])
+						continue;
+				}
+
+				if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
+					if (tempssrc[j] != hwsrc[i])
+						continue;
+				}
+
+				break;
+			}
+
+			if (j == 3)
+				break;
+
+			srcpos[i] = j;
+			tempused |= flags << j;
+			if (flags & SLOT_SRC_VECTOR)
+				tempvsrc[j] = hwsrc[i];
+			if (flags & SLOT_SRC_SCALAR)
+				tempssrc[j] = hwsrc[i];
+		}
+
+		if (i == argc)
+			break;
+	}
+
+	// Found a slot, reserve it
+	cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
+	for (i = 0; i < 3; ++i) {
+		cs->slot[pos].vsrc[i] = tempvsrc[i];
+		cs->slot[pos].ssrc[i] = tempssrc[i];
+	}
+
+	for (i = 0; i < argc; ++i) {
+		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
+			int regnr = hwsrc[i] & 31;
+
+			if (used & (SLOT_SRC_VECTOR << i)) {
+				if (cs->hwtemps[regnr].vector_lastread < pos)
+					cs->hwtemps[regnr].vector_lastread =
+					    pos;
+			}
+			if (used & (SLOT_SRC_SCALAR << i)) {
+				if (cs->hwtemps[regnr].scalar_lastread < pos)
+					cs->hwtemps[regnr].scalar_lastread =
+					    pos;
+			}
+		}
+	}
+
+	// Emit the source fetch code
+	fp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK;
+	fp->alu.inst[pos].inst1 |=
+	    ((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
+	     (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
+	     (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
+
+	fp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK;
+	fp->alu.inst[pos].inst3 |=
+	    ((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
+	     (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
+	     (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
+
+	// Emit the argument selection code
+	if (emit_vop) {
+		int swz[3];
+
+		for (i = 0; i < 3; ++i) {
+			if (i < argc) {
+				swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
+					  (srcpos[i] *
+					   v_swiz[REG_GET_VSWZ(src[i])].
+					   stride)) | ((src[i] & REG_NEGV_MASK)
+						       ? ARG_NEG : 0) | ((src[i]
+									  &
+									  REG_ABS_MASK)
+									 ?
+									 ARG_ABS
+									 : 0);
+			} else {
+				swz[i] = R300_FPI0_ARGC_ZERO;
+			}
+		}
+
+		fp->alu.inst[pos].inst0 &=
+		    ~(R300_FPI0_ARG0C_MASK | R300_FPI0_ARG1C_MASK |
+		      R300_FPI0_ARG2C_MASK);
+		fp->alu.inst[pos].inst0 |=
+		    (swz[0] << R300_FPI0_ARG0C_SHIFT) | (swz[1] <<
+							 R300_FPI0_ARG1C_SHIFT)
+		    | (swz[2] << R300_FPI0_ARG2C_SHIFT);
+	}
+
+	if (emit_sop) {
+		int swz[3];
+
+		for (i = 0; i < 3; ++i) {
+			if (i < argc) {
+				swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
+					  (srcpos[i] *
+					   s_swiz[REG_GET_SSWZ(src[i])].
+					   stride)) | ((src[i] & REG_NEGV_MASK)
+						       ? ARG_NEG : 0) | ((src[i]
+									  &
+									  REG_ABS_MASK)
+									 ?
+									 ARG_ABS
+									 : 0);
+			} else {
+				swz[i] = R300_FPI2_ARGA_ZERO;
+			}
+		}
+
+		fp->alu.inst[pos].inst2 &=
+		    ~(R300_FPI2_ARG0A_MASK | R300_FPI2_ARG1A_MASK |
+		      R300_FPI2_ARG2A_MASK);
+		fp->alu.inst[pos].inst2 |=
+		    (swz[0] << R300_FPI2_ARG0A_SHIFT) | (swz[1] <<
+							 R300_FPI2_ARG1A_SHIFT)
+		    | (swz[2] << R300_FPI2_ARG2A_SHIFT);
+	}
+
+	return pos;
+}
+
+/**
+ * Append an ALU instruction to the instruction list.
+ */
+static void emit_arith(struct r300_fragment_program *fp,
+		       int op,
+		       GLuint dest,
+		       int mask,
+		       GLuint src0, GLuint src1, GLuint src2, int flags)
+{
+	COMPILE_STATE;
+	GLuint src[3] = { src0, src1, src2 };
+	int hwdest;
+	GLboolean emit_vop, emit_sop;
+	int vop, sop, argc;
+	int pos;
+
+	vop = r300_fpop[op].v_op;
+	sop = r300_fpop[op].s_op;
+	argc = r300_fpop[op].argc;
+
+	if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
+	    REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
+		if (mask & WRITEMASK_Z) {
+			mask = WRITEMASK_W;
+		} else {
+			return;
+		}
+	}
+
+	emit_vop = GL_FALSE;
+	emit_sop = GL_FALSE;
+	if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
+		emit_vop = GL_TRUE;
+	if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
+		emit_sop = GL_TRUE;
+
+	pos =
+	    find_and_prepare_slot(fp, emit_vop, emit_sop, argc, src, dest,
+				  mask);
+	if (pos < 0)
+		return;
+
+	hwdest = t_hw_dst(fp, dest, GL_FALSE, pos);	/* Note: Side effects wrt register allocation */
+
+	if (flags & PFS_FLAG_SAT) {
+		vop |= R300_FPI0_OUTC_SAT;
+		sop |= R300_FPI2_OUTA_SAT;
+	}
+
+	/* Throw the pieces together and get FPI0/1 */
+	if (emit_vop) {
+		fp->alu.inst[pos].inst0 |= vop;
+
+		fp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
+
+		if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+			if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
+				fp->alu.inst[pos].inst1 |=
+				    (mask & WRITEMASK_XYZ) <<
+				    R300_FPI1_DSTC_OUTPUT_MASK_SHIFT;
+			} else
+				assert(0);
+		} else {
+			fp->alu.inst[pos].inst1 |=
+			    (mask & WRITEMASK_XYZ) <<
+			    R300_FPI1_DSTC_REG_MASK_SHIFT;
+
+			cs->hwtemps[hwdest].vector_valid = pos + 1;
+		}
+	}
+
+	/* And now FPI2/3 */
+	if (emit_sop) {
+		fp->alu.inst[pos].inst2 |= sop;
+
+		if (mask & WRITEMASK_W) {
+			if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+				if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
+					fp->alu.inst[pos].inst3 |=
+					    (hwdest << R300_FPI3_DSTA_SHIFT) |
+					    R300_FPI3_DSTA_OUTPUT;
+				} else if (REG_GET_INDEX(dest) ==
+					   FRAG_RESULT_DEPR) {
+					fp->alu.inst[pos].inst3 |=
+					    R300_FPI3_DSTA_DEPTH;
+				} else
+					assert(0);
+			} else {
+				fp->alu.inst[pos].inst3 |=
+				    (hwdest << R300_FPI3_DSTA_SHIFT) |
+				    R300_FPI3_DSTA_REG;
+
+				cs->hwtemps[hwdest].scalar_valid = pos + 1;
+			}
+		}
+	}
+
+	return;
+}
+
+#if 0
+static GLuint get_attrib(struct r300_fragment_program *fp, GLuint attr)
+{
+	struct gl_fragment_program *mp = &fp->mesa_program;
+	GLuint r = undef;
+
+	if (!(mp->Base.InputsRead & (1 << attr))) {
+		ERROR("Attribute %d was not provided!\n", attr);
+		return undef;
+	}
+
+	REG_SET_TYPE(r, REG_TYPE_INPUT);
+	REG_SET_INDEX(r, attr);
+	REG_SET_VALID(r, GL_TRUE);
+	return r;
+}
+#endif
+
+static GLfloat SinCosConsts[2][4] = {
+	{
+	 1.273239545,		// 4/PI
+	 -0.405284735,		// -4/(PI*PI)
+	 3.141592654,		// PI
+	 0.2225			// weight
+	 },
+	{
+	 0.75,
+	 0.0,
+	 0.159154943,		// 1/(2*PI)
+	 6.283185307		// 2*PI
+	 }
+};
+
+/**
+ * Emit a LIT instruction.
+ * \p flags may be PFS_FLAG_SAT
+ *
+ * Definition of LIT (from ARB_fragment_program):
+ * tmp = VectorLoad(op0);
+ * if (tmp.x < 0) tmp.x = 0;
+ * if (tmp.y < 0) tmp.y = 0;
+ * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
+ * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
+ * result.x = 1.0;
+ * result.y = tmp.x;
+ * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
+ * result.w = 1.0;
+ *
+ * The longest path of computation is the one leading to result.z,
+ * consisting of 5 operations. This implementation of LIT takes
+ * 5 slots. So unless there's some special undocumented opcode,
+ * this implementation is potentially optimal. Unfortunately,
+ * emit_arith is a bit too conservative because it doesn't understand
+ * partial writes to the vector component.
+ */
+static const GLfloat LitConst[4] =
+    { 127.999999, 127.999999, 127.999999, -127.999999 };
+
+static void emit_lit(struct r300_fragment_program *fp,
+		     GLuint dest, int mask, GLuint src, int flags)
+{
+	COMPILE_STATE;
+	GLuint cnst;
+	int needTemporary;
+	GLuint temp;
+
+	cnst = emit_const4fv(fp, LitConst);
+
+	needTemporary = 0;
+	if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
+		needTemporary = 1;
+	} else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+		// LIT is typically followed by DP3/DP4, so there's no point
+		// in creating special code for this case
+		needTemporary = 1;
+	}
+
+	if (needTemporary) {
+		temp = keep(get_temp_reg(fp));
+	} else {
+		temp = keep(dest);
+	}
+
+	// Note: The order of emit_arith inside the slots is relevant,
+	// because emit_arith only looks at scalar vs. vector when resolving
+	// dependencies, and it does not consider individual vector components,
+	// so swizzling between the two parts can create fake dependencies.
+
+	// First slot
+	emit_arith(fp, PFS_OP_MAX, temp, WRITEMASK_XY,
+		   keep(src), pfs_zero, undef, 0);
+	emit_arith(fp, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
+
+	// Second slot
+	emit_arith(fp, PFS_OP_MIN, temp, WRITEMASK_Z,
+		   swizzle(temp, W, W, W, W), cnst, undef, 0);
+	emit_arith(fp, PFS_OP_LG2, temp, WRITEMASK_W,
+		   swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
+
+	// Third slot
+	// If desired, we saturate the y result here.
+	// This does not affect the use as a condition variable in the CMP later
+	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_W,
+		   temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
+	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_Y,
+		   swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
+
+	// Fourth slot
+	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_X,
+		   pfs_one, pfs_one, pfs_zero, 0);
+	emit_arith(fp, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
+
+	// Fifth slot
+	emit_arith(fp, PFS_OP_CMP, temp, WRITEMASK_Z,
+		   pfs_zero, swizzle(temp, W, W, W, W),
+		   negate(swizzle(temp, Y, Y, Y, Y)), flags);
+	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
+		   pfs_zero, 0);
+
+	if (needTemporary) {
+		emit_arith(fp, PFS_OP_MAD, dest, mask,
+			   temp, pfs_one, pfs_zero, flags);
+		free_temp(fp, temp);
+	} else {
+		// Decrease refcount of the destination
+		t_hw_dst(fp, dest, GL_FALSE, cs->nrslots);
+	}
+}
+
+static GLboolean parse_program(struct r300_fragment_program *fp)
+{
+	struct gl_fragment_program *mp = &fp->mesa_program;
+	const struct prog_instruction *inst = mp->Base.Instructions;
+	struct prog_instruction *fpi;
+	GLuint src[3], dest, temp[2];
+	int flags, mask = 0;
+	int const_sin[2];
+
+	if (!inst || inst[0].Opcode == OPCODE_END) {
+		ERROR("empty program?\n");
+		return GL_FALSE;
+	}
+
+	for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
+		if (fpi->SaturateMode == SATURATE_ZERO_ONE)
+			flags = PFS_FLAG_SAT;
+		else
+			flags = 0;
+
+		if (fpi->Opcode != OPCODE_KIL) {
+			dest = t_dst(fp, fpi->DstReg);
+			mask = fpi->DstReg.WriteMask;
+		}
+
+		switch (fpi->Opcode) {
+		case OPCODE_ABS:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   absolute(src[0]), pfs_one, pfs_zero, flags);
+			break;
+		case OPCODE_ADD:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], pfs_one, src[1], flags);
+			break;
+		case OPCODE_CMP:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			src[2] = t_src(fp, fpi->SrcReg[2]);
+			/* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
+			 *    r300 - if src2.c < 0.0 ? src1.c : src0.c
+			 */
+			emit_arith(fp, PFS_OP_CMP, dest, mask,
+				   src[2], src[1], src[0], flags);
+			break;
+		case OPCODE_COS:
+			/*
+			 * cos using a parabola (see SIN):
+			 * cos(x):
+			 *   x = (x/(2*PI))+0.75
+			 *   x = frac(x)
+			 *   x = (x*2*PI)-PI
+			 *   result = sin(x)
+			 */
+			temp[0] = get_temp_reg(fp);
+			const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
+			const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+
+			/* add 0.5*PI and do range reduction */
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
+				   swizzle(src[0], X, X, X, X),
+				   swizzle(const_sin[1], Z, Z, Z, Z),
+				   swizzle(const_sin[1], X, X, X, X), 0);
+
+			emit_arith(fp, PFS_OP_FRC, temp[0], WRITEMASK_X,
+				   swizzle(temp[0], X, X, X, X),
+				   undef, undef, 0);
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),	//2*PI
+				   negate(swizzle(const_sin[0], Z, Z, Z, Z)),	//-PI
+				   0);
+
+			/* SIN */
+
+			emit_arith(fp, PFS_OP_MAD, temp[0],
+				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
+								      Z, Z, Z,
+								      Z),
+				   const_sin[0], pfs_zero, 0);
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
+				   swizzle(temp[0], Y, Y, Y, Y),
+				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
+				   swizzle(temp[0], X, X, X, X), 0);
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
+				   swizzle(temp[0], X, X, X, X),
+				   absolute(swizzle(temp[0], X, X, X, X)),
+				   negate(swizzle(temp[0], X, X, X, X)), 0);
+
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   swizzle(temp[0], Y, Y, Y, Y),
+				   swizzle(const_sin[0], W, W, W, W),
+				   swizzle(temp[0], X, X, X, X), flags);
+
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_DP3:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_DP3, dest, mask,
+				   src[0], src[1], undef, flags);
+			break;
+		case OPCODE_DP4:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_DP4, dest, mask,
+				   src[0], src[1], undef, flags);
+			break;
+		case OPCODE_DPH:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			/* src0.xyz1 -> temp
+			 * DP4 dest, temp, src1
+			 */
+#if 0
+			temp[0] = get_temp_reg(fp);
+			src[0].s_swz = SWIZZLE_ONE;
+			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
+				   src[0], pfs_one, pfs_zero, 0);
+			emit_arith(fp, PFS_OP_DP4, dest, mask,
+				   temp[0], src[1], undef, flags);
+			free_temp(fp, temp[0]);
+#else
+			emit_arith(fp, PFS_OP_DP4, dest, mask,
+				   swizzle(src[0], X, Y, Z, ONE), src[1],
+				   undef, flags);
+#endif
+			break;
+		case OPCODE_DST:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			/* dest.y = src0.y * src1.y */
+			if (mask & WRITEMASK_Y)
+				emit_arith(fp, PFS_OP_MAD, dest, WRITEMASK_Y,
+					   keep(src[0]), keep(src[1]),
+					   pfs_zero, flags);
+			/* dest.z = src0.z */
+			if (mask & WRITEMASK_Z)
+				emit_arith(fp, PFS_OP_MAD, dest, WRITEMASK_Z,
+					   src[0], pfs_one, pfs_zero, flags);
+			/* result.x = 1.0
+			 * result.w = src1.w */
+			if (mask & WRITEMASK_XW) {
+				REG_SET_VSWZ(src[1], SWIZZLE_111);	/*Cheat */
+				emit_arith(fp, PFS_OP_MAD, dest,
+					   mask & WRITEMASK_XW,
+					   src[1], pfs_one, pfs_zero, flags);
+			}
+			break;
+		case OPCODE_EX2:
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_EX2, dest, mask,
+				   src[0], undef, undef, flags);
+			break;
+		case OPCODE_FLR:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			temp[0] = get_temp_reg(fp);
+			/* FRC temp, src0
+			 * MAD dest, src0, 1.0, -temp
+			 */
+			emit_arith(fp, PFS_OP_FRC, temp[0], mask,
+				   keep(src[0]), undef, undef, 0);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], pfs_one, negate(temp[0]), flags);
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_FRC:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_FRC, dest, mask,
+				   src[0], undef, undef, flags);
+			break;
+		case OPCODE_KIL:
+			emit_tex(fp, fpi, R300_FPITX_OP_KIL);
+			break;
+		case OPCODE_LG2:
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_LG2, dest, mask,
+				   src[0], undef, undef, flags);
+			break;
+		case OPCODE_LIT:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			emit_lit(fp, dest, mask, src[0], flags);
+			break;
+		case OPCODE_LRP:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			src[2] = t_src(fp, fpi->SrcReg[2]);
+			/* result = tmp0tmp1 + (1 - tmp0)tmp2
+			 *        = tmp0tmp1 + tmp2 + (-tmp0)tmp2
+			 *     MAD temp, -tmp0, tmp2, tmp2
+			 *     MAD result, tmp0, tmp1, temp
+			 */
+			temp[0] = get_temp_reg(fp);
+			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
+				   negate(keep(src[0])), keep(src[2]), src[2],
+				   0);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], src[1], temp[0], flags);
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_MAD:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			src[2] = t_src(fp, fpi->SrcReg[2]);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], src[1], src[2], flags);
+			break;
+		case OPCODE_MAX:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_MAX, dest, mask,
+				   src[0], src[1], undef, flags);
+			break;
+		case OPCODE_MIN:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_MIN, dest, mask,
+				   src[0], src[1], undef, flags);
+			break;
+		case OPCODE_MOV:
+		case OPCODE_SWZ:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], pfs_one, pfs_zero, flags);
+			break;
+		case OPCODE_MUL:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], src[1], pfs_zero, flags);
+			break;
+		case OPCODE_POW:
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+			src[1] = t_scalar_src(fp, fpi->SrcReg[1]);
+			temp[0] = get_temp_reg(fp);
+			emit_arith(fp, PFS_OP_LG2, temp[0], WRITEMASK_W,
+				   src[0], undef, undef, 0);
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_W,
+				   temp[0], src[1], pfs_zero, 0);
+			emit_arith(fp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
+				   temp[0], undef, undef, 0);
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_RCP:
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_RCP, dest, mask,
+				   src[0], undef, undef, flags);
+			break;
+		case OPCODE_RSQ:
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_RSQ, dest, mask,
+				   absolute(src[0]), pfs_zero, pfs_zero, flags);
+			break;
+		case OPCODE_SCS:
+			/*
+			 * scs using a parabola :
+			 * scs(x):
+			 *   result.x = sin(-abs(x)+0.5*PI)  (cos)
+			 *   result.y = sin(x)               (sin)
+			 *
+			 */
+			temp[0] = get_temp_reg(fp);
+			temp[1] = get_temp_reg(fp);
+			const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
+			const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+
+			/* x = -abs(x)+0.5*PI */
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z),	//PI
+				   pfs_half,
+				   negate(abs
+					  (swizzle(keep(src[0]), X, X, X, X))),
+				   0);
+
+			/* C*x (sin) */
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_W,
+				   swizzle(const_sin[0], Y, Y, Y, Y),
+				   swizzle(keep(src[0]), X, X, X, X),
+				   pfs_zero, 0);
+
+			/* B*x, C*x (cos) */
+			emit_arith(fp, PFS_OP_MAD, temp[0],
+				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
+								      Z, Z, Z,
+								      Z),
+				   const_sin[0], pfs_zero, 0);
+
+			/* B*x (sin) */
+			emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_W,
+				   swizzle(const_sin[0], X, X, X, X),
+				   keep(src[0]), pfs_zero, 0);
+
+			/* y = B*x + C*x*abs(x) (sin) */
+			emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_Z,
+				   absolute(src[0]),
+				   swizzle(temp[0], W, W, W, W),
+				   swizzle(temp[1], W, W, W, W), 0);
+
+			/* y = B*x + C*x*abs(x) (cos) */
+			emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_W,
+				   swizzle(temp[0], Y, Y, Y, Y),
+				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
+				   swizzle(temp[0], X, X, X, X), 0);
+
+			/* y*abs(y) - y (cos), y*abs(y) - y (sin) */
+			emit_arith(fp, PFS_OP_MAD, temp[0],
+				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
+								      W, Z, Y,
+								      X),
+				   absolute(swizzle(temp[1], W, Z, Y, X)),
+				   negate(swizzle(temp[1], W, Z, Y, X)), 0);
+
+			/* dest.xy = mad(temp.xy, P, temp2.wz) */
+			emit_arith(fp, PFS_OP_MAD, dest,
+				   mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
+				   swizzle(const_sin[0], W, W, W, W),
+				   swizzle(temp[1], W, Z, Y, X), flags);
+
+			free_temp(fp, temp[0]);
+			free_temp(fp, temp[1]);
+			break;
+		case OPCODE_SGE:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			temp[0] = get_temp_reg(fp);
+			/* temp = src0 - src1
+			 * dest.c = (temp.c < 0.0) ? 0 : 1
+			 */
+			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
+				   src[0], pfs_one, negate(src[1]), 0);
+			emit_arith(fp, PFS_OP_CMP, dest, mask,
+				   pfs_one, pfs_zero, temp[0], 0);
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_SIN:
+			/*
+			 *  using a parabola:
+			 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
+			 * extra precision is obtained by weighting against
+			 * itself squared.
+			 */
+
+			temp[0] = get_temp_reg(fp);
+			const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
+			const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+
+			/* do range reduction */
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
+				   swizzle(keep(src[0]), X, X, X, X),
+				   swizzle(const_sin[1], Z, Z, Z, Z),
+				   pfs_half, 0);
+
+			emit_arith(fp, PFS_OP_FRC, temp[0], WRITEMASK_X,
+				   swizzle(temp[0], X, X, X, X),
+				   undef, undef, 0);
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),	//2*PI
+				   negate(swizzle(const_sin[0], Z, Z, Z, Z)),	//PI
+				   0);
+
+			/* SIN */
+
+			emit_arith(fp, PFS_OP_MAD, temp[0],
+				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
+								      Z, Z, Z,
+								      Z),
+				   const_sin[0], pfs_zero, 0);
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
+				   swizzle(temp[0], Y, Y, Y, Y),
+				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
+				   swizzle(temp[0], X, X, X, X), 0);
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
+				   swizzle(temp[0], X, X, X, X),
+				   absolute(swizzle(temp[0], X, X, X, X)),
+				   negate(swizzle(temp[0], X, X, X, X)), 0);
+
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   swizzle(temp[0], Y, Y, Y, Y),
+				   swizzle(const_sin[0], W, W, W, W),
+				   swizzle(temp[0], X, X, X, X), flags);
+
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_SLT:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			temp[0] = get_temp_reg(fp);
+			/* temp = src0 - src1
+			 * dest.c = (temp.c < 0.0) ? 1 : 0
+			 */
+			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
+				   src[0], pfs_one, negate(src[1]), 0);
+			emit_arith(fp, PFS_OP_CMP, dest, mask,
+				   pfs_zero, pfs_one, temp[0], 0);
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_SUB:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], pfs_one, negate(src[1]), flags);
+			break;
+		case OPCODE_TEX:
+			emit_tex(fp, fpi, R300_FPITX_OP_TEX);
+			break;
+		case OPCODE_TXB:
+			emit_tex(fp, fpi, R300_FPITX_OP_TXB);
+			break;
+		case OPCODE_TXP:
+			emit_tex(fp, fpi, R300_FPITX_OP_TXP);
+			break;
+		case OPCODE_XPD:{
+				src[0] = t_src(fp, fpi->SrcReg[0]);
+				src[1] = t_src(fp, fpi->SrcReg[1]);
+				temp[0] = get_temp_reg(fp);
+				/* temp = src0.zxy * src1.yzx */
+				emit_arith(fp, PFS_OP_MAD, temp[0],
+					   WRITEMASK_XYZ, swizzle(keep(src[0]),
+								  Z, X, Y, W),
+					   swizzle(keep(src[1]), Y, Z, X, W),
+					   pfs_zero, 0);
+				/* dest.xyz = src0.yzx * src1.zxy - temp
+				 * dest.w       = undefined
+				 * */
+				emit_arith(fp, PFS_OP_MAD, dest,
+					   mask & WRITEMASK_XYZ, swizzle(src[0],
+									 Y, Z,
+									 X, W),
+					   swizzle(src[1], Z, X, Y, W),
+					   negate(temp[0]), flags);
+				/* cleanup */
+				free_temp(fp, temp[0]);
+				break;
+			}
+		default:
+			ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
+			break;
+		}
+
+		if (fp->error)
+			return GL_FALSE;
+
+	}
+
+	return GL_TRUE;
+}
+
+static void insert_wpos(struct gl_program *prog)
+{
+	static gl_state_index tokens[STATE_LENGTH] = {
+		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
+	};
+	struct prog_instruction *fpi;
+	GLuint window_index;
+	int i = 0;
+	GLuint tempregi = prog->NumTemporaries;
+	/* should do something else if no temps left... */
+	prog->NumTemporaries++;
+
+	fpi = _mesa_alloc_instructions(prog->NumInstructions + 3);
+	_mesa_init_instructions(fpi, prog->NumInstructions + 3);
+
+	/* perspective divide */
+	fpi[i].Opcode = OPCODE_RCP;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_W;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+	i++;
+
+	fpi[i].Opcode = OPCODE_MUL;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
+	fpi[i].SrcReg[1].Index = tempregi;
+	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
+	i++;
+
+	/* viewport transformation */
+	window_index = _mesa_add_state_reference(prog->Parameters, tokens);
+
+	fpi[i].Opcode = OPCODE_MAD;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+	fpi[i].SrcReg[0].Index = tempregi;
+	fpi[i].SrcReg[0].Swizzle =
+	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
+	fpi[i].SrcReg[1].Index = window_index;
+	fpi[i].SrcReg[1].Swizzle =
+	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
+	fpi[i].SrcReg[2].Index = window_index;
+	fpi[i].SrcReg[2].Swizzle =
+	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+	i++;
+
+	_mesa_copy_instructions(&fpi[i], prog->Instructions,
+				prog->NumInstructions);
+
+	free(prog->Instructions);
+
+	prog->Instructions = fpi;
+
+	prog->NumInstructions += i;
+	fpi = &prog->Instructions[prog->NumInstructions - 1];
+
+	assert(fpi->Opcode == OPCODE_END);
+
+	for (fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++) {
+		for (i = 0; i < 3; i++)
+			if (fpi->SrcReg[i].File == PROGRAM_INPUT &&
+			    fpi->SrcReg[i].Index == FRAG_ATTRIB_WPOS) {
+				fpi->SrcReg[i].File = PROGRAM_TEMPORARY;
+				fpi->SrcReg[i].Index = tempregi;
+			}
+	}
+}
+
+/* - Init structures
+ * - Determine what hwregs each input corresponds to
+ */
+static void init_program(r300ContextPtr r300, struct r300_fragment_program *fp)
+{
+	struct r300_pfs_compile_state *cs = NULL;
+	struct gl_fragment_program *mp = &fp->mesa_program;
+	struct prog_instruction *fpi;
+	GLuint InputsRead = mp->Base.InputsRead;
+	GLuint temps_used = 0;	/* for fp->temps[] */
+	int i, j;
+
+	/* New compile, reset tracking data */
+	fp->optimization =
+	    driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
+	fp->translated = GL_FALSE;
+	fp->error = GL_FALSE;
+	fp->cs = cs = &(R300_CONTEXT(fp->ctx)->state.pfs_compile);
+	fp->tex.length = 0;
+	fp->cur_node = 0;
+	fp->first_node_has_tex = 0;
+	fp->const_nr = 0;
+	fp->max_temp_idx = 0;
+	fp->node[0].alu_end = -1;
+	fp->node[0].tex_end = -1;
+
+	_mesa_memset(cs, 0, sizeof(*fp->cs));
+	for (i = 0; i < PFS_MAX_ALU_INST; i++) {
+		for (j = 0; j < 3; j++) {
+			cs->slot[i].vsrc[j] = SRC_CONST;
+			cs->slot[i].ssrc[j] = SRC_CONST;
+		}
+	}
+
+	/* Work out what temps the Mesa inputs correspond to, this must match
+	 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
+	 * configures itself based on the fragprog's InputsRead
+	 *
+	 * NOTE: this depends on get_hw_temp() allocating registers in order,
+	 * starting from register 0.
+	 */
+
+	/* Texcoords come first */
+	for (i = 0; i < fp->ctx->Const.MaxTextureUnits; i++) {
+		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
+			cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
+			cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
+			    get_hw_temp(fp, 0);
+		}
+	}
+	InputsRead &= ~FRAG_BITS_TEX_ANY;
+
+	/* fragment position treated as a texcoord */
+	if (InputsRead & FRAG_BIT_WPOS) {
+		cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
+		cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(fp, 0);
+		insert_wpos(&mp->Base);
+	}
+	InputsRead &= ~FRAG_BIT_WPOS;
+
+	/* Then primary colour */
+	if (InputsRead & FRAG_BIT_COL0) {
+		cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
+		cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(fp, 0);
+	}
+	InputsRead &= ~FRAG_BIT_COL0;
+
+	/* Secondary color */
+	if (InputsRead & FRAG_BIT_COL1) {
+		cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
+		cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(fp, 0);
+	}
+	InputsRead &= ~FRAG_BIT_COL1;
+
+	/* Anything else */
+	if (InputsRead) {
+		WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
+		/* force read from hwreg 0 for now */
+		for (i = 0; i < 32; i++)
+			if (InputsRead & (1 << i))
+				cs->inputs[i].reg = 0;
+	}
+
+	/* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
+	 * That way, we can free up the reg when it's no longer needed
+	 */
+	if (!mp->Base.Instructions) {
+		ERROR("No instructions found in program\n");
+		return;
+	}
+
+	for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
+		int idx;
+
+		for (i = 0; i < 3; i++) {
+			idx = fpi->SrcReg[i].Index;
+			switch (fpi->SrcReg[i].File) {
+			case PROGRAM_TEMPORARY:
+				if (!(temps_used & (1 << idx))) {
+					cs->temps[idx].reg = -1;
+					cs->temps[idx].refcount = 1;
+					temps_used |= (1 << idx);
+				} else
+					cs->temps[idx].refcount++;
+				break;
+			case PROGRAM_INPUT:
+				cs->inputs[idx].refcount++;
+				break;
+			default:
+				break;
+			}
+		}
+
+		idx = fpi->DstReg.Index;
+		if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
+			if (!(temps_used & (1 << idx))) {
+				cs->temps[idx].reg = -1;
+				cs->temps[idx].refcount = 1;
+				temps_used |= (1 << idx);
+			} else
+				cs->temps[idx].refcount++;
+		}
+	}
+	cs->temp_in_use = temps_used;
+}
+
+static void update_params(struct r300_fragment_program *fp)
+{
+	struct gl_fragment_program *mp = &fp->mesa_program;
+
+	/* Ask Mesa nicely to fill in ParameterValues for us */
+	if (mp->Base.Parameters)
+		_mesa_load_state_parameters(fp->ctx, mp->Base.Parameters);
+}
+
+void r300TranslateFragmentShader(r300ContextPtr r300,
+				 struct r300_fragment_program *fp)
+{
+	struct r300_pfs_compile_state *cs = NULL;
+
+	if (!fp->translated) {
+
+		init_program(r300, fp);
+		cs = fp->cs;
+
+		if (parse_program(fp) == GL_FALSE) {
+			dump_program(fp);
+			return;
+		}
+
+		/* Finish off */
+		fp->node[fp->cur_node].alu_end =
+		    cs->nrslots - fp->node[fp->cur_node].alu_offset - 1;
+		if (fp->node[fp->cur_node].tex_end < 0)
+			fp->node[fp->cur_node].tex_end = 0;
+		fp->alu_offset = 0;
+		fp->alu_end = cs->nrslots - 1;
+		fp->tex_offset = 0;
+		fp->tex_end = fp->tex.length ? fp->tex.length - 1 : 0;
+		assert(fp->node[fp->cur_node].alu_end >= 0);
+		assert(fp->alu_end >= 0);
+
+		fp->translated = GL_TRUE;
+		if (RADEON_DEBUG & DEBUG_PIXEL)
+			dump_program(fp);
+		r300UpdateStateParameters(fp->ctx, _NEW_PROGRAM);
+	}
+
+	update_params(fp);
+}
+
+/* just some random things... */
+static void dump_program(struct r300_fragment_program *fp)
+{
+	int n, i, j;
+	static int pc = 0;
+
+	fprintf(stderr, "pc=%d*************************************\n", pc++);
+
+	fprintf(stderr, "Mesa program:\n");
+	fprintf(stderr, "-------------\n");
+	_mesa_print_program(&fp->mesa_program.Base);
+	fflush(stdout);
+
+	fprintf(stderr, "Hardware program\n");
+	fprintf(stderr, "----------------\n");
+
+	for (n = 0; n < (fp->cur_node + 1); n++) {
+		fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "
+			"alu_end: %d, tex_end: %d\n", n,
+			fp->node[n].alu_offset,
+			fp->node[n].tex_offset,
+			fp->node[n].alu_end, fp->node[n].tex_end);
+
+		if (fp->tex.length) {
+			fprintf(stderr, "  TEX:\n");
+			for (i = fp->node[n].tex_offset;
+			     i <= fp->node[n].tex_offset + fp->node[n].tex_end;
+			     ++i) {
+				const char *instr;
+
+				switch ((fp->tex.
+					 inst[i] >> R300_FPITX_OPCODE_SHIFT) &
+					15) {
+				case R300_FPITX_OP_TEX:
+					instr = "TEX";
+					break;
+				case R300_FPITX_OP_KIL:
+					instr = "KIL";
+					break;
+				case R300_FPITX_OP_TXP:
+					instr = "TXP";
+					break;
+				case R300_FPITX_OP_TXB:
+					instr = "TXB";
+					break;
+				default:
+					instr = "UNKNOWN";
+				}
+
+				fprintf(stderr,
+					"    %s t%i, %c%i, texture[%i]   (%08x)\n",
+					instr,
+					(fp->tex.
+					 inst[i] >> R300_FPITX_DST_SHIFT) & 31,
+					(fp->tex.
+					 inst[i] & R300_FPITX_SRC_CONST) ? 'c' :
+					't',
+					(fp->tex.
+					 inst[i] >> R300_FPITX_SRC_SHIFT) & 31,
+					(fp->tex.
+					 inst[i] & R300_FPITX_IMAGE_MASK) >>
+					R300_FPITX_IMAGE_SHIFT,
+					fp->tex.inst[i]);
+			}
+		}
+
+		for (i = fp->node[n].alu_offset;
+		     i <= fp->node[n].alu_offset + fp->node[n].alu_end; ++i) {
+			char srcc[3][10], dstc[20];
+			char srca[3][10], dsta[20];
+			char argc[3][20];
+			char arga[3][20];
+			char flags[5], tmp[10];
+
+			for (j = 0; j < 3; ++j) {
+				int regc = fp->alu.inst[i].inst1 >> (j * 6);
+				int rega = fp->alu.inst[i].inst3 >> (j * 6);
+
+				sprintf(srcc[j], "%c%i",
+					(regc & 32) ? 'c' : 't', regc & 31);
+				sprintf(srca[j], "%c%i",
+					(rega & 32) ? 'c' : 't', rega & 31);
+			}
+
+			dstc[0] = 0;
+			sprintf(flags, "%s%s%s",
+				(fp->alu.inst[i].
+				 inst1 & R300_FPI1_DSTC_REG_X) ? "x" : "",
+				(fp->alu.inst[i].
+				 inst1 & R300_FPI1_DSTC_REG_Y) ? "y" : "",
+				(fp->alu.inst[i].
+				 inst1 & R300_FPI1_DSTC_REG_Z) ? "z" : "");
+			if (flags[0] != 0) {
+				sprintf(dstc, "t%i.%s ",
+					(fp->alu.inst[i].
+					 inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
+					flags);
+			}
+			sprintf(flags, "%s%s%s",
+				(fp->alu.inst[i].
+				 inst1 & R300_FPI1_DSTC_OUTPUT_X) ? "x" : "",
+				(fp->alu.inst[i].
+				 inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? "y" : "",
+				(fp->alu.inst[i].
+				 inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? "z" : "");
+			if (flags[0] != 0) {
+				sprintf(tmp, "o%i.%s",
+					(fp->alu.inst[i].
+					 inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
+					flags);
+				strcat(dstc, tmp);
+			}
+
+			dsta[0] = 0;
+			if (fp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) {
+				sprintf(dsta, "t%i.w ",
+					(fp->alu.inst[i].
+					 inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
+			}
+			if (fp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) {
+				sprintf(tmp, "o%i.w ",
+					(fp->alu.inst[i].
+					 inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
+				strcat(dsta, tmp);
+			}
+			if (fp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) {
+				strcat(dsta, "Z");
+			}
+
+			fprintf(stderr,
+				"%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
+				"       w: %3s %3s %3s -> %-20s (%08x)\n", i,
+				srcc[0], srcc[1], srcc[2], dstc,
+				fp->alu.inst[i].inst1, srca[0], srca[1],
+				srca[2], dsta, fp->alu.inst[i].inst3);
+
+			for (j = 0; j < 3; ++j) {
+				int regc = fp->alu.inst[i].inst0 >> (j * 7);
+				int rega = fp->alu.inst[i].inst2 >> (j * 7);
+				int d;
+				char buf[20];
+
+				d = regc & 31;
+				if (d < 12) {
+					switch (d % 4) {
+					case R300_FPI0_ARGC_SRC0C_XYZ:
+						sprintf(buf, "%s.xyz",
+							srcc[d / 4]);
+						break;
+					case R300_FPI0_ARGC_SRC0C_XXX:
+						sprintf(buf, "%s.xxx",
+							srcc[d / 4]);
+						break;
+					case R300_FPI0_ARGC_SRC0C_YYY:
+						sprintf(buf, "%s.yyy",
+							srcc[d / 4]);
+						break;
+					case R300_FPI0_ARGC_SRC0C_ZZZ:
+						sprintf(buf, "%s.zzz",
+							srcc[d / 4]);
+						break;
+					}
+				} else if (d < 15) {
+					sprintf(buf, "%s.www", srca[d - 12]);
+				} else if (d == 20) {
+					sprintf(buf, "0.0");
+				} else if (d == 21) {
+					sprintf(buf, "1.0");
+				} else if (d == 22) {
+					sprintf(buf, "0.5");
+				} else if (d >= 23 && d < 32) {
+					d -= 23;
+					switch (d / 3) {
+					case 0:
+						sprintf(buf, "%s.yzx",
+							srcc[d % 3]);
+						break;
+					case 1:
+						sprintf(buf, "%s.zxy",
+							srcc[d % 3]);
+						break;
+					case 2:
+						sprintf(buf, "%s.Wzy",
+							srcc[d % 3]);
+						break;
+					}
+				} else {
+					sprintf(buf, "%i", d);
+				}
+
+				sprintf(argc[j], "%s%s%s%s",
+					(regc & 32) ? "-" : "",
+					(regc & 64) ? "|" : "",
+					buf, (regc & 64) ? "|" : "");
+
+				d = rega & 31;
+				if (d < 9) {
+					sprintf(buf, "%s.%c", srcc[d / 3],
+						'x' + (char)(d % 3));
+				} else if (d < 12) {
+					sprintf(buf, "%s.w", srca[d - 9]);
+				} else if (d == 16) {
+					sprintf(buf, "0.0");
+				} else if (d == 17) {
+					sprintf(buf, "1.0");
+				} else if (d == 18) {
+					sprintf(buf, "0.5");
+				} else {
+					sprintf(buf, "%i", d);
+				}
+
+				sprintf(arga[j], "%s%s%s%s",
+					(rega & 32) ? "-" : "",
+					(rega & 64) ? "|" : "",
+					buf, (rega & 64) ? "|" : "");
+			}
+
+			fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
+				"       w: %8s %8s %8s    op: %08x\n",
+				argc[0], argc[1], argc[2],
+				fp->alu.inst[i].inst0, arga[0], arga[1],
+				arga[2], fp->alu.inst[i].inst2);
+		}
+	}
+}
diff --git a/r300/r300_fragprog.h b/r300/r300_fragprog.h
new file mode 100644
index 0000000..72fca77
--- /dev/null
+++ b/r300/r300_fragprog.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/*
+ * Authors:
+ *   Ben Skeggs <darktama@iinet.net.au>
+ *   Jerome Glisse <j.glisse@gmail.com>
+ */
+#ifndef __R300_FRAGPROG_H_
+#define __R300_FRAGPROG_H_
+
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "shader/program.h"
+#include "shader/prog_instruction.h"
+
+#include "r300_context.h"
+
+typedef struct r300_fragment_program_swizzle {
+	GLuint length;
+	GLuint src[4];
+	GLuint inst[8];
+} r300_fragment_program_swizzle_t;
+
+/* supported hw opcodes */
+#define PFS_OP_MAD 0
+#define PFS_OP_DP3 1
+#define PFS_OP_DP4 2
+#define PFS_OP_MIN 3
+#define PFS_OP_MAX 4
+#define PFS_OP_CMP 5
+#define PFS_OP_FRC 6
+#define PFS_OP_EX2 7
+#define PFS_OP_LG2 8
+#define PFS_OP_RCP 9
+#define PFS_OP_RSQ 10
+#define PFS_OP_REPL_ALPHA 11
+#define PFS_OP_CMPH 12
+#define MAX_PFS_OP 12
+
+#define PFS_FLAG_SAT	(1 << 0)
+#define PFS_FLAG_ABS	(1 << 1)
+
+#define ARG_NEG			(1 << 5)
+#define ARG_ABS			(1 << 6)
+#define ARG_MASK		(127 << 0)
+#define ARG_STRIDE		7
+#define SRC_CONST		(1 << 5)
+#define SRC_MASK		(63 << 0)
+#define SRC_STRIDE		6
+
+#define NOP_INST0 (						 \
+		(R300_FPI0_OUTC_MAD) |				 \
+		(R300_FPI0_ARGC_ZERO << R300_FPI0_ARG0C_SHIFT) | \
+		(R300_FPI0_ARGC_ZERO << R300_FPI0_ARG1C_SHIFT) | \
+		(R300_FPI0_ARGC_ZERO << R300_FPI0_ARG2C_SHIFT))
+#define NOP_INST1 (					     \
+		((0 | SRC_CONST) << R300_FPI1_SRC0C_SHIFT) | \
+		((0 | SRC_CONST) << R300_FPI1_SRC1C_SHIFT) | \
+		((0 | SRC_CONST) << R300_FPI1_SRC2C_SHIFT))
+#define NOP_INST2 ( \
+		(R300_FPI2_OUTA_MAD) |				 \
+		(R300_FPI2_ARGA_ZERO << R300_FPI2_ARG0A_SHIFT) | \
+		(R300_FPI2_ARGA_ZERO << R300_FPI2_ARG1A_SHIFT) | \
+		(R300_FPI2_ARGA_ZERO << R300_FPI2_ARG2A_SHIFT))
+#define NOP_INST3 (					     \
+		((0 | SRC_CONST) << R300_FPI3_SRC0A_SHIFT) | \
+		((0 | SRC_CONST) << R300_FPI3_SRC1A_SHIFT) | \
+		((0 | SRC_CONST) << R300_FPI3_SRC2A_SHIFT))
+
+#define DRI_CONF_FP_OPTIMIZATION_SPEED   0
+#define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
+
+struct r300_fragment_program;
+
+extern void r300TranslateFragmentShader(r300ContextPtr r300,
+					struct r300_fragment_program *fp);
+
+#endif
diff --git a/r300/r300_ioctl.c b/r300/r300_ioctl.c
new file mode 100644
index 0000000..ea94ce2
--- /dev/null
+++ b/r300/r300_ioctl.c
@@ -0,0 +1,719 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.
+Copyright (C) 2004 Nicolai Haehnle.
+All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include <sched.h>
+#include <errno.h>
+
+#include "glheader.h"
+#include "imports.h"
+#include "macros.h"
+#include "context.h"
+#include "swrast/swrast.h"
+
+#include "r300_context.h"
+#include "radeon_ioctl.h"
+#include "r300_ioctl.h"
+#include "r300_cmdbuf.h"
+#include "r300_state.h"
+#include "r300_program.h"
+#include "radeon_reg.h"
+#include "r300_emit.h"
+
+#include "vblank.h"
+
+#define CLEARBUFFER_COLOR	0x1
+#define CLEARBUFFER_DEPTH	0x2
+#define CLEARBUFFER_STENCIL	0x4
+
+static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
+{
+	GLcontext *ctx = r300->radeon.glCtx;
+	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	GLuint cboffset, cbpitch;
+	drm_r300_cmd_header_t *cmd2;
+	int cmd_reserved = 0;
+	int cmd_written = 0;
+	drm_radeon_cmd_header_t *cmd = NULL;
+	r300ContextPtr rmesa = r300;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s: %s buffer (%i,%i %ix%i)\n",
+			__FUNCTION__, buffer ? "back" : "front",
+			dPriv->x, dPriv->y, dPriv->w, dPriv->h);
+
+	if (buffer) {
+		cboffset = r300->radeon.radeonScreen->backOffset;
+		cbpitch = r300->radeon.radeonScreen->backPitch;
+	} else {
+		cboffset = r300->radeon.radeonScreen->frontOffset;
+		cbpitch = r300->radeon.radeonScreen->frontPitch;
+	}
+
+	cboffset += r300->radeon.radeonScreen->fbLocation;
+
+	cp_wait(r300, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	end_3d(rmesa);
+
+	R300_STATECHANGE(r300, cb);
+	reg_start(R300_RB3D_COLOROFFSET0, 0);
+	e32(cboffset);
+
+	if (r300->radeon.radeonScreen->cpp == 4)
+		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+	else
+		cbpitch |= R300_COLOR_FORMAT_RGB565;
+
+	if (r300->radeon.sarea->tiling_enabled)
+		cbpitch |= R300_COLOR_TILE_ENABLE;
+
+	reg_start(R300_RB3D_COLORPITCH0, 0);
+	e32(cbpitch);
+
+	R300_STATECHANGE(r300, cmk);
+	reg_start(R300_RB3D_COLORMASK, 0);
+
+	if (flags & CLEARBUFFER_COLOR) {
+		e32((ctx->Color.ColorMask[BCOMP] ? R300_COLORMASK0_B : 0) |
+		    (ctx->Color.ColorMask[GCOMP] ? R300_COLORMASK0_G : 0) |
+		    (ctx->Color.ColorMask[RCOMP] ? R300_COLORMASK0_R : 0) |
+		    (ctx->Color.ColorMask[ACOMP] ? R300_COLORMASK0_A : 0));
+	} else {
+		e32(0x0);
+	}
+
+	R300_STATECHANGE(r300, zs);
+	reg_start(R300_RB3D_ZSTENCIL_CNTL_0, 2);
+
+	{
+		uint32_t t1, t2;
+
+		t1 = 0x0;
+		t2 = 0x0;
+
+		if (flags & CLEARBUFFER_DEPTH) {
+			t1 |= R300_RB3D_Z_WRITE_ONLY;
+			t2 |=
+			    (R300_ZS_ALWAYS << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT);
+		} else {
+			t1 |= R300_RB3D_Z_DISABLED_1;	// disable
+		}
+
+		if (flags & CLEARBUFFER_STENCIL) {
+			t1 |= R300_RB3D_STENCIL_ENABLE;
+			t2 |=
+			    (R300_ZS_ALWAYS <<
+			     R300_RB3D_ZS1_FRONT_FUNC_SHIFT) |
+			    (R300_ZS_REPLACE <<
+			     R300_RB3D_ZS1_FRONT_FAIL_OP_SHIFT) |
+			    (R300_ZS_REPLACE <<
+			     R300_RB3D_ZS1_FRONT_ZPASS_OP_SHIFT) |
+			    (R300_ZS_REPLACE <<
+			     R300_RB3D_ZS1_FRONT_ZFAIL_OP_SHIFT) |
+			    (R300_ZS_ALWAYS <<
+			     R300_RB3D_ZS1_BACK_FUNC_SHIFT) |
+			    (R300_ZS_REPLACE <<
+			     R300_RB3D_ZS1_BACK_FAIL_OP_SHIFT) |
+			    (R300_ZS_REPLACE <<
+			     R300_RB3D_ZS1_BACK_ZPASS_OP_SHIFT) |
+			    (R300_ZS_REPLACE <<
+			     R300_RB3D_ZS1_BACK_ZFAIL_OP_SHIFT);
+		}
+
+		e32(t1);
+		e32(t2);
+		e32(r300->state.stencil.clear);
+	}
+
+	cmd2 = (drm_r300_cmd_header_t *) r300AllocCmdBuf(r300, 9, __FUNCTION__);
+	cmd2[0].packet3.cmd_type = R300_CMD_PACKET3;
+	cmd2[0].packet3.packet = R300_CMD_PACKET3_CLEAR;
+	cmd2[1].u = r300PackFloat32(dPriv->w / 2.0);
+	cmd2[2].u = r300PackFloat32(dPriv->h / 2.0);
+	cmd2[3].u = r300PackFloat32(ctx->Depth.Clear);
+	cmd2[4].u = r300PackFloat32(1.0);
+	cmd2[5].u = r300PackFloat32(ctx->Color.ClearColor[0]);
+	cmd2[6].u = r300PackFloat32(ctx->Color.ClearColor[1]);
+	cmd2[7].u = r300PackFloat32(ctx->Color.ClearColor[2]);
+	cmd2[8].u = r300PackFloat32(ctx->Color.ClearColor[3]);
+
+	reg_start(R300_RB3D_DSTCACHE_CTLSTAT, 0);
+	e32(R300_RB3D_DSTCACHE_UNKNOWN_0A);
+
+	reg_start(R300_RB3D_ZCACHE_CTLSTAT, 0);
+	e32(R300_RB3D_ZCACHE_UNKNOWN_03);
+	cp_wait(rmesa, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+}
+
+static void r300EmitClearState(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	r300ContextPtr rmesa = r300;
+	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	int i;
+	int cmd_reserved = 0;
+	int cmd_written = 0;
+	drm_radeon_cmd_header_t *cmd = NULL;
+	int has_tcl = 1;
+
+	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+		has_tcl = 0;
+
+	/* FIXME: the values written to R300_VAP_INPUT_ROUTE_0_0 and
+	 * R300_VAP_INPUT_ROUTE_0_1 are in fact known, however, the values are
+	 * quite complex; see the functions in r300_emit.c.
+	 *
+	 * I believe it would be a good idea to extend the functions in
+	 * r300_emit.c so that they can be used to setup the default values for
+	 * these registers, as well as the actual values used for rendering.
+	 */
+	R300_STATECHANGE(r300, vir[0]);
+	reg_start(R300_VAP_INPUT_ROUTE_0_0, 0);
+	if (!has_tcl)
+		e32(0x22030003);
+	else
+		e32(0x21030003);
+
+	/* disable fog */
+	R300_STATECHANGE(r300, fogs);
+	reg_start(R300_RE_FOG_STATE, 0);
+	e32(0x0);
+
+	R300_STATECHANGE(r300, vir[1]);
+	reg_start(R300_VAP_INPUT_ROUTE_1_0, 0);
+	e32(0xF688F688);
+
+	/* R300_VAP_INPUT_CNTL_0, R300_VAP_INPUT_CNTL_1 */
+	R300_STATECHANGE(r300, vic);
+	reg_start(R300_VAP_INPUT_CNTL_0, 1);
+	e32(R300_INPUT_CNTL_0_COLOR);
+	e32(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
+
+	if (!has_tcl) {
+		R300_STATECHANGE(r300, vte);
+		/* comes from fglrx startup of clear */
+		reg_start(R300_SE_VTE_CNTL, 1);
+		e32(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
+		    R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
+		    R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
+		    R300_VPORT_Z_OFFSET_ENA);
+		e32(0x8);
+
+		reg_start(0x21dc, 0);
+		e32(0xaaaaaaaa);
+	}
+
+	R300_STATECHANGE(r300, vof);
+	reg_start(R300_VAP_OUTPUT_VTX_FMT_0, 1);
+	e32(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
+	    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_PRESENT);
+	e32(0x0);			/* no textures */
+
+	R300_STATECHANGE(r300, txe);
+	reg_start(R300_TX_ENABLE, 0);
+	e32(0x0);
+
+	R300_STATECHANGE(r300, vpt);
+	reg_start(R300_SE_VPORT_XSCALE, 5);
+	efloat(1.0);
+	efloat(dPriv->x);
+	efloat(1.0);
+	efloat(dPriv->y);
+	efloat(1.0);
+	efloat(0.0);
+
+	R300_STATECHANGE(r300, at);
+	reg_start(R300_PP_ALPHA_TEST, 0);
+	e32(0x0);
+
+	R300_STATECHANGE(r300, bld);
+	reg_start(R300_RB3D_CBLEND, 1);
+	e32(0x0);
+	e32(0x0);
+
+	R300_STATECHANGE(r300, vap_clip_cntl);
+	reg_start(R300_VAP_CLIP_CNTL, 0);
+	e32(R300_221C_CLEAR);
+
+	R300_STATECHANGE(r300, ps);
+	reg_start(R300_RE_POINTSIZE, 0);
+	e32(((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
+	    ((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
+
+	R300_STATECHANGE(r300, ri);
+	reg_start(R300_RS_INTERP_0, 8);
+	for (i = 0; i < 8; ++i) {
+		e32(R300_RS_INTERP_USED);
+	}
+
+	R300_STATECHANGE(r300, rc);
+	/* The second constant is needed to get glxgears display anything .. */
+	reg_start(R300_RS_CNTL_0, 1);
+	e32((1 << R300_RS_CNTL_CI_CNT_SHIFT) | R300_RS_CNTL_0_UNKNOWN_18);
+	e32(0x0);
+
+	R300_STATECHANGE(r300, rr);
+	reg_start(R300_RS_ROUTE_0, 0);
+	e32(R300_RS_ROUTE_0_COLOR);
+
+	R300_STATECHANGE(r300, fp);
+	reg_start(R300_PFS_CNTL_0, 2);
+	e32(0x0);
+	e32(0x0);
+	e32(0x0);
+	reg_start(R300_PFS_NODE_0, 3);
+	e32(0x0);
+	e32(0x0);
+	e32(0x0);
+	e32(R300_PFS_NODE_OUTPUT_COLOR);
+
+	R300_STATECHANGE(r300, fpi[0]);
+	R300_STATECHANGE(r300, fpi[1]);
+	R300_STATECHANGE(r300, fpi[2]);
+	R300_STATECHANGE(r300, fpi[3]);
+
+	reg_start(R300_PFS_INSTR0_0, 0);
+	e32(FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
+
+	reg_start(R300_PFS_INSTR1_0, 0);
+	e32(FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
+
+	reg_start(R300_PFS_INSTR2_0, 0);
+	e32(FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
+
+	reg_start(R300_PFS_INSTR3_0, 0);
+	e32(FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
+
+	if (has_tcl) {
+		R300_STATECHANGE(r300, pvs);
+		reg_start(R300_VAP_PVS_CNTL_1, 2);
+		e32((0 << R300_PVS_CNTL_1_PROGRAM_START_SHIFT) |
+		    (0 << R300_PVS_CNTL_1_POS_END_SHIFT) |
+		    (1 << R300_PVS_CNTL_1_PROGRAM_END_SHIFT));
+		e32(0x0);
+		e32(1 << R300_PVS_CNTL_3_PROGRAM_UNKNOWN_SHIFT);
+
+		R300_STATECHANGE(r300, vpi);
+		vsf_start_fragment(0x0, 8);
+		e32(VP_OUT(ADD, OUT, 0, XYZW));
+		e32(VP_IN(IN, 0));
+		e32(VP_ZERO());
+		e32(0x0);
+
+		e32(VP_OUT(ADD, OUT, 1, XYZW));
+		e32(VP_IN(IN, 1));
+		e32(VP_ZERO());
+		e32(0x0);
+	}
+}
+
+/**
+ * Buffer clear
+ */
+static void r300Clear(GLcontext * ctx, GLbitfield mask)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	int flags = 0;
+	int bits = 0;
+	int swapped;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "r300Clear\n");
+
+	{
+		LOCK_HARDWARE(&r300->radeon);
+		UNLOCK_HARDWARE(&r300->radeon);
+		if (dPriv->numClipRects == 0)
+			return;
+	}
+
+	if (mask & BUFFER_BIT_FRONT_LEFT) {
+		flags |= BUFFER_BIT_FRONT_LEFT;
+		mask &= ~BUFFER_BIT_FRONT_LEFT;
+	}
+
+	if (mask & BUFFER_BIT_BACK_LEFT) {
+		flags |= BUFFER_BIT_BACK_LEFT;
+		mask &= ~BUFFER_BIT_BACK_LEFT;
+	}
+
+	if (mask & BUFFER_BIT_DEPTH) {
+		bits |= CLEARBUFFER_DEPTH;
+		mask &= ~BUFFER_BIT_DEPTH;
+	}
+
+	if ((mask & BUFFER_BIT_STENCIL) && r300->state.stencil.hw_stencil) {
+		bits |= CLEARBUFFER_STENCIL;
+		mask &= ~BUFFER_BIT_STENCIL;
+	}
+
+	if (mask) {
+		if (RADEON_DEBUG & DEBUG_FALLBACKS)
+			fprintf(stderr, "%s: swrast clear, mask: %x\n",
+				__FUNCTION__, mask);
+		_swrast_Clear(ctx, mask);
+	}
+
+	swapped = r300->radeon.sarea->pfCurrentPage == 1;
+
+	/* Make sure it fits there. */
+	r300EnsureCmdBufSpace(r300, 421 * 3, __FUNCTION__);
+	if (flags || bits)
+		r300EmitClearState(ctx);
+
+	if (flags & BUFFER_BIT_FRONT_LEFT) {
+		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped);
+		bits = 0;
+	}
+
+	if (flags & BUFFER_BIT_BACK_LEFT) {
+		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped ^ 1);
+		bits = 0;
+	}
+
+	if (bits)
+		r300ClearBuffer(r300, bits, 0);
+
+}
+
+void r300Flush(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (r300->cmdbuf.count_used > r300->cmdbuf.count_reemit)
+		r300FlushCmdBuf(r300, __FUNCTION__);
+}
+
+#ifdef USER_BUFFERS
+#include "r300_mem.h"
+
+static void r300RefillCurrentDmaRegion(r300ContextPtr rmesa, int size)
+{
+	struct r300_dma_buffer *dmabuf;
+	size = MAX2(size, RADEON_BUFFER_SIZE * 16);
+
+	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (rmesa->dma.flush) {
+		rmesa->dma.flush(rmesa);
+	}
+
+	if (rmesa->dma.current.buf)
+		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
+
+	if (rmesa->dma.nr_released_bufs > 4)
+		r300FlushCmdBuf(rmesa, __FUNCTION__);
+
+	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
+	dmabuf->buf = (void *)1;	/* hack */
+	dmabuf->refcount = 1;
+
+	dmabuf->id = r300_mem_alloc(rmesa, 4, size);
+	if (dmabuf->id == 0) {
+		LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
+
+		r300FlushCmdBufLocked(rmesa, __FUNCTION__);
+		radeonWaitForIdleLocked(&rmesa->radeon);
+
+		dmabuf->id = r300_mem_alloc(rmesa, 4, size);
+
+		UNLOCK_HARDWARE(&rmesa->radeon);
+
+		if (dmabuf->id == 0) {
+			fprintf(stderr,
+				"Error: Could not get dma buffer... exiting\n");
+			_mesa_exit(-1);
+		}
+	}
+
+	rmesa->dma.current.buf = dmabuf;
+	rmesa->dma.current.address = r300_mem_ptr(rmesa, dmabuf->id);
+	rmesa->dma.current.end = size;
+	rmesa->dma.current.start = 0;
+	rmesa->dma.current.ptr = 0;
+}
+
+void r300ReleaseDmaRegion(r300ContextPtr rmesa,
+			  struct r300_dma_region *region, const char *caller)
+{
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
+
+	if (!region->buf)
+		return;
+
+	if (rmesa->dma.flush)
+		rmesa->dma.flush(rmesa);
+
+	if (--region->buf->refcount == 0) {
+		r300_mem_free(rmesa, region->buf->id);
+		FREE(region->buf);
+		rmesa->dma.nr_released_bufs++;
+	}
+
+	region->buf = 0;
+	region->start = 0;
+}
+
+/* Allocates a region from rmesa->dma.current.  If there isn't enough
+ * space in current, grab a new buffer (and discard what was left of current)
+ */
+void r300AllocDmaRegion(r300ContextPtr rmesa,
+			struct r300_dma_region *region,
+			int bytes, int alignment)
+{
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+
+	if (rmesa->dma.flush)
+		rmesa->dma.flush(rmesa);
+
+	if (region->buf)
+		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
+
+	alignment--;
+	rmesa->dma.current.start = rmesa->dma.current.ptr =
+	    (rmesa->dma.current.ptr + alignment) & ~alignment;
+
+	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
+		r300RefillCurrentDmaRegion(rmesa, (bytes + 0x7) & ~0x7);
+
+	region->start = rmesa->dma.current.start;
+	region->ptr = rmesa->dma.current.start;
+	region->end = rmesa->dma.current.start + bytes;
+	region->address = rmesa->dma.current.address;
+	region->buf = rmesa->dma.current.buf;
+	region->buf->refcount++;
+
+	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
+	rmesa->dma.current.start =
+	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
+
+	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
+}
+
+#else
+static void r300RefillCurrentDmaRegion(r300ContextPtr rmesa)
+{
+	struct r300_dma_buffer *dmabuf;
+	int fd = rmesa->radeon.dri.fd;
+	int index = 0;
+	int size = 0;
+	drmDMAReq dma;
+	int ret;
+
+	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (rmesa->dma.flush) {
+		rmesa->dma.flush(rmesa);
+	}
+
+	if (rmesa->dma.current.buf)
+		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
+
+	if (rmesa->dma.nr_released_bufs > 4)
+		r300FlushCmdBuf(rmesa, __FUNCTION__);
+
+	dma.context = rmesa->radeon.dri.hwContext;
+	dma.send_count = 0;
+	dma.send_list = NULL;
+	dma.send_sizes = NULL;
+	dma.flags = 0;
+	dma.request_count = 1;
+	dma.request_size = RADEON_BUFFER_SIZE;
+	dma.request_list = &index;
+	dma.request_sizes = &size;
+	dma.granted_count = 0;
+
+	LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
+
+	ret = drmDMA(fd, &dma);
+
+	if (ret != 0) {
+		/* Try to release some buffers and wait until we can't get any more */
+		if (rmesa->dma.nr_released_bufs) {
+			r300FlushCmdBufLocked(rmesa, __FUNCTION__);
+		}
+
+		if (RADEON_DEBUG & DEBUG_DMA)
+			fprintf(stderr, "Waiting for buffers\n");
+
+		radeonWaitForIdleLocked(&rmesa->radeon);
+		ret = drmDMA(fd, &dma);
+
+		if (ret != 0) {
+			UNLOCK_HARDWARE(&rmesa->radeon);
+			fprintf(stderr,
+				"Error: Could not get dma buffer... exiting\n");
+			_mesa_exit(-1);
+		}
+	}
+
+	UNLOCK_HARDWARE(&rmesa->radeon);
+
+	if (RADEON_DEBUG & DEBUG_DMA)
+		fprintf(stderr, "Allocated buffer %d\n", index);
+
+	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
+	dmabuf->buf = &rmesa->radeon.radeonScreen->buffers->list[index];
+	dmabuf->refcount = 1;
+
+	rmesa->dma.current.buf = dmabuf;
+	rmesa->dma.current.address = dmabuf->buf->address;
+	rmesa->dma.current.end = dmabuf->buf->total;
+	rmesa->dma.current.start = 0;
+	rmesa->dma.current.ptr = 0;
+}
+
+void r300ReleaseDmaRegion(r300ContextPtr rmesa,
+			  struct r300_dma_region *region, const char *caller)
+{
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
+
+	if (!region->buf)
+		return;
+
+	if (rmesa->dma.flush)
+		rmesa->dma.flush(rmesa);
+
+	if (--region->buf->refcount == 0) {
+		drm_radeon_cmd_header_t *cmd;
+
+		if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
+			fprintf(stderr, "%s -- DISCARD BUF %d\n",
+				__FUNCTION__, region->buf->buf->idx);
+		cmd =
+		    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa,
+								sizeof
+								(*cmd) / 4,
+								__FUNCTION__);
+		cmd->dma.cmd_type = R300_CMD_DMA_DISCARD;
+		cmd->dma.buf_idx = region->buf->buf->idx;
+
+		FREE(region->buf);
+		rmesa->dma.nr_released_bufs++;
+	}
+
+	region->buf = 0;
+	region->start = 0;
+}
+
+/* Allocates a region from rmesa->dma.current.  If there isn't enough
+ * space in current, grab a new buffer (and discard what was left of current)
+ */
+void r300AllocDmaRegion(r300ContextPtr rmesa,
+			struct r300_dma_region *region,
+			int bytes, int alignment)
+{
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+
+	if (rmesa->dma.flush)
+		rmesa->dma.flush(rmesa);
+
+	if (region->buf)
+		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
+
+	alignment--;
+	rmesa->dma.current.start = rmesa->dma.current.ptr =
+	    (rmesa->dma.current.ptr + alignment) & ~alignment;
+
+	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
+		r300RefillCurrentDmaRegion(rmesa);
+
+	region->start = rmesa->dma.current.start;
+	region->ptr = rmesa->dma.current.start;
+	region->end = rmesa->dma.current.start + bytes;
+	region->address = rmesa->dma.current.address;
+	region->buf = rmesa->dma.current.buf;
+	region->buf->refcount++;
+
+	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
+	rmesa->dma.current.start =
+	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
+
+	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
+}
+
+#endif
+
+GLboolean r300IsGartMemory(r300ContextPtr rmesa, const GLvoid * pointer,
+			   GLint size)
+{
+	int offset =
+	    (char *)pointer -
+	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+	int valid = (size >= 0 && offset >= 0
+		     && offset + size <
+		     rmesa->radeon.radeonScreen->gartTextures.size);
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "r300IsGartMemory( %p ) : %d\n", pointer,
+			valid);
+
+	return valid;
+}
+
+GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa, const GLvoid * pointer)
+{
+	int offset =
+	    (char *)pointer -
+	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+
+	//fprintf(stderr, "offset=%08x\n", offset);
+
+	if (offset < 0
+	    || offset > rmesa->radeon.radeonScreen->gartTextures.size)
+		return ~0;
+	else
+		return rmesa->radeon.radeonScreen->gart_texture_offset + offset;
+}
+
+void r300InitIoctlFuncs(struct dd_function_table *functions)
+{
+	functions->Clear = r300Clear;
+	functions->Finish = radeonFinish;
+	functions->Flush = r300Flush;
+}
diff --git a/r300/r300_ioctl.h b/r300/r300_ioctl.h
new file mode 100644
index 0000000..7a19a2c
--- /dev/null
+++ b/r300/r300_ioctl.h
@@ -0,0 +1,59 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R300_IOCTL_H__
+#define __R300_IOCTL_H__
+
+#include "r300_context.h"
+#include "radeon_drm.h"
+
+extern GLboolean r300IsGartMemory(r300ContextPtr rmesa,
+				  const GLvoid * pointer, GLint size);
+
+extern GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa,
+					const GLvoid * pointer);
+
+extern void r300Flush(GLcontext * ctx);
+
+extern void r300ReleaseDmaRegion(r300ContextPtr rmesa,
+				 struct r300_dma_region *region,
+				 const char *caller);
+extern void r300AllocDmaRegion(r300ContextPtr rmesa,
+			       struct r300_dma_region *region, int bytes,
+			       int alignment);
+
+extern void r300InitIoctlFuncs(struct dd_function_table *functions);
+
+#endif				/* __R300_IOCTL_H__ */
diff --git a/r300/r300_mem.c b/r300/r300_mem.c
new file mode 100644
index 0000000..f8f9d4f
--- /dev/null
+++ b/r300/r300_mem.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright (C) 2005 Aapo Tahkola.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * \author Aapo Tahkola <aet@rasterburn.org>
+ */
+
+#include <unistd.h>
+
+#include "r300_context.h"
+#include "r300_cmdbuf.h"
+#include "r300_ioctl.h"
+#include "r300_mem.h"
+#include "radeon_ioctl.h"
+
+#ifdef USER_BUFFERS
+
+static void resize_u_list(r300ContextPtr rmesa)
+{
+	void *temp;
+	int nsize;
+
+	temp = rmesa->rmm->u_list;
+	nsize = rmesa->rmm->u_size * 2;
+
+	rmesa->rmm->u_list = _mesa_malloc(nsize * sizeof(*rmesa->rmm->u_list));
+	_mesa_memset(rmesa->rmm->u_list, 0,
+		     nsize * sizeof(*rmesa->rmm->u_list));
+
+	if (temp) {
+		r300FlushCmdBuf(rmesa, __FUNCTION__);
+
+		_mesa_memcpy(rmesa->rmm->u_list, temp,
+			     rmesa->rmm->u_size * sizeof(*rmesa->rmm->u_list));
+		_mesa_free(temp);
+	}
+
+	rmesa->rmm->u_size = nsize;
+}
+
+void r300_mem_init(r300ContextPtr rmesa)
+{
+	rmesa->rmm = malloc(sizeof(struct r300_memory_manager));
+	memset(rmesa->rmm, 0, sizeof(struct r300_memory_manager));
+
+	rmesa->rmm->u_size = 128;
+	resize_u_list(rmesa);
+}
+
+void r300_mem_destroy(r300ContextPtr rmesa)
+{
+	_mesa_free(rmesa->rmm->u_list);
+	rmesa->rmm->u_list = NULL;
+
+	_mesa_free(rmesa->rmm);
+	rmesa->rmm = NULL;
+}
+
+void *r300_mem_ptr(r300ContextPtr rmesa, int id)
+{
+	assert(id <= rmesa->rmm->u_last);
+	return rmesa->rmm->u_list[id].ptr;
+}
+
+int r300_mem_find(r300ContextPtr rmesa, void *ptr)
+{
+	int i;
+
+	for (i = 1; i < rmesa->rmm->u_size + 1; i++)
+		if (rmesa->rmm->u_list[i].ptr &&
+		    ptr >= rmesa->rmm->u_list[i].ptr &&
+		    ptr <
+		    rmesa->rmm->u_list[i].ptr + rmesa->rmm->u_list[i].size)
+			break;
+
+	if (i < rmesa->rmm->u_size + 1)
+		return i;
+
+	fprintf(stderr, "%p failed\n", ptr);
+	return 0;
+}
+
+//#define MM_DEBUG
+int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size)
+{
+	drm_radeon_mem_alloc_t alloc;
+	int offset = 0, ret;
+	int i, free = -1;
+	int done_age;
+	drm_radeon_mem_free_t memfree;
+	int tries = 0;
+	static int bytes_wasted = 0, allocated = 0;
+
+	if (size < 4096)
+		bytes_wasted += 4096 - size;
+
+	allocated += size;
+
+#if 0
+	static int t = 0;
+	if (t != time(NULL)) {
+		t = time(NULL);
+		fprintf(stderr, "slots used %d, wasted %d kb, allocated %d\n",
+			rmesa->rmm->u_last, bytes_wasted / 1024,
+			allocated / 1024);
+	}
+#endif
+
+	memfree.region = RADEON_MEM_REGION_GART;
+
+      again:
+
+	done_age = radeonGetAge((radeonContextPtr) rmesa);
+
+	if (rmesa->rmm->u_last + 1 >= rmesa->rmm->u_size)
+		resize_u_list(rmesa);
+
+	for (i = rmesa->rmm->u_last + 1; i > 0; i--) {
+		if (rmesa->rmm->u_list[i].ptr == NULL) {
+			free = i;
+			continue;
+		}
+
+		if (rmesa->rmm->u_list[i].h_pending == 0 &&
+		    rmesa->rmm->u_list[i].pending
+		    && rmesa->rmm->u_list[i].age <= done_age) {
+			memfree.region_offset =
+			    (char *)rmesa->rmm->u_list[i].ptr -
+			    (char *)rmesa->radeon.radeonScreen->gartTextures.
+			    map;
+
+			ret =
+			    drmCommandWrite(rmesa->radeon.radeonScreen->
+					    driScreen->fd, DRM_RADEON_FREE,
+					    &memfree, sizeof(memfree));
+
+			if (ret) {
+				fprintf(stderr, "Failed to free at %p\n",
+					rmesa->rmm->u_list[i].ptr);
+				fprintf(stderr, "ret = %s\n", strerror(-ret));
+				exit(1);
+			} else {
+#ifdef MM_DEBUG
+				fprintf(stderr, "really freed %d at age %x\n",
+					i,
+					radeonGetAge((radeonContextPtr) rmesa));
+#endif
+				if (i == rmesa->rmm->u_last)
+					rmesa->rmm->u_last--;
+
+				if (rmesa->rmm->u_list[i].size < 4096)
+					bytes_wasted -=
+					    4096 - rmesa->rmm->u_list[i].size;
+
+				allocated -= rmesa->rmm->u_list[i].size;
+				rmesa->rmm->u_list[i].pending = 0;
+				rmesa->rmm->u_list[i].ptr = NULL;
+				free = i;
+			}
+		}
+	}
+	rmesa->rmm->u_head = i;
+
+	if (free == -1) {
+		WARN_ONCE("Ran out of slots!\n");
+		//usleep(100);
+		r300FlushCmdBuf(rmesa, __FUNCTION__);
+		tries++;
+		if (tries > 100) {
+			WARN_ONCE("Ran out of slots!\n");
+			exit(1);
+		}
+		goto again;
+	}
+
+	alloc.region = RADEON_MEM_REGION_GART;
+	alloc.alignment = alignment;
+	alloc.size = size;
+	alloc.region_offset = &offset;
+
+	ret =
+	    drmCommandWriteRead(rmesa->radeon.dri.fd, DRM_RADEON_ALLOC, &alloc,
+				sizeof(alloc));
+	if (ret) {
+#if 0
+		WARN_ONCE("Ran out of mem!\n");
+		r300FlushCmdBuf(rmesa, __FUNCTION__);
+		//usleep(100);
+		tries2++;
+		tries = 0;
+		if (tries2 > 100) {
+			WARN_ONCE("Ran out of GART memory!\n");
+			exit(1);
+		}
+		goto again;
+#else
+		WARN_ONCE
+		    ("Ran out of GART memory (for %d)!\nPlease consider adjusting GARTSize option.\n",
+		     size);
+		return 0;
+#endif
+	}
+
+	i = free;
+
+	if (i > rmesa->rmm->u_last)
+		rmesa->rmm->u_last = i;
+
+	rmesa->rmm->u_list[i].ptr =
+	    ((GLubyte *) rmesa->radeon.radeonScreen->gartTextures.map) + offset;
+	rmesa->rmm->u_list[i].size = size;
+	rmesa->rmm->u_list[i].age = 0;
+	//fprintf(stderr, "alloc %p at id %d\n", rmesa->rmm->u_list[i].ptr, i);
+
+#ifdef MM_DEBUG
+	fprintf(stderr, "allocated %d at age %x\n", i,
+		radeonGetAge((radeonContextPtr) rmesa));
+#endif
+
+	return i;
+}
+
+void r300_mem_use(r300ContextPtr rmesa, int id)
+{
+	uint64_t ull;
+#ifdef MM_DEBUG
+	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
+		radeonGetAge((radeonContextPtr) rmesa));
+#endif
+	drm_r300_cmd_header_t *cmd;
+
+	assert(id <= rmesa->rmm->u_last);
+
+	if (id == 0)
+		return;
+
+	cmd =
+	    (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa,
+						      2 + sizeof(ull) / 4,
+						      __FUNCTION__);
+	cmd[0].scratch.cmd_type = R300_CMD_SCRATCH;
+	cmd[0].scratch.reg = R300_MEM_SCRATCH;
+	cmd[0].scratch.n_bufs = 1;
+	cmd[0].scratch.flags = 0;
+	cmd++;
+
+	ull = (uint64_t) (intptr_t) & rmesa->rmm->u_list[id].age;
+	_mesa_memcpy(cmd, &ull, sizeof(ull));
+	cmd += sizeof(ull) / 4;
+
+	cmd[0].u = /*id */ 0;
+
+	LOCK_HARDWARE(&rmesa->radeon);	/* Protect from DRM. */
+	rmesa->rmm->u_list[id].h_pending++;
+	UNLOCK_HARDWARE(&rmesa->radeon);
+}
+
+unsigned long r300_mem_offset(r300ContextPtr rmesa, int id)
+{
+	unsigned long offset;
+
+	assert(id <= rmesa->rmm->u_last);
+
+	offset = (char *)rmesa->rmm->u_list[id].ptr -
+	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+	offset += rmesa->radeon.radeonScreen->gart_texture_offset;
+
+	return offset;
+}
+
+void *r300_mem_map(r300ContextPtr rmesa, int id, int access)
+{
+#ifdef MM_DEBUG
+	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
+		radeonGetAge((radeonContextPtr) rmesa));
+#endif
+	void *ptr;
+	int tries = 0;
+
+	assert(id <= rmesa->rmm->u_last);
+
+	if (access == R300_MEM_R) {
+
+		if (rmesa->rmm->u_list[id].mapped == 1)
+			WARN_ONCE("buffer %d already mapped\n", id);
+
+		rmesa->rmm->u_list[id].mapped = 1;
+		ptr = r300_mem_ptr(rmesa, id);
+
+		return ptr;
+	}
+
+	if (rmesa->rmm->u_list[id].h_pending)
+		r300FlushCmdBuf(rmesa, __FUNCTION__);
+
+	if (rmesa->rmm->u_list[id].h_pending) {
+		return NULL;
+	}
+
+	while (rmesa->rmm->u_list[id].age >
+	       radeonGetAge((radeonContextPtr) rmesa) && tries++ < 1000)
+		usleep(10);
+
+	if (tries >= 1000) {
+		fprintf(stderr, "Idling failed (%x vs %x)\n",
+			rmesa->rmm->u_list[id].age,
+			radeonGetAge((radeonContextPtr) rmesa));
+		return NULL;
+	}
+
+	if (rmesa->rmm->u_list[id].mapped == 1)
+		WARN_ONCE("buffer %d already mapped\n", id);
+
+	rmesa->rmm->u_list[id].mapped = 1;
+	ptr = r300_mem_ptr(rmesa, id);
+
+	return ptr;
+}
+
+void r300_mem_unmap(r300ContextPtr rmesa, int id)
+{
+#ifdef MM_DEBUG
+	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
+		radeonGetAge((radeonContextPtr) rmesa));
+#endif
+
+	assert(id <= rmesa->rmm->u_last);
+
+	if (rmesa->rmm->u_list[id].mapped == 0)
+		WARN_ONCE("buffer %d not mapped\n", id);
+
+	rmesa->rmm->u_list[id].mapped = 0;
+}
+
+void r300_mem_free(r300ContextPtr rmesa, int id)
+{
+#ifdef MM_DEBUG
+	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
+		radeonGetAge((radeonContextPtr) rmesa));
+#endif
+
+	assert(id <= rmesa->rmm->u_last);
+
+	if (id == 0)
+		return;
+
+	if (rmesa->rmm->u_list[id].ptr == NULL) {
+		WARN_ONCE("Not allocated!\n");
+		return;
+	}
+
+	if (rmesa->rmm->u_list[id].pending) {
+		WARN_ONCE("%p already pended!\n", rmesa->rmm->u_list[id].ptr);
+		return;
+	}
+
+	rmesa->rmm->u_list[id].pending = 1;
+}
+#endif
diff --git a/r300/r300_mem.h b/r300/r300_mem.h
new file mode 100644
index 0000000..625a7f6
--- /dev/null
+++ b/r300/r300_mem.h
@@ -0,0 +1,37 @@
+#ifndef __R300_MEM_H__
+#define __R300_MEM_H__
+
+//#define R300_MEM_PDL 0
+#define R300_MEM_UL 1
+
+#define R300_MEM_R 1
+#define R300_MEM_W 2
+#define R300_MEM_RW (R300_MEM_R | R300_MEM_W)
+
+#define R300_MEM_SCRATCH 2
+
+struct r300_memory_manager {
+	struct {
+		void *ptr;
+		uint32_t size;
+		uint32_t age;
+		uint32_t h_pending;
+		int pending;
+		int mapped;
+	} *u_list;
+	int u_head, u_size, u_last;
+
+};
+
+extern void r300_mem_init(r300ContextPtr rmesa);
+extern void r300_mem_destroy(r300ContextPtr rmesa);
+extern void *r300_mem_ptr(r300ContextPtr rmesa, int id);
+extern int r300_mem_find(r300ContextPtr rmesa, void *ptr);
+extern int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size);
+extern void r300_mem_use(r300ContextPtr rmesa, int id);
+extern unsigned long r300_mem_offset(r300ContextPtr rmesa, int id);
+extern void *r300_mem_map(r300ContextPtr rmesa, int id, int access);
+extern void r300_mem_unmap(r300ContextPtr rmesa, int id);
+extern void r300_mem_free(r300ContextPtr rmesa, int id);
+
+#endif
diff --git a/r300/r300_program.h b/r300/r300_program.h
new file mode 100644
index 0000000..eddd783
--- /dev/null
+++ b/r300/r300_program.h
@@ -0,0 +1,150 @@
+/*
+Copyright (C) 2004 Nicolai Haehnle.  All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R300_PROGRAM_H__
+#define __R300_PROGRAM_H__
+
+#include "r300_reg.h"
+
+/**
+ * Vertex program helper macros
+ */
+
+/* Produce out dword */
+#define VP_OUTCLASS_TMP		R300_VPI_OUT_REG_CLASS_TEMPORARY
+#define VP_OUTCLASS_OUT		R300_VPI_OUT_REG_CLASS_RESULT
+
+#define VP_OUTMASK_X	R300_VPI_OUT_WRITE_X
+#define VP_OUTMASK_Y	R300_VPI_OUT_WRITE_Y
+#define VP_OUTMASK_Z	R300_VPI_OUT_WRITE_Z
+#define VP_OUTMASK_W	R300_VPI_OUT_WRITE_W
+#define VP_OUTMASK_XY	(VP_OUTMASK_X|VP_OUTMASK_Y)
+#define VP_OUTMASK_XZ	(VP_OUTMASK_X|VP_OUTMASK_Z)
+#define VP_OUTMASK_XW	(VP_OUTMASK_X|VP_OUTMASK_W)
+#define VP_OUTMASK_XYZ	(VP_OUTMASK_XY|VP_OUTMASK_Z)
+#define VP_OUTMASK_XYW	(VP_OUTMASK_XY|VP_OUTMASK_W)
+#define VP_OUTMASK_XZW	(VP_OUTMASK_XZ|VP_OUTMASK_W)
+#define VP_OUTMASK_XYZW	(VP_OUTMASK_XYZ|VP_OUTMASK_W)
+#define VP_OUTMASK_YZ	(VP_OUTMASK_Y|VP_OUTMASK_Z)
+#define VP_OUTMASK_YW	(VP_OUTMASK_Y|VP_OUTMASK_W)
+#define VP_OUTMASK_YZW	(VP_OUTMASK_YZ|VP_OUTMASK_W)
+#define VP_OUTMASK_ZW	(VP_OUTMASK_Z|VP_OUTMASK_W)
+
+#define VP_OUT(instr,outclass,outidx,outmask) \
+	(R300_VPI_OUT_OP_##instr |				\
+	((outidx) << R300_VPI_OUT_REG_INDEX_SHIFT) |		\
+	VP_OUTCLASS_##outclass |				\
+	VP_OUTMASK_##outmask)
+
+/* Produce in dword */
+#define VP_INCLASS_TMP		R300_VPI_IN_REG_CLASS_TEMPORARY
+#define VP_INCLASS_IN		R300_VPI_IN_REG_CLASS_ATTRIBUTE
+#define VP_INCLASS_CONST	R300_VPI_IN_REG_CLASS_PARAMETER
+
+#define VP_IN(class,idx) \
+	(((idx) << R300_VPI_IN_REG_INDEX_SHIFT) |		\
+	VP_INCLASS_##class |					\
+	(R300_VPI_IN_SELECT_X << R300_VPI_IN_X_SHIFT) |		\
+	(R300_VPI_IN_SELECT_Y << R300_VPI_IN_Y_SHIFT) |		\
+	(R300_VPI_IN_SELECT_Z << R300_VPI_IN_Z_SHIFT) |		\
+	(R300_VPI_IN_SELECT_W << R300_VPI_IN_W_SHIFT))
+#define VP_ZERO() \
+	((R300_VPI_IN_SELECT_ZERO << R300_VPI_IN_X_SHIFT) |	\
+	(R300_VPI_IN_SELECT_ZERO << R300_VPI_IN_Y_SHIFT) |	\
+	(R300_VPI_IN_SELECT_ZERO << R300_VPI_IN_Z_SHIFT) |	\
+	(R300_VPI_IN_SELECT_ZERO << R300_VPI_IN_W_SHIFT))
+#define VP_ONE() \
+	((R300_VPI_IN_SELECT_ONE << R300_VPI_IN_X_SHIFT) |	\
+	(R300_VPI_IN_SELECT_ONE << R300_VPI_IN_Y_SHIFT) |	\
+	(R300_VPI_IN_SELECT_ONE << R300_VPI_IN_Z_SHIFT) |	\
+	(R300_VPI_IN_SELECT_ONE << R300_VPI_IN_W_SHIFT))
+
+#define VP_NEG(in,comp)		((in) ^ (R300_VPI_IN_NEG_##comp))
+#define VP_NEGALL(in,comp)	VP_NEG(VP_NEG(VP_NEG(VP_NEG((in),X),Y),Z),W)
+
+/**
+ * Fragment program helper macros
+ */
+
+/* Produce unshifted source selectors */
+#define FP_TMP(idx) (idx)
+#define FP_CONST(idx) ((idx) | (1 << 5))
+
+/* Produce source/dest selector dword */
+#define FP_SELC_MASK_NO		0
+#define FP_SELC_MASK_X		1
+#define FP_SELC_MASK_Y		2
+#define FP_SELC_MASK_XY		3
+#define FP_SELC_MASK_Z		4
+#define FP_SELC_MASK_XZ		5
+#define FP_SELC_MASK_YZ		6
+#define FP_SELC_MASK_XYZ	7
+
+#define FP_SELC(destidx,regmask,outmask,src0,src1,src2) \
+	(((destidx) << R300_FPI1_DSTC_SHIFT) |		\
+	 (FP_SELC_MASK_##regmask << 23) |		\
+	 (FP_SELC_MASK_##outmask << 26) |		\
+	 ((src0) << R300_FPI1_SRC0C_SHIFT) |		\
+	 ((src1) << R300_FPI1_SRC1C_SHIFT) |		\
+	 ((src2) << R300_FPI1_SRC2C_SHIFT))
+
+#define FP_SELA_MASK_NO		0
+#define FP_SELA_MASK_W		1
+
+#define FP_SELA(destidx,regmask,outmask,src0,src1,src2) \
+	(((destidx) << R300_FPI3_DSTA_SHIFT) |		\
+	 (FP_SELA_MASK_##regmask << 23) |		\
+	 (FP_SELA_MASK_##outmask << 24) |		\
+	 ((src0) << R300_FPI3_SRC0A_SHIFT) |		\
+	 ((src1) << R300_FPI3_SRC1A_SHIFT) |		\
+	 ((src2) << R300_FPI3_SRC2A_SHIFT))
+
+/* Produce unshifted argument selectors */
+#define FP_ARGC(source)	R300_FPI0_ARGC_##source
+#define FP_ARGA(source) R300_FPI2_ARGA_##source
+#define FP_ABS(arg) ((arg) | (1 << 6))
+#define FP_NEG(arg) ((arg) ^ (1 << 5))
+
+/* Produce instruction dword */
+#define FP_INSTRC(opcode,arg0,arg1,arg2) \
+	(R300_FPI0_OUTC_##opcode | 		\
+	((arg0) << R300_FPI0_ARG0C_SHIFT) |	\
+	((arg1) << R300_FPI0_ARG1C_SHIFT) |	\
+	((arg2) << R300_FPI0_ARG2C_SHIFT))
+
+#define FP_INSTRA(opcode,arg0,arg1,arg2) \
+	(R300_FPI2_OUTA_##opcode | 		\
+	((arg0) << R300_FPI2_ARG0A_SHIFT) |	\
+	((arg1) << R300_FPI2_ARG1A_SHIFT) |	\
+	((arg2) << R300_FPI2_ARG2A_SHIFT))
+
+extern void debug_vp(GLcontext * ctx, struct gl_vertex_program *vp);
+
+#endif				/* __R300_PROGRAM_H__ */
diff --git a/r300/r300_reg.h b/r300/r300_reg.h
new file mode 100644
index 0000000..e5501b6
--- /dev/null
+++ b/r300/r300_reg.h
@@ -0,0 +1,1635 @@
+/**************************************************************************
+
+Copyright (C) 2004-2005 Nicolai Haehnle et al.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/* *INDENT-OFF* */
+
+#ifndef _R300_REG_H
+#define _R300_REG_H
+
+#define R300_MC_INIT_MISC_LAT_TIMER	0x180
+#	define R300_MC_MISC__MC_CPR_INIT_LAT_SHIFT	0
+#	define R300_MC_MISC__MC_VF_INIT_LAT_SHIFT	4
+#	define R300_MC_MISC__MC_DISP0R_INIT_LAT_SHIFT	8
+#	define R300_MC_MISC__MC_DISP1R_INIT_LAT_SHIFT	12
+#	define R300_MC_MISC__MC_FIXED_INIT_LAT_SHIFT	16
+#	define R300_MC_MISC__MC_E2R_INIT_LAT_SHIFT	20
+#	define R300_MC_MISC__MC_SAME_PAGE_PRIO_SHIFT	24
+#	define R300_MC_MISC__MC_GLOBW_INIT_LAT_SHIFT	28
+
+
+#define R300_MC_INIT_GFX_LAT_TIMER	0x154
+#	define R300_MC_MISC__MC_G3D0R_INIT_LAT_SHIFT	0
+#	define R300_MC_MISC__MC_G3D1R_INIT_LAT_SHIFT	4
+#	define R300_MC_MISC__MC_G3D2R_INIT_LAT_SHIFT	8
+#	define R300_MC_MISC__MC_G3D3R_INIT_LAT_SHIFT	12
+#	define R300_MC_MISC__MC_TX0R_INIT_LAT_SHIFT	16
+#	define R300_MC_MISC__MC_TX1R_INIT_LAT_SHIFT	20
+#	define R300_MC_MISC__MC_GLOBR_INIT_LAT_SHIFT	24
+#	define R300_MC_MISC__MC_GLOBW_FULL_LAT_SHIFT	28
+
+/*
+ * This file contains registers and constants for the R300. They have been
+ * found mostly by examining command buffers captured using glxtest, as well
+ * as by extrapolating some known registers and constants from the R200.
+ * I am fairly certain that they are correct unless stated otherwise
+ * in comments.
+ */
+
+#define R300_SE_VPORT_XSCALE                0x1D98
+#define R300_SE_VPORT_XOFFSET               0x1D9C
+#define R300_SE_VPORT_YSCALE                0x1DA0
+#define R300_SE_VPORT_YOFFSET               0x1DA4
+#define R300_SE_VPORT_ZSCALE                0x1DA8
+#define R300_SE_VPORT_ZOFFSET               0x1DAC
+
+
+/*
+ * Vertex Array Processing (VAP) Control
+ * Stolen from r200 code from Christoph Brill (It's a guess!)
+ */
+#define R300_VAP_CNTL	0x2080
+
+/* This register is written directly and also starts data section
+ * in many 3d CP_PACKET3's
+ */
+#define R300_VAP_VF_CNTL	0x2084
+#	define	R300_VAP_VF_CNTL__PRIM_TYPE__SHIFT              0
+#	define  R300_VAP_VF_CNTL__PRIM_NONE                     (0<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_POINTS                   (1<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINES                    (2<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINE_STRIP               (3<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLES                (4<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN             (5<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP           (6<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINE_LOOP                (12<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_QUADS                    (13<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_QUAD_STRIP               (14<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_POLYGON                  (15<<0)
+
+#	define	R300_VAP_VF_CNTL__PRIM_WALK__SHIFT              4
+	/* State based - direct writes to registers trigger vertex
+           generation */
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_STATE_BASED         (0<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_INDICES             (1<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST         (2<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED     (3<<4)
+
+	/* I don't think I saw these three used.. */
+#	define	R300_VAP_VF_CNTL__COLOR_ORDER__SHIFT            6
+#	define	R300_VAP_VF_CNTL__TCL_OUTPUT_CTL_ENA__SHIFT     9
+#	define	R300_VAP_VF_CNTL__PROG_STREAM_ENA__SHIFT        10
+
+	/* index size - when not set the indices are assumed to be 16 bit */
+#	define	R300_VAP_VF_CNTL__INDEX_SIZE_32bit              (1<<11)
+	/* number of vertices */
+#	define	R300_VAP_VF_CNTL__NUM_VERTICES__SHIFT           16
+
+/* BEGIN: Wild guesses */
+#define R300_VAP_OUTPUT_VTX_FMT_0           0x2090
+#       define R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT     (1<<0)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_PRESENT   (1<<1)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT (1<<2)  /* GUESS */
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT (1<<3)  /* GUESS */
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT (1<<4)  /* GUESS */
+#       define R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT (1<<16) /* GUESS */
+
+#define R300_VAP_OUTPUT_VTX_FMT_1           0x2094
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_0_COMP_CNT_SHIFT 0
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_1_COMP_CNT_SHIFT 3
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_2_COMP_CNT_SHIFT 6
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_3_COMP_CNT_SHIFT 9
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_4_COMP_CNT_SHIFT 12
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_5_COMP_CNT_SHIFT 15
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_6_COMP_CNT_SHIFT 18
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_7_COMP_CNT_SHIFT 21
+/* END: Wild guesses */
+
+#define R300_SE_VTE_CNTL                  0x20b0
+#	define     R300_VPORT_X_SCALE_ENA                0x00000001
+#	define     R300_VPORT_X_OFFSET_ENA               0x00000002
+#	define     R300_VPORT_Y_SCALE_ENA                0x00000004
+#	define     R300_VPORT_Y_OFFSET_ENA               0x00000008
+#	define     R300_VPORT_Z_SCALE_ENA                0x00000010
+#	define     R300_VPORT_Z_OFFSET_ENA               0x00000020
+#	define     R300_VTX_XY_FMT                       0x00000100
+#	define     R300_VTX_Z_FMT                        0x00000200
+#	define     R300_VTX_W0_FMT                       0x00000400
+#	define     R300_VTX_W0_NORMALIZE                 0x00000800
+#	define     R300_VTX_ST_DENORMALIZED              0x00001000
+
+/* BEGIN: Vertex data assembly - lots of uncertainties */
+
+/* gap */
+
+#define R300_VAP_CNTL_STATUS              0x2140
+#	define R300_VC_NO_SWAP                  (0 << 0)
+#	define R300_VC_16BIT_SWAP               (1 << 0)
+#	define R300_VC_32BIT_SWAP               (2 << 0)
+#	define R300_VAP_TCL_BYPASS		(1 << 8)
+
+/* gap */
+
+/* Where do we get our vertex data?
+ *
+ * Vertex data either comes either from immediate mode registers or from
+ * vertex arrays.
+ * There appears to be no mixed mode (though we can force the pitch of
+ * vertex arrays to 0, effectively reusing the same element over and over
+ * again).
+ *
+ * Immediate mode is controlled by the INPUT_CNTL registers. I am not sure
+ * if these registers influence vertex array processing.
+ *
+ * Vertex arrays are controlled via the 3D_LOAD_VBPNTR packet3.
+ *
+ * In both cases, vertex attributes are then passed through INPUT_ROUTE.
+ *
+ * Beginning with INPUT_ROUTE_0_0 is a list of WORDs that route vertex data
+ * into the vertex processor's input registers.
+ * The first word routes the first input, the second word the second, etc.
+ * The corresponding input is routed into the register with the given index.
+ * The list is ended by a word with INPUT_ROUTE_END set.
+ *
+ * Always set COMPONENTS_4 in immediate mode.
+ */
+
+#define R300_VAP_INPUT_ROUTE_0_0            0x2150
+#       define R300_INPUT_ROUTE_COMPONENTS_1     (0 << 0)
+#       define R300_INPUT_ROUTE_COMPONENTS_2     (1 << 0)
+#       define R300_INPUT_ROUTE_COMPONENTS_3     (2 << 0)
+#       define R300_INPUT_ROUTE_COMPONENTS_4     (3 << 0)
+#       define R300_INPUT_ROUTE_COMPONENTS_RGBA  (4 << 0) /* GUESS */
+#       define R300_VAP_INPUT_ROUTE_IDX_SHIFT    8
+#       define R300_VAP_INPUT_ROUTE_IDX_MASK     (31 << 8) /* GUESS */
+#       define R300_VAP_INPUT_ROUTE_END          (1 << 13)
+#       define R300_INPUT_ROUTE_IMMEDIATE_MODE   (0 << 14) /* GUESS */
+#       define R300_INPUT_ROUTE_FLOAT            (1 << 14) /* GUESS */
+#       define R300_INPUT_ROUTE_UNSIGNED_BYTE    (2 << 14) /* GUESS */
+#       define R300_INPUT_ROUTE_FLOAT_COLOR      (3 << 14) /* GUESS */
+#define R300_VAP_INPUT_ROUTE_0_1            0x2154
+#define R300_VAP_INPUT_ROUTE_0_2            0x2158
+#define R300_VAP_INPUT_ROUTE_0_3            0x215C
+#define R300_VAP_INPUT_ROUTE_0_4            0x2160
+#define R300_VAP_INPUT_ROUTE_0_5            0x2164
+#define R300_VAP_INPUT_ROUTE_0_6            0x2168
+#define R300_VAP_INPUT_ROUTE_0_7            0x216C
+
+/* gap */
+
+/* Notes:
+ *  - always set up to produce at least two attributes:
+ *    if vertex program uses only position, fglrx will set normal, too
+ *  - INPUT_CNTL_0_COLOR and INPUT_CNTL_COLOR bits are always equal.
+ */
+#define R300_VAP_INPUT_CNTL_0               0x2180
+#       define R300_INPUT_CNTL_0_COLOR           0x00000001
+#define R300_VAP_INPUT_CNTL_1               0x2184
+#       define R300_INPUT_CNTL_POS               0x00000001
+#       define R300_INPUT_CNTL_NORMAL            0x00000002
+#       define R300_INPUT_CNTL_COLOR             0x00000004
+#       define R300_INPUT_CNTL_TC0               0x00000400
+#       define R300_INPUT_CNTL_TC1               0x00000800
+#       define R300_INPUT_CNTL_TC2               0x00001000 /* GUESS */
+#       define R300_INPUT_CNTL_TC3               0x00002000 /* GUESS */
+#       define R300_INPUT_CNTL_TC4               0x00004000 /* GUESS */
+#       define R300_INPUT_CNTL_TC5               0x00008000 /* GUESS */
+#       define R300_INPUT_CNTL_TC6               0x00010000 /* GUESS */
+#       define R300_INPUT_CNTL_TC7               0x00020000 /* GUESS */
+
+/* gap */
+
+/* Words parallel to INPUT_ROUTE_0; All words that are active in INPUT_ROUTE_0
+ * are set to a swizzling bit pattern, other words are 0.
+ *
+ * In immediate mode, the pattern is always set to xyzw. In vertex array
+ * mode, the swizzling pattern is e.g. used to set zw components in texture
+ * coordinates with only tweo components.
+ */
+#define R300_VAP_INPUT_ROUTE_1_0            0x21E0
+#       define R300_INPUT_ROUTE_SELECT_X    0
+#       define R300_INPUT_ROUTE_SELECT_Y    1
+#       define R300_INPUT_ROUTE_SELECT_Z    2
+#       define R300_INPUT_ROUTE_SELECT_W    3
+#       define R300_INPUT_ROUTE_SELECT_ZERO 4
+#       define R300_INPUT_ROUTE_SELECT_ONE  5
+#       define R300_INPUT_ROUTE_SELECT_MASK 7
+#       define R300_INPUT_ROUTE_X_SHIFT     0
+#       define R300_INPUT_ROUTE_Y_SHIFT     3
+#       define R300_INPUT_ROUTE_Z_SHIFT     6
+#       define R300_INPUT_ROUTE_W_SHIFT     9
+#       define R300_INPUT_ROUTE_ENABLE      (15 << 12)
+#define R300_VAP_INPUT_ROUTE_1_1            0x21E4
+#define R300_VAP_INPUT_ROUTE_1_2            0x21E8
+#define R300_VAP_INPUT_ROUTE_1_3            0x21EC
+#define R300_VAP_INPUT_ROUTE_1_4            0x21F0
+#define R300_VAP_INPUT_ROUTE_1_5            0x21F4
+#define R300_VAP_INPUT_ROUTE_1_6            0x21F8
+#define R300_VAP_INPUT_ROUTE_1_7            0x21FC
+
+/* END: Vertex data assembly */
+
+/* gap */
+
+/* BEGIN: Upload vertex program and data */
+
+/*
+ * The programmable vertex shader unit has a memory bank of unknown size
+ * that can be written to in 16 byte units by writing the address into
+ * UPLOAD_ADDRESS, followed by data in UPLOAD_DATA (multiples of 4 DWORDs).
+ *
+ * Pointers into the memory bank are always in multiples of 16 bytes.
+ *
+ * The memory bank is divided into areas with fixed meaning.
+ *
+ * Starting at address UPLOAD_PROGRAM: Vertex program instructions.
+ * Native limits reported by drivers from ATI suggest size 256 (i.e. 4KB),
+ * whereas the difference between known addresses suggests size 512.
+ *
+ * Starting at address UPLOAD_PARAMETERS: Vertex program parameters.
+ * Native reported limits and the VPI layout suggest size 256, whereas
+ * difference between known addresses suggests size 512.
+ *
+ * At address UPLOAD_POINTSIZE is a vector (0, 0, ps, 0), where ps is the
+ * floating point pointsize. The exact purpose of this state is uncertain,
+ * as there is also the R300_RE_POINTSIZE register.
+ *
+ * Multiple vertex programs and parameter sets can be loaded at once,
+ * which could explain the size discrepancy.
+ */
+#define R300_VAP_PVS_UPLOAD_ADDRESS         0x2200
+#       define R300_PVS_UPLOAD_PROGRAM           0x00000000
+#       define R300_PVS_UPLOAD_PARAMETERS        0x00000200
+#       define R300_PVS_UPLOAD_CLIP_PLANE0       0x00000400
+#       define R300_PVS_UPLOAD_CLIP_PLANE1       0x00000401
+#       define R300_PVS_UPLOAD_CLIP_PLANE2       0x00000402
+#       define R300_PVS_UPLOAD_CLIP_PLANE3       0x00000403
+#       define R300_PVS_UPLOAD_CLIP_PLANE4       0x00000404
+#       define R300_PVS_UPLOAD_CLIP_PLANE5       0x00000405
+#       define R300_PVS_UPLOAD_POINTSIZE         0x00000406
+
+#       define R500_PVS_UPLOAD_CLIP_PLANE0       0x00000600
+#       define R500_PVS_UPLOAD_CLIP_PLANE1       0x00000601
+#       define R500_PVS_UPLOAD_CLIP_PLANE2       0x00000602
+#       define R500_PVS_UPLOAD_CLIP_PLANE3       0x00000603
+#       define R500_PVS_UPLOAD_CLIP_PLANE4       0x00000604
+#       define R500_PVS_UPLOAD_CLIP_PLANE5       0x00000605
+
+/* gap */
+
+#define R300_VAP_PVS_UPLOAD_DATA            0x2208
+
+/* END: Upload vertex program and data */
+
+/* gap */
+
+/* I do not know the purpose of this register. However, I do know that
+ * it is set to 221C_CLEAR for clear operations and to 221C_NORMAL
+ * for normal rendering.
+ *
+ * 2007-11-05: This register is the user clip plane control register, but there
+ * also seems to be a rendering mode control; the NORMAL/CLEAR defines.
+ *
+ * See bug #9871. http://bugs.freedesktop.org/attachment.cgi?id=10672&action=view
+ */
+#define R300_VAP_CLIP_CNTL                       0x221C
+#       define R300_221C_NORMAL                  0x00000000
+#       define R300_221C_CLEAR                   0x0001C000
+#define R300_VAP_UCP_ENABLE_0 (1 << 0)
+
+/* gap */
+
+/* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
+ * rendering commands and overwriting vertex program parameters.
+ * Therefore, I suspect writing zero to 0x2284 synchronizes the engine and
+ * avoids bugs caused by still running shaders reading bad data from memory.
+ */
+#define R300_VAP_PVS_WAITIDLE               0x2284 /* GUESS */
+
+/* Absolutely no clue what this register is about. */
+#define R300_VAP_UNKNOWN_2288               0x2288
+#       define R300_2288_R300                    0x00750000 /* -- nh */
+#       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
+
+/* gap */
+
+/* Addresses are relative to the vertex program instruction area of the
+ * memory bank. PROGRAM_END points to the last instruction of the active
+ * program
+ *
+ * The meaning of the two UNKNOWN fields is obviously not known. However,
+ * experiments so far have shown that both *must* point to an instruction
+ * inside the vertex program, otherwise the GPU locks up.
+ * fglrx usually sets CNTL_3_UNKNOWN to the end of the program and
+ * CNTL_1_UNKNOWN points to instruction where last write to position takes
+ * place.
+ * Most likely this is used to ignore rest of the program in cases
+ * where group of verts arent visible. For some reason this "section"
+ * is sometimes accepted other instruction that have no relationship with
+ *position calculations.
+ */
+#define R300_VAP_PVS_CNTL_1                 0x22D0
+#       define R300_PVS_CNTL_1_PROGRAM_START_SHIFT   0
+#       define R300_PVS_CNTL_1_POS_END_SHIFT         10
+#       define R300_PVS_CNTL_1_PROGRAM_END_SHIFT     20
+/* Addresses are relative the the vertex program parameters area. */
+#define R300_VAP_PVS_CNTL_2                 0x22D4
+#       define R300_PVS_CNTL_2_PARAM_OFFSET_SHIFT 0
+#       define R300_PVS_CNTL_2_PARAM_COUNT_SHIFT  16
+#define R300_VAP_PVS_CNTL_3	           0x22D8
+#       define R300_PVS_CNTL_3_PROGRAM_UNKNOWN_SHIFT 10
+#       define R300_PVS_CNTL_3_PROGRAM_UNKNOWN2_SHIFT 0
+
+/* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
+ * immediate vertices
+ */
+#define R300_VAP_VTX_COLOR_R                0x2464
+#define R300_VAP_VTX_COLOR_G                0x2468
+#define R300_VAP_VTX_COLOR_B                0x246C
+#define R300_VAP_VTX_POS_0_X_1              0x2490 /* used for glVertex2*() */
+#define R300_VAP_VTX_POS_0_Y_1              0x2494
+#define R300_VAP_VTX_COLOR_PKD              0x249C /* RGBA */
+#define R300_VAP_VTX_POS_0_X_2              0x24A0 /* used for glVertex3*() */
+#define R300_VAP_VTX_POS_0_Y_2              0x24A4
+#define R300_VAP_VTX_POS_0_Z_2              0x24A8
+/* write 0 to indicate end of packet? */
+#define R300_VAP_VTX_END_OF_PKT             0x24AC
+
+/* gap */
+
+/* These are values from r300_reg/r300_reg.h - they are known to be correct
+ * and are here so we can use one register file instead of several
+ * - Vladimir
+ */
+#define R300_GB_VAP_RASTER_VTX_FMT_0	0x4000
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__POS_PRESENT	(1<<0)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_0_PRESENT	(1<<1)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_1_PRESENT	(1<<2)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_2_PRESENT	(1<<3)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_3_PRESENT	(1<<4)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_SPACE	(0xf<<5)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__PT_SIZE_PRESENT	(0x1<<16)
+
+#define R300_GB_VAP_RASTER_VTX_FMT_1	0x4004
+	/* each of the following is 3 bits wide, specifies number
+	   of components */
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_0_COMP_CNT_SHIFT	0
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_1_COMP_CNT_SHIFT	3
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_2_COMP_CNT_SHIFT	6
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_3_COMP_CNT_SHIFT	9
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_4_COMP_CNT_SHIFT	12
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_5_COMP_CNT_SHIFT	15
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_6_COMP_CNT_SHIFT	18
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_7_COMP_CNT_SHIFT	21
+
+/* UNK30 seems to enables point to quad transformation on textures
+ * (or something closely related to that).
+ * This bit is rather fatal at the time being due to lackings at pixel
+ * shader side
+ */
+#define R300_GB_ENABLE	0x4008
+#	define R300_GB_POINT_STUFF_ENABLE	(1<<0)
+#	define R300_GB_LINE_STUFF_ENABLE	(1<<1)
+#	define R300_GB_TRIANGLE_STUFF_ENABLE	(1<<2)
+#	define R300_GB_STENCIL_AUTO_ENABLE	(1<<4)
+#	define R300_GB_UNK31			(1<<31)
+	/* each of the following is 2 bits wide */
+#define R300_GB_TEX_REPLICATE	0
+#define R300_GB_TEX_ST		1
+#define R300_GB_TEX_STR		2
+#	define R300_GB_TEX0_SOURCE_SHIFT	16
+#	define R300_GB_TEX1_SOURCE_SHIFT	18
+#	define R300_GB_TEX2_SOURCE_SHIFT	20
+#	define R300_GB_TEX3_SOURCE_SHIFT	22
+#	define R300_GB_TEX4_SOURCE_SHIFT	24
+#	define R300_GB_TEX5_SOURCE_SHIFT	26
+#	define R300_GB_TEX6_SOURCE_SHIFT	28
+#	define R300_GB_TEX7_SOURCE_SHIFT	30
+
+/* MSPOS - positions for multisample antialiasing (?) */
+#define R300_GB_MSPOS0	0x4010
+	/* shifts - each of the fields is 4 bits */
+#	define R300_GB_MSPOS0__MS_X0_SHIFT	0
+#	define R300_GB_MSPOS0__MS_Y0_SHIFT	4
+#	define R300_GB_MSPOS0__MS_X1_SHIFT	8
+#	define R300_GB_MSPOS0__MS_Y1_SHIFT	12
+#	define R300_GB_MSPOS0__MS_X2_SHIFT	16
+#	define R300_GB_MSPOS0__MS_Y2_SHIFT	20
+#	define R300_GB_MSPOS0__MSBD0_Y		24
+#	define R300_GB_MSPOS0__MSBD0_X		28
+
+#define R300_GB_MSPOS1	0x4014
+#	define R300_GB_MSPOS1__MS_X3_SHIFT	0
+#	define R300_GB_MSPOS1__MS_Y3_SHIFT	4
+#	define R300_GB_MSPOS1__MS_X4_SHIFT	8
+#	define R300_GB_MSPOS1__MS_Y4_SHIFT	12
+#	define R300_GB_MSPOS1__MS_X5_SHIFT	16
+#	define R300_GB_MSPOS1__MS_Y5_SHIFT	20
+#	define R300_GB_MSPOS1__MSBD1		24
+
+
+#define R300_GB_TILE_CONFIG	0x4018
+#	define R300_GB_TILE_ENABLE	(1<<0)
+#	define R300_GB_TILE_PIPE_COUNT_RV300	0
+#	define R300_GB_TILE_PIPE_COUNT_R300	(3<<1)
+#	define R300_GB_TILE_PIPE_COUNT_R420	(7<<1)
+#	define R300_GB_TILE_PIPE_COUNT_RV410	(3<<1)
+#	define R300_GB_TILE_SIZE_8		0
+#	define R300_GB_TILE_SIZE_16		(1<<4)
+#	define R300_GB_TILE_SIZE_32		(2<<4)
+#	define R300_GB_SUPER_SIZE_1		(0<<6)
+#	define R300_GB_SUPER_SIZE_2		(1<<6)
+#	define R300_GB_SUPER_SIZE_4		(2<<6)
+#	define R300_GB_SUPER_SIZE_8		(3<<6)
+#	define R300_GB_SUPER_SIZE_16		(4<<6)
+#	define R300_GB_SUPER_SIZE_32		(5<<6)
+#	define R300_GB_SUPER_SIZE_64		(6<<6)
+#	define R300_GB_SUPER_SIZE_128		(7<<6)
+#	define R300_GB_SUPER_X_SHIFT		9	/* 3 bits wide */
+#	define R300_GB_SUPER_Y_SHIFT		12	/* 3 bits wide */
+#	define R300_GB_SUPER_TILE_A		0
+#	define R300_GB_SUPER_TILE_B		(1<<15)
+#	define R300_GB_SUBPIXEL_1_12		0
+#	define R300_GB_SUBPIXEL_1_16		(1<<16)
+
+#define R300_GB_FIFO_SIZE	0x4024
+	/* each of the following is 2 bits wide */
+#define R300_GB_FIFO_SIZE_32	0
+#define R300_GB_FIFO_SIZE_64	1
+#define R300_GB_FIFO_SIZE_128	2
+#define R300_GB_FIFO_SIZE_256	3
+#	define R300_SC_IFIFO_SIZE_SHIFT	0
+#	define R300_SC_TZFIFO_SIZE_SHIFT	2
+#	define R300_SC_BFIFO_SIZE_SHIFT	4
+
+#	define R300_US_OFIFO_SIZE_SHIFT	12
+#	define R300_US_WFIFO_SIZE_SHIFT	14
+	/* the following use the same constants as above, but meaning is
+	   is times 2 (i.e. instead of 32 words it means 64 */
+#	define R300_RS_TFIFO_SIZE_SHIFT	6
+#	define R300_RS_CFIFO_SIZE_SHIFT	8
+#	define R300_US_RAM_SIZE_SHIFT		10
+	/* watermarks, 3 bits wide */
+#	define R300_RS_HIGHWATER_COL_SHIFT	16
+#	define R300_RS_HIGHWATER_TEX_SHIFT	19
+#	define R300_OFIFO_HIGHWATER_SHIFT	22	/* two bits only */
+#	define R300_CUBE_FIFO_HIGHWATER_COL_SHIFT	24
+
+#define R300_GB_SELECT	0x401C
+#	define R300_GB_FOG_SELECT_C0A		0
+#	define R300_GB_FOG_SELECT_C1A		1
+#	define R300_GB_FOG_SELECT_C2A		2
+#	define R300_GB_FOG_SELECT_C3A		3
+#	define R300_GB_FOG_SELECT_1_1_W	4
+#	define R300_GB_FOG_SELECT_Z		5
+#	define R300_GB_DEPTH_SELECT_Z		0
+#	define R300_GB_DEPTH_SELECT_1_1_W	(1<<3)
+#	define R300_GB_W_SELECT_1_W		0
+#	define R300_GB_W_SELECT_1		(1<<4)
+
+#define R300_GB_AA_CONFIG		0x4020
+#	define R300_AA_DISABLE			0x00
+#	define R300_AA_ENABLE			0x01
+#	define R300_AA_SUBSAMPLES_2		0
+#	define R300_AA_SUBSAMPLES_3		(1<<1)
+#	define R300_AA_SUBSAMPLES_4		(2<<1)
+#	define R300_AA_SUBSAMPLES_6		(3<<1)
+
+/* gap */
+
+/* Zero to flush caches. */
+#define R300_TX_CNTL                        0x4100
+#define R300_TX_FLUSH                       0x0
+
+/* The upper enable bits are guessed, based on fglrx reported limits. */
+#define R300_TX_ENABLE                      0x4104
+#       define R300_TX_ENABLE_0                  (1 << 0)
+#       define R300_TX_ENABLE_1                  (1 << 1)
+#       define R300_TX_ENABLE_2                  (1 << 2)
+#       define R300_TX_ENABLE_3                  (1 << 3)
+#       define R300_TX_ENABLE_4                  (1 << 4)
+#       define R300_TX_ENABLE_5                  (1 << 5)
+#       define R300_TX_ENABLE_6                  (1 << 6)
+#       define R300_TX_ENABLE_7                  (1 << 7)
+#       define R300_TX_ENABLE_8                  (1 << 8)
+#       define R300_TX_ENABLE_9                  (1 << 9)
+#       define R300_TX_ENABLE_10                 (1 << 10)
+#       define R300_TX_ENABLE_11                 (1 << 11)
+#       define R300_TX_ENABLE_12                 (1 << 12)
+#       define R300_TX_ENABLE_13                 (1 << 13)
+#       define R300_TX_ENABLE_14                 (1 << 14)
+#       define R300_TX_ENABLE_15                 (1 << 15)
+
+/* The pointsize is given in multiples of 6. The pointsize can be
+ * enormous: Clear() renders a single point that fills the entire
+ * framebuffer.
+ */
+#define R300_RE_POINTSIZE                   0x421C
+#       define R300_POINTSIZE_Y_SHIFT            0
+#       define R300_POINTSIZE_Y_MASK             (0xFFFF << 0) /* GUESS */
+#       define R300_POINTSIZE_X_SHIFT            16
+#       define R300_POINTSIZE_X_MASK             (0xFFFF << 16) /* GUESS */
+#       define R300_POINTSIZE_MAX             (R300_POINTSIZE_Y_MASK / 6)
+
+/* The line width is given in multiples of 6.
+ * In default mode lines are classified as vertical lines.
+ * HO: horizontal
+ * VE: vertical or horizontal
+ * HO & VE: no classification
+ */
+#define R300_RE_LINE_CNT                      0x4234
+#       define R300_LINESIZE_SHIFT            0
+#       define R300_LINESIZE_MASK             (0xFFFF << 0) /* GUESS */
+#       define R300_LINESIZE_MAX             (R300_LINESIZE_MASK / 6)
+#       define R300_LINE_CNT_HO               (1 << 16)
+#       define R300_LINE_CNT_VE               (1 << 17)
+
+/* Some sort of scale or clamp value for texcoordless textures. */
+#define R300_RE_UNK4238                       0x4238
+
+/* Something shade related */
+#define R300_RE_SHADE                         0x4274
+
+#define R300_RE_SHADE_MODEL                   0x4278
+#	define R300_RE_SHADE_MODEL_SMOOTH     0x3aaaa
+#	define R300_RE_SHADE_MODEL_FLAT       0x39595
+
+/* Dangerous */
+#define R300_RE_POLYGON_MODE                  0x4288
+#	define R300_PM_ENABLED                (1 << 0)
+#	define R300_PM_FRONT_POINT            (0 << 0)
+#	define R300_PM_BACK_POINT             (0 << 0)
+#	define R300_PM_FRONT_LINE             (1 << 4)
+#	define R300_PM_FRONT_FILL             (1 << 5)
+#	define R300_PM_BACK_LINE              (1 << 7)
+#	define R300_PM_BACK_FILL              (1 << 8)
+
+/* Fog parameters */
+#define R300_RE_FOG_SCALE                     0x4294
+#define R300_RE_FOG_START                     0x4298
+
+/* Not sure why there are duplicate of factor and constant values.
+ * My best guess so far is that there are seperate zbiases for test and write.
+ * Ordering might be wrong.
+ * Some of the tests indicate that fgl has a fallback implementation of zbias
+ * via pixel shaders.
+ */
+#define R300_RE_ZBIAS_CNTL                    0x42A0 /* GUESS */
+#define R300_RE_ZBIAS_T_FACTOR                0x42A4
+#define R300_RE_ZBIAS_T_CONSTANT              0x42A8
+#define R300_RE_ZBIAS_W_FACTOR                0x42AC
+#define R300_RE_ZBIAS_W_CONSTANT              0x42B0
+
+/* This register needs to be set to (1<<1) for RV350 to correctly
+ * perform depth test (see --vb-triangles in r300_demo)
+ * Don't know about other chips. - Vladimir
+ * This is set to 3 when GL_POLYGON_OFFSET_FILL is on.
+ * My guess is that there are two bits for each zbias primitive
+ * (FILL, LINE, POINT).
+ *  One to enable depth test and one for depth write.
+ * Yet this doesnt explain why depth writes work ...
+ */
+#define R300_RE_OCCLUSION_CNTL		    0x42B4
+#	define R300_OCCLUSION_ON		(1<<1)
+
+#define R300_RE_CULL_CNTL                   0x42B8
+#       define R300_CULL_FRONT                   (1 << 0)
+#       define R300_CULL_BACK                    (1 << 1)
+#       define R300_FRONT_FACE_CCW               (0 << 2)
+#       define R300_FRONT_FACE_CW                (1 << 2)
+
+
+/* BEGIN: Rasterization / Interpolators - many guesses */
+
+/* 0_UNKNOWN_18 has always been set except for clear operations.
+ * TC_CNT is the number of incoming texture coordinate sets (i.e. it depends
+ * on the vertex program, *not* the fragment program)
+ */
+#define R300_RS_CNTL_0                      0x4300
+#       define R300_RS_CNTL_TC_CNT_SHIFT         2
+#       define R300_RS_CNTL_TC_CNT_MASK          (7 << 2)
+	/* number of color interpolators used */
+#	define R300_RS_CNTL_CI_CNT_SHIFT         7
+#       define R300_RS_CNTL_0_UNKNOWN_18         (1 << 18)
+	/* Guess: RS_CNTL_1 holds the index of the highest used RS_ROUTE_n
+	   register. */
+#define R300_RS_CNTL_1                      0x4304
+
+/* gap */
+
+/* Only used for texture coordinates.
+ * Use the source field to route texture coordinate input from the
+ * vertex program to the desired interpolator. Note that the source
+ * field is relative to the outputs the vertex program *actually*
+ * writes. If a vertex program only writes texcoord[1], this will
+ * be source index 0.
+ * Set INTERP_USED on all interpolators that produce data used by
+ * the fragment program. INTERP_USED looks like a swizzling mask,
+ * but I haven't seen it used that way.
+ *
+ * Note: The _UNKNOWN constants are always set in their respective
+ * register. I don't know if this is necessary.
+ */
+#define R300_RS_INTERP_0                    0x4310
+#define R300_RS_INTERP_1                    0x4314
+#       define R300_RS_INTERP_1_UNKNOWN          0x40
+#define R300_RS_INTERP_2                    0x4318
+#       define R300_RS_INTERP_2_UNKNOWN          0x80
+#define R300_RS_INTERP_3                    0x431C
+#       define R300_RS_INTERP_3_UNKNOWN          0xC0
+#define R300_RS_INTERP_4                    0x4320
+#define R300_RS_INTERP_5                    0x4324
+#define R300_RS_INTERP_6                    0x4328
+#define R300_RS_INTERP_7                    0x432C
+#       define R300_RS_INTERP_SRC_SHIFT          2
+#       define R300_RS_INTERP_SRC_MASK           (7 << 2)
+#       define R300_RS_INTERP_USED               0x00D10000
+
+/* These DWORDs control how vertex data is routed into fragment program
+ * registers, after interpolators.
+ */
+#define R300_RS_ROUTE_0                     0x4330
+#define R300_RS_ROUTE_1                     0x4334
+#define R300_RS_ROUTE_2                     0x4338
+#define R300_RS_ROUTE_3                     0x433C /* GUESS */
+#define R300_RS_ROUTE_4                     0x4340 /* GUESS */
+#define R300_RS_ROUTE_5                     0x4344 /* GUESS */
+#define R300_RS_ROUTE_6                     0x4348 /* GUESS */
+#define R300_RS_ROUTE_7                     0x434C /* GUESS */
+#       define R300_RS_ROUTE_SOURCE_INTERP_0     0
+#       define R300_RS_ROUTE_SOURCE_INTERP_1     1
+#       define R300_RS_ROUTE_SOURCE_INTERP_2     2
+#       define R300_RS_ROUTE_SOURCE_INTERP_3     3
+#       define R300_RS_ROUTE_SOURCE_INTERP_4     4
+#       define R300_RS_ROUTE_SOURCE_INTERP_5     5 /* GUESS */
+#       define R300_RS_ROUTE_SOURCE_INTERP_6     6 /* GUESS */
+#       define R300_RS_ROUTE_SOURCE_INTERP_7     7 /* GUESS */
+#       define R300_RS_ROUTE_ENABLE              (1 << 3) /* GUESS */
+#       define R300_RS_ROUTE_DEST_SHIFT          6
+#       define R300_RS_ROUTE_DEST_MASK           (31 << 6) /* GUESS */
+
+/* Special handling for color: When the fragment program uses color,
+ * the ROUTE_0_COLOR bit is set and ROUTE_0_COLOR_DEST contains the
+ * color register index.
+ *
+ * Apperently you may set the R300_RS_ROUTE_0_COLOR bit, but not provide any
+ * R300_RS_ROUTE_0_COLOR_DEST value; this setup is used for clearing the state.
+ * See r300_ioctl.c:r300EmitClearState. I'm not sure if this setup is strictly
+ * correct or not. - Oliver.
+ */
+#       define R300_RS_ROUTE_0_COLOR             (1 << 14)
+#       define R300_RS_ROUTE_0_COLOR_DEST_SHIFT  17
+#       define R300_RS_ROUTE_0_COLOR_DEST_MASK   (31 << 17) /* GUESS */
+/* As above, but for secondary color */
+#		define R300_RS_ROUTE_1_COLOR1            (1 << 14)
+#		define R300_RS_ROUTE_1_COLOR1_DEST_SHIFT 17
+#		define R300_RS_ROUTE_1_COLOR1_DEST_MASK  (31 << 17)
+#		define R300_RS_ROUTE_1_UNKNOWN11         (1 << 11)
+/* END: Rasterization / Interpolators - many guesses */
+
+/* BEGIN: Scissors and cliprects */
+
+/* There are four clipping rectangles. Their corner coordinates are inclusive.
+ * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
+ * on whether the pixel is inside cliprects 0-3, respectively. For example,
+ * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
+ * the number 3 (binary 0011).
+ * Iff the bit corresponding to the pixel's number in RE_CLIPRECT_CNTL is set,
+ * the pixel is rasterized.
+ *
+ * In addition to this, there is a scissors rectangle. Only pixels inside the
+ * scissors rectangle are drawn. (coordinates are inclusive)
+ *
+ * For some reason, the top-left corner of the framebuffer is at (1440, 1440)
+ * for the purpose of clipping and scissors.
+ */
+#define R300_RE_CLIPRECT_TL_0               0x43B0
+#define R300_RE_CLIPRECT_BR_0               0x43B4
+#define R300_RE_CLIPRECT_TL_1               0x43B8
+#define R300_RE_CLIPRECT_BR_1               0x43BC
+#define R300_RE_CLIPRECT_TL_2               0x43C0
+#define R300_RE_CLIPRECT_BR_2               0x43C4
+#define R300_RE_CLIPRECT_TL_3               0x43C8
+#define R300_RE_CLIPRECT_BR_3               0x43CC
+#       define R300_CLIPRECT_OFFSET              1440
+#       define R300_CLIPRECT_MASK                0x1FFF
+#       define R300_CLIPRECT_X_SHIFT             0
+#       define R300_CLIPRECT_X_MASK              (0x1FFF << 0)
+#       define R300_CLIPRECT_Y_SHIFT             13
+#       define R300_CLIPRECT_Y_MASK              (0x1FFF << 13)
+#define R300_RE_CLIPRECT_CNTL               0x43D0
+#       define R300_CLIP_OUT                     (1 << 0)
+#       define R300_CLIP_0                       (1 << 1)
+#       define R300_CLIP_1                       (1 << 2)
+#       define R300_CLIP_10                      (1 << 3)
+#       define R300_CLIP_2                       (1 << 4)
+#       define R300_CLIP_20                      (1 << 5)
+#       define R300_CLIP_21                      (1 << 6)
+#       define R300_CLIP_210                     (1 << 7)
+#       define R300_CLIP_3                       (1 << 8)
+#       define R300_CLIP_30                      (1 << 9)
+#       define R300_CLIP_31                      (1 << 10)
+#       define R300_CLIP_310                     (1 << 11)
+#       define R300_CLIP_32                      (1 << 12)
+#       define R300_CLIP_320                     (1 << 13)
+#       define R300_CLIP_321                     (1 << 14)
+#       define R300_CLIP_3210                    (1 << 15)
+
+/* gap */
+
+#define R300_RE_SCISSORS_TL                 0x43E0
+#define R300_RE_SCISSORS_BR                 0x43E4
+#       define R300_SCISSORS_OFFSET              1440
+#       define R300_SCISSORS_X_SHIFT             0
+#       define R300_SCISSORS_X_MASK              (0x1FFF << 0)
+#       define R300_SCISSORS_Y_SHIFT             13
+#       define R300_SCISSORS_Y_MASK              (0x1FFF << 13)
+/* END: Scissors and cliprects */
+
+/* BEGIN: Texture specification */
+
+/*
+ * The texture specification dwords are grouped by meaning and not by texture
+ * unit. This means that e.g. the offset for texture image unit N is found in
+ * register TX_OFFSET_0 + (4*N)
+ */
+#define R300_TX_FILTER_0                    0x4400
+#       define R300_TX_REPEAT                    0
+#       define R300_TX_MIRRORED                  1
+#       define R300_TX_CLAMP                     4
+#       define R300_TX_CLAMP_TO_EDGE             2
+#       define R300_TX_CLAMP_TO_BORDER           6
+#       define R300_TX_WRAP_S_SHIFT              0
+#       define R300_TX_WRAP_S_MASK               (7 << 0)
+#       define R300_TX_WRAP_T_SHIFT              3
+#       define R300_TX_WRAP_T_MASK               (7 << 3)
+#       define R300_TX_WRAP_Q_SHIFT              6
+#       define R300_TX_WRAP_Q_MASK               (7 << 6)
+#       define R300_TX_MAG_FILTER_NEAREST        (1 << 9)
+#       define R300_TX_MAG_FILTER_LINEAR         (2 << 9)
+#       define R300_TX_MAG_FILTER_MASK           (3 << 9)
+#       define R300_TX_MIN_FILTER_NEAREST        (1 << 11)
+#       define R300_TX_MIN_FILTER_LINEAR         (2 << 11)
+#	define R300_TX_MIN_FILTER_NEAREST_MIP_NEAREST       (5  <<  11)
+#	define R300_TX_MIN_FILTER_NEAREST_MIP_LINEAR        (9  <<  11)
+#	define R300_TX_MIN_FILTER_LINEAR_MIP_NEAREST        (6  <<  11)
+#	define R300_TX_MIN_FILTER_LINEAR_MIP_LINEAR         (10 <<  11)
+
+/* NOTE: NEAREST doesnt seem to exist.
+ * Im not seting MAG_FILTER_MASK and (3 << 11) on for all
+ * anisotropy modes because that would void selected mag filter
+ */
+#	define R300_TX_MIN_FILTER_ANISO_NEAREST             (0 << 13)
+#	define R300_TX_MIN_FILTER_ANISO_LINEAR              (0 << 13)
+#	define R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_NEAREST (1 << 13)
+#	define R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_LINEAR  (2 << 13)
+#       define R300_TX_MIN_FILTER_MASK   ( (15 << 11) | (3 << 13) )
+#	define R300_TX_MAX_ANISO_1_TO_1  (0 << 21)
+#	define R300_TX_MAX_ANISO_2_TO_1  (2 << 21)
+#	define R300_TX_MAX_ANISO_4_TO_1  (4 << 21)
+#	define R300_TX_MAX_ANISO_8_TO_1  (6 << 21)
+#	define R300_TX_MAX_ANISO_16_TO_1 (8 << 21)
+#	define R300_TX_MAX_ANISO_MASK    (14 << 21)
+
+#define R300_TX_FILTER1_0                      0x4440
+#	define R300_CHROMA_KEY_MODE_DISABLE    0
+#	define R300_CHROMA_KEY_FORCE	       1
+#	define R300_CHROMA_KEY_BLEND           2
+#	define R300_MC_ROUND_NORMAL            (0<<2)
+#	define R300_MC_ROUND_MPEG4             (1<<2)
+#	define R300_LOD_BIAS_MASK	    0x1fff
+#	define R300_EDGE_ANISO_EDGE_DIAG       (0<<13)
+#	define R300_EDGE_ANISO_EDGE_ONLY       (1<<13)
+#	define R300_MC_COORD_TRUNCATE_DISABLE  (0<<14)
+#	define R300_MC_COORD_TRUNCATE_MPEG     (1<<14)
+#	define R300_TX_TRI_PERF_0_8            (0<<15)
+#	define R300_TX_TRI_PERF_1_8            (1<<15)
+#	define R300_TX_TRI_PERF_1_4            (2<<15)
+#	define R300_TX_TRI_PERF_3_8            (3<<15)
+#	define R300_ANISO_THRESHOLD_MASK       (7<<17)
+
+#define R300_TX_SIZE_0                      0x4480
+#       define R300_TX_WIDTHMASK_SHIFT           0
+#       define R300_TX_WIDTHMASK_MASK            (2047 << 0)
+#       define R300_TX_HEIGHTMASK_SHIFT          11
+#       define R300_TX_HEIGHTMASK_MASK           (2047 << 11)
+#       define R300_TX_UNK23                     (1 << 23)
+#       define R300_TX_MAX_MIP_LEVEL_SHIFT       26
+#       define R300_TX_MAX_MIP_LEVEL_MASK        (0xf << 26)
+#       define R300_TX_SIZE_PROJECTED            (1<<30)
+#       define R300_TX_SIZE_TXPITCH_EN           (1<<31)
+#define R300_TX_FORMAT_0                    0x44C0
+	/* The interpretation of the format word by Wladimir van der Laan */
+	/* The X, Y, Z and W refer to the layout of the components.
+	   They are given meanings as R, G, B and Alpha by the swizzle
+	   specification */
+#	define R300_TX_FORMAT_X8		    0x0
+#	define R300_TX_FORMAT_X16		    0x1
+#	define R300_TX_FORMAT_Y4X4		    0x2
+#	define R300_TX_FORMAT_Y8X8		    0x3
+#	define R300_TX_FORMAT_Y16X16		    0x4
+#	define R300_TX_FORMAT_Z3Y3X2		    0x5
+#	define R300_TX_FORMAT_Z5Y6X5		    0x6
+#	define R300_TX_FORMAT_Z6Y5X5		    0x7
+#	define R300_TX_FORMAT_Z11Y11X10		    0x8
+#	define R300_TX_FORMAT_Z10Y11X11		    0x9
+#	define R300_TX_FORMAT_W4Z4Y4X4		    0xA
+#	define R300_TX_FORMAT_W1Z5Y5X5		    0xB
+#	define R300_TX_FORMAT_W8Z8Y8X8		    0xC
+#	define R300_TX_FORMAT_W2Z10Y10X10	    0xD
+#	define R300_TX_FORMAT_W16Z16Y16X16	    0xE
+#	define R300_TX_FORMAT_DXT1	    	    0xF
+#	define R300_TX_FORMAT_DXT3	    	    0x10
+#	define R300_TX_FORMAT_DXT5	    	    0x11
+#	define R300_TX_FORMAT_D3DMFT_CxV8U8	    0x12     /* no swizzle */
+#	define R300_TX_FORMAT_A8R8G8B8	    	    0x13     /* no swizzle */
+#	define R300_TX_FORMAT_B8G8_B8G8	    	    0x14     /* no swizzle */
+#	define R300_TX_FORMAT_G8R8_G8B8	    	    0x15     /* no swizzle */
+	/* 0x16 - some 16 bit green format.. ?? */
+#	define R300_TX_FORMAT_UNK25		   (1 << 25) /* no swizzle */
+#	define R300_TX_FORMAT_CUBIC_MAP		   (1 << 26)
+
+	/* gap */
+	/* Floating point formats */
+	/* Note - hardware supports both 16 and 32 bit floating point */
+#	define R300_TX_FORMAT_FL_I16	    	    0x18
+#	define R300_TX_FORMAT_FL_I16A16	    	    0x19
+#	define R300_TX_FORMAT_FL_R16G16B16A16	    0x1A
+#	define R300_TX_FORMAT_FL_I32	    	    0x1B
+#	define R300_TX_FORMAT_FL_I32A32	    	    0x1C
+#	define R300_TX_FORMAT_FL_R32G32B32A32	    0x1D
+	/* alpha modes, convenience mostly */
+	/* if you have alpha, pick constant appropriate to the
+	   number of channels (1 for I8, 2 for I8A8, 4 for R8G8B8A8, etc */
+# 	define R300_TX_FORMAT_ALPHA_1CH		    0x000
+# 	define R300_TX_FORMAT_ALPHA_2CH		    0x200
+# 	define R300_TX_FORMAT_ALPHA_4CH		    0x600
+# 	define R300_TX_FORMAT_ALPHA_NONE	    0xA00
+	/* Swizzling */
+	/* constants */
+#	define R300_TX_FORMAT_X		0
+#	define R300_TX_FORMAT_Y		1
+#	define R300_TX_FORMAT_Z		2
+#	define R300_TX_FORMAT_W		3
+#	define R300_TX_FORMAT_ZERO	4
+#	define R300_TX_FORMAT_ONE	5
+	/* 2.0*Z, everything above 1.0 is set to 0.0 */
+#	define R300_TX_FORMAT_CUT_Z	6
+	/* 2.0*W, everything above 1.0 is set to 0.0 */
+#	define R300_TX_FORMAT_CUT_W	7
+
+#	define R300_TX_FORMAT_B_SHIFT	18
+#	define R300_TX_FORMAT_G_SHIFT	15
+#	define R300_TX_FORMAT_R_SHIFT	12
+#	define R300_TX_FORMAT_A_SHIFT	9
+	/* Convenience macro to take care of layout and swizzling */
+#	define R300_EASY_TX_FORMAT(B, G, R, A, FMT)	(		\
+		((R300_TX_FORMAT_##B)<<R300_TX_FORMAT_B_SHIFT)		\
+		| ((R300_TX_FORMAT_##G)<<R300_TX_FORMAT_G_SHIFT)	\
+		| ((R300_TX_FORMAT_##R)<<R300_TX_FORMAT_R_SHIFT)	\
+		| ((R300_TX_FORMAT_##A)<<R300_TX_FORMAT_A_SHIFT)	\
+		| (R300_TX_FORMAT_##FMT)				\
+		)
+	/* These can be ORed with result of R300_EASY_TX_FORMAT()
+	   We don't really know what they do. Take values from a
+           constant color ? */
+#	define R300_TX_FORMAT_CONST_X		(1<<5)
+#	define R300_TX_FORMAT_CONST_Y		(2<<5)
+#	define R300_TX_FORMAT_CONST_Z		(4<<5)
+#	define R300_TX_FORMAT_CONST_W		(8<<5)
+
+#	define R300_TX_FORMAT_YUV_MODE		0x00800000
+
+#define R300_TX_PITCH_0			    0x4500 /* obvious missing in gap */
+#define R300_TX_OFFSET_0                    0x4540
+	/* BEGIN: Guess from R200 */
+#       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
+#       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
+#       define R300_TXO_ENDIAN_WORD_SWAP         (2 << 0)
+#       define R300_TXO_ENDIAN_HALFDW_SWAP       (3 << 0)
+#       define R300_TXO_MACRO_TILE               (1 << 2)
+#       define R300_TXO_MICRO_TILE               (1 << 3)
+#       define R300_TXO_OFFSET_MASK              0xffffffe0
+#       define R300_TXO_OFFSET_SHIFT             5
+	/* END: Guess from R200 */
+
+/* 32 bit chroma key */
+#define R300_TX_CHROMA_KEY_0                      0x4580
+/* ff00ff00 == { 0, 1.0, 0, 1.0 } */
+#define R300_TX_BORDER_COLOR_0              0x45C0
+
+/* END: Texture specification */
+
+/* BEGIN: Fragment program instruction set */
+
+/* Fragment programs are written directly into register space.
+ * There are separate instruction streams for texture instructions and ALU
+ * instructions.
+ * In order to synchronize these streams, the program is divided into up
+ * to 4 nodes. Each node begins with a number of TEX operations, followed
+ * by a number of ALU operations.
+ * The first node can have zero TEX ops, all subsequent nodes must have at
+ * least
+ * one TEX ops.
+ * All nodes must have at least one ALU op.
+ *
+ * The index of the last node is stored in PFS_CNTL_0: A value of 0 means
+ * 1 node, a value of 3 means 4 nodes.
+ * The total amount of instructions is defined in PFS_CNTL_2. The offsets are
+ * offsets into the respective instruction streams, while *_END points to the
+ * last instruction relative to this offset.
+ */
+#define R300_PFS_CNTL_0                     0x4600
+#       define R300_PFS_CNTL_LAST_NODES_SHIFT    0
+#       define R300_PFS_CNTL_LAST_NODES_MASK     (3 << 0)
+#       define R300_PFS_CNTL_FIRST_NODE_HAS_TEX  (1 << 3)
+#define R300_PFS_CNTL_1                     0x4604
+/* There is an unshifted value here which has so far always been equal to the
+ * index of the highest used temporary register.
+ */
+#define R300_PFS_CNTL_2                     0x4608
+#       define R300_PFS_CNTL_ALU_OFFSET_SHIFT    0
+#       define R300_PFS_CNTL_ALU_OFFSET_MASK     (63 << 0)
+#       define R300_PFS_CNTL_ALU_END_SHIFT       6
+#       define R300_PFS_CNTL_ALU_END_MASK        (63 << 6)
+#       define R300_PFS_CNTL_TEX_OFFSET_SHIFT    12
+#       define R300_PFS_CNTL_TEX_OFFSET_MASK     (31 << 12) /* GUESS */
+#       define R300_PFS_CNTL_TEX_END_SHIFT       18
+#       define R300_PFS_CNTL_TEX_END_MASK        (31 << 18) /* GUESS */
+
+/* gap */
+
+/* Nodes are stored backwards. The last active node is always stored in
+ * PFS_NODE_3.
+ * Example: In a 2-node program, NODE_0 and NODE_1 are set to 0. The
+ * first node is stored in NODE_2, the second node is stored in NODE_3.
+ *
+ * Offsets are relative to the master offset from PFS_CNTL_2.
+ * LAST_NODE is set for the last node, and only for the last node.
+ */
+#define R300_PFS_NODE_0                     0x4610
+#define R300_PFS_NODE_1                     0x4614
+#define R300_PFS_NODE_2                     0x4618
+#define R300_PFS_NODE_3                     0x461C
+#       define R300_PFS_NODE_ALU_OFFSET_SHIFT    0
+#       define R300_PFS_NODE_ALU_OFFSET_MASK     (63 << 0)
+#       define R300_PFS_NODE_ALU_END_SHIFT       6
+#       define R300_PFS_NODE_ALU_END_MASK        (63 << 6)
+#       define R300_PFS_NODE_TEX_OFFSET_SHIFT    12
+#       define R300_PFS_NODE_TEX_OFFSET_MASK     (31 << 12)
+#       define R300_PFS_NODE_TEX_END_SHIFT       17
+#       define R300_PFS_NODE_TEX_END_MASK        (31 << 17)
+/*#       define R300_PFS_NODE_LAST_NODE           (1 << 22) */
+#		define R300_PFS_NODE_OUTPUT_COLOR        (1 << 22)
+#		define R300_PFS_NODE_OUTPUT_DEPTH        (1 << 23)
+
+/* TEX
+ * As far as I can tell, texture instructions cannot write into output
+ * registers directly. A subsequent ALU instruction is always necessary,
+ * even if it's just MAD o0, r0, 1, 0
+ */
+#define R300_PFS_TEXI_0                     0x4620
+#	define R300_FPITX_SRC_SHIFT              0
+#	define R300_FPITX_SRC_MASK               (31 << 0)
+	/* GUESS */
+#	define R300_FPITX_SRC_CONST              (1 << 5)
+#	define R300_FPITX_DST_SHIFT              6
+#	define R300_FPITX_DST_MASK               (31 << 6)
+#	define R300_FPITX_IMAGE_SHIFT            11
+	/* GUESS based on layout and native limits */
+#       define R300_FPITX_IMAGE_MASK             (15 << 11)
+/* Unsure if these are opcodes, or some kind of bitfield, but this is how
+ * they were set when I checked
+ */
+#	define R300_FPITX_OPCODE_SHIFT		15
+#		define R300_FPITX_OP_TEX	1
+#		define R300_FPITX_OP_KIL	2
+#		define R300_FPITX_OP_TXP	3
+#		define R300_FPITX_OP_TXB	4
+#	define R300_FPITX_OPCODE_MASK           (7 << 15)
+
+/* ALU
+ * The ALU instructions register blocks are enumerated according to the order
+ * in which fglrx. I assume there is space for 64 instructions, since
+ * each block has space for a maximum of 64 DWORDs, and this matches reported
+ * native limits.
+ *
+ * The basic functional block seems to be one MAD for each color and alpha,
+ * and an adder that adds all components after the MUL.
+ *  - ADD, MUL, MAD etc.: use MAD with appropriate neutral operands
+ *  - DP4: Use OUTC_DP4, OUTA_DP4
+ *  - DP3: Use OUTC_DP3, OUTA_DP4, appropriate alpha operands
+ *  - DPH: Use OUTC_DP4, OUTA_DP4, appropriate alpha operands
+ *  - CMPH: If ARG2 > 0.5, return ARG0, else return ARG1
+ *  - CMP: If ARG2 < 0, return ARG1, else return ARG0
+ *  - FLR: use FRC+MAD
+ *  - XPD: use MAD+MAD
+ *  - SGE, SLT: use MAD+CMP
+ *  - RSQ: use ABS modifier for argument
+ *  - Use OUTC_REPL_ALPHA to write results of an alpha-only operation
+ *    (e.g. RCP) into color register
+ *  - apparently, there's no quick DST operation
+ *  - fglrx set FPI2_UNKNOWN_31 on a "MAD fragment.color, tmp0, tmp1, tmp2"
+ *  - fglrx set FPI2_UNKNOWN_31 on a "MAX r2, r1, c0"
+ *  - fglrx once set FPI0_UNKNOWN_31 on a "FRC r1, r1"
+ *
+ * Operand selection
+ * First stage selects three sources from the available registers and
+ * constant parameters. This is defined in INSTR1 (color) and INSTR3 (alpha).
+ * fglrx sorts the three source fields: Registers before constants,
+ * lower indices before higher indices; I do not know whether this is
+ * necessary.
+ *
+ * fglrx fills unused sources with "read constant 0"
+ * According to specs, you cannot select more than two different constants.
+ *
+ * Second stage selects the operands from the sources. This is defined in
+ * INSTR0 (color) and INSTR2 (alpha). You can also select the special constants
+ * zero and one.
+ * Swizzling and negation happens in this stage, as well.
+ *
+ * Important: Color and alpha seem to be mostly separate, i.e. their sources
+ * selection appears to be fully independent (the register storage is probably
+ * physically split into a color and an alpha section).
+ * However (because of the apparent physical split), there is some interaction
+ * WRT swizzling. If, for example, you want to load an R component into an
+ * Alpha operand, this R component is taken from a *color* source, not from
+ * an alpha source. The corresponding register doesn't even have to appear in
+ * the alpha sources list. (I hope this all makes sense to you)
+ *
+ * Destination selection
+ * The destination register index is in FPI1 (color) and FPI3 (alpha)
+ * together with enable bits.
+ * There are separate enable bits for writing into temporary registers
+ * (DSTC_REG_* /DSTA_REG) and and program output registers (DSTC_OUTPUT_*
+ * /DSTA_OUTPUT). You can write to both at once, or not write at all (the
+ * same index must be used for both).
+ *
+ * Note: There is a special form for LRP
+ *  - Argument order is the same as in ARB_fragment_program.
+ *  - Operation is MAD
+ *  - ARG1 is set to ARGC_SRC1C_LRP/ARGC_SRC1A_LRP
+ *  - Set FPI0/FPI2_SPECIAL_LRP
+ * Arbitrary LRP (including support for swizzling) requires vanilla MAD+MAD
+ */
+#define R300_PFS_INSTR1_0                   0x46C0
+#       define R300_FPI1_SRC0C_SHIFT             0
+#       define R300_FPI1_SRC0C_MASK              (31 << 0)
+#       define R300_FPI1_SRC0C_CONST             (1 << 5)
+#       define R300_FPI1_SRC1C_SHIFT             6
+#       define R300_FPI1_SRC1C_MASK              (31 << 6)
+#       define R300_FPI1_SRC1C_CONST             (1 << 11)
+#       define R300_FPI1_SRC2C_SHIFT             12
+#       define R300_FPI1_SRC2C_MASK              (31 << 12)
+#       define R300_FPI1_SRC2C_CONST             (1 << 17)
+#       define R300_FPI1_SRC_MASK                0x0003ffff
+#       define R300_FPI1_DSTC_SHIFT              18
+#       define R300_FPI1_DSTC_MASK               (31 << 18)
+#		define R300_FPI1_DSTC_REG_MASK_SHIFT     23
+#       define R300_FPI1_DSTC_REG_X              (1 << 23)
+#       define R300_FPI1_DSTC_REG_Y              (1 << 24)
+#       define R300_FPI1_DSTC_REG_Z              (1 << 25)
+#		define R300_FPI1_DSTC_OUTPUT_MASK_SHIFT  26
+#       define R300_FPI1_DSTC_OUTPUT_X           (1 << 26)
+#       define R300_FPI1_DSTC_OUTPUT_Y           (1 << 27)
+#       define R300_FPI1_DSTC_OUTPUT_Z           (1 << 28)
+
+#define R300_PFS_INSTR3_0                   0x47C0
+#       define R300_FPI3_SRC0A_SHIFT             0
+#       define R300_FPI3_SRC0A_MASK              (31 << 0)
+#       define R300_FPI3_SRC0A_CONST             (1 << 5)
+#       define R300_FPI3_SRC1A_SHIFT             6
+#       define R300_FPI3_SRC1A_MASK              (31 << 6)
+#       define R300_FPI3_SRC1A_CONST             (1 << 11)
+#       define R300_FPI3_SRC2A_SHIFT             12
+#       define R300_FPI3_SRC2A_MASK              (31 << 12)
+#       define R300_FPI3_SRC2A_CONST             (1 << 17)
+#       define R300_FPI3_SRC_MASK                0x0003ffff
+#       define R300_FPI3_DSTA_SHIFT              18
+#       define R300_FPI3_DSTA_MASK               (31 << 18)
+#       define R300_FPI3_DSTA_REG                (1 << 23)
+#       define R300_FPI3_DSTA_OUTPUT             (1 << 24)
+#		define R300_FPI3_DSTA_DEPTH              (1 << 27)
+
+#define R300_PFS_INSTR0_0                   0x48C0
+#       define R300_FPI0_ARGC_SRC0C_XYZ          0
+#       define R300_FPI0_ARGC_SRC0C_XXX          1
+#       define R300_FPI0_ARGC_SRC0C_YYY          2
+#       define R300_FPI0_ARGC_SRC0C_ZZZ          3
+#       define R300_FPI0_ARGC_SRC1C_XYZ          4
+#       define R300_FPI0_ARGC_SRC1C_XXX          5
+#       define R300_FPI0_ARGC_SRC1C_YYY          6
+#       define R300_FPI0_ARGC_SRC1C_ZZZ          7
+#       define R300_FPI0_ARGC_SRC2C_XYZ          8
+#       define R300_FPI0_ARGC_SRC2C_XXX          9
+#       define R300_FPI0_ARGC_SRC2C_YYY          10
+#       define R300_FPI0_ARGC_SRC2C_ZZZ          11
+#       define R300_FPI0_ARGC_SRC0A              12
+#       define R300_FPI0_ARGC_SRC1A              13
+#       define R300_FPI0_ARGC_SRC2A              14
+#       define R300_FPI0_ARGC_SRC1C_LRP          15
+#       define R300_FPI0_ARGC_ZERO               20
+#       define R300_FPI0_ARGC_ONE                21
+	/* GUESS */
+#       define R300_FPI0_ARGC_HALF               22
+#       define R300_FPI0_ARGC_SRC0C_YZX          23
+#       define R300_FPI0_ARGC_SRC1C_YZX          24
+#       define R300_FPI0_ARGC_SRC2C_YZX          25
+#       define R300_FPI0_ARGC_SRC0C_ZXY          26
+#       define R300_FPI0_ARGC_SRC1C_ZXY          27
+#       define R300_FPI0_ARGC_SRC2C_ZXY          28
+#       define R300_FPI0_ARGC_SRC0CA_WZY         29
+#       define R300_FPI0_ARGC_SRC1CA_WZY         30
+#       define R300_FPI0_ARGC_SRC2CA_WZY         31
+
+#       define R300_FPI0_ARG0C_SHIFT             0
+#       define R300_FPI0_ARG0C_MASK              (31 << 0)
+#       define R300_FPI0_ARG0C_NEG               (1 << 5)
+#       define R300_FPI0_ARG0C_ABS               (1 << 6)
+#       define R300_FPI0_ARG1C_SHIFT             7
+#       define R300_FPI0_ARG1C_MASK              (31 << 7)
+#       define R300_FPI0_ARG1C_NEG               (1 << 12)
+#       define R300_FPI0_ARG1C_ABS               (1 << 13)
+#       define R300_FPI0_ARG2C_SHIFT             14
+#       define R300_FPI0_ARG2C_MASK              (31 << 14)
+#       define R300_FPI0_ARG2C_NEG               (1 << 19)
+#       define R300_FPI0_ARG2C_ABS               (1 << 20)
+#       define R300_FPI0_SPECIAL_LRP             (1 << 21)
+#       define R300_FPI0_OUTC_MAD                (0 << 23)
+#       define R300_FPI0_OUTC_DP3                (1 << 23)
+#       define R300_FPI0_OUTC_DP4                (2 << 23)
+#       define R300_FPI0_OUTC_MIN                (4 << 23)
+#       define R300_FPI0_OUTC_MAX                (5 << 23)
+#       define R300_FPI0_OUTC_CMPH               (7 << 23)
+#       define R300_FPI0_OUTC_CMP                (8 << 23)
+#       define R300_FPI0_OUTC_FRC                (9 << 23)
+#       define R300_FPI0_OUTC_REPL_ALPHA         (10 << 23)
+#       define R300_FPI0_OUTC_SAT                (1 << 30)
+#       define R300_FPI0_INSERT_NOP              (1 << 31)
+
+#define R300_PFS_INSTR2_0                   0x49C0
+#       define R300_FPI2_ARGA_SRC0C_X            0
+#       define R300_FPI2_ARGA_SRC0C_Y            1
+#       define R300_FPI2_ARGA_SRC0C_Z            2
+#       define R300_FPI2_ARGA_SRC1C_X            3
+#       define R300_FPI2_ARGA_SRC1C_Y            4
+#       define R300_FPI2_ARGA_SRC1C_Z            5
+#       define R300_FPI2_ARGA_SRC2C_X            6
+#       define R300_FPI2_ARGA_SRC2C_Y            7
+#       define R300_FPI2_ARGA_SRC2C_Z            8
+#       define R300_FPI2_ARGA_SRC0A              9
+#       define R300_FPI2_ARGA_SRC1A              10
+#       define R300_FPI2_ARGA_SRC2A              11
+#       define R300_FPI2_ARGA_SRC1A_LRP          15
+#       define R300_FPI2_ARGA_ZERO               16
+#       define R300_FPI2_ARGA_ONE                17
+	/* GUESS */
+#       define R300_FPI2_ARGA_HALF               18
+#       define R300_FPI2_ARG0A_SHIFT             0
+#       define R300_FPI2_ARG0A_MASK              (31 << 0)
+#       define R300_FPI2_ARG0A_NEG               (1 << 5)
+	/* GUESS */
+#	define R300_FPI2_ARG0A_ABS		 (1 << 6)
+#       define R300_FPI2_ARG1A_SHIFT             7
+#       define R300_FPI2_ARG1A_MASK              (31 << 7)
+#       define R300_FPI2_ARG1A_NEG               (1 << 12)
+	/* GUESS */
+#	define R300_FPI2_ARG1A_ABS		 (1 << 13)
+#       define R300_FPI2_ARG2A_SHIFT             14
+#       define R300_FPI2_ARG2A_MASK              (31 << 14)
+#       define R300_FPI2_ARG2A_NEG               (1 << 19)
+	/* GUESS */
+#	define R300_FPI2_ARG2A_ABS		 (1 << 20)
+#       define R300_FPI2_SPECIAL_LRP             (1 << 21)
+#       define R300_FPI2_OUTA_MAD                (0 << 23)
+#       define R300_FPI2_OUTA_DP4                (1 << 23)
+#       define R300_FPI2_OUTA_MIN                (2 << 23)
+#       define R300_FPI2_OUTA_MAX                (3 << 23)
+#       define R300_FPI2_OUTA_CMP                (6 << 23)
+#       define R300_FPI2_OUTA_FRC                (7 << 23)
+#       define R300_FPI2_OUTA_EX2                (8 << 23)
+#       define R300_FPI2_OUTA_LG2                (9 << 23)
+#       define R300_FPI2_OUTA_RCP                (10 << 23)
+#       define R300_FPI2_OUTA_RSQ                (11 << 23)
+#       define R300_FPI2_OUTA_SAT                (1 << 30)
+#       define R300_FPI2_UNKNOWN_31              (1 << 31)
+/* END: Fragment program instruction set */
+
+/* Fog state and color */
+#define R300_RE_FOG_STATE                   0x4BC0
+#       define R300_FOG_ENABLE                   (1 << 0)
+#	define R300_FOG_MODE_LINEAR              (0 << 1)
+#	define R300_FOG_MODE_EXP                 (1 << 1)
+#	define R300_FOG_MODE_EXP2                (2 << 1)
+#	define R300_FOG_MODE_MASK                (3 << 1)
+#define R300_FOG_COLOR_R                    0x4BC8
+#define R300_FOG_COLOR_G                    0x4BCC
+#define R300_FOG_COLOR_B                    0x4BD0
+
+#define R300_PP_ALPHA_TEST                  0x4BD4
+#       define R300_REF_ALPHA_MASK               0x000000ff
+#       define R300_ALPHA_TEST_FAIL              (0 << 8)
+#       define R300_ALPHA_TEST_LESS              (1 << 8)
+#       define R300_ALPHA_TEST_LEQUAL            (3 << 8)
+#       define R300_ALPHA_TEST_EQUAL             (2 << 8)
+#       define R300_ALPHA_TEST_GEQUAL            (6 << 8)
+#       define R300_ALPHA_TEST_GREATER           (4 << 8)
+#       define R300_ALPHA_TEST_NEQUAL            (5 << 8)
+#       define R300_ALPHA_TEST_PASS              (7 << 8)
+#       define R300_ALPHA_TEST_OP_MASK           (7 << 8)
+#       define R300_ALPHA_TEST_ENABLE            (1 << 11)
+
+/* gap */
+
+/* Fragment program parameters in 7.16 floating point */
+#define R300_PFS_PARAM_0_X                  0x4C00
+#define R300_PFS_PARAM_0_Y                  0x4C04
+#define R300_PFS_PARAM_0_Z                  0x4C08
+#define R300_PFS_PARAM_0_W                  0x4C0C
+/* GUESS: PARAM_31 is last, based on native limits reported by fglrx */
+#define R300_PFS_PARAM_31_X                 0x4DF0
+#define R300_PFS_PARAM_31_Y                 0x4DF4
+#define R300_PFS_PARAM_31_Z                 0x4DF8
+#define R300_PFS_PARAM_31_W                 0x4DFC
+
+/* Notes:
+ * - AFAIK fglrx always sets BLEND_UNKNOWN when blending is used in
+ *   the application
+ * - AFAIK fglrx always sets BLEND_NO_SEPARATE when CBLEND and ABLEND
+ *    are set to the same
+ *   function (both registers are always set up completely in any case)
+ * - Most blend flags are simply copied from R200 and not tested yet
+ */
+#define R300_RB3D_CBLEND                    0x4E04
+#define R300_RB3D_ABLEND                    0x4E08
+/* the following only appear in CBLEND */
+#       define R300_BLEND_ENABLE                     (1 << 0)
+#       define R300_BLEND_UNKNOWN                    (3 << 1)
+#       define R300_BLEND_NO_SEPARATE                (1 << 3)
+/* the following are shared between CBLEND and ABLEND */
+#       define R300_FCN_MASK                         (3  << 12)
+#       define R300_COMB_FCN_ADD_CLAMP               (0  << 12)
+#       define R300_COMB_FCN_ADD_NOCLAMP             (1  << 12)
+#       define R300_COMB_FCN_SUB_CLAMP               (2  << 12)
+#       define R300_COMB_FCN_SUB_NOCLAMP             (3  << 12)
+#       define R300_COMB_FCN_MIN                     (4  << 12)
+#       define R300_COMB_FCN_MAX                     (5  << 12)
+#       define R300_COMB_FCN_RSUB_CLAMP              (6  << 12)
+#       define R300_COMB_FCN_RSUB_NOCLAMP            (7  << 12)
+#       define R300_BLEND_GL_ZERO                    (32)
+#       define R300_BLEND_GL_ONE                     (33)
+#       define R300_BLEND_GL_SRC_COLOR               (34)
+#       define R300_BLEND_GL_ONE_MINUS_SRC_COLOR     (35)
+#       define R300_BLEND_GL_DST_COLOR               (36)
+#       define R300_BLEND_GL_ONE_MINUS_DST_COLOR     (37)
+#       define R300_BLEND_GL_SRC_ALPHA               (38)
+#       define R300_BLEND_GL_ONE_MINUS_SRC_ALPHA     (39)
+#       define R300_BLEND_GL_DST_ALPHA               (40)
+#       define R300_BLEND_GL_ONE_MINUS_DST_ALPHA     (41)
+#       define R300_BLEND_GL_SRC_ALPHA_SATURATE      (42)
+#       define R300_BLEND_GL_CONST_COLOR             (43)
+#       define R300_BLEND_GL_ONE_MINUS_CONST_COLOR   (44)
+#       define R300_BLEND_GL_CONST_ALPHA             (45)
+#       define R300_BLEND_GL_ONE_MINUS_CONST_ALPHA   (46)
+#       define R300_BLEND_MASK                       (63)
+#       define R300_SRC_BLEND_SHIFT                  (16)
+#       define R300_DST_BLEND_SHIFT                  (24)
+#define R300_RB3D_BLEND_COLOR               0x4E10
+#define R300_RB3D_COLORMASK                 0x4E0C
+#       define R300_COLORMASK0_B                 (1<<0)
+#       define R300_COLORMASK0_G                 (1<<1)
+#       define R300_COLORMASK0_R                 (1<<2)
+#       define R300_COLORMASK0_A                 (1<<3)
+
+/* gap */
+
+#define R300_RB3D_COLOROFFSET0              0x4E28
+#       define R300_COLOROFFSET_MASK             0xFFFFFFF0 /* GUESS */
+#define R300_RB3D_COLOROFFSET1              0x4E2C /* GUESS */
+#define R300_RB3D_COLOROFFSET2              0x4E30 /* GUESS */
+#define R300_RB3D_COLOROFFSET3              0x4E34 /* GUESS */
+
+/* gap */
+
+/* Bit 16: Larger tiles
+ * Bit 17: 4x2 tiles
+ * Bit 18: Extremely weird tile like, but some pixels duplicated?
+ */
+#define R300_RB3D_COLORPITCH0               0x4E38
+#       define R300_COLORPITCH_MASK              0x00001FF8 /* GUESS */
+#       define R300_COLOR_TILE_ENABLE            (1 << 16) /* GUESS */
+#       define R300_COLOR_MICROTILE_ENABLE       (1 << 17) /* GUESS */
+#       define R300_COLOR_ENDIAN_NO_SWAP         (0 << 18) /* GUESS */
+#       define R300_COLOR_ENDIAN_WORD_SWAP       (1 << 18) /* GUESS */
+#       define R300_COLOR_ENDIAN_DWORD_SWAP      (2 << 18) /* GUESS */
+#       define R300_COLOR_FORMAT_RGB565          (2 << 22)
+#       define R300_COLOR_FORMAT_ARGB8888        (3 << 22)
+#define R300_RB3D_COLORPITCH1               0x4E3C /* GUESS */
+#define R300_RB3D_COLORPITCH2               0x4E40 /* GUESS */
+#define R300_RB3D_COLORPITCH3               0x4E44 /* GUESS */
+
+/* gap */
+
+/* Guess by Vladimir.
+ * Set to 0A before 3D operations, set to 02 afterwards.
+ */
+#define R300_RB3D_DSTCACHE_CTLSTAT          0x4E4C
+#       define R300_RB3D_DSTCACHE_UNKNOWN_02             0x00000002
+#       define R300_RB3D_DSTCACHE_UNKNOWN_0A             0x0000000A
+
+/* gap */
+/* There seems to be no "write only" setting, so use Z-test = ALWAYS
+ * for this.
+ * Bit (1<<8) is the "test" bit. so plain write is 6  - vd
+ */
+#define R300_RB3D_ZSTENCIL_CNTL_0                   0x4F00
+#       define R300_RB3D_Z_DISABLED_1            0x00000010
+#       define R300_RB3D_Z_DISABLED_2            0x00000014
+#       define R300_RB3D_Z_TEST                  0x00000012
+#       define R300_RB3D_Z_TEST_AND_WRITE        0x00000016
+#       define R300_RB3D_Z_WRITE_ONLY        	 0x00000006
+
+#       define R300_RB3D_Z_TEST                  0x00000012
+#       define R300_RB3D_Z_TEST_AND_WRITE        0x00000016
+#       define R300_RB3D_Z_WRITE_ONLY        	 0x00000006
+#	define R300_RB3D_STENCIL_ENABLE		 0x00000001
+
+#define R300_RB3D_ZSTENCIL_CNTL_1                   0x4F04
+	/* functions */
+#	define R300_ZS_NEVER			0
+#	define R300_ZS_LESS			1
+#	define R300_ZS_LEQUAL			2
+#	define R300_ZS_EQUAL			3
+#	define R300_ZS_GEQUAL			4
+#	define R300_ZS_GREATER			5
+#	define R300_ZS_NOTEQUAL			6
+#	define R300_ZS_ALWAYS			7
+#       define R300_ZS_MASK                     7
+	/* operations */
+#	define R300_ZS_KEEP			0
+#	define R300_ZS_ZERO			1
+#	define R300_ZS_REPLACE			2
+#	define R300_ZS_INCR			3
+#	define R300_ZS_DECR			4
+#	define R300_ZS_INVERT			5
+#	define R300_ZS_INCR_WRAP		6
+#	define R300_ZS_DECR_WRAP		7
+	/* front and back refer to operations done for front
+	   and back faces, i.e. separate stencil function support */
+#	define R300_RB3D_ZS1_DEPTH_FUNC_SHIFT		0
+#	define R300_RB3D_ZS1_FRONT_FUNC_SHIFT		3
+#	define R300_RB3D_ZS1_FRONT_FAIL_OP_SHIFT	6
+#	define R300_RB3D_ZS1_FRONT_ZPASS_OP_SHIFT	9
+#	define R300_RB3D_ZS1_FRONT_ZFAIL_OP_SHIFT      12
+#	define R300_RB3D_ZS1_BACK_FUNC_SHIFT           15
+#	define R300_RB3D_ZS1_BACK_FAIL_OP_SHIFT        18
+#	define R300_RB3D_ZS1_BACK_ZPASS_OP_SHIFT       21
+#	define R300_RB3D_ZS1_BACK_ZFAIL_OP_SHIFT       24
+
+#define R300_RB3D_ZSTENCIL_CNTL_2                   0x4F08
+#	define R300_RB3D_ZS2_STENCIL_REF_SHIFT		0
+#	define R300_RB3D_ZS2_STENCIL_MASK		0xFF
+#	define R300_RB3D_ZS2_STENCIL_MASK_SHIFT	        8
+#	define R300_RB3D_ZS2_STENCIL_WRITE_MASK_SHIFT	16
+
+/* gap */
+
+#define R300_RB3D_ZSTENCIL_FORMAT                   0x4F10
+#	define R300_DEPTH_FORMAT_16BIT_INT_Z     (0 << 0)
+#	define R300_DEPTH_FORMAT_24BIT_INT_Z     (2 << 0)
+	/* 16 bit format or some aditional bit ? */
+#	define R300_DEPTH_FORMAT_UNK32          (32 << 0)
+
+#define R300_RB3D_EARLY_Z                           0x4F14
+#	define R300_EARLY_Z_DISABLE              (0 << 0)
+#	define R300_EARLY_Z_ENABLE               (1 << 0)
+
+/* gap */
+
+#define R300_RB3D_ZCACHE_CTLSTAT            0x4F18 /* GUESS */
+#       define R300_RB3D_ZCACHE_UNKNOWN_01  0x1
+#       define R300_RB3D_ZCACHE_UNKNOWN_03  0x3
+
+/* gap */
+
+#define R300_RB3D_DEPTHOFFSET               0x4F20
+#define R300_RB3D_DEPTHPITCH                0x4F24
+#       define R300_DEPTHPITCH_MASK              0x00001FF8 /* GUESS */
+#       define R300_DEPTH_TILE_ENABLE            (1 << 16) /* GUESS */
+#       define R300_DEPTH_MICROTILE_ENABLE       (1 << 17) /* GUESS */
+#       define R300_DEPTH_ENDIAN_NO_SWAP         (0 << 18) /* GUESS */
+#       define R300_DEPTH_ENDIAN_WORD_SWAP       (1 << 18) /* GUESS */
+#       define R300_DEPTH_ENDIAN_DWORD_SWAP      (2 << 18) /* GUESS */
+
+/* BEGIN: Vertex program instruction set */
+
+/* Every instruction is four dwords long:
+ *  DWORD 0: output and opcode
+ *  DWORD 1: first argument
+ *  DWORD 2: second argument
+ *  DWORD 3: third argument
+ *
+ * Notes:
+ *  - ABS r, a is implemented as MAX r, a, -a
+ *  - MOV is implemented as ADD to zero
+ *  - XPD is implemented as MUL + MAD
+ *  - FLR is implemented as FRC + ADD
+ *  - apparently, fglrx tries to schedule instructions so that there is at
+ *    least one instruction between the write to a temporary and the first
+ *    read from said temporary; however, violations of this scheduling are
+ *    allowed
+ *  - register indices seem to be unrelated with OpenGL aliasing to
+ *    conventional state
+ *  - only one attribute and one parameter can be loaded at a time; however,
+ *    the same attribute/parameter can be used for more than one argument
+ *  - the second software argument for POW is the third hardware argument
+ *    (no idea why)
+ *  - MAD with only temporaries as input seems to use VPI_OUT_SELECT_MAD_2
+ *
+ * There is some magic surrounding LIT:
+ *   The single argument is replicated across all three inputs, but swizzled:
+ *     First argument: xyzy
+ *     Second argument: xyzx
+ *     Third argument: xyzw
+ *   Whenever the result is used later in the fragment program, fglrx forces
+ *   x and w to be 1.0 in the input selection; I don't know whether this is
+ *   strictly necessary
+ */
+#define R300_VPI_OUT_OP_DOT                     (1 << 0)
+#define R300_VPI_OUT_OP_MUL                     (2 << 0)
+#define R300_VPI_OUT_OP_ADD                     (3 << 0)
+#define R300_VPI_OUT_OP_MAD                     (4 << 0)
+#define R300_VPI_OUT_OP_DST                     (5 << 0)
+#define R300_VPI_OUT_OP_FRC                     (6 << 0)
+#define R300_VPI_OUT_OP_MAX                     (7 << 0)
+#define R300_VPI_OUT_OP_MIN                     (8 << 0)
+#define R300_VPI_OUT_OP_SGE                     (9 << 0)
+#define R300_VPI_OUT_OP_SLT                     (10 << 0)
+	/* Used in GL_POINT_DISTANCE_ATTENUATION_ARB, vector(scalar, vector) */
+#define R300_VPI_OUT_OP_UNK12                   (12 << 0)
+#define R300_VPI_OUT_OP_ARL                     (13 << 0)
+#define R300_VPI_OUT_OP_EXP                     (65 << 0)
+#define R300_VPI_OUT_OP_LOG                     (66 << 0)
+	/* Used in fog computations, scalar(scalar) */
+#define R300_VPI_OUT_OP_UNK67                   (67 << 0)
+#define R300_VPI_OUT_OP_LIT                     (68 << 0)
+#define R300_VPI_OUT_OP_POW                     (69 << 0)
+#define R300_VPI_OUT_OP_RCP                     (70 << 0)
+#define R300_VPI_OUT_OP_RSQ                     (72 << 0)
+	/* Used in GL_POINT_DISTANCE_ATTENUATION_ARB, scalar(scalar) */
+#define R300_VPI_OUT_OP_UNK73                   (73 << 0)
+#define R300_VPI_OUT_OP_EX2                     (75 << 0)
+#define R300_VPI_OUT_OP_LG2                     (76 << 0)
+#define R300_VPI_OUT_OP_MAD_2                   (128 << 0)
+	/* all temps, vector(scalar, vector, vector) */
+#define R300_VPI_OUT_OP_UNK129                  (129 << 0)
+
+#define R300_VPI_OUT_REG_CLASS_TEMPORARY        (0 << 8)
+#define R300_VPI_OUT_REG_CLASS_ADDR             (1 << 8)
+#define R300_VPI_OUT_REG_CLASS_RESULT           (2 << 8)
+#define R300_VPI_OUT_REG_CLASS_MASK             (31 << 8)
+
+#define R300_VPI_OUT_REG_INDEX_SHIFT            13
+	/* GUESS based on fglrx native limits */
+#define R300_VPI_OUT_REG_INDEX_MASK             (31 << 13)
+
+#define R300_VPI_OUT_WRITE_X                    (1 << 20)
+#define R300_VPI_OUT_WRITE_Y                    (1 << 21)
+#define R300_VPI_OUT_WRITE_Z                    (1 << 22)
+#define R300_VPI_OUT_WRITE_W                    (1 << 23)
+
+#define R300_VPI_IN_REG_CLASS_TEMPORARY         (0 << 0)
+#define R300_VPI_IN_REG_CLASS_ATTRIBUTE         (1 << 0)
+#define R300_VPI_IN_REG_CLASS_PARAMETER         (2 << 0)
+#define R300_VPI_IN_REG_CLASS_NONE              (9 << 0)
+#define R300_VPI_IN_REG_CLASS_MASK              (31 << 0)
+
+#define R300_VPI_IN_REG_INDEX_SHIFT             5
+	/* GUESS based on fglrx native limits */
+#define R300_VPI_IN_REG_INDEX_MASK              (255 << 5)
+
+/* The R300 can select components from the input register arbitrarily.
+ * Use the following constants, shifted by the component shift you
+ * want to select
+ */
+#define R300_VPI_IN_SELECT_X    0
+#define R300_VPI_IN_SELECT_Y    1
+#define R300_VPI_IN_SELECT_Z    2
+#define R300_VPI_IN_SELECT_W    3
+#define R300_VPI_IN_SELECT_ZERO 4
+#define R300_VPI_IN_SELECT_ONE  5
+#define R300_VPI_IN_SELECT_MASK 7
+
+#define R300_VPI_IN_X_SHIFT                     13
+#define R300_VPI_IN_Y_SHIFT                     16
+#define R300_VPI_IN_Z_SHIFT                     19
+#define R300_VPI_IN_W_SHIFT                     22
+
+#define R300_VPI_IN_NEG_X                       (1 << 25)
+#define R300_VPI_IN_NEG_Y                       (1 << 26)
+#define R300_VPI_IN_NEG_Z                       (1 << 27)
+#define R300_VPI_IN_NEG_W                       (1 << 28)
+/* END: Vertex program instruction set */
+
+/* BEGIN: Packet 3 commands */
+
+/* A primitive emission dword. */
+#define R300_PRIM_TYPE_NONE                     (0 << 0)
+#define R300_PRIM_TYPE_POINT                    (1 << 0)
+#define R300_PRIM_TYPE_LINE                     (2 << 0)
+#define R300_PRIM_TYPE_LINE_STRIP               (3 << 0)
+#define R300_PRIM_TYPE_TRI_LIST                 (4 << 0)
+#define R300_PRIM_TYPE_TRI_FAN                  (5 << 0)
+#define R300_PRIM_TYPE_TRI_STRIP                (6 << 0)
+#define R300_PRIM_TYPE_TRI_TYPE2                (7 << 0)
+#define R300_PRIM_TYPE_RECT_LIST                (8 << 0)
+#define R300_PRIM_TYPE_3VRT_POINT_LIST          (9 << 0)
+#define R300_PRIM_TYPE_3VRT_LINE_LIST           (10 << 0)
+	/* GUESS (based on r200) */
+#define R300_PRIM_TYPE_POINT_SPRITES            (11 << 0)
+#define R300_PRIM_TYPE_LINE_LOOP                (12 << 0)
+#define R300_PRIM_TYPE_QUADS                    (13 << 0)
+#define R300_PRIM_TYPE_QUAD_STRIP               (14 << 0)
+#define R300_PRIM_TYPE_POLYGON                  (15 << 0)
+#define R300_PRIM_TYPE_MASK                     0xF
+#define R300_PRIM_WALK_IND                      (1 << 4)
+#define R300_PRIM_WALK_LIST                     (2 << 4)
+#define R300_PRIM_WALK_RING                     (3 << 4)
+#define R300_PRIM_WALK_MASK                     (3 << 4)
+	/* GUESS (based on r200) */
+#define R300_PRIM_COLOR_ORDER_BGRA              (0 << 6)
+#define R300_PRIM_COLOR_ORDER_RGBA              (1 << 6)
+#define R300_PRIM_NUM_VERTICES_SHIFT            16
+#define R300_PRIM_NUM_VERTICES_MASK             0xffff
+
+/* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR.
+ * Two parameter dwords:
+ * 0. The first parameter appears to be always 0
+ * 1. The second parameter is a standard primitive emission dword.
+ */
+#define R300_PACKET3_3D_DRAW_VBUF           0x00002800
+
+/* Specify the full set of vertex arrays as (address, stride).
+ * The first parameter is the number of vertex arrays specified.
+ * The rest of the command is a variable length list of blocks, where
+ * each block is three dwords long and specifies two arrays.
+ * The first dword of a block is split into two words, the lower significant
+ * word refers to the first array, the more significant word to the second
+ * array in the block.
+ * The low byte of each word contains the size of an array entry in dwords,
+ * the high byte contains the stride of the array.
+ * The second dword of a block contains the pointer to the first array,
+ * the third dword of a block contains the pointer to the second array.
+ * Note that if the total number of arrays is odd, the third dword of
+ * the last block is omitted.
+ */
+#define R300_PACKET3_3D_LOAD_VBPNTR         0x00002F00
+
+#define R300_PACKET3_INDX_BUFFER            0x00003300
+#    define R300_EB_UNK1_SHIFT                      24
+#    define R300_EB_UNK1                    (0x80<<24)
+#    define R300_EB_UNK2                        0x0810
+#define R300_PACKET3_3D_DRAW_INDX_2         0x00003600
+
+/* END: Packet 3 commands */
+
+
+/* Color formats for 2d packets
+ */
+#define R300_CP_COLOR_FORMAT_CI8	2
+#define R300_CP_COLOR_FORMAT_ARGB1555	3
+#define R300_CP_COLOR_FORMAT_RGB565	4
+#define R300_CP_COLOR_FORMAT_ARGB8888	6
+#define R300_CP_COLOR_FORMAT_RGB332	7
+#define R300_CP_COLOR_FORMAT_RGB8	9
+#define R300_CP_COLOR_FORMAT_ARGB4444	15
+
+/*
+ * CP type-3 packets
+ */
+#define R300_CP_CMD_BITBLT_MULTI	0xC0009B00
+
+#endif /* _R300_REG_H */
+
+/* *INDENT-ON* */
diff --git a/r300/r300_render.c b/r300/r300_render.c
new file mode 100644
index 0000000..cc13e9a
--- /dev/null
+++ b/r300/r300_render.c
@@ -0,0 +1,536 @@
+/**************************************************************************
+
+Copyright (C) 2004 Nicolai Haehnle.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ATI, VA LINUX SYSTEMS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \brief R300 Render (Vertex Buffer Implementation)
+ *
+ * The immediate implementation has been removed from CVS in favor of the vertex
+ * buffer implementation.
+ *
+ * The render functions are called by the pipeline manager to render a batch of
+ * primitives. They return TRUE to pass on to the next stage (i.e. software
+ * rasterization) or FALSE to indicate that the pipeline has finished after
+ * rendering something.
+ *
+ * When falling back to software TCL still attempt to use hardware
+ * rasterization.
+ *
+ * I am not sure that the cache related registers are setup correctly, but
+ * obviously this does work... Further investigation is needed.
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include "glheader.h"
+#include "state.h"
+#include "imports.h"
+#include "enums.h"
+#include "macros.h"
+#include "context.h"
+#include "dd.h"
+#include "simple_list.h"
+#include "api_arrayelt.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
+#include "radeon_reg.h"
+#include "radeon_macros.h"
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "r300_context.h"
+#include "r300_ioctl.h"
+#include "r300_state.h"
+#include "r300_reg.h"
+#include "r300_tex.h"
+#include "r300_emit.h"
+extern int future_hw_tcl_on;
+
+/**
+ * \brief Convert a OpenGL primitive type into a R300 primitive type.
+ */
+static int r300PrimitiveType(r300ContextPtr rmesa, GLcontext * ctx, int prim)
+{
+	switch (prim & PRIM_MODE_MASK) {
+	case GL_POINTS:
+		return R300_VAP_VF_CNTL__PRIM_POINTS;
+		break;
+	case GL_LINES:
+		return R300_VAP_VF_CNTL__PRIM_LINES;
+		break;
+	case GL_LINE_STRIP:
+		return R300_VAP_VF_CNTL__PRIM_LINE_STRIP;
+		break;
+	case GL_LINE_LOOP:
+		return R300_VAP_VF_CNTL__PRIM_LINE_LOOP;
+		break;
+	case GL_TRIANGLES:
+		return R300_VAP_VF_CNTL__PRIM_TRIANGLES;
+		break;
+	case GL_TRIANGLE_STRIP:
+		return R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP;
+		break;
+	case GL_TRIANGLE_FAN:
+		return R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN;
+		break;
+	case GL_QUADS:
+		return R300_VAP_VF_CNTL__PRIM_QUADS;
+		break;
+	case GL_QUAD_STRIP:
+		return R300_VAP_VF_CNTL__PRIM_QUAD_STRIP;
+		break;
+	case GL_POLYGON:
+		return R300_VAP_VF_CNTL__PRIM_POLYGON;
+		break;
+	default:
+		assert(0);
+		return -1;
+		break;
+	}
+}
+
+static int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim)
+{
+	int verts_off = 0;
+
+	switch (prim & PRIM_MODE_MASK) {
+	case GL_POINTS:
+		verts_off = 0;
+		break;
+	case GL_LINES:
+		verts_off = num_verts % 2;
+		break;
+	case GL_LINE_STRIP:
+		if (num_verts < 2)
+			verts_off = num_verts;
+		break;
+	case GL_LINE_LOOP:
+		if (num_verts < 2)
+			verts_off = num_verts;
+		break;
+	case GL_TRIANGLES:
+		verts_off = num_verts % 3;
+		break;
+	case GL_TRIANGLE_STRIP:
+		if (num_verts < 3)
+			verts_off = num_verts;
+		break;
+	case GL_TRIANGLE_FAN:
+		if (num_verts < 3)
+			verts_off = num_verts;
+		break;
+	case GL_QUADS:
+		verts_off = num_verts % 4;
+		break;
+	case GL_QUAD_STRIP:
+		if (num_verts < 4)
+			verts_off = num_verts;
+		else
+			verts_off = num_verts % 2;
+		break;
+	case GL_POLYGON:
+		if (num_verts < 3)
+			verts_off = num_verts;
+		break;
+	default:
+		assert(0);
+		return -1;
+		break;
+	}
+
+	return num_verts - verts_off;
+}
+
+static void r300EmitElts(GLcontext * ctx, void *elts, unsigned long n_elts,
+			 int elt_size)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct r300_dma_region *rvb = &rmesa->state.elt_dma;
+	void *out;
+
+	assert(elt_size == 2 || elt_size == 4);
+
+	if (r300IsGartMemory(rmesa, elts, n_elts * elt_size)) {
+		rvb->address = rmesa->radeon.radeonScreen->gartTextures.map;
+		rvb->start = ((char *)elts) - rvb->address;
+		rvb->aos_offset =
+		    rmesa->radeon.radeonScreen->gart_texture_offset +
+		    rvb->start;
+		return;
+	} else if (r300IsGartMemory(rmesa, elts, 1)) {
+		WARN_ONCE("Pointer not within GART memory!\n");
+		_mesa_exit(-1);
+	}
+
+	r300AllocDmaRegion(rmesa, rvb, n_elts * elt_size, elt_size);
+	rvb->aos_offset = GET_START(rvb);
+
+	out = rvb->address + rvb->start;
+	memcpy(out, elts, n_elts * elt_size);
+}
+
+static void r300FireEB(r300ContextPtr rmesa, unsigned long addr,
+		       int vertex_count, int type, int elt_size)
+{
+	int cmd_reserved = 0;
+	int cmd_written = 0;
+	drm_radeon_cmd_header_t *cmd = NULL;
+	unsigned long t_addr;
+	unsigned long magic_1, magic_2;
+
+	assert(elt_size == 2 || elt_size == 4);
+
+	if (addr & (elt_size - 1)) {
+		WARN_ONCE("Badly aligned buffer\n");
+		return;
+	}
+
+	magic_1 = (addr % 32) / 4;
+	t_addr = addr & ~0x1d;
+	magic_2 = (vertex_count + 1 + (t_addr & 0x2)) / 2 + magic_1;
+
+	start_packet3(RADEON_CP_PACKET3_3D_DRAW_INDX_2, 0);
+	if (elt_size == 4) {
+		e32(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+		    (vertex_count << 16) | type |
+		    R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
+	} else {
+		e32(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+		    (vertex_count << 16) | type);
+	}
+
+	start_packet3(RADEON_CP_PACKET3_INDX_BUFFER, 2);
+#ifdef OPTIMIZE_ELTS
+	if (elt_size == 4) {
+		e32(R300_EB_UNK1 | (0 << 16) | R300_EB_UNK2);
+		e32(addr);
+	} else {
+		e32(R300_EB_UNK1 | (magic_1 << 16) | R300_EB_UNK2);
+		e32(t_addr);
+	}
+#else
+	e32(R300_EB_UNK1 | (0 << 16) | R300_EB_UNK2);
+	e32(addr);
+#endif
+
+	if (elt_size == 4) {
+		e32(vertex_count);
+	} else {
+#ifdef OPTIMIZE_ELTS
+		e32(magic_2);
+#else
+		e32((vertex_count + 1) / 2);
+#endif
+	}
+}
+
+static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
+{
+	int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
+	int i;
+	int cmd_reserved = 0;
+	int cmd_written = 0;
+	drm_radeon_cmd_header_t *cmd = NULL;
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s: nr=%d, ofs=0x%08x\n", __FUNCTION__, nr,
+			offset);
+
+	start_packet3(RADEON_CP_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+	e32(nr);
+	for (i = 0; i + 1 < nr; i += 2) {
+		e32((rmesa->state.aos[i].aos_size << 0)
+		    | (rmesa->state.aos[i].aos_stride << 8)
+		    | (rmesa->state.aos[i + 1].aos_size << 16)
+		    | (rmesa->state.aos[i + 1].aos_stride << 24)
+		    );
+		e32(rmesa->state.aos[i].aos_offset +
+		    offset * 4 * rmesa->state.aos[i].aos_stride);
+		e32(rmesa->state.aos[i + 1].aos_offset +
+		    offset * 4 * rmesa->state.aos[i + 1].aos_stride);
+	}
+
+	if (nr & 1) {
+		e32((rmesa->state.aos[nr - 1].aos_size << 0)
+		    | (rmesa->state.aos[nr - 1].aos_stride << 8)
+		    );
+		e32(rmesa->state.aos[nr - 1].aos_offset +
+		    offset * 4 * rmesa->state.aos[nr - 1].aos_stride);
+	}
+}
+
+static void r300FireAOS(r300ContextPtr rmesa, int vertex_count, int type)
+{
+	int cmd_reserved = 0;
+	int cmd_written = 0;
+	drm_radeon_cmd_header_t *cmd = NULL;
+
+	start_packet3(RADEON_CP_PACKET3_3D_DRAW_VBUF_2, 0);
+	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16)
+	    | type);
+}
+
+static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
+				   int start, int end, int prim)
+{
+	int type, num_verts;
+
+	type = r300PrimitiveType(rmesa, ctx, prim);
+	num_verts = r300NumVerts(rmesa, end - start, prim);
+
+	if (type < 0 || num_verts <= 0)
+		return;
+
+	if (rmesa->state.VB.Elts) {
+		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
+		if (num_verts > 65535) {
+			/* not implemented yet */
+			WARN_ONCE("Too many elts\n");
+			return;
+		}
+		r300EmitElts(ctx, rmesa->state.VB.Elts, num_verts,
+			     rmesa->state.VB.elt_size);
+		r300FireEB(rmesa, rmesa->state.elt_dma.aos_offset,
+			   num_verts, type, rmesa->state.VB.elt_size);
+	} else {
+		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
+		r300FireAOS(rmesa, num_verts, type);
+	}
+}
+
+#define CONV_VB(a, b) rvb->AttribPtr[(a)].size = vb->b->size, \
+			rvb->AttribPtr[(a)].type = GL_FLOAT, \
+			rvb->AttribPtr[(a)].stride = vb->b->stride, \
+			rvb->AttribPtr[(a)].data = vb->b->data
+
+static void radeon_vb_to_rvb(r300ContextPtr rmesa,
+			     struct radeon_vertex_buffer *rvb,
+			     struct vertex_buffer *vb)
+{
+	int i;
+	GLcontext *ctx;
+	ctx = rmesa->radeon.glCtx;
+
+	memset(rvb, 0, sizeof(*rvb));
+
+	rvb->Elts = vb->Elts;
+	rvb->elt_size = 4;
+	rvb->elt_min = 0;
+	rvb->elt_max = vb->Count;
+
+	rvb->Count = vb->Count;
+
+	if (hw_tcl_on) {
+		CONV_VB(VERT_ATTRIB_POS, ObjPtr);
+	} else {
+		assert(vb->ClipPtr);
+		CONV_VB(VERT_ATTRIB_POS, ClipPtr);
+	}
+
+	CONV_VB(VERT_ATTRIB_NORMAL, NormalPtr);
+	CONV_VB(VERT_ATTRIB_COLOR0, ColorPtr[0]);
+	CONV_VB(VERT_ATTRIB_COLOR1, SecondaryColorPtr[0]);
+	CONV_VB(VERT_ATTRIB_FOG, FogCoordPtr);
+
+	for (i = 0; i < ctx->Const.MaxTextureCoordUnits; i++)
+		CONV_VB(VERT_ATTRIB_TEX0 + i, TexCoordPtr[i]);
+
+	for (i = 0; i < MAX_VERTEX_PROGRAM_ATTRIBS; i++)
+		CONV_VB(VERT_ATTRIB_GENERIC0 + i,
+			AttribPtr[VERT_ATTRIB_GENERIC0 + i]);
+
+	rvb->Primitive = vb->Primitive;
+	rvb->PrimitiveCount = vb->PrimitiveCount;
+	rvb->LockFirst = rvb->LockCount = 0;
+	rvb->lock_uptodate = GL_FALSE;
+}
+
+static GLboolean r300RunRender(GLcontext * ctx,
+			       struct tnl_pipeline_stage *stage)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct radeon_vertex_buffer *VB = &rmesa->state.VB;
+	int i;
+	int cmd_reserved = 0;
+	int cmd_written = 0;
+	drm_radeon_cmd_header_t *cmd = NULL;
+
+	if (RADEON_DEBUG & DEBUG_PRIMS)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (stage) {
+		TNLcontext *tnl = TNL_CONTEXT(ctx);
+		radeon_vb_to_rvb(rmesa, VB, &tnl->vb);
+	}
+
+	r300UpdateShaders(rmesa);
+	if (r300EmitArrays(ctx))
+		return GL_TRUE;
+
+	r300UpdateShaderStates(rmesa);
+
+	reg_start(R300_RB3D_DSTCACHE_CTLSTAT, 0);
+	e32(R300_RB3D_DSTCACHE_UNKNOWN_0A);
+
+	reg_start(R300_RB3D_ZCACHE_CTLSTAT, 0);
+	e32(R300_RB3D_ZCACHE_UNKNOWN_03);
+
+	r300EmitState(rmesa);
+
+	for (i = 0; i < VB->PrimitiveCount; i++) {
+		GLuint prim = _tnl_translate_prim(&VB->Primitive[i]);
+		GLuint start = VB->Primitive[i].start;
+		GLuint end = VB->Primitive[i].start + VB->Primitive[i].count;
+		r300RunRenderPrimitive(rmesa, ctx, start, end, prim);
+	}
+
+	reg_start(R300_RB3D_DSTCACHE_CTLSTAT, 0);
+	e32(R300_RB3D_DSTCACHE_UNKNOWN_0A);
+
+	reg_start(R300_RB3D_ZCACHE_CTLSTAT, 0);
+	e32(R300_RB3D_ZCACHE_UNKNOWN_03);
+
+#ifdef USER_BUFFERS
+	r300UseArrays(ctx);
+#endif
+
+	r300ReleaseArrays(ctx);
+
+	return GL_FALSE;
+}
+
+#define FALLBACK_IF(expr)						\
+	do {								\
+		if (expr) {						\
+			if (1 || RADEON_DEBUG & DEBUG_FALLBACKS)	\
+				WARN_ONCE("Software fallback:%s\n",	\
+					  #expr);			\
+			return R300_FALLBACK_RAST;			\
+		}							\
+	} while(0)
+
+static int r300Fallback(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_fragment_program *fp = (struct r300_fragment_program *)
+	    (char *)ctx->FragmentProgram._Current;
+
+	if (fp) {
+		if (!fp->translated)
+			r300TranslateFragmentShader(r300, fp);
+		FALLBACK_IF(!fp->translated);
+	}
+
+	FALLBACK_IF(ctx->RenderMode != GL_RENDER);
+
+	FALLBACK_IF(ctx->Stencil._TestTwoSide
+		    && (ctx->Stencil.Ref[0] != ctx->Stencil.Ref[1]
+			|| ctx->Stencil.ValueMask[0] !=
+			ctx->Stencil.ValueMask[1]
+			|| ctx->Stencil.WriteMask[0] !=
+			ctx->Stencil.WriteMask[1]));
+
+	FALLBACK_IF(ctx->Color.ColorLogicOpEnabled);
+
+	if (ctx->Extensions.NV_point_sprite || ctx->Extensions.ARB_point_sprite)
+		FALLBACK_IF(ctx->Point.PointSprite);
+
+	if (!r300->disable_lowimpact_fallback) {
+		FALLBACK_IF(ctx->Polygon.OffsetPoint);
+		FALLBACK_IF(ctx->Polygon.OffsetLine);
+		FALLBACK_IF(ctx->Polygon.StippleFlag);
+		FALLBACK_IF(ctx->Multisample.Enabled);
+		FALLBACK_IF(ctx->Line.StippleFlag);
+		FALLBACK_IF(ctx->Line.SmoothFlag);
+		FALLBACK_IF(ctx->Point.SmoothFlag);
+	}
+
+	return R300_FALLBACK_NONE;
+}
+
+static GLboolean r300RunNonTCLRender(GLcontext * ctx,
+				     struct tnl_pipeline_stage *stage)
+{
+	if (RADEON_DEBUG & DEBUG_PRIMS)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (r300Fallback(ctx) >= R300_FALLBACK_RAST)
+		return GL_TRUE;
+
+	return r300RunRender(ctx, stage);
+}
+
+static GLboolean r300RunTCLRender(GLcontext * ctx,
+				  struct tnl_pipeline_stage *stage)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct r300_vertex_program *vp;
+
+	hw_tcl_on = future_hw_tcl_on;
+
+	if (RADEON_DEBUG & DEBUG_PRIMS)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (hw_tcl_on == GL_FALSE)
+		return GL_TRUE;
+
+	if (r300Fallback(ctx) >= R300_FALLBACK_TCL) {
+		hw_tcl_on = GL_FALSE;
+		return GL_TRUE;
+	}
+
+	r300UpdateShaders(rmesa);
+
+	vp = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
+	if (vp->native == GL_FALSE) {
+		hw_tcl_on = GL_FALSE;
+		return GL_TRUE;
+	}
+
+	return r300RunRender(ctx, stage);
+}
+
+const struct tnl_pipeline_stage _r300_render_stage = {
+	"r300 Hardware Rasterization",
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	r300RunNonTCLRender
+};
+
+const struct tnl_pipeline_stage _r300_tcl_stage = {
+	"r300 Hardware Transform, Clipping and Lighting",
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	r300RunTCLRender
+};
diff --git a/r300/r300_shader.c b/r300/r300_shader.c
new file mode 100644
index 0000000..59fe17b
--- /dev/null
+++ b/r300/r300_shader.c
@@ -0,0 +1,73 @@
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+
+#include "program.h"
+#include "tnl/tnl.h"
+#include "r300_context.h"
+#include "r300_fragprog.h"
+
+static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
+					 GLuint id)
+{
+	struct r300_vertex_program_cont *vp;
+	struct r300_fragment_program *fp;
+
+	switch (target) {
+	case GL_VERTEX_STATE_PROGRAM_NV:
+	case GL_VERTEX_PROGRAM_ARB:
+		vp = CALLOC_STRUCT(r300_vertex_program_cont);
+		return _mesa_init_vertex_program(ctx, &vp->mesa_program,
+						 target, id);
+	case GL_FRAGMENT_PROGRAM_ARB:
+		fp = CALLOC_STRUCT(r300_fragment_program);
+		fp->ctx = ctx;
+		return _mesa_init_fragment_program(ctx, &fp->mesa_program,
+						   target, id);
+	case GL_FRAGMENT_PROGRAM_NV:
+		fp = CALLOC_STRUCT(r300_fragment_program);
+		return _mesa_init_fragment_program(ctx, &fp->mesa_program,
+						   target, id);
+	default:
+		_mesa_problem(ctx, "Bad target in r300NewProgram");
+	}
+
+	return NULL;
+}
+
+static void r300DeleteProgram(GLcontext * ctx, struct gl_program *prog)
+{
+	_mesa_delete_program(ctx, prog);
+}
+
+static void
+r300ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
+{
+	struct r300_vertex_program_cont *vp = (void *)prog;
+	struct r300_fragment_program *fp = (struct r300_fragment_program *)prog;
+
+	switch (target) {
+	case GL_VERTEX_PROGRAM_ARB:
+		vp->progs = NULL;
+		break;
+	case GL_FRAGMENT_PROGRAM_ARB:
+		fp->translated = GL_FALSE;
+		break;
+	}
+	/* need this for tcl fallbacks */
+	_tnl_program_string(ctx, target, prog);
+}
+
+static GLboolean
+r300IsProgramNative(GLcontext * ctx, GLenum target, struct gl_program *prog)
+{
+	return 1;
+}
+
+void r300InitShaderFuncs(struct dd_function_table *functions)
+{
+	functions->NewProgram = r300NewProgram;
+	functions->DeleteProgram = r300DeleteProgram;
+	functions->ProgramStringNotify = r300ProgramStringNotify;
+	functions->IsProgramNative = r300IsProgramNative;
+}
diff --git a/r300/r300_state.c b/r300/r300_state.c
new file mode 100644
index 0000000..2aaf041
--- /dev/null
+++ b/r300/r300_state.c
@@ -0,0 +1,2375 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.
+Copyright (C) 2004 Nicolai Haehnle.
+All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include "glheader.h"
+#include "state.h"
+#include "imports.h"
+#include "enums.h"
+#include "macros.h"
+#include "context.h"
+#include "dd.h"
+#include "simple_list.h"
+
+#include "api_arrayelt.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "texformat.h"
+
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "r300_context.h"
+#include "r300_ioctl.h"
+#include "r300_state.h"
+#include "r300_reg.h"
+#include "r300_emit.h"
+#include "r300_fragprog.h"
+#include "r300_tex.h"
+
+#include "drirenderbuffer.h"
+
+static void r300BlendColor(GLcontext * ctx, const GLfloat cf[4])
+{
+	GLubyte color[4];
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	R300_STATECHANGE(rmesa, blend_color);
+
+	CLAMPED_FLOAT_TO_UBYTE(color[0], cf[0]);
+	CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]);
+	CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]);
+	CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]);
+
+	rmesa->hw.blend_color.cmd[1] = PACK_COLOR_8888(color[3], color[0],
+						       color[1], color[2]);
+}
+
+/**
+ * Calculate the hardware blend factor setting.  This same function is used
+ * for source and destination of both alpha and RGB.
+ *
+ * \returns
+ * The hardware register value for the specified blend factor.  This value
+ * will need to be shifted into the correct position for either source or
+ * destination factor.
+ *
+ * \todo
+ * Since the two cases where source and destination are handled differently
+ * are essentially error cases, they should never happen.  Determine if these
+ * cases can be removed.
+ */
+static int blend_factor(GLenum factor, GLboolean is_src)
+{
+	switch (factor) {
+	case GL_ZERO:
+		return R300_BLEND_GL_ZERO;
+		break;
+	case GL_ONE:
+		return R300_BLEND_GL_ONE;
+		break;
+	case GL_DST_COLOR:
+		return R300_BLEND_GL_DST_COLOR;
+		break;
+	case GL_ONE_MINUS_DST_COLOR:
+		return R300_BLEND_GL_ONE_MINUS_DST_COLOR;
+		break;
+	case GL_SRC_COLOR:
+		return R300_BLEND_GL_SRC_COLOR;
+		break;
+	case GL_ONE_MINUS_SRC_COLOR:
+		return R300_BLEND_GL_ONE_MINUS_SRC_COLOR;
+		break;
+	case GL_SRC_ALPHA:
+		return R300_BLEND_GL_SRC_ALPHA;
+		break;
+	case GL_ONE_MINUS_SRC_ALPHA:
+		return R300_BLEND_GL_ONE_MINUS_SRC_ALPHA;
+		break;
+	case GL_DST_ALPHA:
+		return R300_BLEND_GL_DST_ALPHA;
+		break;
+	case GL_ONE_MINUS_DST_ALPHA:
+		return R300_BLEND_GL_ONE_MINUS_DST_ALPHA;
+		break;
+	case GL_SRC_ALPHA_SATURATE:
+		return (is_src) ? R300_BLEND_GL_SRC_ALPHA_SATURATE :
+		    R300_BLEND_GL_ZERO;
+		break;
+	case GL_CONSTANT_COLOR:
+		return R300_BLEND_GL_CONST_COLOR;
+		break;
+	case GL_ONE_MINUS_CONSTANT_COLOR:
+		return R300_BLEND_GL_ONE_MINUS_CONST_COLOR;
+		break;
+	case GL_CONSTANT_ALPHA:
+		return R300_BLEND_GL_CONST_ALPHA;
+		break;
+	case GL_ONE_MINUS_CONSTANT_ALPHA:
+		return R300_BLEND_GL_ONE_MINUS_CONST_ALPHA;
+		break;
+	default:
+		fprintf(stderr, "unknown blend factor %x\n", factor);
+		return (is_src) ? R300_BLEND_GL_ONE : R300_BLEND_GL_ZERO;
+		break;
+	}
+}
+
+/**
+ * Sets both the blend equation and the blend function.
+ * This is done in a single
+ * function because some blend equations (i.e., \c GL_MIN and \c GL_MAX)
+ * change the interpretation of the blend function.
+ * Also, make sure that blend function and blend equation are set to their
+ * default value if color blending is not enabled, since at least blend
+ * equations GL_MIN and GL_FUNC_REVERSE_SUBTRACT will cause wrong results
+ * otherwise for unknown reasons.
+ */
+
+/* helper function */
+static void r300SetBlendCntl(r300ContextPtr r300, int func, int eqn,
+			     int cbits, int funcA, int eqnA)
+{
+	GLuint new_ablend, new_cblend;
+
+#if 0
+	fprintf(stderr,
+		"eqnA=%08x funcA=%08x eqn=%08x func=%08x cbits=%08x\n",
+		eqnA, funcA, eqn, func, cbits);
+#endif
+	new_ablend = eqnA | funcA;
+	new_cblend = eqn | func;
+
+	/* Some blend factor combinations don't seem to work when the
+	 * BLEND_NO_SEPARATE bit is set.
+	 *
+	 * Especially problematic candidates are the ONE_MINUS_* flags,
+	 * but I can't see a real pattern.
+	 */
+#if 0
+	if (new_ablend == new_cblend) {
+		new_cblend |= R300_BLEND_NO_SEPARATE;
+	}
+#endif
+	new_cblend |= cbits;
+
+	if ((new_ablend != r300->hw.bld.cmd[R300_BLD_ABLEND]) ||
+	    (new_cblend != r300->hw.bld.cmd[R300_BLD_CBLEND])) {
+		R300_STATECHANGE(r300, bld);
+		r300->hw.bld.cmd[R300_BLD_ABLEND] = new_ablend;
+		r300->hw.bld.cmd[R300_BLD_CBLEND] = new_cblend;
+	}
+}
+
+static void r300SetBlendState(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	int func = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+	    (R300_BLEND_GL_ZERO << R300_DST_BLEND_SHIFT);
+	int eqn = R300_COMB_FCN_ADD_CLAMP;
+	int funcA = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+	    (R300_BLEND_GL_ZERO << R300_DST_BLEND_SHIFT);
+	int eqnA = R300_COMB_FCN_ADD_CLAMP;
+
+	if (RGBA_LOGICOP_ENABLED(ctx) || !ctx->Color.BlendEnabled) {
+		r300SetBlendCntl(r300, func, eqn, 0, func, eqn);
+		return;
+	}
+
+	func =
+	    (blend_factor(ctx->Color.BlendSrcRGB, GL_TRUE) <<
+	     R300_SRC_BLEND_SHIFT) | (blend_factor(ctx->Color.BlendDstRGB,
+						   GL_FALSE) <<
+				      R300_DST_BLEND_SHIFT);
+
+	switch (ctx->Color.BlendEquationRGB) {
+	case GL_FUNC_ADD:
+		eqn = R300_COMB_FCN_ADD_CLAMP;
+		break;
+
+	case GL_FUNC_SUBTRACT:
+		eqn = R300_COMB_FCN_SUB_CLAMP;
+		break;
+
+	case GL_FUNC_REVERSE_SUBTRACT:
+		eqn = R300_COMB_FCN_RSUB_CLAMP;
+		break;
+
+	case GL_MIN:
+		eqn = R300_COMB_FCN_MIN;
+		func = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+		    (R300_BLEND_GL_ONE << R300_DST_BLEND_SHIFT);
+		break;
+
+	case GL_MAX:
+		eqn = R300_COMB_FCN_MAX;
+		func = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+		    (R300_BLEND_GL_ONE << R300_DST_BLEND_SHIFT);
+		break;
+
+	default:
+		fprintf(stderr,
+			"[%s:%u] Invalid RGB blend equation (0x%04x).\n",
+			__FUNCTION__, __LINE__, ctx->Color.BlendEquationRGB);
+		return;
+	}
+
+	funcA =
+	    (blend_factor(ctx->Color.BlendSrcA, GL_TRUE) <<
+	     R300_SRC_BLEND_SHIFT) | (blend_factor(ctx->Color.BlendDstA,
+						   GL_FALSE) <<
+				      R300_DST_BLEND_SHIFT);
+
+	switch (ctx->Color.BlendEquationA) {
+	case GL_FUNC_ADD:
+		eqnA = R300_COMB_FCN_ADD_CLAMP;
+		break;
+
+	case GL_FUNC_SUBTRACT:
+		eqnA = R300_COMB_FCN_SUB_CLAMP;
+		break;
+
+	case GL_FUNC_REVERSE_SUBTRACT:
+		eqnA = R300_COMB_FCN_RSUB_CLAMP;
+		break;
+
+	case GL_MIN:
+		eqnA = R300_COMB_FCN_MIN;
+		funcA = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+		    (R300_BLEND_GL_ONE << R300_DST_BLEND_SHIFT);
+		break;
+
+	case GL_MAX:
+		eqnA = R300_COMB_FCN_MAX;
+		funcA = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+		    (R300_BLEND_GL_ONE << R300_DST_BLEND_SHIFT);
+		break;
+
+	default:
+		fprintf(stderr,
+			"[%s:%u] Invalid A blend equation (0x%04x).\n",
+			__FUNCTION__, __LINE__, ctx->Color.BlendEquationA);
+		return;
+	}
+
+	r300SetBlendCntl(r300,
+			 func, eqn,
+			 R300_BLEND_UNKNOWN | R300_BLEND_ENABLE, funcA, eqnA);
+}
+
+static void r300BlendEquationSeparate(GLcontext * ctx,
+				      GLenum modeRGB, GLenum modeA)
+{
+	r300SetBlendState(ctx);
+}
+
+static void r300BlendFuncSeparate(GLcontext * ctx,
+				  GLenum sfactorRGB, GLenum dfactorRGB,
+				  GLenum sfactorA, GLenum dfactorA)
+{
+	r300SetBlendState(ctx);
+}
+
+/**
+ * Update our tracked culling state based on Mesa's state.
+ */
+static void r300UpdateCulling(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	uint32_t val = 0;
+
+	R300_STATECHANGE(r300, cul);
+	if (ctx->Polygon.CullFlag) {
+		if (ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
+			val = R300_CULL_FRONT | R300_CULL_BACK;
+		else if (ctx->Polygon.CullFaceMode == GL_FRONT)
+			val = R300_CULL_FRONT;
+		else
+			val = R300_CULL_BACK;
+
+		if (ctx->Polygon.FrontFace == GL_CW)
+			val |= R300_FRONT_FACE_CW;
+		else
+			val |= R300_FRONT_FACE_CCW;
+	}
+	r300->hw.cul.cmd[R300_CUL_CULL] = val;
+}
+
+static void r300SetEarlyZState(GLcontext * ctx)
+{
+	/* updates register R300_RB3D_EARLY_Z (0x4F14)
+	   if depth test is not enabled it should be R300_EARLY_Z_DISABLE
+	   if depth is enabled and alpha not it should be R300_EARLY_Z_ENABLE
+	   if depth and alpha is enabled it should be R300_EARLY_Z_DISABLE
+	 */
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	R300_STATECHANGE(r300, zstencil_format);
+	if (ctx->Color.AlphaEnabled && ctx->Color.AlphaFunc != GL_ALWAYS)
+		/* disable early Z */
+		r300->hw.zstencil_format.cmd[2] = R300_EARLY_Z_DISABLE;
+	else {
+		if (ctx->Depth.Test && ctx->Depth.Func != GL_NEVER)
+			/* enable early Z */
+			r300->hw.zstencil_format.cmd[2] = R300_EARLY_Z_ENABLE;
+		else
+			/* disable early Z */
+			r300->hw.zstencil_format.cmd[2] = R300_EARLY_Z_DISABLE;
+	}
+}
+
+static void r300SetAlphaState(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	GLubyte refByte;
+	uint32_t pp_misc = 0x0;
+	GLboolean really_enabled = ctx->Color.AlphaEnabled;
+
+	CLAMPED_FLOAT_TO_UBYTE(refByte, ctx->Color.AlphaRef);
+
+	switch (ctx->Color.AlphaFunc) {
+	case GL_NEVER:
+		pp_misc |= R300_ALPHA_TEST_FAIL;
+		break;
+	case GL_LESS:
+		pp_misc |= R300_ALPHA_TEST_LESS;
+		break;
+	case GL_EQUAL:
+		pp_misc |= R300_ALPHA_TEST_EQUAL;
+		break;
+	case GL_LEQUAL:
+		pp_misc |= R300_ALPHA_TEST_LEQUAL;
+		break;
+	case GL_GREATER:
+		pp_misc |= R300_ALPHA_TEST_GREATER;
+		break;
+	case GL_NOTEQUAL:
+		pp_misc |= R300_ALPHA_TEST_NEQUAL;
+		break;
+	case GL_GEQUAL:
+		pp_misc |= R300_ALPHA_TEST_GEQUAL;
+		break;
+	case GL_ALWAYS:
+		/*pp_misc |= R300_ALPHA_TEST_PASS; */
+		really_enabled = GL_FALSE;
+		break;
+	}
+
+	if (really_enabled) {
+		pp_misc |= R300_ALPHA_TEST_ENABLE;
+		pp_misc |= (refByte & R300_REF_ALPHA_MASK);
+	} else {
+		pp_misc = 0x0;
+	}
+
+	R300_STATECHANGE(r300, at);
+	r300->hw.at.cmd[R300_AT_ALPHA_TEST] = pp_misc;
+
+	r300SetEarlyZState(ctx);
+}
+
+static void r300AlphaFunc(GLcontext * ctx, GLenum func, GLfloat ref)
+{
+	(void)func;
+	(void)ref;
+	r300SetAlphaState(ctx);
+}
+
+static int translate_func(int func)
+{
+	switch (func) {
+	case GL_NEVER:
+		return R300_ZS_NEVER;
+	case GL_LESS:
+		return R300_ZS_LESS;
+	case GL_EQUAL:
+		return R300_ZS_EQUAL;
+	case GL_LEQUAL:
+		return R300_ZS_LEQUAL;
+	case GL_GREATER:
+		return R300_ZS_GREATER;
+	case GL_NOTEQUAL:
+		return R300_ZS_NOTEQUAL;
+	case GL_GEQUAL:
+		return R300_ZS_GEQUAL;
+	case GL_ALWAYS:
+		return R300_ZS_ALWAYS;
+	}
+	return 0;
+}
+
+static void r300SetDepthState(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	R300_STATECHANGE(r300, zs);
+	r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_RB3D_STENCIL_ENABLE;
+	r300->hw.zs.cmd[R300_ZS_CNTL_1] &=
+	    ~(R300_ZS_MASK << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT);
+
+	if (ctx->Depth.Test && ctx->Depth.Func != GL_NEVER) {
+		if (ctx->Depth.Mask)
+			r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
+			    R300_RB3D_Z_TEST_AND_WRITE;
+		else
+			r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_RB3D_Z_TEST;
+
+		r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
+		    translate_func(ctx->Depth.
+				   Func) << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT;
+	} else {
+		r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_RB3D_Z_DISABLED_1;
+		r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
+		    translate_func(GL_NEVER) << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT;
+	}
+
+	r300SetEarlyZState(ctx);
+}
+
+static void r300ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq );
+
+/**
+ * Handle glEnable()/glDisable().
+ *
+ * \note Mesa already filters redundant calls to glEnable/glDisable.
+ */
+static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	GLuint p;
+	if (RADEON_DEBUG & DEBUG_STATE)
+		fprintf(stderr, "%s( %s = %s )\n", __FUNCTION__,
+			_mesa_lookup_enum_by_nr(cap),
+			state ? "GL_TRUE" : "GL_FALSE");
+
+	switch (cap) {
+		/* Fast track this one...
+		 */
+	case GL_TEXTURE_1D:
+	case GL_TEXTURE_2D:
+	case GL_TEXTURE_3D:
+		break;
+
+	case GL_FOG:
+		R300_STATECHANGE(r300, fogs);
+		if (state) {
+			r300->hw.fogs.cmd[R300_FOGS_STATE] |= R300_FOG_ENABLE;
+
+			ctx->Driver.Fogfv(ctx, GL_FOG_MODE, NULL);
+			ctx->Driver.Fogfv(ctx, GL_FOG_DENSITY,
+					  &ctx->Fog.Density);
+			ctx->Driver.Fogfv(ctx, GL_FOG_START, &ctx->Fog.Start);
+			ctx->Driver.Fogfv(ctx, GL_FOG_END, &ctx->Fog.End);
+			ctx->Driver.Fogfv(ctx, GL_FOG_COLOR, ctx->Fog.Color);
+		} else {
+			r300->hw.fogs.cmd[R300_FOGS_STATE] &= ~R300_FOG_ENABLE;
+		}
+
+		break;
+
+	case GL_ALPHA_TEST:
+		r300SetAlphaState(ctx);
+		break;
+
+	case GL_BLEND:
+	case GL_COLOR_LOGIC_OP:
+		r300SetBlendState(ctx);
+		break;
+
+
+	case GL_CLIP_PLANE0:
+	case GL_CLIP_PLANE1:
+	case GL_CLIP_PLANE2:
+	case GL_CLIP_PLANE3:
+	case GL_CLIP_PLANE4:
+	case GL_CLIP_PLANE5:
+		/* no VAP UCP on non-TCL chipsets */
+		if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+			return;
+
+		p = cap-GL_CLIP_PLANE0;
+		R300_STATECHANGE( r300, vap_clip_cntl );
+		if (state) {
+			r300->hw.vap_clip_cntl.cmd[1] |= (R300_VAP_UCP_ENABLE_0<<p);
+			r300ClipPlane( ctx, cap, NULL );
+		}
+		else {
+			r300->hw.vap_clip_cntl.cmd[1] &= ~(R300_VAP_UCP_ENABLE_0<<p);
+		}
+		break;
+	case GL_DEPTH_TEST:
+		r300SetDepthState(ctx);
+		break;
+
+	case GL_STENCIL_TEST:
+		if (r300->state.stencil.hw_stencil) {
+			R300_STATECHANGE(r300, zs);
+			if (state) {
+				r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
+				    R300_RB3D_STENCIL_ENABLE;
+			} else {
+				r300->hw.zs.cmd[R300_ZS_CNTL_0] &=
+				    ~R300_RB3D_STENCIL_ENABLE;
+			}
+		} else {
+#if R200_MERGED
+			FALLBACK(&r300->radeon, RADEON_FALLBACK_STENCIL, state);
+#endif
+		}
+		break;
+
+	case GL_CULL_FACE:
+		r300UpdateCulling(ctx);
+		break;
+
+	case GL_POLYGON_OFFSET_POINT:
+	case GL_POLYGON_OFFSET_LINE:
+		break;
+
+	case GL_POLYGON_OFFSET_FILL:
+		R300_STATECHANGE(r300, occlusion_cntl);
+		if (state) {
+			r300->hw.occlusion_cntl.cmd[1] |= (3 << 0);
+		} else {
+			r300->hw.occlusion_cntl.cmd[1] &= ~(3 << 0);
+		}
+		break;
+	default:
+		radeonEnable(ctx, cap, state);
+		return;
+	}
+}
+
+static void r300UpdatePolygonMode(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	uint32_t hw_mode = 0;
+
+	if (ctx->Polygon.FrontMode != GL_FILL ||
+	    ctx->Polygon.BackMode != GL_FILL) {
+		GLenum f, b;
+
+		if (ctx->Polygon.FrontFace == GL_CCW) {
+			f = ctx->Polygon.FrontMode;
+			b = ctx->Polygon.BackMode;
+		} else {
+			f = ctx->Polygon.BackMode;
+			b = ctx->Polygon.FrontMode;
+		}
+
+		hw_mode |= R300_PM_ENABLED;
+
+		switch (f) {
+		case GL_LINE:
+			hw_mode |= R300_PM_FRONT_LINE;
+			break;
+		case GL_POINT:	/* noop */
+			hw_mode |= R300_PM_FRONT_POINT;
+			break;
+		case GL_FILL:
+			hw_mode |= R300_PM_FRONT_FILL;
+			break;
+		}
+
+		switch (b) {
+		case GL_LINE:
+			hw_mode |= R300_PM_BACK_LINE;
+			break;
+		case GL_POINT:	/* noop */
+			hw_mode |= R300_PM_BACK_POINT;
+			break;
+		case GL_FILL:
+			hw_mode |= R300_PM_BACK_FILL;
+			break;
+		}
+	}
+
+	if (r300->hw.polygon_mode.cmd[1] != hw_mode) {
+		R300_STATECHANGE(r300, polygon_mode);
+		r300->hw.polygon_mode.cmd[1] = hw_mode;
+	}
+}
+
+/**
+ * Change the culling mode.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300CullFace(GLcontext * ctx, GLenum mode)
+{
+	(void)mode;
+
+	r300UpdateCulling(ctx);
+}
+
+/**
+ * Change the polygon orientation.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300FrontFace(GLcontext * ctx, GLenum mode)
+{
+	(void)mode;
+
+	r300UpdateCulling(ctx);
+	r300UpdatePolygonMode(ctx);
+}
+
+/**
+ * Change the depth testing function.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300DepthFunc(GLcontext * ctx, GLenum func)
+{
+	(void)func;
+	r300SetDepthState(ctx);
+}
+
+/**
+ * Enable/Disable depth writing.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300DepthMask(GLcontext * ctx, GLboolean mask)
+{
+	(void)mask;
+	r300SetDepthState(ctx);
+}
+
+/**
+ * Handle glColorMask()
+ */
+static void r300ColorMask(GLcontext * ctx,
+			  GLboolean r, GLboolean g, GLboolean b, GLboolean a)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	int mask = (r ? R300_COLORMASK0_R : 0) |
+	    (g ? R300_COLORMASK0_G : 0) |
+	    (b ? R300_COLORMASK0_B : 0) | (a ? R300_COLORMASK0_A : 0);
+
+	if (mask != r300->hw.cmk.cmd[R300_CMK_COLORMASK]) {
+		R300_STATECHANGE(r300, cmk);
+		r300->hw.cmk.cmd[R300_CMK_COLORMASK] = mask;
+	}
+}
+
+/* =============================================================
+ * Fog
+ */
+static void r300Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	union {
+		int i;
+		float f;
+	} fogScale, fogStart;
+
+	(void)param;
+
+	fogScale.i = r300->hw.fogp.cmd[R300_FOGP_SCALE];
+	fogStart.i = r300->hw.fogp.cmd[R300_FOGP_START];
+
+	switch (pname) {
+	case GL_FOG_MODE:
+		if (!ctx->Fog.Enabled)
+			return;
+		switch (ctx->Fog.Mode) {
+		case GL_LINEAR:
+			R300_STATECHANGE(r300, fogs);
+			r300->hw.fogs.cmd[R300_FOGS_STATE] =
+			    (r300->hw.fogs.
+			     cmd[R300_FOGS_STATE] & ~R300_FOG_MODE_MASK) |
+			    R300_FOG_MODE_LINEAR;
+
+			if (ctx->Fog.Start == ctx->Fog.End) {
+				fogScale.f = -1.0;
+				fogStart.f = 1.0;
+			} else {
+				fogScale.f =
+				    1.0 / (ctx->Fog.End - ctx->Fog.Start);
+				fogStart.f =
+				    -ctx->Fog.Start / (ctx->Fog.End -
+						       ctx->Fog.Start);
+			}
+			break;
+		case GL_EXP:
+			R300_STATECHANGE(r300, fogs);
+			r300->hw.fogs.cmd[R300_FOGS_STATE] =
+			    (r300->hw.fogs.
+			     cmd[R300_FOGS_STATE] & ~R300_FOG_MODE_MASK) |
+			    R300_FOG_MODE_EXP;
+			fogScale.f = 0.0933 * ctx->Fog.Density;
+			fogStart.f = 0.0;
+			break;
+		case GL_EXP2:
+			R300_STATECHANGE(r300, fogs);
+			r300->hw.fogs.cmd[R300_FOGS_STATE] =
+			    (r300->hw.fogs.
+			     cmd[R300_FOGS_STATE] & ~R300_FOG_MODE_MASK) |
+			    R300_FOG_MODE_EXP2;
+			fogScale.f = 0.3 * ctx->Fog.Density;
+			fogStart.f = 0.0;
+		default:
+			return;
+		}
+		break;
+	case GL_FOG_DENSITY:
+		switch (ctx->Fog.Mode) {
+		case GL_EXP:
+			fogScale.f = 0.0933 * ctx->Fog.Density;
+			fogStart.f = 0.0;
+			break;
+		case GL_EXP2:
+			fogScale.f = 0.3 * ctx->Fog.Density;
+			fogStart.f = 0.0;
+		default:
+			break;
+		}
+		break;
+	case GL_FOG_START:
+	case GL_FOG_END:
+		if (ctx->Fog.Mode == GL_LINEAR) {
+			if (ctx->Fog.Start == ctx->Fog.End) {
+				fogScale.f = -1.0;
+				fogStart.f = 1.0;
+			} else {
+				fogScale.f =
+				    1.0 / (ctx->Fog.End - ctx->Fog.Start);
+				fogStart.f =
+				    -ctx->Fog.Start / (ctx->Fog.End -
+						       ctx->Fog.Start);
+			}
+		}
+		break;
+	case GL_FOG_COLOR:
+		R300_STATECHANGE(r300, fogc);
+		r300->hw.fogc.cmd[R300_FOGC_R] =
+		    (GLuint) (ctx->Fog.Color[0] * 1023.0F) & 0x3FF;
+		r300->hw.fogc.cmd[R300_FOGC_G] =
+		    (GLuint) (ctx->Fog.Color[1] * 1023.0F) & 0x3FF;
+		r300->hw.fogc.cmd[R300_FOGC_B] =
+		    (GLuint) (ctx->Fog.Color[2] * 1023.0F) & 0x3FF;
+		break;
+	case GL_FOG_COORD_SRC:
+		break;
+	default:
+		return;
+	}
+
+	if (fogScale.i != r300->hw.fogp.cmd[R300_FOGP_SCALE] ||
+	    fogStart.i != r300->hw.fogp.cmd[R300_FOGP_START]) {
+		R300_STATECHANGE(r300, fogp);
+		r300->hw.fogp.cmd[R300_FOGP_SCALE] = fogScale.i;
+		r300->hw.fogp.cmd[R300_FOGP_START] = fogStart.i;
+	}
+}
+
+/* =============================================================
+ * Point state
+ */
+static void r300PointSize(GLcontext * ctx, GLfloat size)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	size = ctx->Point._Size;
+
+	R300_STATECHANGE(r300, ps);
+	r300->hw.ps.cmd[R300_PS_POINTSIZE] =
+	    ((int)(size * 6) << R300_POINTSIZE_X_SHIFT) |
+	    ((int)(size * 6) << R300_POINTSIZE_Y_SHIFT);
+}
+
+/* =============================================================
+ * Line state
+ */
+static void r300LineWidth(GLcontext * ctx, GLfloat widthf)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	widthf = ctx->Line._Width;
+
+	R300_STATECHANGE(r300, lcntl);
+	r300->hw.lcntl.cmd[1] = (int)(widthf * 6.0);
+	r300->hw.lcntl.cmd[1] |= R300_LINE_CNT_VE;
+}
+
+static void r300PolygonMode(GLcontext * ctx, GLenum face, GLenum mode)
+{
+	(void)face;
+	(void)mode;
+
+	r300UpdatePolygonMode(ctx);
+}
+
+/* =============================================================
+ * Stencil
+ */
+
+static int translate_stencil_op(int op)
+{
+	switch (op) {
+	case GL_KEEP:
+		return R300_ZS_KEEP;
+	case GL_ZERO:
+		return R300_ZS_ZERO;
+	case GL_REPLACE:
+		return R300_ZS_REPLACE;
+	case GL_INCR:
+		return R300_ZS_INCR;
+	case GL_DECR:
+		return R300_ZS_DECR;
+	case GL_INCR_WRAP_EXT:
+		return R300_ZS_INCR_WRAP;
+	case GL_DECR_WRAP_EXT:
+		return R300_ZS_DECR_WRAP;
+	case GL_INVERT:
+		return R300_ZS_INVERT;
+	default:
+		WARN_ONCE("Do not know how to translate stencil op");
+		return R300_ZS_KEEP;
+	}
+	return 0;
+}
+
+static void r300ShadeModel(GLcontext * ctx, GLenum mode)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	R300_STATECHANGE(rmesa, shade);
+	switch (mode) {
+	case GL_FLAT:
+		rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_FLAT;
+		break;
+	case GL_SMOOTH:
+		rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_SMOOTH;
+		break;
+	default:
+		return;
+	}
+}
+
+static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
+				    GLenum func, GLint ref, GLuint mask)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	GLuint refmask =
+	    (((ctx->Stencil.
+	       Ref[0] & 0xff) << R300_RB3D_ZS2_STENCIL_REF_SHIFT) | ((ctx->
+								      Stencil.
+								      ValueMask
+								      [0] &
+								      0xff)
+								     <<
+								     R300_RB3D_ZS2_STENCIL_MASK_SHIFT));
+
+	GLuint flag;
+
+	R300_STATECHANGE(rmesa, zs);
+
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &= ~((R300_ZS_MASK <<
+					       R300_RB3D_ZS1_FRONT_FUNC_SHIFT)
+					      | (R300_ZS_MASK <<
+						 R300_RB3D_ZS1_BACK_FUNC_SHIFT));
+
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] &=
+	    ~((R300_RB3D_ZS2_STENCIL_MASK <<
+	       R300_RB3D_ZS2_STENCIL_REF_SHIFT) |
+	      (R300_RB3D_ZS2_STENCIL_MASK << R300_RB3D_ZS2_STENCIL_MASK_SHIFT));
+
+	flag = translate_func(ctx->Stencil.Function[0]);
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+	    (flag << R300_RB3D_ZS1_FRONT_FUNC_SHIFT);
+
+	if (ctx->Stencil._TestTwoSide)
+		flag = translate_func(ctx->Stencil.Function[1]);
+
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+	    (flag << R300_RB3D_ZS1_BACK_FUNC_SHIFT);
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] |= refmask;
+}
+
+static void r300StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	R300_STATECHANGE(rmesa, zs);
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] &=
+	    ~(R300_RB3D_ZS2_STENCIL_MASK <<
+	      R300_RB3D_ZS2_STENCIL_WRITE_MASK_SHIFT);
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] |=
+	    (ctx->Stencil.
+	     WriteMask[0] & 0xff) << R300_RB3D_ZS2_STENCIL_WRITE_MASK_SHIFT;
+}
+
+static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
+				  GLenum fail, GLenum zfail, GLenum zpass)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	R300_STATECHANGE(rmesa, zs);
+	/* It is easier to mask what's left.. */
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &=
+	    (R300_ZS_MASK << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT) |
+	    (R300_ZS_MASK << R300_RB3D_ZS1_FRONT_FUNC_SHIFT) |
+	    (R300_ZS_MASK << R300_RB3D_ZS1_BACK_FUNC_SHIFT);
+
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+	    (translate_stencil_op(ctx->Stencil.FailFunc[0]) <<
+	     R300_RB3D_ZS1_FRONT_FAIL_OP_SHIFT)
+	    | (translate_stencil_op(ctx->Stencil.ZFailFunc[0]) <<
+	       R300_RB3D_ZS1_FRONT_ZFAIL_OP_SHIFT)
+	    | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
+	       R300_RB3D_ZS1_FRONT_ZPASS_OP_SHIFT);
+
+	if (ctx->Stencil._TestTwoSide) {
+		rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+		    (translate_stencil_op(ctx->Stencil.FailFunc[1]) <<
+		     R300_RB3D_ZS1_BACK_FAIL_OP_SHIFT)
+		    | (translate_stencil_op(ctx->Stencil.ZFailFunc[1]) <<
+		       R300_RB3D_ZS1_BACK_ZFAIL_OP_SHIFT)
+		    | (translate_stencil_op(ctx->Stencil.ZPassFunc[1]) <<
+		       R300_RB3D_ZS1_BACK_ZPASS_OP_SHIFT);
+	} else {
+		rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+		    (translate_stencil_op(ctx->Stencil.FailFunc[0]) <<
+		     R300_RB3D_ZS1_BACK_FAIL_OP_SHIFT)
+		    | (translate_stencil_op(ctx->Stencil.ZFailFunc[0]) <<
+		       R300_RB3D_ZS1_BACK_ZFAIL_OP_SHIFT)
+		    | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
+		       R300_RB3D_ZS1_BACK_ZPASS_OP_SHIFT);
+	}
+}
+
+static void r300ClearStencil(GLcontext * ctx, GLint s)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	rmesa->state.stencil.clear =
+	    ((GLuint) (ctx->Stencil.Clear & 0xff) |
+	     (R300_RB3D_ZS2_STENCIL_MASK <<
+	      R300_RB3D_ZS2_STENCIL_MASK_SHIFT) | ((ctx->Stencil.
+						    WriteMask[0] & 0xff) <<
+						   R300_RB3D_ZS2_STENCIL_WRITE_MASK_SHIFT));
+}
+
+/* =============================================================
+ * Window position and viewport transformation
+ */
+
+/*
+ * To correctly position primitives:
+ */
+#define SUBPIXEL_X 0.125
+#define SUBPIXEL_Y 0.125
+
+static void r300UpdateWindow(GLcontext * ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	__DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+	GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
+	GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
+	const GLfloat *v = ctx->Viewport._WindowMap.m;
+
+	GLfloat sx = v[MAT_SX];
+	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
+	GLfloat sy = -v[MAT_SY];
+	GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+	GLfloat sz = v[MAT_SZ] * rmesa->state.depth.scale;
+	GLfloat tz = v[MAT_TZ] * rmesa->state.depth.scale;
+
+	R300_FIREVERTICES(rmesa);
+	R300_STATECHANGE(rmesa, vpt);
+
+	rmesa->hw.vpt.cmd[R300_VPT_XSCALE] = r300PackFloat32(sx);
+	rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] = r300PackFloat32(tx);
+	rmesa->hw.vpt.cmd[R300_VPT_YSCALE] = r300PackFloat32(sy);
+	rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] = r300PackFloat32(ty);
+	rmesa->hw.vpt.cmd[R300_VPT_ZSCALE] = r300PackFloat32(sz);
+	rmesa->hw.vpt.cmd[R300_VPT_ZOFFSET] = r300PackFloat32(tz);
+}
+
+static void r300Viewport(GLcontext * ctx, GLint x, GLint y,
+			 GLsizei width, GLsizei height)
+{
+	/* Don't pipeline viewport changes, conflict with window offset
+	 * setting below.  Could apply deltas to rescue pipelined viewport
+	 * values, or keep the originals hanging around.
+	 */
+	r300UpdateWindow(ctx);
+}
+
+static void r300DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
+{
+	r300UpdateWindow(ctx);
+}
+
+void r300UpdateViewportOffset(GLcontext * ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	__DRIdrawablePrivate *dPriv = ((radeonContextPtr) rmesa)->dri.drawable;
+	GLfloat xoffset = (GLfloat) dPriv->x;
+	GLfloat yoffset = (GLfloat) dPriv->y + dPriv->h;
+	const GLfloat *v = ctx->Viewport._WindowMap.m;
+
+	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
+	GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+
+	if (rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] != r300PackFloat32(tx) ||
+	    rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] != r300PackFloat32(ty)) {
+		/* Note: this should also modify whatever data the context reset
+		 * code uses...
+		 */
+		R300_STATECHANGE(rmesa, vpt);
+		rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] = r300PackFloat32(tx);
+		rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] = r300PackFloat32(ty);
+
+	}
+
+	radeonUpdateScissor(ctx);
+}
+
+/**
+ * Tell the card where to render (offset, pitch).
+ * Effected by glDrawBuffer, etc
+ */
+void r300UpdateDrawBuffer(GLcontext * ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	r300ContextPtr r300 = rmesa;
+	struct gl_framebuffer *fb = ctx->DrawBuffer;
+	driRenderbuffer *drb;
+
+	if (fb->_ColorDrawBufferMask[0] == BUFFER_BIT_FRONT_LEFT) {
+		/* draw to front */
+		drb =
+		    (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].
+		    Renderbuffer;
+	} else if (fb->_ColorDrawBufferMask[0] == BUFFER_BIT_BACK_LEFT) {
+		/* draw to back */
+		drb =
+		    (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].
+		    Renderbuffer;
+	} else {
+		/* drawing to multiple buffers, or none */
+		return;
+	}
+
+	assert(drb);
+	assert(drb->flippedPitch);
+
+	R300_STATECHANGE(rmesa, cb);
+
+	r300->hw.cb.cmd[R300_CB_OFFSET] = drb->flippedOffset +	//r300->radeon.state.color.drawOffset +
+	    r300->radeon.radeonScreen->fbLocation;
+	r300->hw.cb.cmd[R300_CB_PITCH] = drb->flippedPitch;	//r300->radeon.state.color.drawPitch;
+
+	if (r300->radeon.radeonScreen->cpp == 4)
+		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
+	else
+		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
+
+	if (r300->radeon.sarea->tiling_enabled)
+		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
+#if 0
+	R200_STATECHANGE(rmesa, ctx);
+
+	/* Note: we used the (possibly) page-flipped values */
+	rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
+	    = ((drb->flippedOffset + rmesa->r200Screen->fbLocation)
+	       & R200_COLOROFFSET_MASK);
+	rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
+
+	if (rmesa->sarea->tiling_enabled) {
+		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
+		    R200_COLOR_TILE_ENABLE;
+	}
+#endif
+}
+
+static void
+r300FetchStateParameter(GLcontext * ctx,
+			const gl_state_index state[STATE_LENGTH],
+			GLfloat * value)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	switch (state[0]) {
+	case STATE_INTERNAL:
+		switch (state[1]) {
+		case STATE_R300_WINDOW_DIMENSION:
+			value[0] = r300->radeon.dri.drawable->w * 0.5f;	/* width*0.5 */
+			value[1] = r300->radeon.dri.drawable->h * 0.5f;	/* height*0.5 */
+			value[2] = 0.5F;	/* for moving range [-1 1] -> [0 1] */
+			value[3] = 1.0F;	/* not used */
+			break;
+
+		case STATE_R300_TEXRECT_FACTOR:{
+				struct gl_texture_object *t =
+				    ctx->Texture.Unit[state[2]].CurrentRect;
+
+				if (t && t->Image[0][t->BaseLevel]) {
+					struct gl_texture_image *image =
+					    t->Image[0][t->BaseLevel];
+					value[0] = 1.0 / image->Width2;
+					value[1] = 1.0 / image->Height2;
+				} else {
+					value[0] = 1.0;
+					value[1] = 1.0;
+				}
+				value[2] = 1.0;
+				value[3] = 1.0;
+				break;
+			}
+
+		default:
+			break;
+		}
+		break;
+
+	default:
+		break;
+	}
+}
+
+/**
+ * Update R300's own internal state parameters.
+ * For now just STATE_R300_WINDOW_DIMENSION
+ */
+void r300UpdateStateParameters(GLcontext * ctx, GLuint new_state)
+{
+	struct r300_fragment_program *fp;
+	struct gl_program_parameter_list *paramList;
+	GLuint i;
+
+	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM)))
+		return;
+
+	fp = (struct r300_fragment_program *)ctx->FragmentProgram._Current;
+	if (!fp)
+		return;
+
+	paramList = fp->mesa_program.Base.Parameters;
+
+	if (!paramList)
+		return;
+
+	for (i = 0; i < paramList->NumParameters; i++) {
+		if (paramList->Parameters[i].Type == PROGRAM_STATE_VAR) {
+			r300FetchStateParameter(ctx,
+						paramList->Parameters[i].
+						StateIndexes,
+						paramList->ParameterValues[i]);
+		}
+	}
+}
+
+/* =============================================================
+ * Polygon state
+ */
+static void r300PolygonOffset(GLcontext * ctx, GLfloat factor, GLfloat units)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	GLfloat constant = units;
+
+	switch (ctx->Visual.depthBits) {
+	case 16:
+		constant *= 4.0;
+		break;
+	case 24:
+		constant *= 2.0;
+		break;
+	}
+
+	factor *= 12.0;
+
+/*    fprintf(stderr, "%s f:%f u:%f\n", __FUNCTION__, factor, constant); */
+
+	R300_STATECHANGE(rmesa, zbs);
+	rmesa->hw.zbs.cmd[R300_ZBS_T_FACTOR] = r300PackFloat32(factor);
+	rmesa->hw.zbs.cmd[R300_ZBS_T_CONSTANT] = r300PackFloat32(constant);
+	rmesa->hw.zbs.cmd[R300_ZBS_W_FACTOR] = r300PackFloat32(factor);
+	rmesa->hw.zbs.cmd[R300_ZBS_W_CONSTANT] = r300PackFloat32(constant);
+}
+
+/* Routing and texture-related */
+
+/* r300 doesnt handle GL_CLAMP and GL_MIRROR_CLAMP_EXT correctly when filter is NEAREST.
+ * Since texwrap produces same results for GL_CLAMP and GL_CLAMP_TO_EDGE we use them instead.
+ * We need to recalculate wrap modes whenever filter mode is changed because someone might do:
+ * glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ * glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
+ * glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+ * Since r300 completely ignores R300_TX_CLAMP when either min or mag is nearest it cant handle
+ * combinations where only one of them is nearest.
+ */
+static unsigned long gen_fixed_filter(unsigned long f)
+{
+	unsigned long mag, min, needs_fixing = 0;
+	//return f;
+
+	/* We ignore MIRROR bit so we dont have to do everything twice */
+	if ((f & ((7 - 1) << R300_TX_WRAP_S_SHIFT)) ==
+	    (R300_TX_CLAMP << R300_TX_WRAP_S_SHIFT)) {
+		needs_fixing |= 1;
+	}
+	if ((f & ((7 - 1) << R300_TX_WRAP_T_SHIFT)) ==
+	    (R300_TX_CLAMP << R300_TX_WRAP_T_SHIFT)) {
+		needs_fixing |= 2;
+	}
+	if ((f & ((7 - 1) << R300_TX_WRAP_Q_SHIFT)) ==
+	    (R300_TX_CLAMP << R300_TX_WRAP_Q_SHIFT)) {
+		needs_fixing |= 4;
+	}
+
+	if (!needs_fixing)
+		return f;
+
+	mag = f & R300_TX_MAG_FILTER_MASK;
+	min = f & R300_TX_MIN_FILTER_MASK;
+
+	/* TODO: Check for anisto filters too */
+	if ((mag != R300_TX_MAG_FILTER_NEAREST)
+	    && (min != R300_TX_MIN_FILTER_NEAREST))
+		return f;
+
+	/* r300 cant handle these modes hence we force nearest to linear */
+	if ((mag == R300_TX_MAG_FILTER_NEAREST)
+	    && (min != R300_TX_MIN_FILTER_NEAREST)) {
+		f &= ~R300_TX_MAG_FILTER_NEAREST;
+		f |= R300_TX_MAG_FILTER_LINEAR;
+		return f;
+	}
+
+	if ((min == R300_TX_MIN_FILTER_NEAREST)
+	    && (mag != R300_TX_MAG_FILTER_NEAREST)) {
+		f &= ~R300_TX_MIN_FILTER_NEAREST;
+		f |= R300_TX_MIN_FILTER_LINEAR;
+		return f;
+	}
+
+	/* Both are nearest */
+	if (needs_fixing & 1) {
+		f &= ~((7 - 1) << R300_TX_WRAP_S_SHIFT);
+		f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_S_SHIFT;
+	}
+	if (needs_fixing & 2) {
+		f &= ~((7 - 1) << R300_TX_WRAP_T_SHIFT);
+		f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_T_SHIFT;
+	}
+	if (needs_fixing & 4) {
+		f &= ~((7 - 1) << R300_TX_WRAP_Q_SHIFT);
+		f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_Q_SHIFT;
+	}
+	return f;
+}
+
+static void r300SetupTextures(GLcontext * ctx)
+{
+	int i, mtu;
+	struct r300_tex_obj *t;
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	int hw_tmu = 0;
+	int last_hw_tmu = -1;	/* -1 translates into no setup costs for fields */
+	int tmu_mappings[R300_MAX_TEXTURE_UNITS] = { -1, };
+	struct r300_fragment_program *fp = (struct r300_fragment_program *)
+	    (char *)ctx->FragmentProgram._Current;
+
+	R300_STATECHANGE(r300, txe);
+	R300_STATECHANGE(r300, tex.filter);
+	R300_STATECHANGE(r300, tex.filter_1);
+	R300_STATECHANGE(r300, tex.size);
+	R300_STATECHANGE(r300, tex.format);
+	R300_STATECHANGE(r300, tex.pitch);
+	R300_STATECHANGE(r300, tex.offset);
+	R300_STATECHANGE(r300, tex.chroma_key);
+	R300_STATECHANGE(r300, tex.border_color);
+
+	r300->hw.txe.cmd[R300_TXE_ENABLE] = 0x0;
+
+	mtu = r300->radeon.glCtx->Const.MaxTextureUnits;
+	if (RADEON_DEBUG & DEBUG_STATE)
+		fprintf(stderr, "mtu=%d\n", mtu);
+
+	if (mtu > R300_MAX_TEXTURE_UNITS) {
+		fprintf(stderr,
+			"Aiiee ! mtu=%d is greater than R300_MAX_TEXTURE_UNITS=%d\n",
+			mtu, R300_MAX_TEXTURE_UNITS);
+		_mesa_exit(-1);
+	}
+
+	/* We cannot let disabled tmu offsets pass DRM */
+	for (i = 0; i < mtu; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled) {
+
+#if 0				/* Enables old behaviour */
+			hw_tmu = i;
+#endif
+			tmu_mappings[i] = hw_tmu;
+
+			t = r300->state.texture.unit[i].texobj;
+			/* XXX questionable fix for bug 9170: */
+			if (!t)
+				continue;
+
+			if ((t->format & 0xffffff00) == 0xffffff00) {
+				WARN_ONCE
+				    ("unknown texture format (entry %x) encountered. Help me !\n",
+				     t->format & 0xff);
+			}
+
+			if (RADEON_DEBUG & DEBUG_STATE)
+				fprintf(stderr,
+					"Activating texture unit %d\n", i);
+
+			r300->hw.txe.cmd[R300_TXE_ENABLE] |= (1 << hw_tmu);
+
+			r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 +
+						hw_tmu] =
+			    gen_fixed_filter(t->filter) | (hw_tmu << 28);
+			/* Currently disabled! */
+			r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = 0x0;	//0x20501f80;
+			r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+			    t->size;
+			r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
+						hw_tmu] = t->format;
+			r300->hw.tex.pitch.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+			    t->pitch_reg;
+			r300->hw.tex.offset.cmd[R300_TEX_VALUE_0 +
+						hw_tmu] = t->offset;
+
+			if (t->offset & R300_TXO_MACRO_TILE) {
+				WARN_ONCE("macro tiling enabled!\n");
+			}
+
+			if (t->offset & R300_TXO_MICRO_TILE) {
+				WARN_ONCE("micro tiling enabled!\n");
+			}
+
+			r300->hw.tex.chroma_key.cmd[R300_TEX_VALUE_0 +
+						    hw_tmu] = 0x0;
+			r300->hw.tex.border_color.cmd[R300_TEX_VALUE_0 +
+						      hw_tmu] =
+			    t->pp_border_color;
+
+			last_hw_tmu = hw_tmu;
+
+			hw_tmu++;
+		}
+	}
+
+	r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_FILTER_0, last_hw_tmu + 1);
+	r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_FILTER1_0, last_hw_tmu + 1);
+	r300->hw.tex.size.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_SIZE_0, last_hw_tmu + 1);
+	r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_FORMAT_0, last_hw_tmu + 1);
+	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_PITCH_0, last_hw_tmu + 1);
+	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_OFFSET_0, last_hw_tmu + 1);
+	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_CHROMA_KEY_0, last_hw_tmu + 1);
+	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_BORDER_COLOR_0, last_hw_tmu + 1);
+
+	if (!fp)		/* should only happenen once, just after context is created */
+		return;
+
+	R300_STATECHANGE(r300, fpt);
+
+	for (i = 0; i < fp->tex.length; i++) {
+		int unit;
+		int opcode;
+		unsigned long val;
+
+		unit = fp->tex.inst[i] >> R300_FPITX_IMAGE_SHIFT;
+		unit &= 15;
+
+		val = fp->tex.inst[i];
+		val &= ~R300_FPITX_IMAGE_MASK;
+
+		opcode =
+		    (val & R300_FPITX_OPCODE_MASK) >> R300_FPITX_OPCODE_SHIFT;
+		if (opcode == R300_FPITX_OP_KIL) {
+			r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
+		} else {
+			if (tmu_mappings[unit] >= 0) {
+				val |=
+				    tmu_mappings[unit] <<
+				    R300_FPITX_IMAGE_SHIFT;
+				r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
+			} else {
+				// We get here when the corresponding texture image is incomplete
+				// (e.g. incomplete mipmaps etc.)
+				r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
+			}
+		}
+	}
+
+	r300->hw.fpt.cmd[R300_FPT_CMD_0] =
+	    cmdpacket0(R300_PFS_TEXI_0, fp->tex.length);
+
+	if (RADEON_DEBUG & DEBUG_STATE)
+		fprintf(stderr, "TX_ENABLE: %08x  last_hw_tmu=%d\n",
+			r300->hw.txe.cmd[R300_TXE_ENABLE], last_hw_tmu);
+}
+
+union r300_outputs_written {
+	GLuint vp_outputs;	/* hw_tcl_on */
+	 DECLARE_RENDERINPUTS(index_bitset);	/* !hw_tcl_on */
+};
+
+#define R300_OUTPUTS_WRITTEN_TEST(ow, vp_result, tnl_attrib) \
+	((hw_tcl_on) ? (ow).vp_outputs & (1 << (vp_result)) : \
+	RENDERINPUTS_TEST( (ow.index_bitset), (tnl_attrib) ))
+
+static void r300SetupRSUnit(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	/* I'm still unsure if these are needed */
+	GLuint interp_magic[8] = {
+		0x00,
+		R300_RS_INTERP_1_UNKNOWN,
+		R300_RS_INTERP_2_UNKNOWN,
+		R300_RS_INTERP_3_UNKNOWN,
+		0x00,
+		0x00,
+		0x00,
+		0x00
+	};
+	union r300_outputs_written OutputsWritten;
+	GLuint InputsRead;
+	int fp_reg, high_rr;
+	int in_texcoords, col_interp_nr;
+	int i;
+
+	if (hw_tcl_on)
+		OutputsWritten.vp_outputs =
+		    CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+	else
+		RENDERINPUTS_COPY(OutputsWritten.index_bitset,
+				  r300->state.render_inputs_bitset);
+
+	if (ctx->FragmentProgram._Current)
+		InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
+	else {
+		fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
+		return;		/* This should only ever happen once.. */
+	}
+
+	R300_STATECHANGE(r300, ri);
+	R300_STATECHANGE(r300, rc);
+	R300_STATECHANGE(r300, rr);
+
+	fp_reg = in_texcoords = col_interp_nr = high_rr = 0;
+
+	r300->hw.rr.cmd[R300_RR_ROUTE_1] = 0;
+
+	if (InputsRead & FRAG_BIT_WPOS) {
+		for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+			if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
+				break;
+
+		if (i == ctx->Const.MaxTextureUnits) {
+			fprintf(stderr, "\tno free texcoord found...\n");
+			_mesa_exit(-1);
+		}
+
+		InputsRead |= (FRAG_BIT_TEX0 << i);
+		InputsRead &= ~FRAG_BIT_WPOS;
+	}
+
+	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = 0
+		    | R300_RS_INTERP_USED
+		    | (in_texcoords << R300_RS_INTERP_SRC_SHIFT)
+		    | interp_magic[i];
+
+		r300->hw.rr.cmd[R300_RR_ROUTE_0 + fp_reg] = 0;
+		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
+			//assert(r300->state.texture.tc_count != 0);
+			r300->hw.rr.cmd[R300_RR_ROUTE_0 + fp_reg] |= R300_RS_ROUTE_ENABLE | i	/* source INTERP */
+			    | (fp_reg << R300_RS_ROUTE_DEST_SHIFT);
+			high_rr = fp_reg;
+
+			if (!R300_OUTPUTS_WRITTEN_TEST
+			    (OutputsWritten, VERT_RESULT_TEX0 + i,
+			     _TNL_ATTRIB_TEX(i))) {
+				/* Passing invalid data here can lock the GPU. */
+				WARN_ONCE
+				    ("fragprog wants coords for tex%d, vp doesn't provide them!\n",
+				     i);
+				//_mesa_print_program(&CURRENT_VERTEX_SHADER(ctx)->Base);
+				//_mesa_exit(-1);
+			}
+			InputsRead &= ~(FRAG_BIT_TEX0 << i);
+			fp_reg++;
+		}
+		/* Need to count all coords enabled at vof */
+		if (R300_OUTPUTS_WRITTEN_TEST
+		    (OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i)))
+			in_texcoords++;
+	}
+
+	if (InputsRead & FRAG_BIT_COL0) {
+		if (!R300_OUTPUTS_WRITTEN_TEST
+		    (OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
+			WARN_ONCE
+			    ("fragprog wants col0, vp doesn't provide it\n");
+			goto out;	/* FIXME */
+			//_mesa_print_program(&CURRENT_VERTEX_SHADER(ctx)->Base);
+			//_mesa_exit(-1);
+		}
+
+		r300->hw.rr.cmd[R300_RR_ROUTE_0] |= 0
+		    | R300_RS_ROUTE_0_COLOR
+		    | (fp_reg++ << R300_RS_ROUTE_0_COLOR_DEST_SHIFT);
+		InputsRead &= ~FRAG_BIT_COL0;
+		col_interp_nr++;
+	}
+      out:
+
+	if (InputsRead & FRAG_BIT_COL1) {
+		if (!R300_OUTPUTS_WRITTEN_TEST
+		    (OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
+			WARN_ONCE
+			    ("fragprog wants col1, vp doesn't provide it\n");
+			//_mesa_exit(-1);
+		}
+
+		r300->hw.rr.cmd[R300_RR_ROUTE_1] |=
+		    R300_RS_ROUTE_1_UNKNOWN11 | R300_RS_ROUTE_1_COLOR1 |
+		    (fp_reg++ << R300_RS_ROUTE_1_COLOR1_DEST_SHIFT);
+		InputsRead &= ~FRAG_BIT_COL1;
+		if (high_rr < 1)
+			high_rr = 1;
+		col_interp_nr++;
+	}
+
+	/* Need at least one. This might still lock as the values are undefined... */
+	if (in_texcoords == 0 && col_interp_nr == 0) {
+		r300->hw.rr.cmd[R300_RR_ROUTE_0] |= 0
+		    | R300_RS_ROUTE_0_COLOR
+		    | (fp_reg++ << R300_RS_ROUTE_0_COLOR_DEST_SHIFT);
+		col_interp_nr++;
+	}
+
+	r300->hw.rc.cmd[1] = 0 | (in_texcoords << R300_RS_CNTL_TC_CNT_SHIFT)
+	    | (col_interp_nr << R300_RS_CNTL_CI_CNT_SHIFT)
+	    | R300_RS_CNTL_0_UNKNOWN_18;
+
+	assert(high_rr >= 0);
+	r300->hw.rr.cmd[R300_RR_CMD_0] =
+	    cmdpacket0(R300_RS_ROUTE_0, high_rr + 1);
+	r300->hw.rc.cmd[2] = 0xC0 | high_rr;
+
+	if (InputsRead)
+		WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n",
+			  InputsRead);
+}
+
+#define vpucount(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
+
+#define bump_vpu_count(ptr, new_count)   do{\
+	drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr));\
+	int _nc=(new_count)/4; \
+	assert(_nc < 256); \
+	if(_nc>_p->vpu.count)_p->vpu.count=_nc;\
+	}while(0)
+
+void static inline setup_vertex_shader_fragment(r300ContextPtr r300, int dest, struct
+						r300_vertex_shader_fragment
+						*vsf)
+{
+	int i;
+
+	if (vsf->length == 0)
+		return;
+
+	if (vsf->length & 0x3) {
+		fprintf(stderr,
+			"VERTEX_SHADER_FRAGMENT must have length divisible by 4\n");
+		_mesa_exit(-1);
+	}
+
+	switch ((dest >> 8) & 0xf) {
+	case 0:
+		R300_STATECHANGE(r300, vpi);
+		for (i = 0; i < vsf->length; i++)
+			r300->hw.vpi.cmd[R300_VPI_INSTR_0 + i +
+					 4 * (dest & 0xff)] = (vsf->body.d[i]);
+		bump_vpu_count(r300->hw.vpi.cmd,
+			       vsf->length + 4 * (dest & 0xff));
+		break;
+
+	case 2:
+		R300_STATECHANGE(r300, vpp);
+		for (i = 0; i < vsf->length; i++)
+			r300->hw.vpp.cmd[R300_VPP_PARAM_0 + i +
+					 4 * (dest & 0xff)] = (vsf->body.d[i]);
+		bump_vpu_count(r300->hw.vpp.cmd,
+			       vsf->length + 4 * (dest & 0xff));
+		break;
+	case 4:
+		R300_STATECHANGE(r300, vps);
+		for (i = 0; i < vsf->length; i++)
+			r300->hw.vps.cmd[1 + i + 4 * (dest & 0xff)] =
+			    (vsf->body.d[i]);
+		bump_vpu_count(r300->hw.vps.cmd,
+			       vsf->length + 4 * (dest & 0xff));
+		break;
+	default:
+		fprintf(stderr,
+			"%s:%s don't know how to handle dest %04x\n",
+			__FILE__, __FUNCTION__, dest);
+		_mesa_exit(-1);
+	}
+}
+
+/* just a skeleton for now.. */
+
+/* Generate a vertex shader that simply transforms vertex and texture coordinates,
+   while leaving colors intact. Nothing fancy (like lights)
+
+   If implementing lights make a copy first, so it is easy to switch between the two versions */
+static void r300GenerateSimpleVertexShader(r300ContextPtr r300)
+{
+	int i;
+	GLuint o_reg = 0;
+
+	/* Allocate parameters */
+	r300->state.vap_param.transform_offset = 0x0;	/* transform matrix */
+	r300->state.vertex_shader.param_offset = 0x0;
+	r300->state.vertex_shader.param_count = 0x4;	/* 4 vector values - 4x4 matrix */
+
+	r300->state.vertex_shader.program_start = 0x0;
+	r300->state.vertex_shader.unknown_ptr1 = 0x4;	/* magic value ? */
+	r300->state.vertex_shader.program_end = 0x0;
+
+	r300->state.vertex_shader.unknown_ptr2 = 0x0;	/* magic value */
+	r300->state.vertex_shader.unknown_ptr3 = 0x4;	/* magic value */
+
+	r300->state.vertex_shader.unknown1.length = 0;
+	r300->state.vertex_shader.unknown2.length = 0;
+
+#define WRITE_OP(oper,source1,source2,source3)	{\
+	r300->state.vertex_shader.program.body.i[r300->state.vertex_shader.program_end].op=(oper); \
+	r300->state.vertex_shader.program.body.i[r300->state.vertex_shader.program_end].src[0]=(source1); \
+	r300->state.vertex_shader.program.body.i[r300->state.vertex_shader.program_end].src[1]=(source2); \
+	r300->state.vertex_shader.program.body.i[r300->state.vertex_shader.program_end].src[2]=(source3); \
+	r300->state.vertex_shader.program_end++; \
+	}
+
+	for (i = VERT_ATTRIB_POS; i < VERT_ATTRIB_MAX; i++)
+		if (r300->state.sw_tcl_inputs[i] != -1) {
+			WRITE_OP(EASY_VSF_OP(MUL, o_reg++, ALL, RESULT),
+				 VSF_REG(r300->state.sw_tcl_inputs[i]),
+				 VSF_ATTR_UNITY(r300->state.
+						sw_tcl_inputs[i]),
+				 VSF_UNITY(r300->state.sw_tcl_inputs[i])
+			    )
+
+		}
+
+	r300->state.vertex_shader.program_end--;	/* r300 wants program length to be one more - no idea why */
+	r300->state.vertex_shader.program.length =
+	    (r300->state.vertex_shader.program_end + 1) * 4;
+
+	r300->state.vertex_shader.unknown_ptr1 = r300->state.vertex_shader.program_end;	/* magic value ? */
+	r300->state.vertex_shader.unknown_ptr2 = r300->state.vertex_shader.program_end;	/* magic value ? */
+	r300->state.vertex_shader.unknown_ptr3 = r300->state.vertex_shader.program_end;	/* magic value ? */
+
+}
+
+static void r300SetupVertexProgram(r300ContextPtr rmesa)
+{
+	GLcontext *ctx = rmesa->radeon.glCtx;
+	int inst_count;
+	int param_count;
+	struct r300_vertex_program *prog =
+	    (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
+
+	((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
+	R300_STATECHANGE(rmesa, vpp);
+	param_count =
+	    r300VertexProgUpdateParams(ctx, (struct r300_vertex_program_cont *)
+				       ctx->VertexProgram._Current /*prog */ ,
+				       (float *)&rmesa->hw.vpp.
+				       cmd[R300_VPP_PARAM_0]);
+	bump_vpu_count(rmesa->hw.vpp.cmd, param_count);
+	param_count /= 4;
+
+	/* Reset state, in case we don't use something */
+	((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
+
+	setup_vertex_shader_fragment(rmesa, VSF_DEST_PROGRAM, &(prog->program));
+
+#if 0
+	setup_vertex_shader_fragment(rmesa, VSF_DEST_UNKNOWN1,
+				     &(rmesa->state.vertex_shader.unknown1));
+	setup_vertex_shader_fragment(rmesa, VSF_DEST_UNKNOWN2,
+				     &(rmesa->state.vertex_shader.unknown2));
+#endif
+
+	inst_count = prog->program.length / 4 - 1;
+
+	R300_STATECHANGE(rmesa, pvs);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
+	    (0 << R300_PVS_CNTL_1_PROGRAM_START_SHIFT)
+	    | (inst_count /*pos_end */  << R300_PVS_CNTL_1_POS_END_SHIFT)
+	    | (inst_count << R300_PVS_CNTL_1_PROGRAM_END_SHIFT);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
+	    (0 << R300_PVS_CNTL_2_PARAM_OFFSET_SHIFT)
+	    | (param_count << R300_PVS_CNTL_2_PARAM_COUNT_SHIFT);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
+	    (0 /*rmesa->state.vertex_shader.unknown_ptr2 */  <<
+	     R300_PVS_CNTL_3_PROGRAM_UNKNOWN_SHIFT)
+	    | (inst_count /*rmesa->state.vertex_shader.unknown_ptr3 */  <<
+	       0);
+
+	/* This is done for vertex shader fragments, but also needs to be done for vap_pvs,
+	   so I leave it as a reminder */
+#if 0
+	reg_start(R300_VAP_PVS_WAITIDLE, 0);
+	e32(0x00000000);
+#endif
+}
+
+static void r300SetupVertexShader(r300ContextPtr rmesa)
+{
+	GLcontext *ctx = rmesa->radeon.glCtx;
+
+	/* Reset state, in case we don't use something */
+	((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
+
+	/* Not sure why this doesnt work...
+	   0x400 area might have something to do with pixel shaders as it appears right after pfs programming.
+	   0x406 is set to { 0.0, 0.0, 1.0, 0.0 } most of the time but should change with smooth points and in other rare cases. */
+	//setup_vertex_shader_fragment(rmesa, 0x406, &unk4);
+	if (hw_tcl_on
+	    && ((struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx))->
+	    translated) {
+		r300SetupVertexProgram(rmesa);
+		return;
+	}
+
+	/* This needs to be replaced by vertex shader generation code */
+	r300GenerateSimpleVertexShader(rmesa);
+
+	setup_vertex_shader_fragment(rmesa, VSF_DEST_PROGRAM,
+				     &(rmesa->state.vertex_shader.program));
+
+#if 0
+	setup_vertex_shader_fragment(rmesa, VSF_DEST_UNKNOWN1,
+				     &(rmesa->state.vertex_shader.unknown1));
+	setup_vertex_shader_fragment(rmesa, VSF_DEST_UNKNOWN2,
+				     &(rmesa->state.vertex_shader.unknown2));
+#endif
+
+	R300_STATECHANGE(rmesa, pvs);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
+	    (rmesa->state.vertex_shader.
+	     program_start << R300_PVS_CNTL_1_PROGRAM_START_SHIFT)
+	    | (rmesa->state.vertex_shader.
+	       unknown_ptr1 << R300_PVS_CNTL_1_POS_END_SHIFT)
+	    | (rmesa->state.vertex_shader.
+	       program_end << R300_PVS_CNTL_1_PROGRAM_END_SHIFT);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
+	    (rmesa->state.vertex_shader.
+	     param_offset << R300_PVS_CNTL_2_PARAM_OFFSET_SHIFT)
+	    | (rmesa->state.vertex_shader.
+	       param_count << R300_PVS_CNTL_2_PARAM_COUNT_SHIFT);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
+	    (rmesa->state.vertex_shader.
+	     unknown_ptr2 << R300_PVS_CNTL_3_PROGRAM_UNKNOWN_SHIFT)
+	    | (rmesa->state.vertex_shader.unknown_ptr3 << 0);
+
+	/* This is done for vertex shader fragments, but also needs to be done for vap_pvs,
+	   so I leave it as a reminder */
+#if 0
+	reg_start(R300_VAP_PVS_WAITIDLE, 0);
+	e32(0x00000000);
+#endif
+}
+
+/**
+ * Completely recalculates hardware state based on the Mesa state.
+ */
+static void r300ResetHwState(r300ContextPtr r300)
+{
+	GLcontext *ctx = r300->radeon.glCtx;
+	int has_tcl = 1;
+
+	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+		has_tcl = 0;
+
+	if (RADEON_DEBUG & DEBUG_STATE)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	/* This is a place to initialize registers which
+	   have bitfields accessed by different functions
+	   and not all bits are used */
+
+	/* go and compute register values from GL state */
+
+	r300UpdateWindow(ctx);
+
+	r300ColorMask(ctx,
+		      ctx->Color.ColorMask[RCOMP],
+		      ctx->Color.ColorMask[GCOMP],
+		      ctx->Color.ColorMask[BCOMP], ctx->Color.ColorMask[ACOMP]);
+
+	r300Enable(ctx, GL_DEPTH_TEST, ctx->Depth.Test);
+	r300DepthMask(ctx, ctx->Depth.Mask);
+	r300DepthFunc(ctx, ctx->Depth.Func);
+
+	/* stencil */
+	r300Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
+	r300StencilMaskSeparate(ctx, 0, ctx->Stencil.WriteMask[0]);
+	r300StencilFuncSeparate(ctx, 0, ctx->Stencil.Function[0],
+				ctx->Stencil.Ref[0], ctx->Stencil.ValueMask[0]);
+	r300StencilOpSeparate(ctx, 0, ctx->Stencil.FailFunc[0],
+			      ctx->Stencil.ZFailFunc[0],
+			      ctx->Stencil.ZPassFunc[0]);
+
+	r300UpdateCulling(ctx);
+
+	r300UpdateTextureState(ctx);
+
+	r300SetBlendState(ctx);
+
+	r300AlphaFunc(ctx, ctx->Color.AlphaFunc, ctx->Color.AlphaRef);
+	r300Enable(ctx, GL_ALPHA_TEST, ctx->Color.AlphaEnabled);
+
+	/* Initialize magic registers
+	   TODO : learn what they really do, or get rid of
+	   those we don't have to touch */
+	if (!has_tcl)
+		r300->hw.vap_cntl.cmd[1] = 0x0014045a;
+	else
+		r300->hw.vap_cntl.cmd[1] = 0x0030045A;	//0x0030065a /* Dangerous */
+	r300->hw.vte.cmd[1] = R300_VPORT_X_SCALE_ENA
+	    | R300_VPORT_X_OFFSET_ENA
+	    | R300_VPORT_Y_SCALE_ENA
+	    | R300_VPORT_Y_OFFSET_ENA
+	    | R300_VPORT_Z_SCALE_ENA
+	    | R300_VPORT_Z_OFFSET_ENA | R300_VTX_W0_FMT;
+	r300->hw.vte.cmd[2] = 0x00000008;
+
+	r300->hw.unk2134.cmd[1] = 0x00FFFFFF;
+	r300->hw.unk2134.cmd[2] = 0x00000000;
+	if (_mesa_little_endian())
+		r300->hw.vap_cntl_status.cmd[1] = R300_VC_NO_SWAP;
+	else
+		r300->hw.vap_cntl_status.cmd[1] = R300_VC_32BIT_SWAP;
+
+	/* disable VAP/TCL on non-TCL capable chips */
+	if (!has_tcl)
+		r300->hw.vap_cntl_status.cmd[1] |= R300_VAP_TCL_BYPASS;
+
+	r300->hw.unk21DC.cmd[1] = 0xAAAAAAAA;
+
+	r300->hw.vap_clip_cntl.cmd[1] = R300_221C_NORMAL;
+
+	r300->hw.unk2220.cmd[1] = r300PackFloat32(1.0);
+	r300->hw.unk2220.cmd[2] = r300PackFloat32(1.0);
+	r300->hw.unk2220.cmd[3] = r300PackFloat32(1.0);
+	r300->hw.unk2220.cmd[4] = r300PackFloat32(1.0);
+
+	/* what about other chips than r300 or rv350??? */
+	if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R300)
+		r300->hw.unk2288.cmd[1] = R300_2288_R300;
+	else
+		r300->hw.unk2288.cmd[1] = R300_2288_RV350;
+
+	r300->hw.gb_enable.cmd[1] = R300_GB_POINT_STUFF_ENABLE
+	    | R300_GB_LINE_STUFF_ENABLE
+	    | R300_GB_TRIANGLE_STUFF_ENABLE /*| R300_GB_UNK31 */ ;
+
+	r300->hw.gb_misc.cmd[R300_GB_MISC_MSPOS_0] = 0x66666666;
+	r300->hw.gb_misc.cmd[R300_GB_MISC_MSPOS_1] = 0x06666666;
+	if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R300) ||
+	    (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R350))
+		r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] =
+		    R300_GB_TILE_ENABLE | R300_GB_TILE_PIPE_COUNT_R300 |
+		    R300_GB_TILE_SIZE_16;
+	else if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410)
+		r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] =
+		    R300_GB_TILE_ENABLE | R300_GB_TILE_PIPE_COUNT_RV410 |
+		    R300_GB_TILE_SIZE_16;
+	else if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420)
+		r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] =
+		    R300_GB_TILE_ENABLE | R300_GB_TILE_PIPE_COUNT_R420 |
+		    R300_GB_TILE_SIZE_16;
+	else
+		r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] =
+		    R300_GB_TILE_ENABLE | R300_GB_TILE_PIPE_COUNT_RV300 |
+		    R300_GB_TILE_SIZE_16;
+	/* set to 0 when fog is disabled? */
+	r300->hw.gb_misc.cmd[R300_GB_MISC_SELECT] = R300_GB_FOG_SELECT_1_1_W;
+	r300->hw.gb_misc.cmd[R300_GB_MISC_AA_CONFIG] = R300_AA_DISABLE;	/* No antialiasing */
+
+	r300->hw.unk4200.cmd[1] = r300PackFloat32(0.0);
+	r300->hw.unk4200.cmd[2] = r300PackFloat32(0.0);
+	r300->hw.unk4200.cmd[3] = r300PackFloat32(1.0);
+	r300->hw.unk4200.cmd[4] = r300PackFloat32(1.0);
+
+	r300->hw.unk4214.cmd[1] = 0x00050005;
+
+	r300PointSize(ctx, 0.0);
+
+	r300->hw.unk4230.cmd[1] = 0x18000006;
+	r300->hw.unk4230.cmd[2] = 0x00020006;
+	r300->hw.unk4230.cmd[3] = r300PackFloat32(1.0 / 192.0);
+
+	r300LineWidth(ctx, 0.0);
+
+	r300->hw.unk4260.cmd[1] = 0;
+	r300->hw.unk4260.cmd[2] = r300PackFloat32(0.0);
+	r300->hw.unk4260.cmd[3] = r300PackFloat32(1.0);
+
+	r300->hw.shade.cmd[1] = 0x00000002;
+	r300ShadeModel(ctx, ctx->Light.ShadeModel);
+	r300->hw.shade.cmd[3] = 0x00000000;
+	r300->hw.shade.cmd[4] = 0x00000000;
+
+	r300PolygonMode(ctx, GL_FRONT, ctx->Polygon.FrontMode);
+	r300PolygonMode(ctx, GL_BACK, ctx->Polygon.BackMode);
+	r300->hw.polygon_mode.cmd[2] = 0x00000001;
+	r300->hw.polygon_mode.cmd[3] = 0x00000000;
+	r300->hw.zbias_cntl.cmd[1] = 0x00000000;
+
+	r300PolygonOffset(ctx, ctx->Polygon.OffsetFactor,
+			  ctx->Polygon.OffsetUnits);
+	r300Enable(ctx, GL_POLYGON_OFFSET_FILL, ctx->Polygon.OffsetFill);
+
+	r300->hw.unk42C0.cmd[1] = 0x4B7FFFFF;
+	r300->hw.unk42C0.cmd[2] = 0x00000000;
+
+	r300->hw.unk43A4.cmd[1] = 0x0000001C;
+	r300->hw.unk43A4.cmd[2] = 0x2DA49525;
+
+	r300->hw.unk43E8.cmd[1] = 0x00FFFFFF;
+
+	r300->hw.unk46A4.cmd[1] = 0x00001B01;
+	r300->hw.unk46A4.cmd[2] = 0x00001B0F;
+	r300->hw.unk46A4.cmd[3] = 0x00001B0F;
+	r300->hw.unk46A4.cmd[4] = 0x00001B0F;
+	r300->hw.unk46A4.cmd[5] = 0x00000001;
+
+	r300Enable(ctx, GL_FOG, ctx->Fog.Enabled);
+	ctx->Driver.Fogfv(ctx, GL_FOG_MODE, NULL);
+	ctx->Driver.Fogfv(ctx, GL_FOG_DENSITY, &ctx->Fog.Density);
+	ctx->Driver.Fogfv(ctx, GL_FOG_START, &ctx->Fog.Start);
+	ctx->Driver.Fogfv(ctx, GL_FOG_END, &ctx->Fog.End);
+	ctx->Driver.Fogfv(ctx, GL_FOG_COLOR, ctx->Fog.Color);
+	ctx->Driver.Fogfv(ctx, GL_FOG_COORDINATE_SOURCE_EXT, NULL);
+
+	r300->hw.at.cmd[R300_AT_UNKNOWN] = 0;
+	r300->hw.unk4BD8.cmd[1] = 0;
+
+	r300->hw.unk4E00.cmd[1] = 0;
+
+	r300BlendColor(ctx, ctx->Color.BlendColor);
+	r300->hw.blend_color.cmd[2] = 0;
+	r300->hw.blend_color.cmd[3] = 0;
+
+	/* Again, r300ClearBuffer uses this */
+	r300->hw.cb.cmd[R300_CB_OFFSET] =
+	    r300->radeon.state.color.drawOffset +
+	    r300->radeon.radeonScreen->fbLocation;
+	r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
+
+	if (r300->radeon.radeonScreen->cpp == 4)
+		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
+	else
+		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
+
+	if (r300->radeon.sarea->tiling_enabled)
+		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
+
+	r300->hw.unk4E50.cmd[1] = 0;
+	r300->hw.unk4E50.cmd[2] = 0;
+	r300->hw.unk4E50.cmd[3] = 0;
+	r300->hw.unk4E50.cmd[4] = 0;
+	r300->hw.unk4E50.cmd[5] = 0;
+	r300->hw.unk4E50.cmd[6] = 0;
+	r300->hw.unk4E50.cmd[7] = 0;
+	r300->hw.unk4E50.cmd[8] = 0;
+	r300->hw.unk4E50.cmd[9] = 0;
+
+	r300->hw.unk4E88.cmd[1] = 0;
+
+	r300->hw.unk4EA0.cmd[1] = 0x00000000;
+	r300->hw.unk4EA0.cmd[2] = 0xffffffff;
+
+	switch (ctx->Visual.depthBits) {
+	case 16:
+		r300->hw.zstencil_format.cmd[1] = R300_DEPTH_FORMAT_16BIT_INT_Z;
+		break;
+	case 24:
+		r300->hw.zstencil_format.cmd[1] = R300_DEPTH_FORMAT_24BIT_INT_Z;
+		break;
+	default:
+		fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
+			ctx->Visual.depthBits);
+		_mesa_exit(-1);
+
+	}
+	/* z compress? */
+	//r300->hw.zstencil_format.cmd[1] |= R300_DEPTH_FORMAT_UNK32;
+
+	r300->hw.zstencil_format.cmd[3] = 0x00000003;
+	r300->hw.zstencil_format.cmd[4] = 0x00000000;
+
+	r300->hw.zb.cmd[R300_ZB_OFFSET] =
+	    r300->radeon.radeonScreen->depthOffset +
+	    r300->radeon.radeonScreen->fbLocation;
+	r300->hw.zb.cmd[R300_ZB_PITCH] = r300->radeon.radeonScreen->depthPitch;
+
+	if (r300->radeon.sarea->tiling_enabled) {
+		/* Turn off when clearing buffers ? */
+		r300->hw.zb.cmd[R300_ZB_PITCH] |= R300_DEPTH_TILE_ENABLE;
+
+		if (ctx->Visual.depthBits == 24)
+			r300->hw.zb.cmd[R300_ZB_PITCH] |=
+			    R300_DEPTH_MICROTILE_ENABLE;
+	}
+
+	r300->hw.unk4F28.cmd[1] = 0;
+
+	r300->hw.unk4F30.cmd[1] = 0;
+	r300->hw.unk4F30.cmd[2] = 0;
+
+	r300->hw.unk4F44.cmd[1] = 0;
+
+	r300->hw.unk4F54.cmd[1] = 0;
+
+	if (has_tcl) {
+		r300->hw.vps.cmd[R300_VPS_ZERO_0] = 0;
+		r300->hw.vps.cmd[R300_VPS_ZERO_1] = 0;
+		r300->hw.vps.cmd[R300_VPS_POINTSIZE] = r300PackFloat32(1.0);
+		r300->hw.vps.cmd[R300_VPS_ZERO_3] = 0;
+	}
+//END: TODO
+	r300->hw.all_dirty = GL_TRUE;
+}
+
+
+extern void _tnl_UpdateFixedFunctionProgram(GLcontext * ctx);
+
+extern int future_hw_tcl_on;
+void r300UpdateShaders(r300ContextPtr rmesa)
+{
+	GLcontext *ctx;
+	struct r300_vertex_program *vp;
+	int i;
+
+	ctx = rmesa->radeon.glCtx;
+
+	if (rmesa->NewGLState && hw_tcl_on) {
+		rmesa->NewGLState = 0;
+
+		for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+			rmesa->temp_attrib[i] =
+			    TNL_CONTEXT(ctx)->vb.AttribPtr[i];
+			TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
+			    &rmesa->dummy_attrib[i];
+		}
+
+		_tnl_UpdateFixedFunctionProgram(ctx);
+
+		for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+			TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
+			    rmesa->temp_attrib[i];
+		}
+
+		r300SelectVertexShader(rmesa);
+		vp = (struct r300_vertex_program *)
+		    CURRENT_VERTEX_SHADER(ctx);
+		/*if (vp->translated == GL_FALSE)
+		   r300TranslateVertexShader(vp); */
+		if (vp->translated == GL_FALSE) {
+			fprintf(stderr, "Failing back to sw-tcl\n");
+			hw_tcl_on = future_hw_tcl_on = 0;
+			r300ResetHwState(rmesa);
+
+			return;
+		}
+		r300UpdateStateParameters(ctx, _NEW_PROGRAM);
+	}
+
+}
+
+static void r300SetupPixelShader(r300ContextPtr rmesa)
+{
+	GLcontext *ctx = rmesa->radeon.glCtx;
+	struct r300_fragment_program *fp = (struct r300_fragment_program *)
+	    (char *)ctx->FragmentProgram._Current;
+	int i, k;
+
+	if (!fp)		/* should only happenen once, just after context is created */
+		return;
+
+	r300TranslateFragmentShader(rmesa, fp);
+	if (!fp->translated) {
+		fprintf(stderr, "%s: No valid fragment shader, exiting\n",
+			__FUNCTION__);
+		return;
+	}
+#define OUTPUT_FIELD(st, reg, field)  \
+		R300_STATECHANGE(rmesa, st); \
+		for(i=0;i<=fp->alu_end;i++) \
+			rmesa->hw.st.cmd[R300_FPI_INSTR_0+i]=fp->alu.inst[i].field;\
+		rmesa->hw.st.cmd[R300_FPI_CMD_0]=cmdpacket0(reg, fp->alu_end+1);
+
+	OUTPUT_FIELD(fpi[0], R300_PFS_INSTR0_0, inst0);
+	OUTPUT_FIELD(fpi[1], R300_PFS_INSTR1_0, inst1);
+	OUTPUT_FIELD(fpi[2], R300_PFS_INSTR2_0, inst2);
+	OUTPUT_FIELD(fpi[3], R300_PFS_INSTR3_0, inst3);
+#undef OUTPUT_FIELD
+
+	R300_STATECHANGE(rmesa, fp);
+	/* I just want to say, the way these nodes are stored.. weird.. */
+	for (i = 0, k = (4 - (fp->cur_node + 1)); i < 4; i++, k++) {
+		if (i < (fp->cur_node + 1)) {
+			rmesa->hw.fp.cmd[R300_FP_NODE0 + k] =
+			    (fp->node[i].
+			     alu_offset << R300_PFS_NODE_ALU_OFFSET_SHIFT)
+			    | (fp->node[i].
+			       alu_end << R300_PFS_NODE_ALU_END_SHIFT)
+			    | (fp->node[i].
+			       tex_offset << R300_PFS_NODE_TEX_OFFSET_SHIFT)
+			    | (fp->node[i].
+			       tex_end << R300_PFS_NODE_TEX_END_SHIFT)
+			    | fp->node[i].flags;	/*  ( (k==3) ? R300_PFS_NODE_LAST_NODE : 0); */
+		} else {
+			rmesa->hw.fp.cmd[R300_FP_NODE0 + (3 - i)] = 0;
+		}
+	}
+
+	/*  PFS_CNTL_0 */
+	rmesa->hw.fp.cmd[R300_FP_CNTL0] =
+	    fp->cur_node | (fp->first_node_has_tex << 3);
+	/* PFS_CNTL_1 */
+	rmesa->hw.fp.cmd[R300_FP_CNTL1] = fp->max_temp_idx;
+	/* PFS_CNTL_2 */
+	rmesa->hw.fp.cmd[R300_FP_CNTL2] =
+	    (fp->alu_offset << R300_PFS_CNTL_ALU_OFFSET_SHIFT)
+	    | (fp->alu_end << R300_PFS_CNTL_ALU_END_SHIFT)
+	    | (fp->tex_offset << R300_PFS_CNTL_TEX_OFFSET_SHIFT)
+	    | (fp->tex_end << R300_PFS_CNTL_TEX_END_SHIFT);
+
+	R300_STATECHANGE(rmesa, fpp);
+	for (i = 0; i < fp->const_nr; i++) {
+		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] =
+		    r300PackFloat24(fp->constant[i][0]);
+		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] =
+		    r300PackFloat24(fp->constant[i][1]);
+		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] =
+		    r300PackFloat24(fp->constant[i][2]);
+		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] =
+		    r300PackFloat24(fp->constant[i][3]);
+	}
+	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] =
+	    cmdpacket0(R300_PFS_PARAM_0_X, fp->const_nr * 4);
+}
+
+void r300UpdateShaderStates(r300ContextPtr rmesa)
+{
+	GLcontext *ctx;
+	ctx = rmesa->radeon.glCtx;
+
+	r300UpdateTextureState(ctx);
+
+	r300SetupPixelShader(rmesa);
+	r300SetupTextures(ctx);
+
+	if ((rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+		r300SetupVertexShader(rmesa);
+	r300SetupRSUnit(ctx);
+}
+
+/**
+ * Called by Mesa after an internal state update.
+ */
+static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	_swrast_InvalidateState(ctx, new_state);
+	_swsetup_InvalidateState(ctx, new_state);
+	_vbo_InvalidateState(ctx, new_state);
+	_tnl_InvalidateState(ctx, new_state);
+	_ae_invalidate_state(ctx, new_state);
+
+	if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+		r300UpdateDrawBuffer(ctx);
+	}
+
+	r300UpdateStateParameters(ctx, new_state);
+
+	r300->NewGLState |= new_state;
+}
+
+/**
+ * Calculate initial hardware state and register state functions.
+ * Assumes that the command buffer and state atoms have been
+ * initialized already.
+ */
+void r300InitState(r300ContextPtr r300)
+{
+	GLcontext *ctx = r300->radeon.glCtx;
+	GLuint depth_fmt;
+
+	radeonInitState(&r300->radeon);
+
+	switch (ctx->Visual.depthBits) {
+	case 16:
+		r300->state.depth.scale = 1.0 / (GLfloat) 0xffff;
+		depth_fmt = R300_DEPTH_FORMAT_16BIT_INT_Z;
+		r300->state.stencil.clear = 0x00000000;
+		break;
+	case 24:
+		r300->state.depth.scale = 1.0 / (GLfloat) 0xffffff;
+		depth_fmt = R300_DEPTH_FORMAT_24BIT_INT_Z;
+		r300->state.stencil.clear = 0x00ff0000;
+		break;
+
+
+	default:
+		fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
+			ctx->Visual.depthBits);
+		_mesa_exit(-1);
+	}
+
+	/* Only have hw stencil when depth buffer is 24 bits deep */
+	r300->state.stencil.hw_stencil = (ctx->Visual.stencilBits > 0 &&
+					  ctx->Visual.depthBits == 24);
+
+	memset(&(r300->state.texture), 0, sizeof(r300->state.texture));
+
+	r300ResetHwState(r300);
+}
+
+static void r300RenderMode(GLcontext * ctx, GLenum mode)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	(void)rmesa;
+	(void)mode;
+}
+
+static void r300ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	GLint p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
+	GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
+
+	R300_STATECHANGE( rmesa, vpucp[p] );
+	rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
+	rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
+	rmesa->hw.vpucp[p].cmd[R300_VPUCP_Z] = ip[2];
+	rmesa->hw.vpucp[p].cmd[R300_VPUCP_W] = ip[3];
+}
+
+
+void r300UpdateClipPlanes( GLcontext *ctx )
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	GLuint p;
+	
+	for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
+		if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
+			GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
+			
+			R300_STATECHANGE( rmesa, vpucp[p] );
+			rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
+			rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
+			rmesa->hw.vpucp[p].cmd[R300_VPUCP_Z] = ip[2];
+			rmesa->hw.vpucp[p].cmd[R300_VPUCP_W] = ip[3];
+		}
+	}
+}
+
+/**
+ * Initialize driver's state callback functions
+ */
+void r300InitStateFuncs(struct dd_function_table *functions)
+{
+	radeonInitStateFuncs(functions);
+
+	functions->UpdateState = r300InvalidateState;
+	functions->AlphaFunc = r300AlphaFunc;
+	functions->BlendColor = r300BlendColor;
+	functions->BlendEquationSeparate = r300BlendEquationSeparate;
+	functions->BlendFuncSeparate = r300BlendFuncSeparate;
+	functions->Enable = r300Enable;
+	functions->ColorMask = r300ColorMask;
+	functions->DepthFunc = r300DepthFunc;
+	functions->DepthMask = r300DepthMask;
+	functions->CullFace = r300CullFace;
+	functions->Fogfv = r300Fogfv;
+	functions->FrontFace = r300FrontFace;
+	functions->ShadeModel = r300ShadeModel;
+
+	/* Stencil related */
+	functions->ClearStencil = r300ClearStencil;
+	functions->StencilFuncSeparate = r300StencilFuncSeparate;
+	functions->StencilMaskSeparate = r300StencilMaskSeparate;
+	functions->StencilOpSeparate = r300StencilOpSeparate;
+
+	/* Viewport related */
+	functions->Viewport = r300Viewport;
+	functions->DepthRange = r300DepthRange;
+	functions->PointSize = r300PointSize;
+	functions->LineWidth = r300LineWidth;
+
+	functions->PolygonOffset = r300PolygonOffset;
+	functions->PolygonMode = r300PolygonMode;
+
+	functions->RenderMode = r300RenderMode;
+
+	functions->ClipPlane = r300ClipPlane;
+}
diff --git a/r300/r300_state.h b/r300/r300_state.h
new file mode 100644
index 0000000..21a49b7
--- /dev/null
+++ b/r300/r300_state.h
@@ -0,0 +1,70 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R300_STATE_H__
+#define __R300_STATE_H__
+
+#include "r300_context.h"
+
+#define R300_STATECHANGE(r300, atom) \
+	do {						\
+		r300->hw.atom.dirty = GL_TRUE;		\
+		r300->hw.is_dirty = GL_TRUE;		\
+	} while(0)
+
+#define R300_PRINT_STATE(r300, atom) \
+		r300PrintStateAtom(r300, &r300->hw.atom)
+
+/* Fire the buffered vertices no matter what.
+   TODO: This has not been implemented yet
+ */
+#define R300_FIREVERTICES( r300 )			\
+do {							\
+    \
+   if ( (r300)->cmdbuf.count_used || (r300)->dma.flush ) {	\
+      r300Flush( (r300)->radeon.glCtx );		\
+   }							\
+    \
+} while (0)
+
+extern void r300UpdateStateParameters(GLcontext * ctx, GLuint new_state);
+extern void r300InitState(r300ContextPtr r300);
+extern void r300InitStateFuncs(struct dd_function_table *functions);
+extern void r300UpdateViewportOffset(GLcontext * ctx);
+extern void r300UpdateDrawBuffer(GLcontext * ctx);
+
+extern void r300UpdateShaders(r300ContextPtr rmesa);
+extern void r300UpdateShaderStates(r300ContextPtr rmesa);
+
+#endif				/* __R300_STATE_H__ */
diff --git a/r300/r300_tex.c b/r300/r300_tex.c
new file mode 100644
index 0000000..2a21c61
--- /dev/null
+++ b/r300/r300_tex.c
@@ -0,0 +1,1166 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "colormac.h"
+#include "context.h"
+#include "enums.h"
+#include "image.h"
+#include "simple_list.h"
+#include "texformat.h"
+#include "texstore.h"
+#include "texmem.h"
+#include "teximage.h"
+#include "texobj.h"
+
+#include "r300_context.h"
+#include "r300_state.h"
+#include "r300_ioctl.h"
+#include "r300_tex.h"
+
+#include "xmlpool.h"
+
+/**
+ * Set the texture wrap modes.
+ *
+ * \param t Texture object whose wrap modes are to be set
+ * \param swrap Wrap mode for the \a s texture coordinate
+ * \param twrap Wrap mode for the \a t texture coordinate
+ */
+
+static void r300SetTexWrap(r300TexObjPtr t, GLenum swrap, GLenum twrap,
+			   GLenum rwrap)
+{
+	unsigned long hw_swrap = 0, hw_twrap = 0, hw_qwrap = 0;
+
+	t->filter &=
+	    ~(R300_TX_WRAP_S_MASK | R300_TX_WRAP_T_MASK | R300_TX_WRAP_Q_MASK);
+
+	switch (swrap) {
+	case GL_REPEAT:
+		hw_swrap |= R300_TX_REPEAT;
+		break;
+	case GL_CLAMP:
+		hw_swrap |= R300_TX_CLAMP;
+		break;
+	case GL_CLAMP_TO_EDGE:
+		hw_swrap |= R300_TX_CLAMP_TO_EDGE;
+		break;
+	case GL_CLAMP_TO_BORDER:
+		hw_swrap |= R300_TX_CLAMP_TO_BORDER;
+		break;
+	case GL_MIRRORED_REPEAT:
+		hw_swrap |= R300_TX_REPEAT | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_EXT:
+		hw_swrap |= R300_TX_CLAMP | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_TO_EDGE_EXT:
+		hw_swrap |= R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_TO_BORDER_EXT:
+		hw_swrap |= R300_TX_CLAMP_TO_BORDER | R300_TX_MIRRORED;
+		break;
+	default:
+		_mesa_problem(NULL, "bad S wrap mode in %s", __FUNCTION__);
+	}
+
+	switch (twrap) {
+	case GL_REPEAT:
+		hw_twrap |= R300_TX_REPEAT;
+		break;
+	case GL_CLAMP:
+		hw_twrap |= R300_TX_CLAMP;
+		break;
+	case GL_CLAMP_TO_EDGE:
+		hw_twrap |= R300_TX_CLAMP_TO_EDGE;
+		break;
+	case GL_CLAMP_TO_BORDER:
+		hw_twrap |= R300_TX_CLAMP_TO_BORDER;
+		break;
+	case GL_MIRRORED_REPEAT:
+		hw_twrap |= R300_TX_REPEAT | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_EXT:
+		hw_twrap |= R300_TX_CLAMP | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_TO_EDGE_EXT:
+		hw_twrap |= R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_TO_BORDER_EXT:
+		hw_twrap |= R300_TX_CLAMP_TO_BORDER | R300_TX_MIRRORED;
+		break;
+	default:
+		_mesa_problem(NULL, "bad T wrap mode in %s", __FUNCTION__);
+	}
+
+	switch (rwrap) {
+	case GL_REPEAT:
+		hw_qwrap |= R300_TX_REPEAT;
+		break;
+	case GL_CLAMP:
+		hw_qwrap |= R300_TX_CLAMP;
+		break;
+	case GL_CLAMP_TO_EDGE:
+		hw_qwrap |= R300_TX_CLAMP_TO_EDGE;
+		break;
+	case GL_CLAMP_TO_BORDER:
+		hw_qwrap |= R300_TX_CLAMP_TO_BORDER;
+		break;
+	case GL_MIRRORED_REPEAT:
+		hw_qwrap |= R300_TX_REPEAT | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_EXT:
+		hw_qwrap |= R300_TX_CLAMP | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_TO_EDGE_EXT:
+		hw_qwrap |= R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_TO_BORDER_EXT:
+		hw_qwrap |= R300_TX_CLAMP_TO_BORDER | R300_TX_MIRRORED;
+		break;
+	default:
+		_mesa_problem(NULL, "bad R wrap mode in %s", __FUNCTION__);
+	}
+
+	t->filter |= hw_swrap << R300_TX_WRAP_S_SHIFT;
+	t->filter |= hw_twrap << R300_TX_WRAP_T_SHIFT;
+	t->filter |= hw_qwrap << R300_TX_WRAP_Q_SHIFT;
+}
+
+static void r300SetTexMaxAnisotropy(r300TexObjPtr t, GLfloat max)
+{
+
+	t->filter &= ~R300_TX_MAX_ANISO_MASK;
+
+	if (max <= 1.0) {
+		t->filter |= R300_TX_MAX_ANISO_1_TO_1;
+	} else if (max <= 2.0) {
+		t->filter |= R300_TX_MAX_ANISO_2_TO_1;
+	} else if (max <= 4.0) {
+		t->filter |= R300_TX_MAX_ANISO_4_TO_1;
+	} else if (max <= 8.0) {
+		t->filter |= R300_TX_MAX_ANISO_8_TO_1;
+	} else {
+		t->filter |= R300_TX_MAX_ANISO_16_TO_1;
+	}
+}
+
+/**
+ * Set the texture magnification and minification modes.
+ *
+ * \param t Texture whose filter modes are to be set
+ * \param minf Texture minification mode
+ * \param magf Texture magnification mode
+ */
+
+static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf)
+{
+	GLuint anisotropy = (t->filter & R300_TX_MAX_ANISO_MASK);
+
+	t->filter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MAG_FILTER_MASK);
+
+	if (anisotropy == R300_TX_MAX_ANISO_1_TO_1) {
+		switch (minf) {
+		case GL_NEAREST:
+			t->filter |= R300_TX_MIN_FILTER_NEAREST;
+			break;
+		case GL_LINEAR:
+			t->filter |= R300_TX_MIN_FILTER_LINEAR;
+			break;
+		case GL_NEAREST_MIPMAP_NEAREST:
+			t->filter |= R300_TX_MIN_FILTER_NEAREST_MIP_NEAREST;
+			break;
+		case GL_NEAREST_MIPMAP_LINEAR:
+			t->filter |= R300_TX_MIN_FILTER_NEAREST_MIP_LINEAR;
+			break;
+		case GL_LINEAR_MIPMAP_NEAREST:
+			t->filter |= R300_TX_MIN_FILTER_LINEAR_MIP_NEAREST;
+			break;
+		case GL_LINEAR_MIPMAP_LINEAR:
+			t->filter |= R300_TX_MIN_FILTER_LINEAR_MIP_LINEAR;
+			break;
+		}
+	} else {
+		switch (minf) {
+		case GL_NEAREST:
+			t->filter |= R300_TX_MIN_FILTER_ANISO_NEAREST;
+			break;
+		case GL_LINEAR:
+			t->filter |= R300_TX_MIN_FILTER_ANISO_LINEAR;
+			break;
+		case GL_NEAREST_MIPMAP_NEAREST:
+		case GL_LINEAR_MIPMAP_NEAREST:
+			t->filter |=
+			    R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_NEAREST;
+			break;
+		case GL_NEAREST_MIPMAP_LINEAR:
+		case GL_LINEAR_MIPMAP_LINEAR:
+			t->filter |=
+			    R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_LINEAR;
+			break;
+		}
+	}
+
+	/* Note we don't have 3D mipmaps so only use the mag filter setting
+	 * to set the 3D texture filter mode.
+	 */
+	switch (magf) {
+	case GL_NEAREST:
+		t->filter |= R300_TX_MAG_FILTER_NEAREST;
+		break;
+	case GL_LINEAR:
+		t->filter |= R300_TX_MAG_FILTER_LINEAR;
+		break;
+	}
+}
+
+static void r300SetTexBorderColor(r300TexObjPtr t, GLubyte c[4])
+{
+	t->pp_border_color = PACK_COLOR_8888(c[0], c[1], c[2], c[3]);
+}
+
+/**
+ * Allocate space for and load the mesa images into the texture memory block.
+ * This will happen before drawing with a new texture, or drawing with a
+ * texture after it was swapped out or teximaged again.
+ */
+
+static r300TexObjPtr r300AllocTexObj(struct gl_texture_object *texObj)
+{
+	r300TexObjPtr t;
+
+	t = CALLOC_STRUCT(r300_tex_obj);
+	texObj->DriverData = t;
+	if (t != NULL) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE) {
+			fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
+				(void *)texObj, (void *)t);
+		}
+
+		/* Initialize non-image-dependent parts of the state:
+		 */
+		t->base.tObj = texObj;
+		t->border_fallback = GL_FALSE;
+
+		make_empty_list(&t->base);
+
+		r300SetTexWrap(t, texObj->WrapS, texObj->WrapT, texObj->WrapR);
+		r300SetTexMaxAnisotropy(t, texObj->MaxAnisotropy);
+		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter);
+		r300SetTexBorderColor(t, texObj->_BorderChan);
+	}
+
+	return t;
+}
+
+/* try to find a format which will only need a memcopy */
+static const struct gl_texture_format *r300Choose8888TexFormat(GLenum srcFormat,
+							       GLenum srcType)
+{
+	const GLuint ui = 1;
+	const GLubyte littleEndian = *((const GLubyte *)&ui);
+
+	if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
+	    (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE
+	     && !littleEndian) || (srcFormat == GL_ABGR_EXT
+				   && srcType == GL_UNSIGNED_INT_8_8_8_8_REV)
+	    || (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE
+		&& littleEndian)) {
+		return &_mesa_texformat_rgba8888;
+	} else
+	    if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV)
+		|| (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE
+		    && littleEndian) || (srcFormat == GL_ABGR_EXT
+					 && srcType == GL_UNSIGNED_INT_8_8_8_8)
+		|| (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE
+		    && !littleEndian)) {
+		return &_mesa_texformat_rgba8888_rev;
+	} else if (srcFormat == GL_BGRA &&
+		   ((srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
+		    srcType == GL_UNSIGNED_INT_8_8_8_8)) {
+		return &_mesa_texformat_argb8888_rev;
+	} else if (srcFormat == GL_BGRA &&
+		   ((srcType == GL_UNSIGNED_BYTE && littleEndian) ||
+		    srcType == GL_UNSIGNED_INT_8_8_8_8_REV)) {
+		return &_mesa_texformat_argb8888;
+	} else
+		return _dri_texformat_argb8888;
+}
+
+static const struct gl_texture_format *r300ChooseTextureFormat(GLcontext * ctx,
+							       GLint
+							       internalFormat,
+							       GLenum format,
+							       GLenum type)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	const GLboolean do32bpt =
+	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32);
+	const GLboolean force16bpt =
+	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16);
+	(void)format;
+
+#if 0
+	fprintf(stderr, "InternalFormat=%s(%d) type=%s format=%s\n",
+		_mesa_lookup_enum_by_nr(internalFormat), internalFormat,
+		_mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
+	fprintf(stderr, "do32bpt=%d force16bpt=%d\n", do32bpt, force16bpt);
+#endif
+
+	switch (internalFormat) {
+	case 4:
+	case GL_RGBA:
+	case GL_COMPRESSED_RGBA:
+		switch (type) {
+		case GL_UNSIGNED_INT_10_10_10_2:
+		case GL_UNSIGNED_INT_2_10_10_10_REV:
+			return do32bpt ? _dri_texformat_argb8888 :
+			    _dri_texformat_argb1555;
+		case GL_UNSIGNED_SHORT_4_4_4_4:
+		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+			return _dri_texformat_argb4444;
+		case GL_UNSIGNED_SHORT_5_5_5_1:
+		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+			return _dri_texformat_argb1555;
+		default:
+			return do32bpt ? r300Choose8888TexFormat(format, type) :
+			    _dri_texformat_argb4444;
+		}
+
+	case 3:
+	case GL_RGB:
+	case GL_COMPRESSED_RGB:
+		switch (type) {
+		case GL_UNSIGNED_SHORT_4_4_4_4:
+		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+			return _dri_texformat_argb4444;
+		case GL_UNSIGNED_SHORT_5_5_5_1:
+		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+			return _dri_texformat_argb1555;
+		case GL_UNSIGNED_SHORT_5_6_5:
+		case GL_UNSIGNED_SHORT_5_6_5_REV:
+			return _dri_texformat_rgb565;
+		default:
+			return do32bpt ? _dri_texformat_argb8888 :
+			    _dri_texformat_rgb565;
+		}
+
+	case GL_RGBA8:
+	case GL_RGB10_A2:
+	case GL_RGBA12:
+	case GL_RGBA16:
+		return !force16bpt ?
+		    r300Choose8888TexFormat(format,
+					    type) : _dri_texformat_argb4444;
+
+	case GL_RGBA4:
+	case GL_RGBA2:
+		return _dri_texformat_argb4444;
+
+	case GL_RGB5_A1:
+		return _dri_texformat_argb1555;
+
+	case GL_RGB8:
+	case GL_RGB10:
+	case GL_RGB12:
+	case GL_RGB16:
+		return !force16bpt ? _dri_texformat_argb8888 :
+		    _dri_texformat_rgb565;
+
+	case GL_RGB5:
+	case GL_RGB4:
+	case GL_R3_G3_B2:
+		return _dri_texformat_rgb565;
+
+	case GL_ALPHA:
+	case GL_ALPHA4:
+	case GL_ALPHA8:
+	case GL_ALPHA12:
+	case GL_ALPHA16:
+	case GL_COMPRESSED_ALPHA:
+		return _dri_texformat_a8;
+
+	case 1:
+	case GL_LUMINANCE:
+	case GL_LUMINANCE4:
+	case GL_LUMINANCE8:
+	case GL_LUMINANCE12:
+	case GL_LUMINANCE16:
+	case GL_COMPRESSED_LUMINANCE:
+		return _dri_texformat_l8;
+
+	case 2:
+	case GL_LUMINANCE_ALPHA:
+	case GL_LUMINANCE4_ALPHA4:
+	case GL_LUMINANCE6_ALPHA2:
+	case GL_LUMINANCE8_ALPHA8:
+	case GL_LUMINANCE12_ALPHA4:
+	case GL_LUMINANCE12_ALPHA12:
+	case GL_LUMINANCE16_ALPHA16:
+	case GL_COMPRESSED_LUMINANCE_ALPHA:
+		return _dri_texformat_al88;
+
+	case GL_INTENSITY:
+	case GL_INTENSITY4:
+	case GL_INTENSITY8:
+	case GL_INTENSITY12:
+	case GL_INTENSITY16:
+	case GL_COMPRESSED_INTENSITY:
+		return _dri_texformat_i8;
+
+	case GL_YCBCR_MESA:
+		if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
+		    type == GL_UNSIGNED_BYTE)
+			return &_mesa_texformat_ycbcr;
+		else
+			return &_mesa_texformat_ycbcr_rev;
+
+	case GL_RGB_S3TC:
+	case GL_RGB4_S3TC:
+	case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+		return &_mesa_texformat_rgb_dxt1;
+
+	case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+		return &_mesa_texformat_rgba_dxt1;
+
+	case GL_RGBA_S3TC:
+	case GL_RGBA4_S3TC:
+	case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+		return &_mesa_texformat_rgba_dxt3;
+
+	case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+		return &_mesa_texformat_rgba_dxt5;
+
+	case GL_ALPHA16F_ARB:
+		return &_mesa_texformat_alpha_float16;
+	case GL_ALPHA32F_ARB:
+		return &_mesa_texformat_alpha_float32;
+	case GL_LUMINANCE16F_ARB:
+		return &_mesa_texformat_luminance_float16;
+	case GL_LUMINANCE32F_ARB:
+		return &_mesa_texformat_luminance_float32;
+	case GL_LUMINANCE_ALPHA16F_ARB:
+		return &_mesa_texformat_luminance_alpha_float16;
+	case GL_LUMINANCE_ALPHA32F_ARB:
+		return &_mesa_texformat_luminance_alpha_float32;
+	case GL_INTENSITY16F_ARB:
+		return &_mesa_texformat_intensity_float16;
+	case GL_INTENSITY32F_ARB:
+		return &_mesa_texformat_intensity_float32;
+	case GL_RGB16F_ARB:
+		return &_mesa_texformat_rgba_float16;
+	case GL_RGB32F_ARB:
+		return &_mesa_texformat_rgba_float32;
+	case GL_RGBA16F_ARB:
+		return &_mesa_texformat_rgba_float16;
+	case GL_RGBA32F_ARB:
+		return &_mesa_texformat_rgba_float32;
+
+	default:
+		_mesa_problem(ctx,
+			      "unexpected internalFormat 0x%x in r300ChooseTextureFormat",
+			      (int)internalFormat);
+		return NULL;
+	}
+
+	return NULL;		/* never get here */
+}
+
+static GLboolean
+r300ValidateClientStorage(GLcontext * ctx, GLenum target,
+			  GLint internalFormat,
+			  GLint srcWidth, GLint srcHeight,
+			  GLenum format, GLenum type, const void *pixels,
+			  const struct gl_pixelstore_attrib *packing,
+			  struct gl_texture_object *texObj,
+			  struct gl_texture_image *texImage)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	if (RADEON_DEBUG & DEBUG_TEXTURE)
+		fprintf(stderr, "intformat %s format %s type %s\n",
+			_mesa_lookup_enum_by_nr(internalFormat),
+			_mesa_lookup_enum_by_nr(format),
+			_mesa_lookup_enum_by_nr(type));
+
+	if (!ctx->Unpack.ClientStorage)
+		return 0;
+
+	if (ctx->_ImageTransferState ||
+	    texImage->IsCompressed || texObj->GenerateMipmap)
+		return 0;
+
+	/* This list is incomplete, may be different on ppc???
+	 */
+	switch (internalFormat) {
+	case GL_RGBA:
+		if (format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV) {
+			texImage->TexFormat = _dri_texformat_argb8888;
+		} else
+			return 0;
+		break;
+
+	case GL_RGB:
+		if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) {
+			texImage->TexFormat = _dri_texformat_rgb565;
+		} else
+			return 0;
+		break;
+
+	case GL_YCBCR_MESA:
+		if (format == GL_YCBCR_MESA &&
+		    type == GL_UNSIGNED_SHORT_8_8_REV_APPLE) {
+			texImage->TexFormat = &_mesa_texformat_ycbcr_rev;
+		} else if (format == GL_YCBCR_MESA &&
+			   (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
+			    type == GL_UNSIGNED_BYTE)) {
+			texImage->TexFormat = &_mesa_texformat_ycbcr;
+		} else
+			return 0;
+		break;
+
+	default:
+		return 0;
+	}
+
+	/* Could deal with these packing issues, but currently don't:
+	 */
+	if (packing->SkipPixels ||
+	    packing->SkipRows || packing->SwapBytes || packing->LsbFirst) {
+		return 0;
+	}
+
+	{
+		GLint srcRowStride = _mesa_image_row_stride(packing, srcWidth,
+							    format, type);
+
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: srcRowStride %d/%x\n",
+				__FUNCTION__, srcRowStride, srcRowStride);
+
+		/* Could check this later in upload, pitch restrictions could be
+		 * relaxed, but would need to store the image pitch somewhere,
+		 * as packing details might change before image is uploaded:
+		 */
+		if (!r300IsGartMemory(rmesa, pixels, srcHeight * srcRowStride)
+		    || (srcRowStride & 63))
+			return 0;
+
+		/* Have validated that _mesa_transfer_teximage would be a straight
+		 * memcpy at this point.  NOTE: future calls to TexSubImage will
+		 * overwrite the client data.  This is explicitly mentioned in the
+		 * extension spec.
+		 */
+		texImage->Data = (void *)pixels;
+		texImage->IsClientData = GL_TRUE;
+		texImage->RowStride =
+		    srcRowStride / texImage->TexFormat->TexelBytes;
+
+		return 1;
+	}
+}
+
+static void r300TexImage1D(GLcontext * ctx, GLenum target, GLint level,
+			   GLint internalFormat,
+			   GLint width, GLint border,
+			   GLenum format, GLenum type, const GLvoid * pixels,
+			   const struct gl_pixelstore_attrib *packing,
+			   struct gl_texture_object *texObj,
+			   struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+
+	if (t) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
+			return;
+		}
+	}
+
+	/* Note, this will call ChooseTextureFormat */
+	_mesa_store_teximage1d(ctx, target, level, internalFormat,
+			       width, border, format, type, pixels,
+			       &ctx->Unpack, texObj, texImage);
+
+	t->dirty_images[0] |= (1 << level);
+}
+
+static void r300TexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
+			      GLint xoffset,
+			      GLsizei width,
+			      GLenum format, GLenum type,
+			      const GLvoid * pixels,
+			      const struct gl_pixelstore_attrib *packing,
+			      struct gl_texture_object *texObj,
+			      struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+
+	assert(t);		/* this _should_ be true */
+	if (t) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
+			return;
+		}
+	}
+
+	_mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
+				  format, type, pixels, packing, texObj,
+				  texImage);
+
+	t->dirty_images[0] |= (1 << level);
+}
+
+static void r300TexImage2D(GLcontext * ctx, GLenum target, GLint level,
+			   GLint internalFormat,
+			   GLint width, GLint height, GLint border,
+			   GLenum format, GLenum type, const GLvoid * pixels,
+			   const struct gl_pixelstore_attrib *packing,
+			   struct gl_texture_object *texObj,
+			   struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	GLuint face;
+
+	/* which cube face or ordinary 2D image */
+	switch (target) {
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+		face =
+		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+		ASSERT(face < 6);
+		break;
+	default:
+		face = 0;
+	}
+
+	if (t != NULL) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
+			return;
+		}
+	}
+
+	texImage->IsClientData = GL_FALSE;
+
+	if (r300ValidateClientStorage(ctx, target,
+				      internalFormat,
+				      width, height,
+				      format, type, pixels,
+				      packing, texObj, texImage)) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: Using client storage\n",
+				__FUNCTION__);
+	} else {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: Using normal storage\n",
+				__FUNCTION__);
+
+		/* Normal path: copy (to cached memory) and eventually upload
+		 * via another copy to GART memory and then a blit...  Could
+		 * eliminate one copy by going straight to (permanent) GART.
+		 *
+		 * Note, this will call r300ChooseTextureFormat.
+		 */
+		_mesa_store_teximage2d(ctx, target, level, internalFormat,
+				       width, height, border, format, type,
+				       pixels, &ctx->Unpack, texObj, texImage);
+
+		t->dirty_images[face] |= (1 << level);
+	}
+}
+
+static void r300TexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
+			      GLint xoffset, GLint yoffset,
+			      GLsizei width, GLsizei height,
+			      GLenum format, GLenum type,
+			      const GLvoid * pixels,
+			      const struct gl_pixelstore_attrib *packing,
+			      struct gl_texture_object *texObj,
+			      struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	GLuint face;
+
+	/* which cube face or ordinary 2D image */
+	switch (target) {
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+		face =
+		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+		ASSERT(face < 6);
+		break;
+	default:
+		face = 0;
+	}
+
+	assert(t);		/* this _should_ be true */
+	if (t) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
+			return;
+		}
+	}
+
+	_mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
+				  height, format, type, pixels, packing, texObj,
+				  texImage);
+
+	t->dirty_images[face] |= (1 << level);
+}
+
+static void r300CompressedTexImage2D(GLcontext * ctx, GLenum target,
+				     GLint level, GLint internalFormat,
+				     GLint width, GLint height, GLint border,
+				     GLsizei imageSize, const GLvoid * data,
+				     struct gl_texture_object *texObj,
+				     struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	GLuint face;
+
+	/* which cube face or ordinary 2D image */
+	switch (target) {
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+		face =
+		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+		ASSERT(face < 6);
+		break;
+	default:
+		face = 0;
+	}
+
+	if (t != NULL) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY,
+				    "glCompressedTexImage2D");
+			return;
+		}
+	}
+
+	texImage->IsClientData = GL_FALSE;
+
+	/* can't call this, different parameters. Would never evaluate to true anyway currently */
+#if 0
+	if (r300ValidateClientStorage(ctx, target,
+				      internalFormat,
+				      width, height,
+				      format, type, pixels,
+				      packing, texObj, texImage)) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: Using client storage\n",
+				__FUNCTION__);
+	} else
+#endif
+	{
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: Using normal storage\n",
+				__FUNCTION__);
+
+		/* Normal path: copy (to cached memory) and eventually upload
+		 * via another copy to GART memory and then a blit...  Could
+		 * eliminate one copy by going straight to (permanent) GART.
+		 *
+		 * Note, this will call r300ChooseTextureFormat.
+		 */
+		_mesa_store_compressed_teximage2d(ctx, target, level,
+						  internalFormat, width, height,
+						  border, imageSize, data,
+						  texObj, texImage);
+
+		t->dirty_images[face] |= (1 << level);
+	}
+}
+
+static void r300CompressedTexSubImage2D(GLcontext * ctx, GLenum target,
+					GLint level, GLint xoffset,
+					GLint yoffset, GLsizei width,
+					GLsizei height, GLenum format,
+					GLsizei imageSize, const GLvoid * data,
+					struct gl_texture_object *texObj,
+					struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	GLuint face;
+
+	/* which cube face or ordinary 2D image */
+	switch (target) {
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+		face =
+		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+		ASSERT(face < 6);
+		break;
+	default:
+		face = 0;
+	}
+
+	assert(t);		/* this _should_ be true */
+	if (t) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY,
+				    "glCompressedTexSubImage3D");
+			return;
+		}
+	}
+
+	_mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset,
+					     yoffset, width, height, format,
+					     imageSize, data, texObj, texImage);
+
+	t->dirty_images[face] |= (1 << level);
+}
+
+static void r300TexImage3D(GLcontext * ctx, GLenum target, GLint level,
+			   GLint internalFormat,
+			   GLint width, GLint height, GLint depth,
+			   GLint border,
+			   GLenum format, GLenum type, const GLvoid * pixels,
+			   const struct gl_pixelstore_attrib *packing,
+			   struct gl_texture_object *texObj,
+			   struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+
+	if (t) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage3D");
+			return;
+		}
+	}
+
+	texImage->IsClientData = GL_FALSE;
+
+#if 0
+	if (r300ValidateClientStorage(ctx, target,
+				      internalFormat,
+				      width, height,
+				      format, type, pixels,
+				      packing, texObj, texImage)) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: Using client storage\n",
+				__FUNCTION__);
+	} else
+#endif
+	{
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: Using normal storage\n",
+				__FUNCTION__);
+
+		/* Normal path: copy (to cached memory) and eventually upload
+		 * via another copy to GART memory and then a blit...  Could
+		 * eliminate one copy by going straight to (permanent) GART.
+		 *
+		 * Note, this will call r300ChooseTextureFormat.
+		 */
+		_mesa_store_teximage3d(ctx, target, level, internalFormat,
+				       width, height, depth, border,
+				       format, type, pixels,
+				       &ctx->Unpack, texObj, texImage);
+
+		t->dirty_images[0] |= (1 << level);
+	}
+}
+
+static void
+r300TexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
+		  GLint xoffset, GLint yoffset, GLint zoffset,
+		  GLsizei width, GLsizei height, GLsizei depth,
+		  GLenum format, GLenum type,
+		  const GLvoid * pixels,
+		  const struct gl_pixelstore_attrib *packing,
+		  struct gl_texture_object *texObj,
+		  struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+
+/*     fprintf(stderr, "%s\n", __FUNCTION__); */
+
+	assert(t);		/* this _should_ be true */
+	if (t) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage3D");
+			return;
+		}
+		texObj->DriverData = t;
+	}
+
+	_mesa_store_texsubimage3d(ctx, target, level, xoffset, yoffset, zoffset,
+				  width, height, depth,
+				  format, type, pixels, packing, texObj,
+				  texImage);
+
+	t->dirty_images[0] |= (1 << level);
+}
+
+static void r300TexEnv(GLcontext * ctx, GLenum target,
+		       GLenum pname, const GLfloat * param)
+{
+	if (RADEON_DEBUG & DEBUG_STATE) {
+		fprintf(stderr, "%s( %s )\n",
+			__FUNCTION__, _mesa_lookup_enum_by_nr(pname));
+	}
+
+	/* This is incorrect: Need to maintain this data for each of
+	 * GL_TEXTURE_{123}D, GL_TEXTURE_RECTANGLE_NV, etc, and switch
+	 * between them according to _ReallyEnabled.
+	 */
+	switch (pname) {
+	case GL_TEXTURE_LOD_BIAS_EXT:{
+#if 0				/* Needs to be relocated in order to make sure we got the right tmu */
+			GLfloat bias, min;
+			GLuint b;
+
+			/* The R300's LOD bias is a signed 2's complement value with a
+			 * range of -16.0 <= bias < 16.0.
+			 *
+			 * NOTE: Add a small bias to the bias for conform mipsel.c test.
+			 */
+			bias = *param + .01;
+			min =
+			    driQueryOptionb(&rmesa->radeon.optionCache,
+					    "no_neg_lod_bias") ? 0.0 : -16.0;
+			bias = CLAMP(bias, min, 16.0);
+
+			/* 0.0 - 16.0 == 0x0 - 0x1000 */
+			/* 0.0 - -16.0 == 0x1001 - 0x1fff */
+			b = 0x1000 / 16.0 * bias;
+			b &= R300_LOD_BIAS_MASK;
+
+			if (b !=
+			    (rmesa->hw.tex.unknown1.
+			     cmd[R300_TEX_VALUE_0 +
+				 unit] & R300_LOD_BIAS_MASK)) {
+				R300_STATECHANGE(rmesa, tex.unknown1);
+				rmesa->hw.tex.unknown1.cmd[R300_TEX_VALUE_0 +
+							   unit] &=
+				    ~R300_LOD_BIAS_MASK;
+				rmesa->hw.tex.unknown1.cmd[R300_TEX_VALUE_0 +
+							   unit] |= b;
+			}
+#endif
+			break;
+		}
+
+	default:
+		return;
+	}
+}
+
+/**
+ * Changes variables and flags for a state update, which will happen at the
+ * next UpdateTextureState
+ */
+
+static void r300TexParameter(GLcontext * ctx, GLenum target,
+			     struct gl_texture_object *texObj,
+			     GLenum pname, const GLfloat * params)
+{
+	r300TexObjPtr t = (r300TexObjPtr) texObj->DriverData;
+
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+		fprintf(stderr, "%s( %s )\n", __FUNCTION__,
+			_mesa_lookup_enum_by_nr(pname));
+	}
+
+	switch (pname) {
+	case GL_TEXTURE_MIN_FILTER:
+	case GL_TEXTURE_MAG_FILTER:
+	case GL_TEXTURE_MAX_ANISOTROPY_EXT:
+		r300SetTexMaxAnisotropy(t, texObj->MaxAnisotropy);
+		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter);
+		break;
+
+	case GL_TEXTURE_WRAP_S:
+	case GL_TEXTURE_WRAP_T:
+	case GL_TEXTURE_WRAP_R:
+		r300SetTexWrap(t, texObj->WrapS, texObj->WrapT, texObj->WrapR);
+		break;
+
+	case GL_TEXTURE_BORDER_COLOR:
+		r300SetTexBorderColor(t, texObj->_BorderChan);
+		break;
+
+	case GL_TEXTURE_BASE_LEVEL:
+	case GL_TEXTURE_MAX_LEVEL:
+	case GL_TEXTURE_MIN_LOD:
+	case GL_TEXTURE_MAX_LOD:
+		/* This isn't the most efficient solution but there doesn't appear to
+		 * be a nice alternative.  Since there's no LOD clamping,
+		 * we just have to rely on loading the right subset of mipmap levels
+		 * to simulate a clamped LOD.
+		 */
+		driSwapOutTextureObject((driTextureObject *) t);
+		break;
+
+	default:
+		return;
+	}
+
+	/* Mark this texobj as dirty (one bit per tex unit)
+	 */
+	t->dirty_state = TEX_ALL;
+}
+
+static void r300BindTexture(GLcontext * ctx, GLenum target,
+			    struct gl_texture_object *texObj)
+{
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+		fprintf(stderr, "%s( %p ) unit=%d\n", __FUNCTION__,
+			(void *)texObj, ctx->Texture.CurrentUnit);
+	}
+
+	if ((target == GL_TEXTURE_1D)
+	    || (target == GL_TEXTURE_2D)
+	    || (target == GL_TEXTURE_3D)
+	    || (target == GL_TEXTURE_CUBE_MAP)
+	    || (target == GL_TEXTURE_RECTANGLE_NV)) {
+		assert(texObj->DriverData != NULL);
+	}
+}
+
+static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+			(void *)texObj,
+			_mesa_lookup_enum_by_nr(texObj->Target));
+	}
+
+	if (t != NULL) {
+		if (rmesa) {
+			R300_FIREVERTICES(rmesa);
+		}
+
+		driDestroyTextureObject(t);
+	}
+	/* Free mipmap images and the texture object itself */
+	_mesa_delete_texture_object(ctx, texObj);
+}
+
+/**
+ * Allocate a new texture object.
+ * Called via ctx->Driver.NewTextureObject.
+ * Note: this function will be called during context creation to
+ * allocate the default texture objects.
+ * Note: we could use containment here to 'derive' the driver-specific
+ * texture object from the core mesa gl_texture_object.  Not done at this time.
+ * Fixup MaxAnisotropy according to user preference.
+ */
+static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
+						      GLuint name,
+						      GLenum target)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct gl_texture_object *obj;
+	obj = _mesa_new_texture_object(ctx, name, target);
+	if (!obj)
+		return NULL;
+	obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
+
+	r300AllocTexObj(obj);
+	return obj;
+}
+
+void r300InitTextureFuncs(struct dd_function_table *functions)
+{
+	/* Note: we only plug in the functions we implement in the driver
+	 * since _mesa_init_driver_functions() was already called.
+	 */
+	functions->ChooseTextureFormat = r300ChooseTextureFormat;
+	functions->TexImage1D = r300TexImage1D;
+	functions->TexImage2D = r300TexImage2D;
+	functions->TexImage3D = r300TexImage3D;
+	functions->TexSubImage1D = r300TexSubImage1D;
+	functions->TexSubImage2D = r300TexSubImage2D;
+	functions->TexSubImage3D = r300TexSubImage3D;
+	functions->NewTextureObject = r300NewTextureObject;
+	functions->BindTexture = r300BindTexture;
+	functions->DeleteTexture = r300DeleteTexture;
+	functions->IsTextureResident = driIsTextureResident;
+
+	functions->TexEnv = r300TexEnv;
+	functions->TexParameter = r300TexParameter;
+
+	functions->CompressedTexImage2D = r300CompressedTexImage2D;
+	functions->CompressedTexSubImage2D = r300CompressedTexSubImage2D;
+
+	driInitTextureFormats();
+}
diff --git a/r300/r300_tex.h b/r300/r300_tex.h
new file mode 100644
index 0000000..f67a8e6
--- /dev/null
+++ b/r300/r300_tex.h
@@ -0,0 +1,51 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __r300_TEX_H__
+#define __r300_TEX_H__
+
+extern void r300SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
+			     unsigned long long offset, GLint depth,
+			     GLuint pitch);
+
+extern void r300UpdateTextureState(GLcontext * ctx);
+
+extern int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t,
+			       GLuint face);
+
+extern void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t);
+
+extern void r300InitTextureFuncs(struct dd_function_table *functions);
+
+#endif				/* __r300_TEX_H__ */
diff --git a/r300/r300_texmem.c b/r300/r300_texmem.c
new file mode 100644
index 0000000..e2e8355
--- /dev/null
+++ b/r300/r300_texmem.c
@@ -0,0 +1,584 @@
+/**************************************************************************
+
+Copyright (C) Tungsten Graphics 2002.  All Rights Reserved.
+The Weather Channel, Inc. funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86
+license. This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation on the rights to use, copy, modify, merge, publish,
+distribute, sub license, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
+SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Gareth Hughes <gareth@valinux.com>
+ *
+ * \author Kevin E. Martin <martin@valinux.com>
+ */
+
+#include <errno.h>
+
+#include "glheader.h"
+#include "imports.h"
+#include "context.h"
+#include "colormac.h"
+#include "macros.h"
+#include "simple_list.h"
+#include "radeon_reg.h"		/* gets definition for usleep */
+#include "r300_context.h"
+#include "r300_state.h"
+#include "r300_cmdbuf.h"
+#include "radeon_ioctl.h"
+#include "r300_tex.h"
+#include "r300_ioctl.h"
+#include <unistd.h>		/* for usleep() */
+
+#ifdef USER_BUFFERS
+#include "r300_mem.h"
+#endif
+
+/**
+ * Destroy any device-dependent state associated with the texture.  This may
+ * include NULLing out hardware state that points to the texture.
+ */
+void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t)
+{
+	if (RADEON_DEBUG & DEBUG_TEXTURE) {
+		fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
+			(void *)t, (void *)t->base.tObj);
+	}
+
+	if (rmesa != NULL) {
+		unsigned i;
+
+		for (i = 0; i < rmesa->radeon.glCtx->Const.MaxTextureUnits; i++) {
+			if (t == rmesa->state.texture.unit[i].texobj) {
+				rmesa->state.texture.unit[i].texobj = NULL;
+				/* This code below is meant to shorten state
+				   pushed to the hardware by not programming
+				   unneeded units.
+
+				   This does not appear to be worthwhile on R300 */
+#if 0
+				remove_from_list(&rmesa->hw.tex[i]);
+				make_empty_list(&rmesa->hw.tex[i]);
+				remove_from_list(&rmesa->hw.cube[i]);
+				make_empty_list(&rmesa->hw.cube[i]);
+#endif
+			}
+		}
+	}
+}
+
+/* ------------------------------------------------------------
+ * Texture image conversions
+ */
+
+static void r300UploadGARTClientSubImage(r300ContextPtr rmesa,
+					 r300TexObjPtr t,
+					 struct gl_texture_image *texImage,
+					 GLint hwlevel,
+					 GLint x, GLint y,
+					 GLint width, GLint height)
+{
+	const struct gl_texture_format *texFormat = texImage->TexFormat;
+	GLuint srcPitch, dstPitch;
+	int blit_format;
+	int srcOffset;
+
+	/*
+	 * XXX it appears that we always upload the full image, not a subimage.
+	 * I.e. x==0, y==0, width=texWidth, height=texWidth.  If this is ever
+	 * changed, the src pitch will have to change.
+	 */
+	switch (texFormat->TexelBytes) {
+	case 1:
+		blit_format = R300_CP_COLOR_FORMAT_CI8;
+		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+		break;
+	case 2:
+		blit_format = R300_CP_COLOR_FORMAT_RGB565;
+		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+		break;
+	case 4:
+		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
+		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+		break;
+	case 8:
+	case 16:
+		blit_format = R300_CP_COLOR_FORMAT_CI8;
+		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+		break;
+	default:
+		return;
+	}
+
+	t->image[0][hwlevel].data = texImage->Data;
+	srcOffset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
+
+	assert(srcOffset != ~0);
+
+	/* Don't currently need to cope with small pitches?
+	 */
+	width = texImage->Width;
+	height = texImage->Height;
+
+	if (texFormat->TexelBytes > 4) {
+		width *= texFormat->TexelBytes;
+	}
+
+	r300EmitWait(rmesa, R300_WAIT_3D);
+
+	r300EmitBlit(rmesa, blit_format,
+		     srcPitch,
+		     srcOffset,
+		     dstPitch,
+		     t->bufAddr,
+		     x,
+		     y,
+		     t->image[0][hwlevel].x + x,
+		     t->image[0][hwlevel].y + y, width, height);
+
+	r300EmitWait(rmesa, R300_WAIT_2D);
+}
+
+static void r300UploadRectSubImage(r300ContextPtr rmesa,
+				   r300TexObjPtr t,
+				   struct gl_texture_image *texImage,
+				   GLint x, GLint y, GLint width, GLint height)
+{
+	const struct gl_texture_format *texFormat = texImage->TexFormat;
+	int blit_format, dstPitch, done;
+
+	switch (texFormat->TexelBytes) {
+	case 1:
+		blit_format = R300_CP_COLOR_FORMAT_CI8;
+		break;
+	case 2:
+		blit_format = R300_CP_COLOR_FORMAT_RGB565;
+		break;
+	case 4:
+		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
+		break;
+	case 8:
+	case 16:
+		blit_format = R300_CP_COLOR_FORMAT_CI8;
+		break;
+	default:
+		return;
+	}
+
+	t->image[0][0].data = texImage->Data;
+
+	/* Currently don't need to cope with small pitches.
+	 */
+	width = texImage->Width;
+	height = texImage->Height;
+	dstPitch = t->pitch;
+
+	if (texFormat->TexelBytes > 4) {
+		width *= texFormat->TexelBytes;
+	}
+
+	if (rmesa->prefer_gart_client_texturing && texImage->IsClientData) {
+		/* In this case, could also use GART texturing.  This is
+		 * currently disabled, but has been tested & works.
+		 */
+		t->offset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
+		t->pitch = texImage->RowStride * texFormat->TexelBytes - 32;
+
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr,
+				"Using GART texturing for rectangular client texture\n");
+
+		/* Release FB memory allocated for this image:
+		 */
+		/* FIXME This may not be correct as driSwapOutTextureObject sets
+		 * FIXME dirty_images.  It may be fine, though.
+		 */
+		if (t->base.memBlock) {
+			driSwapOutTextureObject((driTextureObject *) t);
+		}
+	} else if (texImage->IsClientData) {
+		/* Data already in GART memory, with usable pitch.
+		 */
+		GLuint srcPitch;
+		srcPitch = texImage->RowStride * texFormat->TexelBytes;
+		r300EmitBlit(rmesa,
+			     blit_format,
+			     srcPitch,
+			     r300GartOffsetFromVirtual(rmesa, texImage->Data),
+			     dstPitch, t->bufAddr, 0, 0, 0, 0, width, height);
+	} else {
+		/* Data not in GART memory, or bad pitch.
+		 */
+		for (done = 0; done < height;) {
+			struct r300_dma_region region;
+			int lines =
+			    MIN2(height - done, RADEON_BUFFER_SIZE / dstPitch);
+			int src_pitch;
+			char *tex;
+
+			src_pitch = texImage->RowStride * texFormat->TexelBytes;
+
+			tex = (char *)texImage->Data + done * src_pitch;
+
+			memset(&region, 0, sizeof(region));
+			r300AllocDmaRegion(rmesa, &region, lines * dstPitch,
+					   1024);
+
+			/* Copy texdata to dma:
+			 */
+			if (RADEON_DEBUG & DEBUG_TEXTURE)
+				fprintf(stderr,
+					"%s: src_pitch %d dst_pitch %d\n",
+					__FUNCTION__, src_pitch, dstPitch);
+
+			if (src_pitch == dstPitch) {
+				memcpy(region.address + region.start, tex,
+				       lines * src_pitch);
+			} else {
+				char *buf = region.address + region.start;
+				int i;
+				for (i = 0; i < lines; i++) {
+					memcpy(buf, tex, src_pitch);
+					buf += dstPitch;
+					tex += src_pitch;
+				}
+			}
+
+			r300EmitWait(rmesa, R300_WAIT_3D);
+
+			/* Blit to framebuffer
+			 */
+			r300EmitBlit(rmesa,
+				     blit_format,
+				     dstPitch, GET_START(&region),
+				     dstPitch | (t->tile_bits >> 16),
+				     t->bufAddr, 0, 0, 0, done, width, lines);
+
+			r300EmitWait(rmesa, R300_WAIT_2D);
+#ifdef USER_BUFFERS
+			r300_mem_use(rmesa, region.buf->id);
+#endif
+
+			r300ReleaseDmaRegion(rmesa, &region, __FUNCTION__);
+			done += lines;
+		}
+	}
+}
+
+/**
+ * Upload the texture image associated with texture \a t at the specified
+ * level at the address relative to \a start.
+ */
+static void r300UploadSubImage(r300ContextPtr rmesa, r300TexObjPtr t,
+			       GLint hwlevel,
+			       GLint x, GLint y, GLint width, GLint height,
+			       GLuint face)
+{
+	struct gl_texture_image *texImage = NULL;
+	GLuint offset;
+	GLint imageWidth, imageHeight;
+	GLint ret;
+	drm_radeon_texture_t tex;
+	drm_radeon_tex_image_t tmp;
+	const int level = hwlevel + t->base.firstLevel;
+
+	if (RADEON_DEBUG & DEBUG_TEXTURE) {
+		fprintf(stderr,
+			"%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n",
+			__FUNCTION__, (void *)t, (void *)t->base.tObj, level,
+			width, height, face);
+	}
+
+	ASSERT(face < 6);
+
+	/* Ensure we have a valid texture to upload */
+	if ((hwlevel < 0) || (hwlevel >= RADEON_MAX_TEXTURE_LEVELS)) {
+		_mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
+		return;
+	}
+
+	texImage = t->base.tObj->Image[face][level];
+
+	if (!texImage) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: texImage %d is NULL!\n",
+				__FUNCTION__, level);
+		return;
+	}
+	if (!texImage->Data) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: image data is NULL!\n",
+				__FUNCTION__);
+		return;
+	}
+
+	if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+		assert(level == 0);
+		assert(hwlevel == 0);
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: image data is rectangular\n",
+				__FUNCTION__);
+		r300UploadRectSubImage(rmesa, t, texImage, x, y, width, height);
+		return;
+	} else if (texImage->IsClientData) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr,
+				"%s: image data is in GART client storage\n",
+				__FUNCTION__);
+		r300UploadGARTClientSubImage(rmesa, t, texImage, hwlevel, x, y,
+					     width, height);
+		return;
+	} else if (RADEON_DEBUG & DEBUG_TEXTURE)
+		fprintf(stderr, "%s: image data is in normal memory\n",
+			__FUNCTION__);
+
+	imageWidth = texImage->Width;
+	imageHeight = texImage->Height;
+
+	offset = t->bufAddr + t->base.totalSize / 6 * face;
+
+	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
+		GLint imageX = 0;
+		GLint imageY = 0;
+		GLint blitX = t->image[face][hwlevel].x;
+		GLint blitY = t->image[face][hwlevel].y;
+		GLint blitWidth = t->image[face][hwlevel].width;
+		GLint blitHeight = t->image[face][hwlevel].height;
+		fprintf(stderr, "   upload image: %d,%d at %d,%d\n",
+			imageWidth, imageHeight, imageX, imageY);
+		fprintf(stderr, "   upload  blit: %d,%d at %d,%d\n",
+			blitWidth, blitHeight, blitX, blitY);
+		fprintf(stderr, "       blit ofs: 0x%07x level: %d/%d\n",
+			(GLuint) offset, hwlevel, level);
+	}
+
+	t->image[face][hwlevel].data = texImage->Data;
+
+	/* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
+	 * NOTE: we're always use a 1KB-wide blit and I8 texture format.
+	 * We used to use 1, 2 and 4-byte texels and used to use the texture
+	 * width to dictate the blit width - but that won't work for compressed
+	 * textures. (Brian)
+	 * NOTE: can't do that with texture tiling. (sroland)
+	 */
+	tex.offset = offset;
+	tex.image = &tmp;
+	/* copy (x,y,width,height,data) */
+	memcpy(&tmp, &t->image[face][hwlevel], sizeof(tmp));
+
+	if (texImage->TexFormat->TexelBytes > 4) {
+		const int log2TexelBytes =
+		    (3 + (texImage->TexFormat->TexelBytes >> 4));
+		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
+		tex.pitch =
+		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
+			 64, 1);
+		tex.height = imageHeight;
+		tex.width = imageWidth << log2TexelBytes;
+		tex.offset += (tmp.x << log2TexelBytes) & ~1023;
+		tmp.x = tmp.x % (1024 >> log2TexelBytes);
+		tmp.width = tmp.width << log2TexelBytes;
+	} else if (texImage->TexFormat->TexelBytes) {
+		/* use multi-byte upload scheme */
+		tex.height = imageHeight;
+		tex.width = imageWidth;
+		switch (texImage->TexFormat->TexelBytes) {
+		case 1:
+			tex.format = RADEON_TXFORMAT_I8;
+			break;
+		case 2:
+			tex.format = RADEON_TXFORMAT_AI88;
+			break;
+		case 4:
+			tex.format = RADEON_TXFORMAT_ARGB8888;
+			break;
+		}
+		tex.pitch =
+		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
+			 64, 1);
+		tex.offset += tmp.x & ~1023;
+		tmp.x = tmp.x % 1024;
+
+		if (t->tile_bits & R300_TXO_MICRO_TILE) {
+			/* need something like "tiled coordinates" ? */
+			tmp.y = tmp.x / (tex.pitch * 128) * 2;
+			tmp.x =
+			    tmp.x % (tex.pitch * 128) / 2 /
+			    texImage->TexFormat->TexelBytes;
+			tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
+		} else {
+			tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
+		}
+#if 1
+		if ((t->tile_bits & R300_TXO_MACRO_TILE) &&
+		    (texImage->Width * texImage->TexFormat->TexelBytes >= 256)
+		    && ((!(t->tile_bits & R300_TXO_MICRO_TILE)
+			 && (texImage->Height >= 8))
+			|| (texImage->Height >= 16))) {
+			/* weird: R200 disables macro tiling if mip width is smaller than 256 bytes,
+			   OR if height is smaller than 8 automatically, but if micro tiling is active
+			   the limit is height 16 instead ? */
+			tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
+		}
+#endif
+	} else {
+		/* In case of for instance 8x8 texture (2x2 dxt blocks),
+		   padding after the first two blocks is needed (only
+		   with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
+		/* set tex.height to 1/4 since 1 "macropixel" (dxt-block)
+		   has 4 real pixels. Needed so the kernel module reads
+		   the right amount of data. */
+		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
+		tex.pitch = (R300_BLIT_WIDTH_BYTES / 64);
+		tex.height = (imageHeight + 3) / 4;
+		tex.width = (imageWidth + 3) / 4;
+		if ((t->format & R300_TX_FORMAT_DXT1) == R300_TX_FORMAT_DXT1) {
+			tex.width *= 8;
+		} else {
+			tex.width *= 16;
+		}
+	}
+
+	LOCK_HARDWARE(&rmesa->radeon);
+	do {
+		ret =
+		    drmCommandWriteRead(rmesa->radeon.dri.fd,
+					DRM_RADEON_TEXTURE, &tex,
+					sizeof(drm_radeon_texture_t));
+		if (ret) {
+			if (RADEON_DEBUG & DEBUG_IOCTL)
+				fprintf(stderr,
+					"DRM_RADEON_TEXTURE:  again!\n");
+			usleep(1);
+		}
+	} while (ret == -EAGAIN);
+
+	UNLOCK_HARDWARE(&rmesa->radeon);
+
+	if (ret) {
+		fprintf(stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret);
+		fprintf(stderr, "   offset=0x%08x\n", offset);
+		fprintf(stderr, "   image width=%d height=%d\n",
+			imageWidth, imageHeight);
+		fprintf(stderr, "    blit width=%d height=%d data=%p\n",
+			t->image[face][hwlevel].width,
+			t->image[face][hwlevel].height,
+			t->image[face][hwlevel].data);
+		_mesa_exit(-1);
+	}
+}
+
+/**
+ * Upload the texture images associated with texture \a t.  This might
+ * require the allocation of texture memory.
+ *
+ * \param rmesa Context pointer
+ * \param t Texture to be uploaded
+ * \param face Cube map face to be uploaded.  Zero for non-cube maps.
+ */
+
+int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t, GLuint face)
+{
+	const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+
+	if (t->image_override)
+		return 0;
+
+	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
+		fprintf(stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
+			(void *)rmesa->radeon.glCtx, (void *)t->base.tObj,
+			t->base.totalSize, t->base.firstLevel,
+			t->base.lastLevel);
+	}
+
+	if (!t || t->base.totalSize == 0)
+		return 0;
+
+	if (RADEON_DEBUG & DEBUG_SYNC) {
+		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
+		radeonFinish(rmesa->radeon.glCtx);
+	}
+
+	LOCK_HARDWARE(&rmesa->radeon);
+
+	if (t->base.memBlock == NULL) {
+		int heap;
+
+		heap = driAllocateTexture(rmesa->texture_heaps, rmesa->nr_heaps,
+					  (driTextureObject *) t);
+		if (heap == -1) {
+			UNLOCK_HARDWARE(&rmesa->radeon);
+			return -1;
+		}
+
+		/* Set the base offset of the texture image */
+		t->bufAddr = rmesa->radeon.radeonScreen->texOffset[heap]
+		    + t->base.memBlock->ofs;
+		t->offset = t->bufAddr;
+
+		if (!(t->base.tObj->Image[0][0]->IsClientData)) {
+			/* hope it's safe to add that here... */
+			t->offset |= t->tile_bits;
+		}
+
+		/* Mark this texobj as dirty on all units:
+		 */
+		t->dirty_state = TEX_ALL;
+	}
+
+	/* Let the world know we've used this memory recently.
+	 */
+	driUpdateTextureLRU((driTextureObject *) t);
+	UNLOCK_HARDWARE(&rmesa->radeon);
+
+	/* Upload any images that are new */
+	if (t->base.dirty_images[face]) {
+		int i;
+		for (i = 0; i < numLevels; i++) {
+			if ((t->base.
+			     dirty_images[face] & (1 <<
+						   (i + t->base.firstLevel))) !=
+			    0) {
+				r300UploadSubImage(rmesa, t, i, 0, 0,
+						   t->image[face][i].width,
+						   t->image[face][i].height,
+						   face);
+			}
+		}
+		t->base.dirty_images[face] = 0;
+	}
+
+	if (RADEON_DEBUG & DEBUG_SYNC) {
+		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
+		radeonFinish(rmesa->radeon.glCtx);
+	}
+
+	return 0;
+}
diff --git a/r300/r300_texstate.c b/r300/r300_texstate.c
new file mode 100644
index 0000000..8203189
--- /dev/null
+++ b/r300/r300_texstate.c
@@ -0,0 +1,620 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ *
+ * \todo Enable R300 texture tiling code?
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "context.h"
+#include "macros.h"
+#include "texformat.h"
+#include "teximage.h"
+#include "texobj.h"
+#include "enums.h"
+
+#include "r300_context.h"
+#include "r300_state.h"
+#include "r300_ioctl.h"
+#include "radeon_ioctl.h"
+#include "r300_tex.h"
+#include "r300_reg.h"
+
+#define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5			\
+			   || ((f) >= MESA_FORMAT_RGBA_FLOAT32 &&	\
+			       (f) <= MESA_FORMAT_INTENSITY_FLOAT16))	\
+			  && tx_table_le[f].flag )
+
+#define _ASSIGN(entry, format)				\
+	[ MESA_FORMAT_ ## entry ] = { format, 0, 1}
+
+/*
+ * Note that the _REV formats are the same as the non-REV formats.  This is
+ * because the REV and non-REV formats are identical as a byte string, but
+ * differ when accessed as 16-bit or 32-bit words depending on the endianness of
+ * the host.  Since the textures are transferred to the R300 as a byte string
+ * (i.e. without any byte-swapping), the R300 sees the REV and non-REV formats
+ * identically.  -- paulus
+ */
+
+static const struct tx_table {
+	GLuint format, filter, flag;
+} tx_table_be[] = {
+	/* *INDENT-OFF* */
+	_ASSIGN(RGBA8888, R300_EASY_TX_FORMAT(Z, Y, X, W, W8Z8Y8X8)),
+	_ASSIGN(RGBA8888_REV, R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8)),
+	_ASSIGN(ARGB8888, R300_EASY_TX_FORMAT(W, Z, Y, X, W8Z8Y8X8)),
+	_ASSIGN(ARGB8888_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8)),
+	_ASSIGN(RGB888, 0xffffffff),
+	_ASSIGN(RGB565, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
+	_ASSIGN(RGB565_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
+	_ASSIGN(ARGB4444, R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4)),
+	_ASSIGN(ARGB4444_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4)),
+	_ASSIGN(ARGB1555, R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5)),
+	_ASSIGN(ARGB1555_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5)),
+	_ASSIGN(AL88, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8)),
+	_ASSIGN(AL88_REV, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8)),
+	_ASSIGN(RGB332, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z3Y3X2)),
+	_ASSIGN(A8, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X8)),
+	_ASSIGN(L8, R300_EASY_TX_FORMAT(X, X, X, ONE, X8)),
+	_ASSIGN(I8, R300_EASY_TX_FORMAT(X, X, X, X, X8)),
+	_ASSIGN(CI8, R300_EASY_TX_FORMAT(X, X, X, X, X8)),
+	_ASSIGN(YCBCR, R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8)|R300_TX_FORMAT_YUV_MODE ),
+	_ASSIGN(YCBCR_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8)|R300_TX_FORMAT_YUV_MODE),
+	_ASSIGN(RGB_DXT1, R300_EASY_TX_FORMAT(X, Y, Z, ONE, DXT1)),
+	_ASSIGN(RGBA_DXT1, R300_EASY_TX_FORMAT(X, Y, Z, W, DXT1)),
+	_ASSIGN(RGBA_DXT3, R300_EASY_TX_FORMAT(X, Y, Z, W, DXT3)),
+	_ASSIGN(RGBA_DXT5, R300_EASY_TX_FORMAT(Y, Z, W, X, DXT5)),
+	_ASSIGN(RGBA_FLOAT32, R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R32G32B32A32)),
+	_ASSIGN(RGBA_FLOAT16, R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16)),
+	_ASSIGN(RGB_FLOAT32, 0xffffffff),
+	_ASSIGN(RGB_FLOAT16, 0xffffffff),
+	_ASSIGN(ALPHA_FLOAT32, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I32)),
+	_ASSIGN(ALPHA_FLOAT16, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I16)),
+	_ASSIGN(LUMINANCE_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I32)),
+	_ASSIGN(LUMINANCE_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I16)),
+	_ASSIGN(LUMINANCE_ALPHA_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, Y, FL_I32A32)),
+	_ASSIGN(LUMINANCE_ALPHA_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, Y, FL_I16A16)),
+	_ASSIGN(INTENSITY_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, X, FL_I32)),
+	_ASSIGN(INTENSITY_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, X, FL_I16)),
+	/* *INDENT-ON* */
+};
+
+static const struct tx_table tx_table_le[] = {
+	/* *INDENT-OFF* */
+	_ASSIGN(RGBA8888, R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8)),
+	_ASSIGN(RGBA8888_REV, R300_EASY_TX_FORMAT(Z, Y, X, W, W8Z8Y8X8)),
+	_ASSIGN(ARGB8888, R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8)),
+	_ASSIGN(ARGB8888_REV, R300_EASY_TX_FORMAT(W, Z, Y, X, W8Z8Y8X8)),
+	_ASSIGN(RGB888, R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8)),
+	_ASSIGN(RGB565, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
+	_ASSIGN(RGB565_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
+	_ASSIGN(ARGB4444, R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4)),
+	_ASSIGN(ARGB4444_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4)),
+	_ASSIGN(ARGB1555, R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5)),
+	_ASSIGN(ARGB1555_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5)),
+	_ASSIGN(AL88, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8)),
+	_ASSIGN(AL88_REV, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8)),
+	_ASSIGN(RGB332, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z3Y3X2)),
+	_ASSIGN(A8, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X8)),
+	_ASSIGN(L8, R300_EASY_TX_FORMAT(X, X, X, ONE, X8)),
+	_ASSIGN(I8, R300_EASY_TX_FORMAT(X, X, X, X, X8)),
+	_ASSIGN(CI8, R300_EASY_TX_FORMAT(X, X, X, X, X8)),
+	_ASSIGN(YCBCR, R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8)|R300_TX_FORMAT_YUV_MODE ),
+	_ASSIGN(YCBCR_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8)|R300_TX_FORMAT_YUV_MODE),
+	_ASSIGN(RGB_DXT1, R300_EASY_TX_FORMAT(X, Y, Z, ONE, DXT1)),
+	_ASSIGN(RGBA_DXT1, R300_EASY_TX_FORMAT(X, Y, Z, W, DXT1)),
+	_ASSIGN(RGBA_DXT3, R300_EASY_TX_FORMAT(X, Y, Z, W, DXT3)),
+	_ASSIGN(RGBA_DXT5, R300_EASY_TX_FORMAT(Y, Z, W, X, DXT5)),
+	_ASSIGN(RGBA_FLOAT32, R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R32G32B32A32)),
+	_ASSIGN(RGBA_FLOAT16, R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16)),
+	_ASSIGN(RGB_FLOAT32, 0xffffffff),
+	_ASSIGN(RGB_FLOAT16, 0xffffffff),
+	_ASSIGN(ALPHA_FLOAT32, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I32)),
+	_ASSIGN(ALPHA_FLOAT16, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I16)),
+	_ASSIGN(LUMINANCE_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I32)),
+	_ASSIGN(LUMINANCE_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I16)),
+	_ASSIGN(LUMINANCE_ALPHA_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, Y, FL_I32A32)),
+	_ASSIGN(LUMINANCE_ALPHA_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, Y, FL_I16A16)),
+	_ASSIGN(INTENSITY_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, X, FL_I32)),
+	_ASSIGN(INTENSITY_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, X, FL_I16)),
+	/* *INDENT-ON* */
+};
+
+#undef _ASSIGN
+
+/**
+ * This function computes the number of bytes of storage needed for
+ * the given texture object (all mipmap levels, all cube faces).
+ * The \c image[face][level].x/y/width/height parameters for upload/blitting
+ * are computed here.  \c filter, \c format, etc. will be set here
+ * too.
+ *
+ * \param rmesa Context pointer
+ * \param tObj GL texture object whose images are to be posted to
+ *                 hardware state.
+ */
+static void r300SetTexImages(r300ContextPtr rmesa,
+			     struct gl_texture_object *tObj)
+{
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	const struct gl_texture_image *baseImage =
+	    tObj->Image[0][tObj->BaseLevel];
+	GLint curOffset, blitWidth;
+	GLint i, texelBytes;
+	GLint numLevels;
+	GLint log2Width, log2Height, log2Depth;
+
+	/* Set the hardware texture format
+	 */
+	if (!t->image_override && VALID_FORMAT(baseImage->TexFormat->MesaFormat)) {
+		if (_mesa_little_endian()) {
+			t->format =
+			    tx_table_le[baseImage->TexFormat->MesaFormat].
+			    format;
+			t->filter |=
+			    tx_table_le[baseImage->TexFormat->MesaFormat].
+			    filter;
+		} else {
+			t->format =
+			    tx_table_be[baseImage->TexFormat->MesaFormat].
+			    format;
+			t->filter |=
+			    tx_table_be[baseImage->TexFormat->MesaFormat].
+			    filter;
+		}
+	} else if (!t->image_override) {
+		_mesa_problem(NULL, "unexpected texture format in %s",
+			      __FUNCTION__);
+		return;
+	}
+
+	texelBytes = baseImage->TexFormat->TexelBytes;
+
+	/* Compute which mipmap levels we really want to send to the hardware.
+	 */
+	driCalculateTextureFirstLastLevel((driTextureObject *) t);
+	log2Width = tObj->Image[0][t->base.firstLevel]->WidthLog2;
+	log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
+	log2Depth = tObj->Image[0][t->base.firstLevel]->DepthLog2;
+
+	numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+
+	assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
+
+	/* Calculate mipmap offsets and dimensions for blitting (uploading)
+	 * The idea is that we lay out the mipmap levels within a block of
+	 * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
+	 */
+	curOffset = 0;
+	blitWidth = R300_BLIT_WIDTH_BYTES;
+	t->tile_bits = 0;
+
+	/* figure out if this texture is suitable for tiling. */
+#if 0				/* Disabled for now */
+	if (texelBytes) {
+		if ((tObj->Target != GL_TEXTURE_RECTANGLE_NV) &&
+		    /* texrect might be able to use micro tiling too in theory? */
+		    (baseImage->Height > 1)) {
+
+			/* allow 32 (bytes) x 1 mip (which will use two times the space
+			   the non-tiled version would use) max if base texture is large enough */
+			if ((numLevels == 1) ||
+			    (((baseImage->Width * texelBytes /
+			       baseImage->Height) <= 32)
+			     && (baseImage->Width * texelBytes > 64))
+			    ||
+			    ((baseImage->Width * texelBytes /
+			      baseImage->Height) <= 16)) {
+				t->tile_bits |= R300_TXO_MICRO_TILE;
+			}
+		}
+
+		if (tObj->Target != GL_TEXTURE_RECTANGLE_NV) {
+			/* we can set macro tiling even for small textures, they will be untiled anyway */
+			t->tile_bits |= R300_TXO_MACRO_TILE;
+		}
+	}
+#endif
+
+	for (i = 0; i < numLevels; i++) {
+		const struct gl_texture_image *texImage;
+		GLuint size;
+
+		texImage = tObj->Image[0][i + t->base.firstLevel];
+		if (!texImage)
+			break;
+
+		/* find image size in bytes */
+		if (texImage->IsCompressed) {
+			if ((t->format & R300_TX_FORMAT_DXT1) ==
+			    R300_TX_FORMAT_DXT1) {
+				// fprintf(stderr,"DXT 1 %d %08X\n", texImage->Width, t->format);
+				if ((texImage->Width + 3) < 8)	/* width one block */
+					size = texImage->CompressedSize * 4;
+				else if ((texImage->Width + 3) < 16)
+					size = texImage->CompressedSize * 2;
+				else
+					size = texImage->CompressedSize;
+			} else {
+				/* DXT3/5, 16 bytes per block */
+				WARN_ONCE
+				    ("DXT 3/5 suffers from multitexturing problems!\n");
+				// fprintf(stderr,"DXT 3/5 %d\n", texImage->Width);
+				if ((texImage->Width + 3) < 8)
+					size = texImage->CompressedSize * 2;
+				else
+					size = texImage->CompressedSize;
+			}
+		} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+			size =
+			    ((texImage->Width * texelBytes +
+			      63) & ~63) * texImage->Height;
+			blitWidth = 64 / texelBytes;
+		} else if (t->tile_bits & R300_TXO_MICRO_TILE) {
+			/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+			   though the actual offset may be different (if texture is less than
+			   32 bytes width) to the untiled case */
+			int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
+			size =
+			    (w * ((texImage->Height + 1) / 2)) *
+			    texImage->Depth;
+			blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+		} else {
+			int w = (texImage->Width * texelBytes + 31) & ~31;
+			size = w * texImage->Height * texImage->Depth;
+			blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+		}
+		assert(size > 0);
+
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "w=%d h=%d d=%d tb=%d intFormat=%d\n",
+				texImage->Width, texImage->Height,
+				texImage->Depth,
+				texImage->TexFormat->TexelBytes,
+				texImage->InternalFormat);
+
+		/* Align to 32-byte offset.  It is faster to do this unconditionally
+		 * (no branch penalty).
+		 */
+
+		curOffset = (curOffset + 0x1f) & ~0x1f;
+
+		if (texelBytes) {
+			/* fix x and y coords up later together with offset */
+			t->image[0][i].x = curOffset;
+			t->image[0][i].y = 0;
+			t->image[0][i].width =
+			    MIN2(size / texelBytes, blitWidth);
+			t->image[0][i].height =
+			    (size / texelBytes) / t->image[0][i].width;
+		} else {
+			t->image[0][i].x = curOffset % R300_BLIT_WIDTH_BYTES;
+			t->image[0][i].y = curOffset / R300_BLIT_WIDTH_BYTES;
+			t->image[0][i].width =
+			    MIN2(size, R300_BLIT_WIDTH_BYTES);
+			t->image[0][i].height = size / t->image[0][i].width;
+		}
+
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr,
+				"level %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
+				i, texImage->Width, texImage->Height,
+				t->image[0][i].x, t->image[0][i].y,
+				t->image[0][i].width, t->image[0][i].height,
+				size, curOffset);
+
+		curOffset += size;
+	}
+
+	/* Align the total size of texture memory block.
+	 */
+	t->base.totalSize =
+	    (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+
+	/* Setup remaining cube face blits, if needed */
+	if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+		GLuint face;
+		for (face = 1; face < 6; face++) {
+			for (i = 0; i < numLevels; i++) {
+				t->image[face][i].x = t->image[0][i].x;
+				t->image[face][i].y = t->image[0][i].y;
+				t->image[face][i].width = t->image[0][i].width;
+				t->image[face][i].height =
+				    t->image[0][i].height;
+			}
+		}
+		t->base.totalSize *= 6;	/* total texmem needed */
+	}
+
+	if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+		ASSERT(log2Width == log2Height);
+		t->format |= R300_TX_FORMAT_CUBIC_MAP;
+	}
+
+	t->size =
+	    (((tObj->Image[0][t->base.firstLevel]->Width -
+	       1) << R300_TX_WIDTHMASK_SHIFT)
+	     | ((tObj->Image[0][t->base.firstLevel]->Height - 1) <<
+		R300_TX_HEIGHTMASK_SHIFT))
+	    | ((numLevels - 1) << R300_TX_MAX_MIP_LEVEL_SHIFT);
+
+	/* Only need to round to nearest 32 for textures, but the blitter
+	 * requires 64-byte aligned pitches, and we may/may not need the
+	 * blitter.   NPOT only!
+	 */
+	if (baseImage->IsCompressed) {
+		t->pitch =
+		    (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
+	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+		unsigned int align = blitWidth - 1;
+		t->pitch = ((tObj->Image[0][t->base.firstLevel]->Width *
+			     texelBytes) + 63) & ~(63);
+		t->size |= R300_TX_SIZE_TXPITCH_EN;
+		if (!t->image_override)
+			t->pitch_reg =
+			    (((tObj->Image[0][t->base.firstLevel]->Width) +
+			      align) & ~align) - 1;
+	} else {
+		t->pitch =
+		    ((tObj->Image[0][t->base.firstLevel]->Width *
+		      texelBytes) + 63) & ~(63);
+	}
+
+	t->dirty_state = TEX_ALL;
+
+	/* FYI: r300UploadTexImages( rmesa, t ) used to be called here */
+}
+
+/* ================================================================
+ * Texture unit state management
+ */
+
+static GLboolean r300EnableTexture2D(GLcontext * ctx, int unit)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+	struct gl_texture_object *tObj = texUnit->_Current;
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+
+	ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
+
+	if (t->base.dirty_images[0]) {
+		R300_FIREVERTICES(rmesa);
+
+		r300SetTexImages(rmesa, tObj);
+		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
+		if (!t->base.memBlock && !t->image_override)
+			return GL_FALSE;
+	}
+
+	return GL_TRUE;
+}
+
+static GLboolean r300EnableTexture3D(GLcontext * ctx, int unit)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+	struct gl_texture_object *tObj = texUnit->_Current;
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+
+	ASSERT(tObj->Target == GL_TEXTURE_3D);
+
+	/* r300 does not support mipmaps for 3D textures. */
+	if ((tObj->MinFilter != GL_NEAREST) && (tObj->MinFilter != GL_LINEAR)) {
+		return GL_FALSE;
+	}
+
+	if (t->base.dirty_images[0]) {
+		R300_FIREVERTICES(rmesa);
+		r300SetTexImages(rmesa, tObj);
+		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
+		if (!t->base.memBlock)
+			return GL_FALSE;
+	}
+
+	return GL_TRUE;
+}
+
+static GLboolean r300EnableTextureCube(GLcontext * ctx, int unit)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+	struct gl_texture_object *tObj = texUnit->_Current;
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	GLuint face;
+
+	ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
+
+	if (t->base.dirty_images[0] || t->base.dirty_images[1] ||
+	    t->base.dirty_images[2] || t->base.dirty_images[3] ||
+	    t->base.dirty_images[4] || t->base.dirty_images[5]) {
+		/* flush */
+		R300_FIREVERTICES(rmesa);
+		/* layout memory space, once for all faces */
+		r300SetTexImages(rmesa, tObj);
+	}
+
+	/* upload (per face) */
+	for (face = 0; face < 6; face++) {
+		if (t->base.dirty_images[face]) {
+			r300UploadTexImages(rmesa,
+					    (r300TexObjPtr) tObj->DriverData,
+					    face);
+		}
+	}
+
+	if (!t->base.memBlock) {
+		/* texmem alloc failed, use s/w fallback */
+		return GL_FALSE;
+	}
+
+	return GL_TRUE;
+}
+
+static GLboolean r300EnableTextureRect(GLcontext * ctx, int unit)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+	struct gl_texture_object *tObj = texUnit->_Current;
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+
+	ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
+
+	if (t->base.dirty_images[0]) {
+		R300_FIREVERTICES(rmesa);
+
+		r300SetTexImages(rmesa, tObj);
+		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
+		if (!t->base.memBlock && !t->image_override &&
+		    !rmesa->prefer_gart_client_texturing)
+			return GL_FALSE;
+	}
+
+	return GL_TRUE;
+}
+
+static GLboolean r300UpdateTexture(GLcontext * ctx, int unit)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+	struct gl_texture_object *tObj = texUnit->_Current;
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+
+	/* Fallback if there's a texture border */
+	if (tObj->Image[0][tObj->BaseLevel]->Border > 0)
+		return GL_FALSE;
+
+	/* Update state if this is a different texture object to last
+	 * time.
+	 */
+	if (rmesa->state.texture.unit[unit].texobj != t) {
+		if (rmesa->state.texture.unit[unit].texobj != NULL) {
+			/* The old texture is no longer bound to this texture unit.
+			 * Mark it as such.
+			 */
+
+			rmesa->state.texture.unit[unit].texobj->base.bound &=
+			    ~(1UL << unit);
+		}
+
+		rmesa->state.texture.unit[unit].texobj = t;
+		t->base.bound |= (1UL << unit);
+		t->dirty_state |= 1 << unit;
+		driUpdateTextureLRU((driTextureObject *) t);	/* XXX: should be locked! */
+	}
+
+	return !t->border_fallback;
+}
+
+void r300SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
+		      unsigned long long offset, GLint depth, GLuint pitch)
+{
+	r300ContextPtr rmesa =
+		(r300ContextPtr)((__DRIcontextPrivate*)pDRICtx->private)->driverPrivate;
+	struct gl_texture_object *tObj =
+		_mesa_lookup_texture(rmesa->radeon.glCtx, texname);
+	r300TexObjPtr t;
+	int idx;
+
+	if (!tObj)
+		return;
+
+	t = (r300TexObjPtr) tObj->DriverData;
+
+	t->image_override = GL_TRUE;
+
+	if (!offset)
+		return;
+
+	t->offset = offset;
+	t->pitch_reg = pitch;
+
+	switch (depth) {
+	case 32:
+		idx = 2;
+		t->pitch_reg /= 4;
+		break;
+	case 24:
+	default:
+		idx = 4;
+		t->pitch_reg /= 4;
+		break;
+	case 16:
+		idx = 5;
+		t->pitch_reg /= 2;
+		break;
+	}
+
+	t->pitch_reg--;
+
+	t->format = tx_table_le[idx].format;
+	t->filter |= tx_table_le[idx].filter;
+}
+
+static GLboolean r300UpdateTextureUnit(GLcontext * ctx, int unit)
+{
+	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+
+	if (texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT)) {
+		return (r300EnableTextureRect(ctx, unit) &&
+			r300UpdateTexture(ctx, unit));
+	} else if (texUnit->_ReallyEnabled & (TEXTURE_1D_BIT | TEXTURE_2D_BIT)) {
+		return (r300EnableTexture2D(ctx, unit) &&
+			r300UpdateTexture(ctx, unit));
+	} else if (texUnit->_ReallyEnabled & (TEXTURE_3D_BIT)) {
+		return (r300EnableTexture3D(ctx, unit) &&
+			r300UpdateTexture(ctx, unit));
+	} else if (texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT)) {
+		return (r300EnableTextureCube(ctx, unit) &&
+			r300UpdateTexture(ctx, unit));
+	} else if (texUnit->_ReallyEnabled) {
+		return GL_FALSE;
+	} else {
+		return GL_TRUE;
+	}
+}
+
+void r300UpdateTextureState(GLcontext * ctx)
+{
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		if (!r300UpdateTextureUnit(ctx, i)) {
+			_mesa_warning(ctx,
+				      "failed to update texture state for unit %d.\n",
+				      i);
+		}
+	}
+}
diff --git a/r300/r300_vertprog.c b/r300/r300_vertprog.c
new file mode 100644
index 0000000..1d90ade
--- /dev/null
+++ b/r300/r300_vertprog.c
@@ -0,0 +1,1305 @@
+/**************************************************************************
+
+Copyright (C) 2005 Aapo Tahkola.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Aapo Tahkola <aet@rasterburn.org>
+ */
+
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "program.h"
+#include "shader/prog_instruction.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+#include "tnl/tnl.h"
+
+#include "r300_context.h"
+
+#if SWIZZLE_X != VSF_IN_COMPONENT_X || \
+    SWIZZLE_Y != VSF_IN_COMPONENT_Y || \
+    SWIZZLE_Z != VSF_IN_COMPONENT_Z || \
+    SWIZZLE_W != VSF_IN_COMPONENT_W || \
+    SWIZZLE_ZERO != VSF_IN_COMPONENT_ZERO || \
+    SWIZZLE_ONE != VSF_IN_COMPONENT_ONE || \
+    WRITEMASK_X != VSF_FLAG_X || \
+    WRITEMASK_Y != VSF_FLAG_Y || \
+    WRITEMASK_Z != VSF_FLAG_Z || \
+    WRITEMASK_W != VSF_FLAG_W
+#error Cannot change these!
+#endif
+
+#define SCALAR_FLAG (1<<31)
+#define FLAG_MASK (1<<31)
+#define OP_MASK	(0xf)		/* we are unlikely to have more than 15 */
+#define OPN(operator, ip) {#operator, OPCODE_##operator, ip}
+
+static struct {
+	char *name;
+	int opcode;
+	unsigned long ip;	/* number of input operands and flags */
+} op_names[] = {
+	/* *INDENT-OFF* */
+	OPN(ABS, 1),
+	OPN(ADD, 2),
+	OPN(ARL, 1 | SCALAR_FLAG),
+	OPN(DP3, 2),
+	OPN(DP4, 2),
+	OPN(DPH, 2),
+	OPN(DST, 2),
+	OPN(EX2, 1 | SCALAR_FLAG),
+	OPN(EXP, 1 | SCALAR_FLAG),
+	OPN(FLR, 1),
+	OPN(FRC, 1),
+	OPN(LG2, 1 | SCALAR_FLAG),
+	OPN(LIT, 1),
+	OPN(LOG, 1 | SCALAR_FLAG),
+	OPN(MAD, 3),
+	OPN(MAX, 2),
+	OPN(MIN, 2),
+	OPN(MOV, 1),
+	OPN(MUL, 2),
+	OPN(POW, 2 | SCALAR_FLAG),
+	OPN(RCP, 1 | SCALAR_FLAG),
+	OPN(RSQ, 1 | SCALAR_FLAG),
+	OPN(SGE, 2),
+	OPN(SLT, 2),
+	OPN(SUB, 2),
+	OPN(SWZ, 1),
+	OPN(XPD, 2),
+	OPN(RCC, 0),	//extra
+	OPN(PRINT, 0),
+	OPN(END, 0)
+	/* *INDENT-ON* */
+};
+
+#undef OPN
+
+int r300VertexProgUpdateParams(GLcontext * ctx,
+			       struct r300_vertex_program_cont *vp, float *dst)
+{
+	int pi;
+	struct gl_vertex_program *mesa_vp = &vp->mesa_program;
+	float *dst_o = dst;
+	struct gl_program_parameter_list *paramList;
+
+	if (mesa_vp->IsNVProgram) {
+		_mesa_load_tracked_matrices(ctx);
+
+		for (pi = 0; pi < MAX_NV_VERTEX_PROGRAM_PARAMS; pi++) {
+			*dst++ = ctx->VertexProgram.Parameters[pi][0];
+			*dst++ = ctx->VertexProgram.Parameters[pi][1];
+			*dst++ = ctx->VertexProgram.Parameters[pi][2];
+			*dst++ = ctx->VertexProgram.Parameters[pi][3];
+		}
+		return dst - dst_o;
+	}
+
+	assert(mesa_vp->Base.Parameters);
+	_mesa_load_state_parameters(ctx, mesa_vp->Base.Parameters);
+
+	if (mesa_vp->Base.Parameters->NumParameters * 4 >
+	    VSF_MAX_FRAGMENT_LENGTH) {
+		fprintf(stderr, "%s:Params exhausted\n", __FUNCTION__);
+		_mesa_exit(-1);
+	}
+
+	paramList = mesa_vp->Base.Parameters;
+	for (pi = 0; pi < paramList->NumParameters; pi++) {
+		switch (paramList->Parameters[pi].Type) {
+
+		case PROGRAM_STATE_VAR:
+		case PROGRAM_NAMED_PARAM:
+			//fprintf(stderr, "%s", vp->Parameters->Parameters[pi].Name);
+		case PROGRAM_CONSTANT:
+			*dst++ = paramList->ParameterValues[pi][0];
+			*dst++ = paramList->ParameterValues[pi][1];
+			*dst++ = paramList->ParameterValues[pi][2];
+			*dst++ = paramList->ParameterValues[pi][3];
+			break;
+
+		default:
+			_mesa_problem(NULL, "Bad param type in %s",
+				      __FUNCTION__);
+		}
+
+	}
+
+	return dst - dst_o;
+}
+
+static unsigned long t_dst_mask(GLuint mask)
+{
+	/* WRITEMASK_* is equivalent to VSF_FLAG_* */
+	return mask & VSF_FLAG_ALL;
+}
+
+static unsigned long t_dst_class(enum register_file file)
+{
+
+	switch (file) {
+	case PROGRAM_TEMPORARY:
+		return VSF_OUT_CLASS_TMP;
+	case PROGRAM_OUTPUT:
+		return VSF_OUT_CLASS_RESULT;
+	case PROGRAM_ADDRESS:
+		return VSF_OUT_CLASS_ADDR;
+		/*
+		   case PROGRAM_INPUT:
+		   case PROGRAM_LOCAL_PARAM:
+		   case PROGRAM_ENV_PARAM:
+		   case PROGRAM_NAMED_PARAM:
+		   case PROGRAM_STATE_VAR:
+		   case PROGRAM_WRITE_ONLY:
+		   case PROGRAM_ADDRESS:
+		 */
+	default:
+		fprintf(stderr, "problem in %s", __FUNCTION__);
+		_mesa_exit(-1);
+		return -1;
+	}
+}
+
+static unsigned long t_dst_index(struct r300_vertex_program *vp,
+				 struct prog_dst_register *dst)
+{
+	if (dst->File == PROGRAM_OUTPUT)
+		return vp->outputs[dst->Index];
+
+	return dst->Index;
+}
+
+static unsigned long t_src_class(enum register_file file)
+{
+
+	switch (file) {
+	case PROGRAM_TEMPORARY:
+		return VSF_IN_CLASS_TMP;
+
+	case PROGRAM_INPUT:
+		return VSF_IN_CLASS_ATTR;
+
+	case PROGRAM_LOCAL_PARAM:
+	case PROGRAM_ENV_PARAM:
+	case PROGRAM_NAMED_PARAM:
+	case PROGRAM_STATE_VAR:
+		return VSF_IN_CLASS_PARAM;
+		/*
+		   case PROGRAM_OUTPUT:
+		   case PROGRAM_WRITE_ONLY:
+		   case PROGRAM_ADDRESS:
+		 */
+	default:
+		fprintf(stderr, "problem in %s", __FUNCTION__);
+		_mesa_exit(-1);
+		return -1;
+	}
+}
+
+static __inline unsigned long t_swizzle(GLubyte swizzle)
+{
+/* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
+	return swizzle;
+}
+
+#if 0
+static void vp_dump_inputs(struct r300_vertex_program *vp, char *caller)
+{
+	int i;
+
+	if (vp == NULL) {
+		fprintf(stderr, "vp null in call to %s from %s\n", __FUNCTION__,
+			caller);
+		return;
+	}
+
+	fprintf(stderr, "%s:<", caller);
+	for (i = 0; i < VERT_ATTRIB_MAX; i++)
+		fprintf(stderr, "%d ", vp->inputs[i]);
+	fprintf(stderr, ">\n");
+
+}
+#endif
+
+static unsigned long t_src_index(struct r300_vertex_program *vp,
+				 struct prog_src_register *src)
+{
+	int i;
+	int max_reg = -1;
+
+	if (src->File == PROGRAM_INPUT) {
+		if (vp->inputs[src->Index] != -1)
+			return vp->inputs[src->Index];
+
+		for (i = 0; i < VERT_ATTRIB_MAX; i++)
+			if (vp->inputs[i] > max_reg)
+				max_reg = vp->inputs[i];
+
+		vp->inputs[src->Index] = max_reg + 1;
+
+		//vp_dump_inputs(vp, __FUNCTION__);
+
+		return vp->inputs[src->Index];
+	} else {
+		if (src->Index < 0) {
+			fprintf(stderr,
+				"negative offsets for indirect addressing do not work.\n");
+			return 0;
+		}
+		return src->Index;
+	}
+}
+
+static unsigned long t_src(struct r300_vertex_program *vp,
+			   struct prog_src_register *src)
+{
+	/* src->NegateBase uses the NEGATE_ flags from program_instruction.h,
+	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
+	 */
+	return MAKE_VSF_SOURCE(t_src_index(vp, src),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
+			       t_src_class(src->File),
+			       src->NegateBase) | (src->RelAddr << 4);
+}
+
+static unsigned long t_src_scalar(struct r300_vertex_program *vp,
+				  struct prog_src_register *src)
+{
+
+	return MAKE_VSF_SOURCE(t_src_index(vp, src),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_src_class(src->File),
+			       src->
+			       NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+	    (src->RelAddr << 4);
+}
+
+static unsigned long t_opcode(enum prog_opcode opcode)
+{
+
+	switch (opcode) {
+	/* *INDENT-OFF* */
+	case OPCODE_ARL: return R300_VPI_OUT_OP_ARL;
+	case OPCODE_DST: return R300_VPI_OUT_OP_DST;
+	case OPCODE_EX2: return R300_VPI_OUT_OP_EX2;
+	case OPCODE_EXP: return R300_VPI_OUT_OP_EXP;
+	case OPCODE_FRC: return R300_VPI_OUT_OP_FRC;
+	case OPCODE_LG2: return R300_VPI_OUT_OP_LG2;
+	case OPCODE_LOG: return R300_VPI_OUT_OP_LOG;
+	case OPCODE_MAX: return R300_VPI_OUT_OP_MAX;
+	case OPCODE_MIN: return R300_VPI_OUT_OP_MIN;
+	case OPCODE_MUL: return R300_VPI_OUT_OP_MUL;
+	case OPCODE_RCP: return R300_VPI_OUT_OP_RCP;
+	case OPCODE_RSQ: return R300_VPI_OUT_OP_RSQ;
+	case OPCODE_SGE: return R300_VPI_OUT_OP_SGE;
+	case OPCODE_SLT: return R300_VPI_OUT_OP_SLT;
+	case OPCODE_DP4: return R300_VPI_OUT_OP_DOT;
+	/* *INDENT-ON* */
+
+	default:
+		fprintf(stderr, "%s: Should not be called with opcode %d!",
+			__FUNCTION__, opcode);
+	}
+	_mesa_exit(-1);
+	return 0;
+}
+
+static unsigned long op_operands(enum prog_opcode opcode)
+{
+	int i;
+
+	/* Can we trust mesas opcodes to be in order ? */
+	for (i = 0; i < sizeof(op_names) / sizeof(*op_names); i++)
+		if (op_names[i].opcode == opcode)
+			return op_names[i].ip;
+
+	fprintf(stderr, "op %d not found in op_names\n", opcode);
+	_mesa_exit(-1);
+	return 0;
+}
+
+static GLboolean valid_dst(struct r300_vertex_program *vp,
+			   struct prog_dst_register *dst)
+{
+	if (dst->File == PROGRAM_OUTPUT && vp->outputs[dst->Index] == -1) {
+		return GL_FALSE;
+	} else if (dst->File == PROGRAM_ADDRESS) {
+		assert(dst->Index == 0);
+	}
+
+	return GL_TRUE;
+}
+
+/* TODO: Get rid of t_src_class call */
+#define CMP_SRCS(a, b) ((a.RelAddr != b.RelAddr) || (a.Index != b.Index && \
+		       ((t_src_class(a.File) == VSF_IN_CLASS_PARAM && \
+			 t_src_class(b.File) == VSF_IN_CLASS_PARAM) || \
+			(t_src_class(a.File) == VSF_IN_CLASS_ATTR && \
+			 t_src_class(b.File) == VSF_IN_CLASS_ATTR)))) \
+
+#define ZERO_SRC_0 (MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), \
+				    SWIZZLE_ZERO, SWIZZLE_ZERO, \
+				    SWIZZLE_ZERO, SWIZZLE_ZERO, \
+				    t_src_class(src[0].File), VSF_FLAG_NONE) | (src[0].RelAddr << 4))
+
+#define ZERO_SRC_1 (MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), \
+				    SWIZZLE_ZERO, SWIZZLE_ZERO, \
+				    SWIZZLE_ZERO, SWIZZLE_ZERO, \
+				    t_src_class(src[1].File), VSF_FLAG_NONE) | (src[1].RelAddr << 4))
+
+#define ZERO_SRC_2 (MAKE_VSF_SOURCE(t_src_index(vp, &src[2]), \
+				    SWIZZLE_ZERO, SWIZZLE_ZERO, \
+				    SWIZZLE_ZERO, SWIZZLE_ZERO, \
+				    t_src_class(src[2].File), VSF_FLAG_NONE) | (src[2].RelAddr << 4))
+
+#define ONE_SRC_0 (MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), \
+				    SWIZZLE_ONE, SWIZZLE_ONE, \
+				    SWIZZLE_ONE, SWIZZLE_ONE, \
+				    t_src_class(src[0].File), VSF_FLAG_NONE) | (src[0].RelAddr << 4))
+
+#define ONE_SRC_1 (MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), \
+				    SWIZZLE_ONE, SWIZZLE_ONE, \
+				    SWIZZLE_ONE, SWIZZLE_ONE, \
+				    t_src_class(src[1].File), VSF_FLAG_NONE) | (src[1].RelAddr << 4))
+
+#define ONE_SRC_2 (MAKE_VSF_SOURCE(t_src_index(vp, &src[2]), \
+				    SWIZZLE_ONE, SWIZZLE_ONE, \
+				    SWIZZLE_ONE, SWIZZLE_ONE, \
+				    t_src_class(src[2].File), VSF_FLAG_NONE) | (src[2].RelAddr << 4))
+
+/* DP4 version seems to trigger some hw peculiarity */
+//#define PREFER_DP4
+
+#define FREE_TEMPS() \
+	do { \
+		if(u_temp_i < vp->num_temporaries) { \
+			WARN_ONCE("Ran out of temps, num temps %d, us %d\n", vp->num_temporaries, u_temp_i); \
+			vp->native = GL_FALSE; \
+		} \
+		u_temp_i=VSF_MAX_FRAGMENT_TEMPS-1; \
+	} while (0)
+
+static void r300TranslateVertexShader(struct r300_vertex_program *vp,
+				      struct prog_instruction *vpi)
+{
+	int i, cur_reg = 0;
+	VERTEX_SHADER_INSTRUCTION *o_inst;
+	unsigned long operands;
+	int are_srcs_scalar;
+	unsigned long hw_op;
+	/* Initial value should be last tmp reg that hw supports.
+	   Strangely enough r300 doesnt mind even though these would be out of range.
+	   Smart enough to realize that it doesnt need it? */
+	int u_temp_i = VSF_MAX_FRAGMENT_TEMPS - 1;
+	struct prog_src_register src[3];
+
+	vp->pos_end = 0;	/* Not supported yet */
+	vp->program.length = 0;
+	/*vp->num_temporaries=mesa_vp->Base.NumTemporaries; */
+
+	for (i = 0; i < VERT_ATTRIB_MAX; i++)
+		vp->inputs[i] = -1;
+
+	for (i = 0; i < VERT_RESULT_MAX; i++)
+		vp->outputs[i] = -1;
+
+	assert(vp->key.OutputsWritten & (1 << VERT_RESULT_HPOS));
+
+	/* Assign outputs */
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_HPOS))
+		vp->outputs[VERT_RESULT_HPOS] = cur_reg++;
+
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_PSIZ))
+		vp->outputs[VERT_RESULT_PSIZ] = cur_reg++;
+
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL0))
+		vp->outputs[VERT_RESULT_COL0] = cur_reg++;
+
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL1))
+		vp->outputs[VERT_RESULT_COL1] = cur_reg++;
+
+#if 0				/* Not supported yet */
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0))
+		vp->outputs[VERT_RESULT_BFC0] = cur_reg++;
+
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1))
+		vp->outputs[VERT_RESULT_BFC1] = cur_reg++;
+
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_FOGC))
+		vp->outputs[VERT_RESULT_FOGC] = cur_reg++;
+#endif
+
+	for (i = VERT_RESULT_TEX0; i <= VERT_RESULT_TEX7; i++)
+		if (vp->key.OutputsWritten & (1 << i))
+			vp->outputs[i] = cur_reg++;
+
+	vp->translated = GL_TRUE;
+	vp->native = GL_TRUE;
+
+	o_inst = vp->program.body.i;
+	for (; vpi->Opcode != OPCODE_END; vpi++, o_inst++) {
+		FREE_TEMPS();
+
+		if (!valid_dst(vp, &vpi->DstReg)) {
+			/* redirect result to unused temp */
+			vpi->DstReg.File = PROGRAM_TEMPORARY;
+			vpi->DstReg.Index = u_temp_i;
+		}
+
+		operands = op_operands(vpi->Opcode);
+		are_srcs_scalar = operands & SCALAR_FLAG;
+		operands &= OP_MASK;
+
+		for (i = 0; i < operands; i++)
+			src[i] = vpi->SrcReg[i];
+
+		if (operands == 3) {	/* TODO: scalars */
+			if (CMP_SRCS(src[1], src[2])
+			    || CMP_SRCS(src[0], src[2])) {
+				o_inst->op =
+				    MAKE_VSF_OP(R300_VPI_OUT_OP_ADD, u_temp_i,
+						VSF_FLAG_ALL,
+						VSF_OUT_CLASS_TMP);
+
+				o_inst->src[0] =
+				    MAKE_VSF_SOURCE(t_src_index(vp, &src[2]),
+						    SWIZZLE_X, SWIZZLE_Y,
+						    SWIZZLE_Z, SWIZZLE_W,
+						    t_src_class(src[2].File),
+						    VSF_FLAG_NONE) | (src[2].
+								      RelAddr <<
+								      4);
+
+				o_inst->src[1] = ZERO_SRC_2;
+				o_inst->src[2] = ZERO_SRC_2;
+				o_inst++;
+
+				src[2].File = PROGRAM_TEMPORARY;
+				src[2].Index = u_temp_i;
+				src[2].RelAddr = 0;
+				u_temp_i--;
+			}
+
+		}
+
+		if (operands >= 2) {
+			if (CMP_SRCS(src[1], src[0])) {
+				o_inst->op =
+				    MAKE_VSF_OP(R300_VPI_OUT_OP_ADD, u_temp_i,
+						VSF_FLAG_ALL,
+						VSF_OUT_CLASS_TMP);
+
+				o_inst->src[0] =
+				    MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+						    SWIZZLE_X, SWIZZLE_Y,
+						    SWIZZLE_Z, SWIZZLE_W,
+						    t_src_class(src[0].File),
+						    VSF_FLAG_NONE) | (src[0].
+								      RelAddr <<
+								      4);
+
+				o_inst->src[1] = ZERO_SRC_0;
+				o_inst->src[2] = ZERO_SRC_0;
+				o_inst++;
+
+				src[0].File = PROGRAM_TEMPORARY;
+				src[0].Index = u_temp_i;
+				src[0].RelAddr = 0;
+				u_temp_i--;
+			}
+		}
+
+		/* These ops need special handling. */
+		switch (vpi->Opcode) {
+		case OPCODE_POW:
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_POW,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src_scalar(vp, &src[0]);
+			o_inst->src[1] = ZERO_SRC_0;
+			o_inst->src[2] = t_src_scalar(vp, &src[1]);
+			goto next;
+
+		case OPCODE_MOV:	//ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO}
+		case OPCODE_SWZ:
+#if 1
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_ADD,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = ZERO_SRC_0;
+			o_inst->src[2] = ZERO_SRC_0;
+#else
+			hw_op =
+			    (src[0].File ==
+			     PROGRAM_TEMPORARY) ? R300_VPI_OUT_OP_MAD_2 :
+			    R300_VPI_OUT_OP_MAD;
+
+			o_inst->op =
+			    MAKE_VSF_OP(hw_op, t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = ONE_SRC_0;
+			o_inst->src[2] = ZERO_SRC_0;
+#endif
+
+			goto next;
+
+		case OPCODE_ADD:
+#if 1
+			hw_op = (src[0].File == PROGRAM_TEMPORARY &&
+				 src[1].File ==
+				 PROGRAM_TEMPORARY) ? R300_VPI_OUT_OP_MAD_2 :
+			    R300_VPI_OUT_OP_MAD;
+
+			o_inst->op =
+			    MAKE_VSF_OP(hw_op, t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = ONE_SRC_0;
+			o_inst->src[1] = t_src(vp, &src[0]);
+			o_inst->src[2] = t_src(vp, &src[1]);
+#else
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_ADD,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = t_src(vp, &src[1]);
+			o_inst->src[2] = ZERO_SRC_1;
+
+#endif
+			goto next;
+
+		case OPCODE_MAD:
+			hw_op = (src[0].File == PROGRAM_TEMPORARY &&
+				 src[1].File == PROGRAM_TEMPORARY &&
+				 src[2].File ==
+				 PROGRAM_TEMPORARY) ? R300_VPI_OUT_OP_MAD_2 :
+			    R300_VPI_OUT_OP_MAD;
+
+			o_inst->op =
+			    MAKE_VSF_OP(hw_op, t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = t_src(vp, &src[1]);
+			o_inst->src[2] = t_src(vp, &src[2]);
+			goto next;
+
+		case OPCODE_MUL:	/* HW mul can take third arg but appears to have some other limitations. */
+			hw_op = (src[0].File == PROGRAM_TEMPORARY &&
+				 src[1].File ==
+				 PROGRAM_TEMPORARY) ? R300_VPI_OUT_OP_MAD_2 :
+			    R300_VPI_OUT_OP_MAD;
+
+			o_inst->op =
+			    MAKE_VSF_OP(hw_op, t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = t_src(vp, &src[1]);
+
+			o_inst->src[2] = ZERO_SRC_1;
+			goto next;
+
+		case OPCODE_DP3:	//DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ZERO} PARAM 0{} {X Y Z ZERO}
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_DOT,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 1)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 2)),
+					    SWIZZLE_ZERO,
+					    t_src_class(src[0].File),
+					    src[0].
+					    NegateBase ? VSF_FLAG_XYZ :
+					    VSF_FLAG_NONE) | (src[0].
+							      RelAddr << 4);
+
+			o_inst->src[1] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 1)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 2)),
+					    SWIZZLE_ZERO,
+					    t_src_class(src[1].File),
+					    src[1].
+					    NegateBase ? VSF_FLAG_XYZ :
+					    VSF_FLAG_NONE) | (src[1].
+							      RelAddr << 4);
+
+			o_inst->src[2] = ZERO_SRC_1;
+			goto next;
+
+		case OPCODE_SUB:	//ADD RESULT 1.X Y Z W TMP 0{} {X Y Z W} PARAM 1{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
+#if 1
+			hw_op = (src[0].File == PROGRAM_TEMPORARY &&
+				 src[1].File ==
+				 PROGRAM_TEMPORARY) ? R300_VPI_OUT_OP_MAD_2 :
+			    R300_VPI_OUT_OP_MAD;
+
+			o_inst->op =
+			    MAKE_VSF_OP(hw_op, t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = ONE_SRC_0;
+			o_inst->src[2] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 1)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 2)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 3)),
+					    t_src_class(src[1].File),
+					    (!src[1].
+					     NegateBase) ? VSF_FLAG_ALL :
+					    VSF_FLAG_NONE) | (src[1].
+							      RelAddr << 4);
+#else
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_ADD,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 1)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 2)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 3)),
+					    t_src_class(src[1].File),
+					    (!src[1].
+					     NegateBase) ? VSF_FLAG_ALL :
+					    VSF_FLAG_NONE) | (src[1].
+							      RelAddr << 4);
+			o_inst->src[2] = 0;
+#endif
+			goto next;
+
+		case OPCODE_ABS:	//MAX RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_MAX,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 1)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 2)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 3)),
+					    t_src_class(src[0].File),
+					    (!src[0].
+					     NegateBase) ? VSF_FLAG_ALL :
+					    VSF_FLAG_NONE) | (src[0].
+							      RelAddr << 4);
+			o_inst->src[2] = 0;
+			goto next;
+
+		case OPCODE_FLR:
+			/* FRC TMP 0.X Y Z W PARAM 0{} {X Y Z W}
+			   ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} TMP 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W */
+
+			o_inst->op = MAKE_VSF_OP(R300_VPI_OUT_OP_FRC, u_temp_i,
+						 t_dst_mask(vpi->DstReg.
+							    WriteMask),
+						 VSF_OUT_CLASS_TMP);
+
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = ZERO_SRC_0;
+			o_inst->src[2] = ZERO_SRC_0;
+			o_inst++;
+
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_ADD,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = MAKE_VSF_SOURCE(u_temp_i,
+							 VSF_IN_COMPONENT_X,
+							 VSF_IN_COMPONENT_Y,
+							 VSF_IN_COMPONENT_Z,
+							 VSF_IN_COMPONENT_W,
+							 VSF_IN_CLASS_TMP,
+							 /* Not 100% sure about this */
+							 (!src[0].
+							  NegateBase) ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE
+							 /*VSF_FLAG_ALL */ );
+
+			o_inst->src[2] = ZERO_SRC_0;
+			u_temp_i--;
+			goto next;
+
+		case OPCODE_LG2:	// LG2 RESULT 1.X Y Z W PARAM 0{} {X X X X}
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_LG2,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_src_class(src[0].File),
+					    src[0].
+					    NegateBase ? VSF_FLAG_ALL :
+					    VSF_FLAG_NONE) | (src[0].
+							      RelAddr << 4);
+			o_inst->src[1] = ZERO_SRC_0;
+			o_inst->src[2] = ZERO_SRC_0;
+			goto next;
+
+		case OPCODE_LIT:	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_LIT,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			/* NOTE: Users swizzling might not work. */
+			o_inst->src[0] = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// x
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// w
+							 VSF_IN_COMPONENT_ZERO,	// z
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// y
+							 t_src_class(src[0].
+								     File),
+							 src[0].
+							 NegateBase ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[0].RelAddr << 4);
+			o_inst->src[1] = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// y
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// w
+							 VSF_IN_COMPONENT_ZERO,	// z
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// x
+							 t_src_class(src[0].
+								     File),
+							 src[0].
+							 NegateBase ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[0].RelAddr << 4);
+			o_inst->src[2] = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// y
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// x
+							 VSF_IN_COMPONENT_ZERO,	// z
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// w
+							 t_src_class(src[0].
+								     File),
+							 src[0].
+							 NegateBase ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[0].RelAddr << 4);
+			goto next;
+
+		case OPCODE_DPH:	//DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W}
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_DOT,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 1)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 2)),
+					    VSF_IN_COMPONENT_ONE,
+					    t_src_class(src[0].File),
+					    src[0].
+					    NegateBase ? VSF_FLAG_XYZ :
+					    VSF_FLAG_NONE) | (src[0].
+							      RelAddr << 4);
+			o_inst->src[1] = t_src(vp, &src[1]);
+			o_inst->src[2] = ZERO_SRC_1;
+			goto next;
+
+		case OPCODE_XPD:
+			/* mul r0, r1.yzxw, r2.zxyw
+			   mad r0, -r2.yzxw, r1.zxyw, r0
+			   NOTE: might need MAD_2
+			 */
+
+			o_inst->op = MAKE_VSF_OP(R300_VPI_OUT_OP_MAD, u_temp_i,
+						 t_dst_mask(vpi->DstReg.
+							    WriteMask),
+						 VSF_OUT_CLASS_TMP);
+
+			o_inst->src[0] = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// y
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 2)),	// z
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// x
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// w
+							 t_src_class(src[0].
+								     File),
+							 src[0].
+							 NegateBase ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[0].RelAddr << 4);
+
+			o_inst->src[1] = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), t_swizzle(GET_SWZ(src[1].Swizzle, 2)),	// z
+							 t_swizzle(GET_SWZ(src[1].Swizzle, 0)),	// x
+							 t_swizzle(GET_SWZ(src[1].Swizzle, 1)),	// y
+							 t_swizzle(GET_SWZ(src[1].Swizzle, 3)),	// w
+							 t_src_class(src[1].
+								     File),
+							 src[1].
+							 NegateBase ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[1].RelAddr << 4);
+
+			o_inst->src[2] = ZERO_SRC_1;
+			o_inst++;
+			u_temp_i--;
+
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_MAD,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), t_swizzle(GET_SWZ(src[1].Swizzle, 1)),	// y
+							 t_swizzle(GET_SWZ(src[1].Swizzle, 2)),	// z
+							 t_swizzle(GET_SWZ(src[1].Swizzle, 0)),	// x
+							 t_swizzle(GET_SWZ(src[1].Swizzle, 3)),	// w
+							 t_src_class(src[1].
+								     File),
+							 (!src[1].
+							  NegateBase) ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[1].RelAddr << 4);
+
+			o_inst->src[1] = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 2)),	// z
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// x
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// y
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// w
+							 t_src_class(src[0].
+								     File),
+							 src[0].
+							 NegateBase ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[0].RelAddr << 4);
+
+			o_inst->src[2] = MAKE_VSF_SOURCE(u_temp_i + 1,
+							 VSF_IN_COMPONENT_X,
+							 VSF_IN_COMPONENT_Y,
+							 VSF_IN_COMPONENT_Z,
+							 VSF_IN_COMPONENT_W,
+							 VSF_IN_CLASS_TMP,
+							 VSF_FLAG_NONE);
+
+			goto next;
+
+		case OPCODE_RCC:
+			fprintf(stderr, "Dont know how to handle op %d yet\n",
+				vpi->Opcode);
+			_mesa_exit(-1);
+			break;
+		case OPCODE_END:
+			break;
+		default:
+			break;
+		}
+
+		o_inst->op =
+		    MAKE_VSF_OP(t_opcode(vpi->Opcode),
+				t_dst_index(vp, &vpi->DstReg),
+				t_dst_mask(vpi->DstReg.WriteMask),
+				t_dst_class(vpi->DstReg.File));
+
+		if (are_srcs_scalar) {
+			switch (operands) {
+			case 1:
+				o_inst->src[0] = t_src_scalar(vp, &src[0]);
+				o_inst->src[1] = ZERO_SRC_0;
+				o_inst->src[2] = ZERO_SRC_0;
+				break;
+
+			case 2:
+				o_inst->src[0] = t_src_scalar(vp, &src[0]);
+				o_inst->src[1] = t_src_scalar(vp, &src[1]);
+				o_inst->src[2] = ZERO_SRC_1;
+				break;
+
+			case 3:
+				o_inst->src[0] = t_src_scalar(vp, &src[0]);
+				o_inst->src[1] = t_src_scalar(vp, &src[1]);
+				o_inst->src[2] = t_src_scalar(vp, &src[2]);
+				break;
+
+			default:
+				fprintf(stderr,
+					"scalars and op RCC not handled yet");
+				_mesa_exit(-1);
+				break;
+			}
+		} else {
+			switch (operands) {
+			case 1:
+				o_inst->src[0] = t_src(vp, &src[0]);
+				o_inst->src[1] = ZERO_SRC_0;
+				o_inst->src[2] = ZERO_SRC_0;
+				break;
+
+			case 2:
+				o_inst->src[0] = t_src(vp, &src[0]);
+				o_inst->src[1] = t_src(vp, &src[1]);
+				o_inst->src[2] = ZERO_SRC_1;
+				break;
+
+			case 3:
+				o_inst->src[0] = t_src(vp, &src[0]);
+				o_inst->src[1] = t_src(vp, &src[1]);
+				o_inst->src[2] = t_src(vp, &src[2]);
+				break;
+
+			default:
+				fprintf(stderr,
+					"scalars and op RCC not handled yet");
+				_mesa_exit(-1);
+				break;
+			}
+		}
+	      next:;
+	}
+
+	/* Will most likely segfault before we get here... fix later. */
+	if (o_inst - vp->program.body.i >= VSF_MAX_FRAGMENT_LENGTH / 4) {
+		vp->program.length = 0;
+		vp->native = GL_FALSE;
+		return;
+	}
+	vp->program.length = (o_inst - vp->program.body.i) * 4;
+#if 0
+	fprintf(stderr, "hw program:\n");
+	for (i = 0; i < vp->program.length; i++)
+		fprintf(stderr, "%08x\n", vp->program.body.d[i]);
+#endif
+}
+
+static void position_invariant(struct gl_program *prog)
+{
+	struct prog_instruction *vpi;
+	struct gl_program_parameter_list *paramList;
+	int i;
+
+	gl_state_index tokens[STATE_LENGTH] = { STATE_MVP_MATRIX, 0, 0, 0, 0 };
+
+	/* tokens[4] = matrix modifier */
+#ifdef PREFER_DP4
+	tokens[4] = 0;		/* not transposed or inverted */
+#else
+	tokens[4] = STATE_MATRIX_TRANSPOSE;
+#endif
+	paramList = prog->Parameters;
+
+	vpi = _mesa_alloc_instructions(prog->NumInstructions + 4);
+	_mesa_init_instructions(vpi, prog->NumInstructions + 4);
+
+	for (i = 0; i < 4; i++) {
+		GLint idx;
+		tokens[2] = tokens[3] = i;	/* matrix row[i]..row[i] */
+		idx = _mesa_add_state_reference(paramList, tokens);
+#ifdef PREFER_DP4
+		vpi[i].Opcode = OPCODE_DP4;
+		vpi[i].StringPos = 0;
+		vpi[i].Data = 0;
+
+		vpi[i].DstReg.File = PROGRAM_OUTPUT;
+		vpi[i].DstReg.Index = VERT_RESULT_HPOS;
+		vpi[i].DstReg.WriteMask = 1 << i;
+		vpi[i].DstReg.CondMask = COND_TR;
+
+		vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
+		vpi[i].SrcReg[0].Index = idx;
+		vpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+		vpi[i].SrcReg[1].File = PROGRAM_INPUT;
+		vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS;
+		vpi[i].SrcReg[1].Swizzle = SWIZZLE_XYZW;
+#else
+		if (i == 0)
+			vpi[i].Opcode = OPCODE_MUL;
+		else
+			vpi[i].Opcode = OPCODE_MAD;
+
+		vpi[i].StringPos = 0;
+		vpi[i].Data = 0;
+
+		if (i == 3)
+			vpi[i].DstReg.File = PROGRAM_OUTPUT;
+		else
+			vpi[i].DstReg.File = PROGRAM_TEMPORARY;
+		vpi[i].DstReg.Index = 0;
+		vpi[i].DstReg.WriteMask = 0xf;
+		vpi[i].DstReg.CondMask = COND_TR;
+
+		vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
+		vpi[i].SrcReg[0].Index = idx;
+		vpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+		vpi[i].SrcReg[1].File = PROGRAM_INPUT;
+		vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS;
+		vpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(i, i, i, i);
+
+		if (i > 0) {
+			vpi[i].SrcReg[2].File = PROGRAM_TEMPORARY;
+			vpi[i].SrcReg[2].Index = 0;
+			vpi[i].SrcReg[2].Swizzle = SWIZZLE_XYZW;
+		}
+#endif
+	}
+
+	_mesa_copy_instructions(&vpi[i], prog->Instructions,
+				prog->NumInstructions);
+
+	free(prog->Instructions);
+
+	prog->Instructions = vpi;
+
+	prog->NumInstructions += 4;
+	vpi = &prog->Instructions[prog->NumInstructions - 1];
+
+	assert(vpi->Opcode == OPCODE_END);
+}
+
+static void insert_wpos(struct r300_vertex_program *vp,
+			struct gl_program *prog, GLuint temp_index)
+{
+	struct prog_instruction *vpi;
+	struct prog_instruction *vpi_insert;
+	int i = 0;
+
+	vpi = _mesa_alloc_instructions(prog->NumInstructions + 2);
+	_mesa_init_instructions(vpi, prog->NumInstructions + 2);
+	/* all but END */
+	_mesa_copy_instructions(vpi, prog->Instructions,
+				prog->NumInstructions - 1);
+	/* END */
+	_mesa_copy_instructions(&vpi[prog->NumInstructions + 1],
+				&prog->Instructions[prog->NumInstructions - 1],
+				1);
+	vpi_insert = &vpi[prog->NumInstructions - 1];
+
+	vpi_insert[i].Opcode = OPCODE_MOV;
+
+	vpi_insert[i].DstReg.File = PROGRAM_OUTPUT;
+	vpi_insert[i].DstReg.Index = VERT_RESULT_HPOS;
+	vpi_insert[i].DstReg.WriteMask = WRITEMASK_XYZW;
+	vpi_insert[i].DstReg.CondMask = COND_TR;
+
+	vpi_insert[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+	vpi_insert[i].SrcReg[0].Index = temp_index;
+	vpi_insert[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+	i++;
+
+	vpi_insert[i].Opcode = OPCODE_MOV;
+
+	vpi_insert[i].DstReg.File = PROGRAM_OUTPUT;
+	vpi_insert[i].DstReg.Index = VERT_RESULT_TEX0 + vp->wpos_idx;
+	vpi_insert[i].DstReg.WriteMask = WRITEMASK_XYZW;
+	vpi_insert[i].DstReg.CondMask = COND_TR;
+
+	vpi_insert[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+	vpi_insert[i].SrcReg[0].Index = temp_index;
+	vpi_insert[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+	i++;
+
+	free(prog->Instructions);
+
+	prog->Instructions = vpi;
+
+	prog->NumInstructions += i;
+	vpi = &prog->Instructions[prog->NumInstructions - 1];
+
+	assert(vpi->Opcode == OPCODE_END);
+}
+
+static void pos_as_texcoord(struct r300_vertex_program *vp,
+			    struct gl_program *prog)
+{
+	struct prog_instruction *vpi;
+	GLuint tempregi = prog->NumTemporaries;
+	/* should do something else if no temps left... */
+	prog->NumTemporaries++;
+
+	for (vpi = prog->Instructions; vpi->Opcode != OPCODE_END; vpi++) {
+		if (vpi->DstReg.File == PROGRAM_OUTPUT &&
+		    vpi->DstReg.Index == VERT_RESULT_HPOS) {
+			vpi->DstReg.File = PROGRAM_TEMPORARY;
+			vpi->DstReg.Index = tempregi;
+		}
+	}
+	insert_wpos(vp, prog, tempregi);
+}
+
+static struct r300_vertex_program *build_program(struct r300_vertex_program_key
+						 *wanted_key, struct gl_vertex_program
+						 *mesa_vp, GLint wpos_idx)
+{
+	struct r300_vertex_program *vp;
+
+	vp = _mesa_calloc(sizeof(*vp));
+	_mesa_memcpy(&vp->key, wanted_key, sizeof(vp->key));
+
+	vp->wpos_idx = wpos_idx;
+
+	if (mesa_vp->IsPositionInvariant) {
+		position_invariant(&mesa_vp->Base);
+	}
+
+	if (wpos_idx > -1)
+		pos_as_texcoord(vp, &mesa_vp->Base);
+
+	assert(mesa_vp->Base.NumInstructions);
+
+	vp->num_temporaries = mesa_vp->Base.NumTemporaries;
+
+	r300TranslateVertexShader(vp, mesa_vp->Base.Instructions);
+
+	return vp;
+}
+
+void r300SelectVertexShader(r300ContextPtr r300)
+{
+	GLcontext *ctx = ctx = r300->radeon.glCtx;
+	GLuint InputsRead;
+	struct r300_vertex_program_key wanted_key = { 0 };
+	GLint i;
+	struct r300_vertex_program_cont *vpc;
+	struct r300_vertex_program *vp;
+	GLint wpos_idx;
+
+	vpc = (struct r300_vertex_program_cont *)ctx->VertexProgram._Current;
+	InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
+
+	wanted_key.OutputsWritten |= 1 << VERT_RESULT_HPOS;
+
+	wpos_idx = -1;
+	if (InputsRead & FRAG_BIT_WPOS) {
+		for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+			if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
+				break;
+
+		if (i == ctx->Const.MaxTextureUnits) {
+			fprintf(stderr, "\tno free texcoord found\n");
+			_mesa_exit(-1);
+		}
+
+		InputsRead |= (FRAG_BIT_TEX0 << i);
+		wpos_idx = i;
+	}
+
+	if (InputsRead & FRAG_BIT_COL0)
+		wanted_key.OutputsWritten |= 1 << VERT_RESULT_COL0;
+
+	if ((InputsRead & FRAG_BIT_COL1)	/*||
+						   (InputsRead & FRAG_BIT_FOGC) */ )
+		wanted_key.OutputsWritten |= 1 << VERT_RESULT_COL1;
+
+	for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+		if (InputsRead & (FRAG_BIT_TEX0 << i))
+			wanted_key.OutputsWritten |=
+			    1 << (VERT_RESULT_TEX0 + i);
+
+	wanted_key.InputsRead = vpc->mesa_program.Base.InputsRead;
+	if (vpc->mesa_program.IsPositionInvariant) {
+		/* we wan't position don't we ? */
+		wanted_key.InputsRead |= (1 << VERT_ATTRIB_POS);
+	}
+
+	for (vp = vpc->progs; vp; vp = vp->next)
+		if (_mesa_memcmp(&vp->key, &wanted_key, sizeof(wanted_key)) ==
+		    0) {
+			r300->selected_vp = vp;
+			return;
+		}
+	//_mesa_print_program(&vpc->mesa_program.Base);
+
+	vp = build_program(&wanted_key, &vpc->mesa_program, wpos_idx);
+	vp->next = vpc->progs;
+	vpc->progs = vp;
+	r300->selected_vp = vp;
+}
diff --git a/r300/r300_vertprog.h b/r300/r300_vertprog.h
new file mode 100644
index 0000000..252d5a9
--- /dev/null
+++ b/r300/r300_vertprog.h
@@ -0,0 +1,89 @@
+#ifndef __R300_VERTPROG_H_
+#define __R300_VERTPROG_H_
+
+#include "r300_reg.h"
+
+typedef struct {
+	GLuint op;
+	GLuint src[3];
+} VERTEX_SHADER_INSTRUCTION;
+
+#define VSF_FLAG_X	1
+#define VSF_FLAG_Y	2
+#define VSF_FLAG_Z	4
+#define VSF_FLAG_W	8
+#define VSF_FLAG_XYZ	(VSF_FLAG_X | VSF_FLAG_Y | VSF_FLAG_Z)
+#define VSF_FLAG_ALL  0xf
+#define VSF_FLAG_NONE  0
+
+#define VSF_OUT_CLASS_TMP	0
+#define VSF_OUT_CLASS_ADDR	1
+#define VSF_OUT_CLASS_RESULT	2
+
+/* first DWORD of an instruction */
+
+/* possible operations: 
+    DOT, MUL, ADD, MAD, FRC, MAX, MIN, SGE, SLT, EXP, LOG, LIT, POW, RCP, RSQ, EX2,
+    LG2, MAD_2 */
+
+#define MAKE_VSF_OP(op, out_reg_index, out_reg_fields, class) \
+   ((op)  \
+  	| ((out_reg_index) << R300_VPI_OUT_REG_INDEX_SHIFT) 	\
+ 	 | ((out_reg_fields) << 20) 	\
+  	| ( (class) << 8 ) )
+
+#define EASY_VSF_OP(op, out_reg_index, out_reg_fields, class) \
+	MAKE_VSF_OP(R300_VPI_OUT_OP_##op, out_reg_index, VSF_FLAG_##out_reg_fields, VSF_OUT_CLASS_##class) \
+
+/* according to Nikolai, the subsequent 3 DWORDs are sources, use same define for each */
+
+#define VSF_IN_CLASS_TMP	0
+#define VSF_IN_CLASS_ATTR	1
+#define VSF_IN_CLASS_PARAM	2
+#define VSF_IN_CLASS_NONE	9
+
+#define VSF_IN_COMPONENT_X	0
+#define VSF_IN_COMPONENT_Y	1
+#define VSF_IN_COMPONENT_Z	2
+#define VSF_IN_COMPONENT_W	3
+#define VSF_IN_COMPONENT_ZERO	4
+#define VSF_IN_COMPONENT_ONE	5
+
+#define MAKE_VSF_SOURCE(in_reg_index, comp_x, comp_y, comp_z, comp_w, class, negate) \
+	( ((in_reg_index)<<R300_VPI_IN_REG_INDEX_SHIFT) \
+	   | ((comp_x)<<R300_VPI_IN_X_SHIFT) \
+	   | ((comp_y)<<R300_VPI_IN_Y_SHIFT) \
+	   | ((comp_z)<<R300_VPI_IN_Z_SHIFT) \
+	   | ((comp_w)<<R300_VPI_IN_W_SHIFT) \
+	   | ((negate)<<25) | ((class)))
+
+#define EASY_VSF_SOURCE(in_reg_index, comp_x, comp_y, comp_z, comp_w, class, negate) \
+	MAKE_VSF_SOURCE(in_reg_index, \
+		VSF_IN_COMPONENT_##comp_x, \
+		VSF_IN_COMPONENT_##comp_y, \
+		VSF_IN_COMPONENT_##comp_z, \
+		VSF_IN_COMPONENT_##comp_w, \
+		VSF_IN_CLASS_##class, VSF_FLAG_##negate)
+
+/* special sources: */
+
+/* (1.0,1.0,1.0,1.0) vector (ATTR, plain ) */
+#define VSF_ATTR_UNITY(reg) 	EASY_VSF_SOURCE(reg, ONE, ONE, ONE, ONE, ATTR, NONE)
+#define VSF_UNITY(reg) 	EASY_VSF_SOURCE(reg, ONE, ONE, ONE, ONE, NONE, NONE)
+
+/* contents of unmodified register */
+#define VSF_REG(reg) 	EASY_VSF_SOURCE(reg, X, Y, Z, W, ATTR, NONE)
+
+/* contents of unmodified parameter */
+#define VSF_PARAM(reg) 	EASY_VSF_SOURCE(reg, X, Y, Z, W, PARAM, NONE)
+
+/* contents of unmodified temporary register */
+#define VSF_TMP(reg) 	EASY_VSF_SOURCE(reg, X, Y, Z, W, TMP, NONE)
+
+/* components of ATTR register */
+#define VSF_ATTR_X(reg) EASY_VSF_SOURCE(reg, X, X, X, X, ATTR, NONE)
+#define VSF_ATTR_Y(reg) EASY_VSF_SOURCE(reg, Y, Y, Y, Y, ATTR, NONE)
+#define VSF_ATTR_Z(reg) EASY_VSF_SOURCE(reg, Z, Z, Z, Z, ATTR, NONE)
+#define VSF_ATTR_W(reg) EASY_VSF_SOURCE(reg, W, W, W, W, ATTR, NONE)
+
+#endif
diff --git a/r300/radeon_context.c b/r300/radeon_context.c
new file mode 100644
index 0000000..e9634b4
--- /dev/null
+++ b/r300/radeon_context.c
@@ -0,0 +1,327 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file radeon_context.c
+ * Common context initialization.
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include <dlfcn.h>
+
+#include "glheader.h"
+#include "imports.h"
+#include "context.h"
+#include "state.h"
+#include "matrix.h"
+#include "framebuffer.h"
+
+#include "drivers/common/driverfuncs.h"
+#include "swrast/swrast.h"
+
+#include "radeon_screen.h"
+#include "radeon_ioctl.h"
+#include "radeon_macros.h"
+#include "radeon_reg.h"
+
+#include "radeon_state.h"
+#include "r300_state.h"
+
+#include "utils.h"
+#include "vblank.h"
+#include "xmlpool.h"		/* for symbolic values of enum-type options */
+
+#define DRIVER_DATE "20060815"
+
+
+/* Return various strings for glGetString().
+ */
+static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	static char buffer[128];
+
+	switch (name) {
+	case GL_VENDOR:
+		if (IS_R300_CLASS(radeon->radeonScreen))
+			return (GLubyte *) "DRI R300 Project";
+		else
+			return (GLubyte *) "Tungsten Graphics, Inc.";
+
+	case GL_RENDERER:
+	{
+		unsigned offset;
+		GLuint agp_mode = (radeon->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
+			radeon->radeonScreen->AGPMode;
+		const char* chipname;
+
+		if (IS_R300_CLASS(radeon->radeonScreen))
+			chipname = "R300";
+		else
+			chipname = "R200";
+
+		offset = driGetRendererString(buffer, chipname, DRIVER_DATE,
+					      agp_mode);
+
+		if (IS_R300_CLASS(radeon->radeonScreen)) {
+		sprintf(&buffer[offset], " %sTCL",
+			(radeon->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)
+			? "" : "NO-");
+		} else {
+			sprintf(&buffer[offset], " %sTCL",
+			!(radeon->TclFallback & RADEON_TCL_FALLBACK_TCL_DISABLE)
+			? "" : "NO-");
+		}
+
+		return (GLubyte *) buffer;
+	}
+
+	default:
+		return NULL;
+	}
+}
+
+/* Initialize the driver's misc functions.
+ */
+static void radeonInitDriverFuncs(struct dd_function_table *functions)
+{
+	functions->GetString = radeonGetString;
+}
+
+
+/**
+ * Create and initialize all common fields of the context,
+ * including the Mesa context itself.
+ */
+GLboolean radeonInitContext(radeonContextPtr radeon,
+			    struct dd_function_table* functions,
+			    const __GLcontextModes * glVisual,
+			    __DRIcontextPrivate * driContextPriv,
+			    void *sharedContextPrivate)
+{
+	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+	GLcontext* ctx;
+	GLcontext* shareCtx;
+	int fthrottle_mode;
+
+	/* Fill in additional standard functions. */
+	radeonInitDriverFuncs(functions);
+
+	/* Allocate and initialize the Mesa context */
+	if (sharedContextPrivate)
+		shareCtx = ((radeonContextPtr)sharedContextPrivate)->glCtx;
+	else
+		shareCtx = NULL;
+	radeon->glCtx = _mesa_create_context(glVisual, shareCtx,
+					    functions, (void *)radeon);
+	if (!radeon->glCtx)
+		return GL_FALSE;
+
+	ctx = radeon->glCtx;
+	driContextPriv->driverPrivate = radeon;
+
+	/* DRI fields */
+	radeon->dri.context = driContextPriv;
+	radeon->dri.screen = sPriv;
+	radeon->dri.drawable = NULL;
+	radeon->dri.readable = NULL;
+	radeon->dri.hwContext = driContextPriv->hHWContext;
+	radeon->dri.hwLock = &sPriv->pSAREA->lock;
+	radeon->dri.fd = sPriv->fd;
+	radeon->dri.drmMinor = sPriv->drmMinor;
+
+	radeon->radeonScreen = screen;
+	radeon->sarea = (drm_radeon_sarea_t *) ((GLubyte *) sPriv->pSAREA +
+					       screen->sarea_priv_offset);
+
+	/* Setup IRQs */
+	fthrottle_mode = driQueryOptioni(&radeon->optionCache, "fthrottle_mode");
+	radeon->iw.irq_seq = -1;
+	radeon->irqsEmitted = 0;
+	radeon->do_irqs = (fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS &&
+			  radeon->radeonScreen->irq);
+
+	radeon->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
+
+	if (!radeon->do_irqs)
+		fprintf(stderr,
+			"IRQ's not enabled, falling back to %s: %d %d\n",
+			radeon->do_usleeps ? "usleeps" : "busy waits",
+			fthrottle_mode, radeon->radeonScreen->irq);
+
+	radeon->vblank_flags = (radeon->radeonScreen->irq != 0)
+	    ? driGetDefaultVBlankFlags(&radeon->optionCache) : VBLANK_FLAG_NO_IRQ;
+
+	(*dri_interface->getUST) (&radeon->swap_ust);
+
+	return GL_TRUE;
+}
+
+
+/**
+ * Cleanup common context fields.
+ * Called by r200DestroyContext/r300DestroyContext
+ */
+void radeonCleanupContext(radeonContextPtr radeon)
+{
+	/* _mesa_destroy_context() might result in calls to functions that
+	 * depend on the DriverCtx, so don't set it to NULL before.
+	 *
+	 * radeon->glCtx->DriverCtx = NULL;
+	 */
+
+	/* free the Mesa context */
+	_mesa_destroy_context(radeon->glCtx);
+
+	if (radeon->state.scissor.pClipRects) {
+		FREE(radeon->state.scissor.pClipRects);
+		radeon->state.scissor.pClipRects = 0;
+	}
+}
+
+
+/**
+ * Swap front and back buffer.
+ */
+void radeonSwapBuffers(__DRIdrawablePrivate * dPriv)
+{
+	if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+		radeonContextPtr radeon;
+		GLcontext *ctx;
+
+		radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+		ctx = radeon->glCtx;
+
+		if (ctx->Visual.doubleBufferMode) {
+			_mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
+			if (radeon->doPageFlip) {
+				radeonPageFlip(dPriv);
+			} else {
+			    radeonCopyBuffer(dPriv, NULL);
+			}
+		}
+	} else {
+		/* XXX this shouldn't be an error but we can't handle it for now */
+		_mesa_problem(NULL, "%s: drawable has no context!",
+			      __FUNCTION__);
+	}
+}
+
+void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+			 int x, int y, int w, int h )
+{
+    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+	radeonContextPtr radeon;
+	GLcontext *ctx;
+
+	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+	ctx = radeon->glCtx;
+
+	if (ctx->Visual.doubleBufferMode) {
+	    drm_clip_rect_t rect;
+	    rect.x1 = x + dPriv->x;
+	    rect.y1 = (dPriv->h - y - h) + dPriv->y;
+	    rect.x2 = rect.x1 + w;
+	    rect.y2 = rect.y1 + h;
+	    _mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
+	    radeonCopyBuffer(dPriv, &rect);
+	}
+    } else {
+	/* XXX this shouldn't be an error but we can't handle it for now */
+	_mesa_problem(NULL, "%s: drawable has no context!",
+		      __FUNCTION__);
+    }
+}
+
+/* Force the context `c' to be the current context and associate with it
+ * buffer `b'.
+ */
+GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
+			    __DRIdrawablePrivate * driDrawPriv,
+			    __DRIdrawablePrivate * driReadPriv)
+{
+	if (driContextPriv) {
+		radeonContextPtr radeon =
+			(radeonContextPtr) driContextPriv->driverPrivate;
+
+		if (RADEON_DEBUG & DEBUG_DRI)
+			fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
+				radeon->glCtx);
+
+		if (radeon->dri.drawable != driDrawPriv) {
+			driDrawableInitVBlank(driDrawPriv,
+					      radeon->vblank_flags,
+					      &radeon->vbl_seq);
+		}
+
+		radeon->dri.readable = driReadPriv;
+
+		if (radeon->dri.drawable != driDrawPriv ||
+		    radeon->lastStamp != driDrawPriv->lastStamp) {
+			radeon->dri.drawable = driDrawPriv;
+
+			radeonSetCliprects(radeon);
+			r300UpdateViewportOffset(radeon->glCtx);
+		}
+
+		_mesa_make_current(radeon->glCtx,
+				    (GLframebuffer *) driDrawPriv->
+				    driverPrivate,
+				    (GLframebuffer *) driReadPriv->
+				    driverPrivate);
+
+		_mesa_update_state(radeon->glCtx);		
+
+		radeonUpdatePageFlipping(radeon);
+	} else {
+		if (RADEON_DEBUG & DEBUG_DRI)
+			fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
+		_mesa_make_current(0, 0, 0);
+	}
+
+	if (RADEON_DEBUG & DEBUG_DRI)
+		fprintf(stderr, "End %s\n", __FUNCTION__);
+	return GL_TRUE;
+}
+
+/* Force the context `c' to be unbound from its buffer.
+ */
+GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv)
+{
+	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
+
+	if (RADEON_DEBUG & DEBUG_DRI)
+		fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
+			radeon->glCtx);
+
+	return GL_TRUE;
+}
+
diff --git a/r300/radeon_context.h b/r300/radeon_context.h
new file mode 100644
index 0000000..2f23941
--- /dev/null
+++ b/r300/radeon_context.h
@@ -0,0 +1,246 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __RADEON_CONTEXT_H__
+#define __RADEON_CONTEXT_H__
+
+#include "mtypes.h"
+#include "radeon_screen.h"
+#include "drm.h"
+#include "dri_util.h"
+#include "colormac.h"
+
+struct radeon_context;
+typedef struct radeon_context radeonContextRec;
+typedef struct radeon_context *radeonContextPtr;
+
+#define TEX_0   0x1
+#define TEX_1   0x2
+#define TEX_2	0x4
+#define TEX_3	0x8
+#define TEX_4	0x10
+#define TEX_5	0x20
+#define TEX_6	0x40
+#define TEX_7	0x80
+#define TEX_ALL 0xff
+
+/* Rasterizing fallbacks */
+/* See correponding strings in r200_swtcl.c */
+#define RADEON_FALLBACK_TEXTURE		0x0001
+#define RADEON_FALLBACK_DRAW_BUFFER	0x0002
+#define RADEON_FALLBACK_STENCIL		0x0004
+#define RADEON_FALLBACK_RENDER_MODE	0x0008
+#define RADEON_FALLBACK_BLEND_EQ	0x0010
+#define RADEON_FALLBACK_BLEND_FUNC	0x0020
+#define RADEON_FALLBACK_DISABLE		0x0040
+#define RADEON_FALLBACK_BORDER_MODE	0x0080
+
+#if R200_MERGED
+extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
+
+#define FALLBACK( radeon, bit, mode ) do {			\
+   if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",	\
+		     __FUNCTION__, bit, mode );			\
+   radeonFallback( (radeon)->glCtx, bit, mode );		\
+} while (0)
+#else
+#define FALLBACK( radeon, bit, mode ) fprintf(stderr, "%s:%s\n", __LINE__, __FILE__);
+#endif
+
+/* TCL fallbacks */
+extern void radeonTclFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
+
+#define RADEON_TCL_FALLBACK_RASTER		0x0001	/* rasterization */
+#define RADEON_TCL_FALLBACK_UNFILLED		0x0002	/* unfilled tris */
+#define RADEON_TCL_FALLBACK_LIGHT_TWOSIDE	0x0004	/* twoside tris */
+#define RADEON_TCL_FALLBACK_MATERIAL		0x0008	/* material in vb */
+#define RADEON_TCL_FALLBACK_TEXGEN_0		0x0010	/* texgen, unit 0 */
+#define RADEON_TCL_FALLBACK_TEXGEN_1		0x0020	/* texgen, unit 1 */
+#define RADEON_TCL_FALLBACK_TEXGEN_2		0x0040	/* texgen, unit 2 */
+#define RADEON_TCL_FALLBACK_TEXGEN_3		0x0080	/* texgen, unit 3 */
+#define RADEON_TCL_FALLBACK_TEXGEN_4		0x0100	/* texgen, unit 4 */
+#define RADEON_TCL_FALLBACK_TEXGEN_5		0x0200	/* texgen, unit 5 */
+#define RADEON_TCL_FALLBACK_TCL_DISABLE		0x0400	/* user disable */
+#define RADEON_TCL_FALLBACK_BITMAP		0x0800	/* draw bitmap with points */
+#define RADEON_TCL_FALLBACK_VERTEX_PROGRAM	0x1000	/* vertex program active */
+
+#if R200_MERGED
+#define TCL_FALLBACK( ctx, bit, mode )	radeonTclFallback( ctx, bit, mode )
+#else
+#define TCL_FALLBACK( ctx, bit, mode )	;
+#endif
+
+struct radeon_dri_mirror {
+	__DRIcontextPrivate *context;	/* DRI context */
+	__DRIscreenPrivate *screen;	/* DRI screen */
+	/**
+	 * DRI drawable bound to this context for drawing.
+	 */
+	__DRIdrawablePrivate *drawable;
+
+	/**
+	 * DRI drawable bound to this context for reading.
+	 */
+	__DRIdrawablePrivate *readable;
+
+	drm_context_t hwContext;
+	drm_hw_lock_t *hwLock;
+	int fd;
+	int drmMinor;
+};
+
+/**
+ * Derived state for internal purposes.
+ */
+struct radeon_scissor_state {
+	drm_clip_rect_t rect;
+	GLboolean enabled;
+
+	GLuint numClipRects;	/* Cliprects active */
+	GLuint numAllocedClipRects;	/* Cliprects available */
+	drm_clip_rect_t *pClipRects;
+};
+
+struct radeon_colorbuffer_state {
+	GLuint clear;
+	GLint drawOffset, drawPitch;
+};
+
+struct radeon_state {
+	struct radeon_colorbuffer_state color;
+	struct radeon_scissor_state scissor;
+};
+
+/**
+ * Common per-context variables shared by R200 and R300.
+ * R200- and R300-specific code "derive" their own context from this
+ * structure.
+ */
+struct radeon_context {
+	GLcontext *glCtx;	/* Mesa context */
+	radeonScreenPtr radeonScreen;	/* Screen private DRI data */
+
+	/* Fallback state */
+	GLuint Fallback;
+	GLuint TclFallback;
+
+	/* Page flipping */
+	GLuint doPageFlip;
+
+	/* Drawable, cliprect and scissor information */
+	GLuint numClipRects;	/* Cliprects for the draw buffer */
+	drm_clip_rect_t *pClipRects;
+	unsigned int lastStamp;
+	GLboolean lost_context;
+	drm_radeon_sarea_t *sarea;	/* Private SAREA data */
+
+	/* Mirrors of some DRI state */
+	struct radeon_dri_mirror dri;
+
+	/* Busy waiting */
+	GLuint do_usleeps;
+	GLuint do_irqs;
+	GLuint irqsEmitted;
+	drm_radeon_irq_wait_t iw;
+
+	/* VBI / buffer swap */
+	GLuint vbl_seq;
+	GLuint vblank_flags;
+
+	int64_t swap_ust;
+	int64_t swap_missed_ust;
+
+	GLuint swap_count;
+	GLuint swap_missed_count;
+
+	/* Derived state */
+	struct radeon_state state;
+
+	/* Configuration cache
+	 */
+	driOptionCache optionCache;
+};
+
+#define RADEON_CONTEXT(glctx) ((radeonContextPtr)(ctx->DriverCtx))
+
+extern void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
+extern void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+				int x, int y, int w, int h);
+extern GLboolean radeonInitContext(radeonContextPtr radeon,
+				   struct dd_function_table *functions,
+				   const __GLcontextModes * glVisual,
+				   __DRIcontextPrivate * driContextPriv,
+				   void *sharedContextPrivate);
+extern void radeonCleanupContext(radeonContextPtr radeon);
+extern GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
+				   __DRIdrawablePrivate * driDrawPriv,
+				   __DRIdrawablePrivate * driReadPriv);
+extern GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
+
+/* ================================================================
+ * Debugging:
+ */
+#define DO_DEBUG		1
+
+#if DO_DEBUG
+extern int RADEON_DEBUG;
+#else
+#define RADEON_DEBUG		0
+#endif
+
+#define DEBUG_TEXTURE	0x0001
+#define DEBUG_STATE	0x0002
+#define DEBUG_IOCTL	0x0004
+#define DEBUG_PRIMS	0x0008
+#define DEBUG_VERTS	0x0010
+#define DEBUG_FALLBACKS	0x0020
+#define DEBUG_VFMT	0x0040
+#define DEBUG_CODEGEN	0x0080
+#define DEBUG_VERBOSE	0x0100
+#define DEBUG_DRI       0x0200
+#define DEBUG_DMA       0x0400
+#define DEBUG_SANITY    0x0800
+#define DEBUG_SYNC      0x1000
+#define DEBUG_PIXEL     0x2000
+#define DEBUG_MEMORY    0x4000
+
+#endif				/* __RADEON_CONTEXT_H__ */
diff --git a/r300/radeon_ioctl.c b/r300/radeon_ioctl.c
new file mode 100644
index 0000000..0b8656b
--- /dev/null
+++ b/r300/radeon_ioctl.c
@@ -0,0 +1,394 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include <sched.h>
+#include <errno.h>
+
+#include "glheader.h"
+#include "imports.h"
+#include "macros.h"
+#include "context.h"
+#include "swrast/swrast.h"
+#include "r300_context.h"
+#include "radeon_ioctl.h"
+#include "r300_ioctl.h"
+#include "r300_state.h"
+#include "radeon_reg.h"
+
+#include "drirenderbuffer.h"
+#include "vblank.h"
+
+static void radeonWaitForIdle(radeonContextPtr radeon);
+
+/* ================================================================
+ * SwapBuffers with client-side throttling
+ */
+
+static uint32_t radeonGetLastFrame(radeonContextPtr radeon)
+{
+	drm_radeon_getparam_t gp;
+	int ret;
+	uint32_t frame;
+
+	gp.param = RADEON_PARAM_LAST_FRAME;
+	gp.value = (int *)&frame;
+	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+				  &gp, sizeof(gp));
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+
+	return frame;
+}
+
+uint32_t radeonGetAge(radeonContextPtr radeon)
+{
+	drm_radeon_getparam_t gp;
+	int ret;
+	uint32_t age;
+
+	gp.param = RADEON_PARAM_LAST_CLEAR;
+	gp.value = (int *)&age;
+	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+				  &gp, sizeof(gp));
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+
+	return age;
+}
+
+static void radeonEmitIrqLocked(radeonContextPtr radeon)
+{
+	drm_radeon_irq_emit_t ie;
+	int ret;
+
+	ie.irq_seq = &radeon->iw.irq_seq;
+	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_IRQ_EMIT,
+				  &ie, sizeof(ie));
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonIrqEmit: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+}
+
+static void radeonWaitIrq(radeonContextPtr radeon)
+{
+	int ret;
+
+	do {
+		ret = drmCommandWrite(radeon->dri.fd, DRM_RADEON_IRQ_WAIT,
+				      &radeon->iw, sizeof(radeon->iw));
+	} while (ret && (errno == EINTR || errno == EBUSY));
+
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+}
+
+static void radeonWaitForFrameCompletion(radeonContextPtr radeon)
+{
+	drm_radeon_sarea_t *sarea = radeon->sarea;
+
+	if (radeon->do_irqs) {
+		if (radeonGetLastFrame(radeon) < sarea->last_frame) {
+			if (!radeon->irqsEmitted) {
+				while (radeonGetLastFrame(radeon) <
+				       sarea->last_frame) ;
+			} else {
+				UNLOCK_HARDWARE(radeon);
+				radeonWaitIrq(radeon);
+				LOCK_HARDWARE(radeon);
+			}
+			radeon->irqsEmitted = 10;
+		}
+
+		if (radeon->irqsEmitted) {
+			radeonEmitIrqLocked(radeon);
+			radeon->irqsEmitted--;
+		}
+	} else {
+		while (radeonGetLastFrame(radeon) < sarea->last_frame) {
+			UNLOCK_HARDWARE(radeon);
+			if (radeon->do_usleeps)
+				DO_USLEEP(1);
+			LOCK_HARDWARE(radeon);
+		}
+	}
+}
+
+/* Copy the back color buffer to the front color buffer.
+ */
+void radeonCopyBuffer(const __DRIdrawablePrivate * dPriv,
+		      const drm_clip_rect_t	 * rect)
+{
+	radeonContextPtr radeon;
+	GLint nbox, i, ret;
+	GLboolean missed_target;
+	int64_t ust;
+
+	assert(dPriv);
+	assert(dPriv->driContextPriv);
+	assert(dPriv->driContextPriv->driverPrivate);
+
+	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL) {
+		fprintf(stderr, "\n%s( %p )\n\n", __FUNCTION__,
+			(void *)radeon->glCtx);
+	}
+
+	r300Flush(radeon->glCtx);
+
+	LOCK_HARDWARE(radeon);
+
+	/* Throttle the frame rate -- only allow one pending swap buffers
+	 * request at a time.
+	 */
+	radeonWaitForFrameCompletion(radeon);
+	if (!rect)
+	{
+	    UNLOCK_HARDWARE(radeon);
+	    driWaitForVBlank(dPriv, &radeon->vbl_seq, radeon->vblank_flags,
+			     &missed_target);
+	    LOCK_HARDWARE(radeon);
+	}
+
+	nbox = dPriv->numClipRects;	/* must be in locked region */
+
+	for (i = 0; i < nbox;) {
+		GLint nr = MIN2(i + RADEON_NR_SAREA_CLIPRECTS, nbox);
+		drm_clip_rect_t *box = dPriv->pClipRects;
+		drm_clip_rect_t *b = radeon->sarea->boxes;
+		GLint n = 0;
+
+		for ( ; i < nr ; i++ ) {
+
+		    *b = box[i];
+
+		    if (rect)
+		    {
+			if (rect->x1 > b->x1)
+			    b->x1 = rect->x1;
+			if (rect->y1 > b->y1)
+			    b->y1 = rect->y1;
+			if (rect->x2 < b->x2)
+			    b->x2 = rect->x2;
+			if (rect->y2 < b->y2)
+			    b->y2 = rect->y2;
+
+			if (b->x1 < b->x2 && b->y1 < b->y2)
+			    b++;
+		    }
+		    else
+			b++;
+
+		    n++;
+		}
+		radeon->sarea->nbox = n;
+
+		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_SWAP);
+
+		if (ret) {
+			fprintf(stderr, "DRM_RADEON_SWAP: return = %d\n",
+				ret);
+			UNLOCK_HARDWARE(radeon);
+			exit(1);
+		}
+	}
+
+	UNLOCK_HARDWARE(radeon);
+	if (!rect)
+	{
+	    ((r300ContextPtr)radeon)->hw.all_dirty = GL_TRUE;
+
+	    radeon->swap_count++;
+	    (*dri_interface->getUST) (&ust);
+	    if (missed_target) {
+		radeon->swap_missed_count++;
+		radeon->swap_missed_ust = ust - radeon->swap_ust;
+	    }
+
+	    radeon->swap_ust = ust;
+
+	    sched_yield();
+	}
+}
+
+void radeonPageFlip(const __DRIdrawablePrivate * dPriv)
+{
+	radeonContextPtr radeon;
+	GLint ret;
+	GLboolean missed_target;
+
+	assert(dPriv);
+	assert(dPriv->driContextPriv);
+	assert(dPriv->driContextPriv->driverPrivate);
+
+	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL) {
+		fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
+			radeon->sarea->pfCurrentPage);
+	}
+
+	r300Flush(radeon->glCtx);
+	LOCK_HARDWARE(radeon);
+
+	if (!dPriv->numClipRects) {
+		UNLOCK_HARDWARE(radeon);
+		usleep(10000);	/* throttle invisible client 10ms */
+		return;
+	}
+
+	/* Need to do this for the perf box placement:
+	 */
+	{
+		drm_clip_rect_t *box = dPriv->pClipRects;
+		drm_clip_rect_t *b = radeon->sarea->boxes;
+		b[0] = box[0];
+		radeon->sarea->nbox = 1;
+	}
+
+	/* Throttle the frame rate -- only allow a few pending swap buffers
+	 * request at a time.
+	 */
+	radeonWaitForFrameCompletion(radeon);
+	UNLOCK_HARDWARE(radeon);
+	driWaitForVBlank(dPriv, &radeon->vbl_seq, radeon->vblank_flags,
+			 &missed_target);
+	if (missed_target) {
+		radeon->swap_missed_count++;
+		(void)(*dri_interface->getUST) (&radeon->swap_missed_ust);
+	}
+	LOCK_HARDWARE(radeon);
+
+	ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_FLIP);
+
+	UNLOCK_HARDWARE(radeon);
+
+	if (ret) {
+		fprintf(stderr, "DRM_RADEON_FLIP: return = %d\n", ret);
+		exit(1);
+	}
+
+	radeon->swap_count++;
+	(void)(*dri_interface->getUST) (&radeon->swap_ust);
+
+        driFlipRenderbuffers(radeon->glCtx->WinSysDrawBuffer, 
+                             radeon->sarea->pfCurrentPage);
+
+	if (radeon->sarea->pfCurrentPage == 1) {
+		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
+		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
+	} else {
+		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
+		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
+	}
+
+	if (IS_R300_CLASS(radeon->radeonScreen)) {
+		r300ContextPtr r300 = (r300ContextPtr)radeon;
+		R300_STATECHANGE(r300, cb);
+		r300->hw.cb.cmd[R300_CB_OFFSET] = r300->radeon.state.color.drawOffset + 
+						r300->radeon.radeonScreen->fbLocation;
+		r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
+		
+		if (r300->radeon.radeonScreen->cpp == 4)
+			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
+		else
+			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
+	
+		if (r300->radeon.sarea->tiling_enabled)
+			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
+	}
+}
+
+void radeonWaitForIdleLocked(radeonContextPtr radeon)
+{
+	int ret;
+	int i = 0;
+
+	do {
+		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_CP_IDLE);
+		if (ret)
+			DO_USLEEP(1);
+	} while (ret && ++i < 100);
+
+	if (ret < 0) {
+		UNLOCK_HARDWARE(radeon);
+		fprintf(stderr, "Error: R300 timed out... exiting\n");
+		exit(-1);
+	}
+}
+
+static void radeonWaitForIdle(radeonContextPtr radeon)
+{
+	LOCK_HARDWARE(radeon);
+	radeonWaitForIdleLocked(radeon);
+	UNLOCK_HARDWARE(radeon);
+}
+
+void radeonFlush(GLcontext * ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+	if (IS_R300_CLASS(radeon->radeonScreen))
+		r300Flush(ctx);
+}
+
+
+/* Make sure all commands have been sent to the hardware and have
+ * completed processing.
+ */
+void radeonFinish(GLcontext * ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+	radeonFlush(ctx);
+
+	if (radeon->do_irqs) {
+		LOCK_HARDWARE(radeon);
+		radeonEmitIrqLocked(radeon);
+		UNLOCK_HARDWARE(radeon);
+		radeonWaitIrq(radeon);
+	} else
+		radeonWaitForIdle(radeon);
+}
diff --git a/r300/radeon_ioctl.h b/r300/radeon_ioctl.h
new file mode 100644
index 0000000..3a80d36
--- /dev/null
+++ b/r300/radeon_ioctl.h
@@ -0,0 +1,57 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __RADEON_IOCTL_H__
+#define __RADEON_IOCTL_H__
+
+#include "simple_list.h"
+#include "radeon_dri.h"
+#include "radeon_lock.h"
+
+#include "xf86drm.h"
+#include "drm.h"
+#if 0
+#include "r200_context.h"
+#endif
+#include "radeon_drm.h"
+
+extern void radeonCopyBuffer(const __DRIdrawablePrivate * drawable,
+			     const drm_clip_rect_t	* rect);
+extern void radeonPageFlip(const __DRIdrawablePrivate * drawable);
+extern void radeonFlush(GLcontext * ctx);
+extern void radeonFinish(GLcontext * ctx);
+extern void radeonWaitForIdleLocked(radeonContextPtr radeon);
+extern uint32_t radeonGetAge(radeonContextPtr radeon);
+
+#endif				/* __RADEON_IOCTL_H__ */
diff --git a/r300/radeon_lock.c b/r300/radeon_lock.c
new file mode 100644
index 0000000..bc3c2d6
--- /dev/null
+++ b/r300/radeon_lock.c
@@ -0,0 +1,137 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Kevin E. Martin <martin@valinux.com>
+ */
+
+#include "radeon_lock.h"
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "r300_context.h"
+#include "r300_state.h"
+
+#include "framebuffer.h"
+
+#include "drirenderbuffer.h"
+
+#if DEBUG_LOCKING
+char *prevLockFile = NULL;
+int prevLockLine = 0;
+#endif
+
+/* Turn on/off page flipping according to the flags in the sarea:
+ */
+void radeonUpdatePageFlipping(radeonContextPtr rmesa)
+{
+	int use_back;
+
+	rmesa->doPageFlip = rmesa->sarea->pfState;
+	if (rmesa->glCtx->WinSysDrawBuffer) {
+		driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
+				     rmesa->sarea->pfCurrentPage);
+		r300UpdateDrawBuffer(rmesa->glCtx);
+	}
+
+	use_back = rmesa->glCtx->DrawBuffer ?
+	    (rmesa->glCtx->DrawBuffer->_ColorDrawBufferMask[0] ==
+	     BUFFER_BIT_BACK_LEFT) : 1;
+	use_back ^= (rmesa->sarea->pfCurrentPage == 1);
+
+	if (use_back) {
+		rmesa->state.color.drawOffset =
+		    rmesa->radeonScreen->backOffset;
+		rmesa->state.color.drawPitch = rmesa->radeonScreen->backPitch;
+	} else {
+		rmesa->state.color.drawOffset =
+		    rmesa->radeonScreen->frontOffset;
+		rmesa->state.color.drawPitch =
+		    rmesa->radeonScreen->frontPitch;
+	}
+}
+
+/* Update the hardware state.  This is called if another context has
+ * grabbed the hardware lock, which includes the X server.  This
+ * function also updates the driver's window state after the X server
+ * moves, resizes or restacks a window -- the change will be reflected
+ * in the drawable position and clip rects.  Since the X server grabs
+ * the hardware lock when it changes the window state, this routine will
+ * automatically be called after such a change.
+ */
+void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
+{
+	__DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
+	__DRIdrawablePrivate *const readable = rmesa->dri.readable;
+	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
+	drm_radeon_sarea_t *sarea = rmesa->sarea;
+	r300ContextPtr r300 = (r300ContextPtr) rmesa;
+
+	assert(drawable != NULL);
+
+	drmGetLock(rmesa->dri.fd, rmesa->dri.hwContext, flags);
+
+	/* The window might have moved, so we might need to get new clip
+	 * rects.
+	 *
+	 * NOTE: This releases and regrabs the hw lock to allow the X server
+	 * to respond to the DRI protocol request for new drawable info.
+	 * Since the hardware state depends on having the latest drawable
+	 * clip rects, all state checking must be done _after_ this call.
+	 */
+	DRI_VALIDATE_DRAWABLE_INFO(sPriv, drawable);
+	if (drawable != readable) {
+		DRI_VALIDATE_DRAWABLE_INFO(sPriv, readable);
+	}
+
+	if (rmesa->lastStamp != drawable->lastStamp) {
+		radeonUpdatePageFlipping(rmesa);
+		radeonSetCliprects(rmesa);
+		r300UpdateViewportOffset(rmesa->glCtx);
+		driUpdateFramebufferSize(rmesa->glCtx, drawable);
+	}
+
+	if (sarea->ctx_owner != rmesa->dri.hwContext) {
+		int i;
+
+		sarea->ctx_owner = rmesa->dri.hwContext;
+		for (i = 0; i < r300->nr_heaps; i++) {
+			DRI_AGE_TEXTURES(r300->texture_heaps[i]);
+		}
+	}
+
+	rmesa->lost_context = GL_TRUE;
+}
diff --git a/r300/radeon_lock.h b/r300/radeon_lock.h
new file mode 100644
index 0000000..c47adc9
--- /dev/null
+++ b/r300/radeon_lock.h
@@ -0,0 +1,118 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Kevin E. Martin <martin@valinux.com>
+ */
+
+#ifndef __RADEON_LOCK_H__
+#define __RADEON_LOCK_H__
+
+#if 0
+#include "r200_ioctl.h"
+#endif
+#include "radeon_context.h"
+
+extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
+extern void radeonUpdatePageFlipping(radeonContextPtr rmesa);
+
+/* Turn DEBUG_LOCKING on to find locking conflicts.
+ */
+#define DEBUG_LOCKING	0
+
+#if DEBUG_LOCKING
+extern char *prevLockFile;
+extern int prevLockLine;
+
+#define DEBUG_LOCK()							\
+   do {									\
+      prevLockFile = (__FILE__);					\
+      prevLockLine = (__LINE__);					\
+   } while (0)
+
+#define DEBUG_RESET()							\
+   do {									\
+      prevLockFile = 0;							\
+      prevLockLine = 0;							\
+   } while (0)
+
+#define DEBUG_CHECK_LOCK()						\
+   do {									\
+      if (prevLockFile) {						\
+	 fprintf(stderr,						\
+		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
+		  prevLockFile, prevLockLine, __FILE__, __LINE__);	\
+	 exit(1);							\
+      }									\
+   } while (0)
+
+#else
+
+#define DEBUG_LOCK()
+#define DEBUG_RESET()
+#define DEBUG_CHECK_LOCK()
+
+#endif
+
+/*
+ * !!! We may want to separate locks from locks with validation.  This
+ * could be used to improve performance for those things commands that
+ * do not do any drawing !!!
+ */
+
+/* Lock the hardware and validate our state.
+ */
+#define LOCK_HARDWARE( rmesa )						\
+	do {								\
+		char __ret = 0;						\
+		DEBUG_CHECK_LOCK();					\
+		DRM_CAS((rmesa)->dri.hwLock, (rmesa)->dri.hwContext,	\
+			(DRM_LOCK_HELD | (rmesa)->dri.hwContext), __ret); \
+		if (__ret)						\
+			radeonGetLock((rmesa), 0);			\
+		DEBUG_LOCK();						\
+	} while (0)
+
+#define UNLOCK_HARDWARE( rmesa )					\
+	do {								\
+		DRM_UNLOCK((rmesa)->dri.fd,				\
+			(rmesa)->dri.hwLock,				\
+			(rmesa)->dri.hwContext);			\
+		DEBUG_RESET();						\
+	} while (0)
+
+#endif				/* __RADEON_LOCK_H__ */
diff --git a/r300/radeon_span.c b/r300/radeon_span.c
new file mode 100644
index 0000000..eae09d6
--- /dev/null
+++ b/r300/radeon_span.c
@@ -0,0 +1,321 @@
+/**************************************************************************
+
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *
+ */
+
+#include "glheader.h"
+#include "swrast/swrast.h"
+
+#include "r300_state.h"
+#include "radeon_ioctl.h"
+#include "r300_ioctl.h"
+#include "radeon_span.h"
+
+#include "drirenderbuffer.h"
+
+#define DBG 0
+
+/*
+ * Note that all information needed to access pixels in a renderbuffer
+ * should be obtained through the gl_renderbuffer parameter, not per-context
+ * information.
+ */
+#define LOCAL_VARS						\
+   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
+   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
+   const GLuint bottom = dPriv->h - 1;				\
+   GLubyte *buf = (GLubyte *) drb->flippedData			\
+      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
+   GLuint p;							\
+   (void) p;
+
+#define LOCAL_DEPTH_VARS				\
+   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
+   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
+   const GLuint bottom = dPriv->h - 1;			\
+   GLuint xo = dPriv->x;				\
+   GLuint yo = dPriv->y;				\
+   GLubyte *buf = (GLubyte *) drb->Base.Data;
+
+#define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
+
+#define Y_FLIP(Y) (bottom - (Y))
+
+#define HW_LOCK()
+
+#define HW_UNLOCK()
+
+/* ================================================================
+ * Color buffer
+ */
+
+/* 16 bit, RGB565 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_RGB
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
+
+#define TAG(x)    radeon##x##_RGB565
+#define TAG2(x,y) radeon##x##_RGB565##y
+#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
+#include "spantmp2.h"
+
+/* 32 bit, ARGB8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+
+#define TAG(x)    radeon##x##_ARGB8888
+#define TAG2(x,y) radeon##x##_ARGB8888##y
+#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
+#include "spantmp2.h"
+
+/* ================================================================
+ * Depth buffer
+ */
+
+/* The Radeon family has depth tiling on all the time, so we have to convert
+ * the x,y coordinates into the memory bus address (mba) in the same
+ * manner as the engine.  In each case, the linear block address (ba)
+ * is calculated, and then wired with x and y to produce the final
+ * memory address.
+ * The chip will do address translation on its own if the surface registers
+ * are set up correctly. It is not quite enough to get it working with hyperz
+ * too...
+ */
+
+static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
+{
+	GLuint pitch = drb->pitch;
+	if (drb->depthHasSurface) {
+		return 4 * (x + y * pitch);
+	} else {
+		GLuint ba, address = 0;	/* a[0..1] = 0           */
+
+#ifdef COMPILE_R300
+		ba = (y / 8) * (pitch / 8) + (x / 8);
+#else
+		ba = (y / 16) * (pitch / 16) + (x / 16);
+#endif
+
+		address |= (x & 0x7) << 2;	/* a[2..4] = x[0..2]     */
+		address |= (y & 0x3) << 5;	/* a[5..6] = y[0..1]     */
+		address |= (((x & 0x10) >> 2) ^ (y & 0x4)) << 5;	/* a[7]    = x[4] ^ y[2] */
+		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
+
+		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
+		address |= (((x & 0x8) << 1) ^ (y & 0x10)) << 7;	/* a[11]   = x[3] ^ y[4] */
+		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
+
+		return address;
+	}
+}
+
+static INLINE GLuint
+radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
+{
+	GLuint pitch = drb->pitch;
+	if (drb->depthHasSurface) {
+		return 2 * (x + y * pitch);
+	} else {
+		GLuint ba, address = 0;	/* a[0]    = 0           */
+
+		ba = (y / 16) * (pitch / 32) + (x / 32);
+
+		address |= (x & 0x7) << 1;	/* a[1..3] = x[0..2]     */
+		address |= (y & 0x7) << 4;	/* a[4..6] = y[0..2]     */
+		address |= (x & 0x8) << 4;	/* a[7]    = x[3]        */
+		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
+		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
+		address |= ((x & 0x10) ^ (y & 0x10)) << 7;	/* a[11]   = x[4] ^ y[4] */
+		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
+
+		return address;
+	}
+}
+
+/* 16-bit depth buffer functions
+ */
+#define WRITE_DEPTH( _x, _y, d )					\
+   *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo )) = d;
+
+#define READ_DEPTH( d, _x, _y )						\
+   d = *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo ));
+
+#define TAG(x) radeon##x##_z16
+#include "depthtmp.h"
+
+/* 24 bit depth, 8 bit stencil depthbuffer functions
+ *
+ * Careful: It looks like the R300 uses ZZZS byte order while the R200
+ * uses SZZZ for 24 bit depth, 8 bit stencil mode.
+ */
+#ifdef COMPILE_R300
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0x000000ff;							\
+   tmp |= ((d << 8) & 0xffffff00);					\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+#else
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0xff000000;							\
+   tmp |= ((d) & 0x00ffffff);						\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+#endif
+
+#ifdef COMPILE_R300
+#define READ_DEPTH( d, _x, _y )						\
+  do { \
+    d = (*(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,		\
+					 _y + yo )) & 0xffffff00) >> 8; \
+  }while(0)
+#else
+#define READ_DEPTH( d, _x, _y )						\
+   d = *(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,			\
+					 _y + yo )) & 0x00ffffff;
+#endif
+
+#define TAG(x) radeon##x##_z24_s8
+#include "depthtmp.h"
+
+/* ================================================================
+ * Stencil buffer
+ */
+
+/* 24 bit depth, 8 bit stencil depthbuffer functions
+ */
+#ifdef COMPILE_R300
+#define WRITE_STENCIL( _x, _y, d )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0xffffff00;							\
+   tmp |= (d) & 0xff;							\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+#else
+#define WRITE_STENCIL( _x, _y, d )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0x00ffffff;							\
+   tmp |= (((d) & 0xff) << 24);						\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+#endif
+
+#ifdef COMPILE_R300
+#define READ_STENCIL( d, _x, _y )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   d = tmp & 0x000000ff;						\
+} while (0)
+#else
+#define READ_STENCIL( d, _x, _y )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   d = (tmp & 0xff000000) >> 24;					\
+} while (0)
+#endif
+
+#define TAG(x) radeon##x##_z24_s8
+#include "stenciltmp.h"
+
+/* Move locking out to get reasonable span performance (10x better
+ * than doing this in HW_LOCK above).  WaitForIdle() is the main
+ * culprit.
+ */
+
+static void radeonSpanRenderStart(GLcontext * ctx)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+#ifdef COMPILE_R300
+	r300ContextPtr r300 = (r300ContextPtr) rmesa;
+	R300_FIREVERTICES(r300);
+#else
+	RADEON_FIREVERTICES(rmesa);
+#endif
+	LOCK_HARDWARE(rmesa);
+	radeonWaitForIdleLocked(rmesa);
+}
+
+static void radeonSpanRenderFinish(GLcontext * ctx)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	_swrast_flush(ctx);
+	UNLOCK_HARDWARE(rmesa);
+}
+
+void radeonInitSpanFuncs(GLcontext * ctx)
+{
+	struct swrast_device_driver *swdd =
+	    _swrast_GetDeviceDriverReference(ctx);
+	swdd->SpanRenderStart = radeonSpanRenderStart;
+	swdd->SpanRenderFinish = radeonSpanRenderFinish;
+}
+
+/**
+ * Plug in the Get/Put routines for the given driRenderbuffer.
+ */
+void radeonSetSpanFunctions(driRenderbuffer * drb, const GLvisual * vis)
+{
+	if (drb->Base.InternalFormat == GL_RGBA) {
+		if (vis->redBits == 5 && vis->greenBits == 6
+		    && vis->blueBits == 5) {
+			radeonInitPointers_RGB565(&drb->Base);
+		} else {
+			radeonInitPointers_ARGB8888(&drb->Base);
+		}
+	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
+		radeonInitDepthPointers_z16(&drb->Base);
+	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
+		radeonInitDepthPointers_z24_s8(&drb->Base);
+	} else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
+		radeonInitStencilPointers_z24_s8(&drb->Base);
+	}
+}
diff --git a/r300/radeon_state.c b/r300/radeon_state.c
new file mode 100644
index 0000000..82bfd95
--- /dev/null
+++ b/r300/radeon_state.c
@@ -0,0 +1,243 @@
+/**************************************************************************
+
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "api_arrayelt.h"
+#include "enums.h"
+#include "colormac.h"
+#include "light.h"
+
+#include "swrast/swrast.h"
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "swrast_setup/swrast_setup.h"
+
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "r300_ioctl.h"
+#include "framebuffer.h"
+
+/* =============================================================
+ * Scissoring
+ */
+
+static GLboolean intersect_rect(drm_clip_rect_t * out,
+				drm_clip_rect_t * a, drm_clip_rect_t * b)
+{
+	*out = *a;
+	if (b->x1 > out->x1)
+		out->x1 = b->x1;
+	if (b->y1 > out->y1)
+		out->y1 = b->y1;
+	if (b->x2 < out->x2)
+		out->x2 = b->x2;
+	if (b->y2 < out->y2)
+		out->y2 = b->y2;
+	if (out->x1 >= out->x2)
+		return GL_FALSE;
+	if (out->y1 >= out->y2)
+		return GL_FALSE;
+	return GL_TRUE;
+}
+
+void radeonRecalcScissorRects(radeonContextPtr radeon)
+{
+	drm_clip_rect_t *out;
+	int i;
+
+	/* Grow cliprect store?
+	 */
+	if (radeon->state.scissor.numAllocedClipRects < radeon->numClipRects) {
+		while (radeon->state.scissor.numAllocedClipRects <
+		       radeon->numClipRects) {
+			radeon->state.scissor.numAllocedClipRects += 1;	/* zero case */
+			radeon->state.scissor.numAllocedClipRects *= 2;
+		}
+
+		if (radeon->state.scissor.pClipRects)
+			FREE(radeon->state.scissor.pClipRects);
+
+		radeon->state.scissor.pClipRects =
+		    MALLOC(radeon->state.scissor.numAllocedClipRects *
+			   sizeof(drm_clip_rect_t));
+
+		if (radeon->state.scissor.pClipRects == NULL) {
+			radeon->state.scissor.numAllocedClipRects = 0;
+			return;
+		}
+	}
+
+	out = radeon->state.scissor.pClipRects;
+	radeon->state.scissor.numClipRects = 0;
+
+	for (i = 0; i < radeon->numClipRects; i++) {
+		if (intersect_rect(out,
+				   &radeon->pClipRects[i],
+				   &radeon->state.scissor.rect)) {
+			radeon->state.scissor.numClipRects++;
+			out++;
+		}
+	}
+}
+
+void radeonUpdateScissor(GLcontext* ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+	if (radeon->dri.drawable) {
+		__DRIdrawablePrivate *dPriv = radeon->dri.drawable;
+		int x1 = dPriv->x + ctx->Scissor.X;
+		int y1 = dPriv->y + dPriv->h - (ctx->Scissor.Y + ctx->Scissor.Height);
+
+		radeon->state.scissor.rect.x1 = x1;
+		radeon->state.scissor.rect.y1 = y1;
+		radeon->state.scissor.rect.x2 = x1 + ctx->Scissor.Width - 1;
+		radeon->state.scissor.rect.y2 = y1 + ctx->Scissor.Height - 1;
+
+		radeonRecalcScissorRects(radeon);
+	}
+}
+
+static void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h)
+{
+	if (ctx->Scissor.Enabled) {
+		/* We don't pipeline cliprect changes */
+		r300Flush(ctx);
+		radeonUpdateScissor(ctx);
+	}
+}
+
+
+/**
+ * Update cliprects and scissors.
+ */
+void radeonSetCliprects(radeonContextPtr radeon)
+{
+	__DRIdrawablePrivate *const drawable = radeon->dri.drawable;
+	__DRIdrawablePrivate *const readable = radeon->dri.readable;
+	GLframebuffer *const draw_fb = (GLframebuffer*)drawable->driverPrivate;
+	GLframebuffer *const read_fb = (GLframebuffer*)readable->driverPrivate;
+
+	if (draw_fb->_ColorDrawBufferMask[0] == BUFFER_BIT_BACK_LEFT) {
+		/* Can't ignore 2d windows if we are page flipping. */
+		if (drawable->numBackClipRects == 0 || radeon->doPageFlip ||
+		    radeon->sarea->pfCurrentPage == 1) {
+			radeon->numClipRects = drawable->numClipRects;
+			radeon->pClipRects = drawable->pClipRects;
+		} else {
+			radeon->numClipRects = drawable->numBackClipRects;
+			radeon->pClipRects = drawable->pBackClipRects;
+		}
+	} else {
+		/* front buffer (or none, or multiple buffers */
+		radeon->numClipRects = drawable->numClipRects;
+		radeon->pClipRects = drawable->pClipRects;
+	}
+
+	if ((draw_fb->Width != drawable->w) ||
+	    (draw_fb->Height != drawable->h)) {
+		_mesa_resize_framebuffer(radeon->glCtx, draw_fb,
+					 drawable->w, drawable->h);
+		draw_fb->Initialized = GL_TRUE;
+	}
+
+	if (drawable != readable) {
+		if ((read_fb->Width != readable->w) ||
+		    (read_fb->Height != readable->h)) {
+			_mesa_resize_framebuffer(radeon->glCtx, read_fb,
+						 readable->w, readable->h);
+			read_fb->Initialized = GL_TRUE;
+		}
+	}
+
+	if (radeon->state.scissor.enabled)
+		radeonRecalcScissorRects(radeon);
+
+	radeon->lastStamp = drawable->lastStamp;
+}
+
+
+/**
+ * Handle common enable bits.
+ * Called as a fallback by r200Enable/r300Enable.
+ */
+void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+	switch(cap) {
+	case GL_SCISSOR_TEST:
+		/* We don't pipeline cliprect & scissor changes */
+		r300Flush(ctx);
+
+		radeon->state.scissor.enabled = state;
+		radeonUpdateScissor(ctx);
+		break;
+
+	default:
+		return;
+	}
+}
+
+
+/**
+ * Initialize default state.
+ * This function is called once at context init time from
+ * r200InitState/r300InitState
+ */
+void radeonInitState(radeonContextPtr radeon)
+{
+	radeon->Fallback = 0;
+
+	if (radeon->glCtx->Visual.doubleBufferMode && radeon->sarea->pfCurrentPage == 0) {
+		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
+		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
+	} else {
+		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
+		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
+	}
+}
+
+
+/**
+ * Initialize common state functions.
+ * Called by r200InitStateFuncs/r300InitStateFuncs
+ */
+void radeonInitStateFuncs(struct dd_function_table *functions)
+{
+	functions->Scissor = radeonScissor;
+}
diff --git a/r300/radeon_state.h b/r300/radeon_state.h
new file mode 100644
index 0000000..821cb40
--- /dev/null
+++ b/r300/radeon_state.h
@@ -0,0 +1,43 @@
+/*
+Copyright (C) 2004 Nicolai Haehnle.  All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __RADEON_STATE_H__
+#define __RADEON_STATE_H__
+
+extern void radeonRecalcScissorRects(radeonContextPtr radeon);
+extern void radeonSetCliprects(radeonContextPtr radeon);
+extern void radeonUpdateScissor(GLcontext* ctx);
+
+extern void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state);
+
+extern void radeonInitState(radeonContextPtr radeon);
+extern void radeonInitStateFuncs(struct dd_function_table* functions);
+
+#endif
diff --git a/radeon/Makefile.am b/radeon/Makefile.am
new file mode 100644
index 0000000..ee1d008
--- /dev/null
+++ b/radeon/Makefile.am
@@ -0,0 +1,24 @@
+AM_CFLAGS = -DIN_DRI_DRIVER -DGLX_DIRECT_RENDERING -DGLX_INDIRECT_RENDERING
+
+RADEON_CFLAGS = -Iserver -DRADEON_COMMON=0
+
+radeon_dri_la_LTLIBRARIES = radeon_dri.la
+radeon_dri_la_CFLAGS = $(AM_CFLAGS) $(DRM_CFLAGS) $(DRI_CFLAGS) $(RADEON_CFLAGS)
+radeon_dri_la_LDFLAGS = -module -noprefix -avoid-version -lm -ldl \
+		$(DRM_LIBS) $(DRI_LIBS)
+radeon_dri_ladir = @libdir@/dri
+radeon_dri_la_SOURCES = \
+	radeon_context.c \
+	radeon_ioctl.c \
+	radeon_lock.c \
+	radeon_screen.c \
+	radeon_state.c \
+	radeon_state_init.c \
+	radeon_tex.c \
+	radeon_texmem.c \
+	radeon_texstate.c \
+	radeon_tcl.c \
+	radeon_swtcl.c \
+	radeon_span.c \
+	radeon_maos.c \
+	radeon_sanity.c 
diff --git a/radeon/radeon_chipset.h b/radeon/radeon_chipset.h
new file mode 100644
index 0000000..5c17e8f
--- /dev/null
+++ b/radeon/radeon_chipset.h
@@ -0,0 +1,182 @@
+#ifndef _RADEON_CHIPSET_H
+#define _RADEON_CHIPSET_H
+/* Including xf86PciInfo.h introduces a bunch of errors...
+ */
+
+/* General chip classes:
+ * r100 includes R100, RV100, RV200, RS100, RS200, RS250.
+ * r200 includes R200, RV250, RV280, RS300.
+ * r300 includes R300, RV350, RV370.
+ * (RS* denotes IGP)
+ */
+#define PCI_CHIP_RV380_3150		0x3150
+#define PCI_CHIP_RV380_3152		0x3152
+#define PCI_CHIP_RV380_3154		0x3154
+#define PCI_CHIP_RV380_3E50		0x3E50
+#define PCI_CHIP_RV380_3E54		0x3E54
+#define PCI_CHIP_RS100_4136		0x4136
+#define PCI_CHIP_RS200_4137		0x4137
+#define PCI_CHIP_R300_AD		0x4144
+#define PCI_CHIP_R300_AE		0x4145
+#define PCI_CHIP_R300_AF		0x4146
+#define PCI_CHIP_R300_AG		0x4147
+#define PCI_CHIP_R350_AH                0x4148
+#define PCI_CHIP_R350_AI                0x4149
+#define PCI_CHIP_R350_AJ                0x414A
+#define PCI_CHIP_R350_AK                0x414B
+#define PCI_CHIP_RV350_AP               0x4150
+#define PCI_CHIP_RV350_AQ               0x4151
+#define PCI_CHIP_RV350_AR               0x4152
+#define PCI_CHIP_RV350_AS               0x4153
+#define PCI_CHIP_RV350_AT               0x4154
+#define PCI_CHIP_RV350_AU		0x4155
+#define PCI_CHIP_RV350_AV               0x4156
+#define PCI_CHIP_RS250_4237		0x4237
+#define PCI_CHIP_R200_BB		0x4242
+#define PCI_CHIP_R200_BC		0x4243
+#define PCI_CHIP_RS100_4336		0x4336
+#define PCI_CHIP_RS200_4337		0x4337
+#define PCI_CHIP_RS250_4437		0x4437
+#define PCI_CHIP_RV250_If		0x4966
+#define PCI_CHIP_RV250_Ig		0x4967
+#define PCI_CHIP_R420_JH		0x4A48
+#define PCI_CHIP_R420_JI		0x4A49
+#define PCI_CHIP_R420_JJ		0x4A4A
+#define PCI_CHIP_R420_JK		0x4A4B
+#define PCI_CHIP_R420_JL		0x4A4C
+#define PCI_CHIP_R420_JM		0x4A4D
+#define PCI_CHIP_R420_JN		0x4A4E
+#define PCI_CHIP_R420_JO		0x4A4F
+#define PCI_CHIP_R420_JP		0x4A50
+#define PCI_CHIP_R420_JT		0x4A54
+#define PCI_CHIP_R481_4B49		0x4B49
+#define PCI_CHIP_R481_4B4A		0x4B4A
+#define PCI_CHIP_R481_4B4B		0x4B4B
+#define PCI_CHIP_R481_4B4C		0x4B4C
+#define PCI_CHIP_RADEON_LW		0x4C57
+#define PCI_CHIP_RADEON_LX		0x4C58
+#define PCI_CHIP_RADEON_LY		0x4C59
+#define PCI_CHIP_RADEON_LZ		0x4C5A
+#define PCI_CHIP_RV250_Ld		0x4C64
+#define PCI_CHIP_RV250_Lf		0x4C66
+#define PCI_CHIP_RV250_Lg		0x4C67
+#define PCI_CHIP_R300_ND		0x4E44
+#define PCI_CHIP_R300_NE		0x4E45
+#define PCI_CHIP_R300_NF		0x4E46
+#define PCI_CHIP_R300_NG		0x4E47
+#define PCI_CHIP_R350_NH                0x4E48
+#define PCI_CHIP_R350_NI                0x4E49  
+#define PCI_CHIP_R360_NJ                0x4E4A  
+#define PCI_CHIP_R350_NK                0x4E4B  
+#define PCI_CHIP_RV350_NP               0x4E50
+#define PCI_CHIP_RV350_NQ               0x4E51
+#define PCI_CHIP_RV350_NR               0x4E52
+#define PCI_CHIP_RV350_NS               0x4E53
+#define PCI_CHIP_RV350_NT               0x4E54
+#define PCI_CHIP_RV350_NV               0x4E56
+#define PCI_CHIP_RADEON_QD		0x5144
+#define PCI_CHIP_RADEON_QE		0x5145
+#define PCI_CHIP_RADEON_QF		0x5146
+#define PCI_CHIP_RADEON_QG		0x5147
+#define PCI_CHIP_R200_QH		0x5148
+#define PCI_CHIP_R200_QL		0x514C
+#define PCI_CHIP_R200_QM		0x514D
+#define PCI_CHIP_RV200_QW		0x5157
+#define PCI_CHIP_RV200_QX		0x5158
+#define PCI_CHIP_RADEON_QY		0x5159
+#define PCI_CHIP_RADEON_QZ		0x515A
+#define PCI_CHIP_RN50_515E		0x515E
+#define PCI_CHIP_RV370_5460		0x5460
+#define PCI_CHIP_RV370_5462		0x5462
+#define PCI_CHIP_RV370_5464		0x5464
+#define PCI_CHIP_R423_UH		0x5548
+#define PCI_CHIP_R423_UI		0x5549
+#define PCI_CHIP_R423_UJ		0x554A
+#define PCI_CHIP_R423_UK		0x554B
+#define PCI_CHIP_R430_554C		0x554C
+#define PCI_CHIP_R430_554D		0x554D
+#define PCI_CHIP_R430_554E		0x554E
+#define PCI_CHIP_R430_554F		0x554F
+#define PCI_CHIP_R423_5550		0x5550
+#define PCI_CHIP_R423_UQ		0x5551
+#define PCI_CHIP_R423_UR		0x5552
+#define PCI_CHIP_R423_UT		0x5554
+#define PCI_CHIP_RV410_564A		0x564A
+#define PCI_CHIP_RV410_564B		0x564B
+#define PCI_CHIP_RV410_564F		0x564F
+#define PCI_CHIP_RV410_5652		0x5652
+#define PCI_CHIP_RV410_5653		0x5653
+#define PCI_CHIP_RS300_5834		0x5834
+#define PCI_CHIP_RS300_5835		0x5835
+#define PCI_CHIP_RS480_5954		0x5954
+#define PCI_CHIP_RS480_5955		0x5955
+#define PCI_CHIP_RV280_5960		0x5960
+#define PCI_CHIP_RV280_5961		0x5961
+#define PCI_CHIP_RV280_5962		0x5962
+#define PCI_CHIP_RV280_5964		0x5964
+#define PCI_CHIP_RV280_5965		0x5965
+#define PCI_CHIP_RN50_5969		0x5969
+#define PCI_CHIP_RS482_5974		0x5974
+#define PCI_CHIP_RS482_5975		0x5975
+#define PCI_CHIP_RS400_5A41		0x5A41
+#define PCI_CHIP_RS400_5A42		0x5A42
+#define PCI_CHIP_RC410_5A61		0x5A61
+#define PCI_CHIP_RC410_5A62		0x5A62
+#define PCI_CHIP_RV370_5B60		0x5B60
+#define PCI_CHIP_RV370_5B62		0x5B62
+#define PCI_CHIP_RV370_5B63		0x5B63
+#define PCI_CHIP_RV370_5B64		0x5B64
+#define PCI_CHIP_RV370_5B65		0x5B65
+#define PCI_CHIP_RV370_5657		0x5657
+#define PCI_CHIP_RV280_5C61		0x5C61
+#define PCI_CHIP_RV280_5C63		0x5C63
+#define PCI_CHIP_R430_5D48		0x5D48
+#define PCI_CHIP_R430_5D49		0x5D49
+#define PCI_CHIP_R430_5D4A		0x5D4A
+#define PCI_CHIP_R480_5D4C		0x5D4C
+#define PCI_CHIP_R480_5D4D		0x5D4D
+#define PCI_CHIP_R480_5D4E		0x5D4E
+#define PCI_CHIP_R480_5D4F		0x5D4F
+#define PCI_CHIP_R480_5D50		0x5D50
+#define PCI_CHIP_R480_5D52		0x5D52
+#define PCI_CHIP_R423_5D57		0x5D57
+#define PCI_CHIP_RV410_5E48		0x5E48
+#define PCI_CHIP_RV410_5E4A		0x5E4A
+#define PCI_CHIP_RV410_5E4B		0x5E4B
+#define PCI_CHIP_RV410_5E4C		0x5E4C
+#define PCI_CHIP_RV410_5E4D		0x5E4D
+#define PCI_CHIP_RV410_5E4F		0x5E4F
+#define PCI_CHIP_RS350_7834		0x7834
+#define PCI_CHIP_RS350_7835		0x7835
+
+enum {
+   CHIP_FAMILY_R100,
+   CHIP_FAMILY_RV100,
+   CHIP_FAMILY_RS100,
+   CHIP_FAMILY_RV200,
+   CHIP_FAMILY_RS200,
+   CHIP_FAMILY_R200,
+   CHIP_FAMILY_RV250,
+   CHIP_FAMILY_RS300,
+   CHIP_FAMILY_RV280,
+   CHIP_FAMILY_R300,
+   CHIP_FAMILY_R350,
+   CHIP_FAMILY_RV350,
+   CHIP_FAMILY_RV380,
+   CHIP_FAMILY_R420,
+   CHIP_FAMILY_RV410,
+   CHIP_FAMILY_RS400,
+   CHIP_FAMILY_LAST
+};
+
+/* General classes of Radeons, as described above the device ID section */
+#define RADEON_CLASS_R100		(0 << 0)
+#define RADEON_CLASS_R200		(1 << 0)
+#define RADEON_CLASS_R300		(2 << 0)
+#define RADEON_CLASS_MASK		(3 << 0)
+
+#define RADEON_CHIPSET_TCL		(1 << 2)	/* tcl support - any radeon */
+#define RADEON_CHIPSET_BROKEN_STENCIL	(1 << 3)	/* r100 stencil bug */
+#define R200_CHIPSET_YCBCR_BROKEN	(1 << 4)	/* r200 ycbcr bug */
+
+#endif /* _RADEON_CHIPSET_H */
diff --git a/radeon/radeon_compat.c b/radeon/radeon_compat.c
new file mode 100644
index 0000000..1cbe340
--- /dev/null
+++ b/radeon/radeon_compat.c
@@ -0,0 +1,302 @@
+/* $XFree86$ */
+/**************************************************************************
+
+Copyright 2002 ATI Technologies Inc., Ontario, Canada, and
+               Tungsten Graphics Inc., Austin, Texas.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ATI, TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *
+ */
+
+#include "glheader.h"
+#include "imports.h"
+
+#include "radeon_context.h"
+#include "radeon_state.h"
+#include "radeon_ioctl.h"
+
+
+static struct { 
+	int start; 
+	int len; 
+	const char *name;
+} packet[RADEON_MAX_STATE_PACKETS] = {
+	{ RADEON_PP_MISC,7,"RADEON_PP_MISC" },
+	{ RADEON_PP_CNTL,3,"RADEON_PP_CNTL" },
+	{ RADEON_RB3D_COLORPITCH,1,"RADEON_RB3D_COLORPITCH" },
+	{ RADEON_RE_LINE_PATTERN,2,"RADEON_RE_LINE_PATTERN" },
+	{ RADEON_SE_LINE_WIDTH,1,"RADEON_SE_LINE_WIDTH" },
+	{ RADEON_PP_LUM_MATRIX,1,"RADEON_PP_LUM_MATRIX" },
+	{ RADEON_PP_ROT_MATRIX_0,2,"RADEON_PP_ROT_MATRIX_0" },
+	{ RADEON_RB3D_STENCILREFMASK,3,"RADEON_RB3D_STENCILREFMASK" },
+	{ RADEON_SE_VPORT_XSCALE,6,"RADEON_SE_VPORT_XSCALE" },
+	{ RADEON_SE_CNTL,2,"RADEON_SE_CNTL" },
+	{ RADEON_SE_CNTL_STATUS,1,"RADEON_SE_CNTL_STATUS" },
+	{ RADEON_RE_MISC,1,"RADEON_RE_MISC" },
+	{ RADEON_PP_TXFILTER_0,6,"RADEON_PP_TXFILTER_0" },
+	{ RADEON_PP_BORDER_COLOR_0,1,"RADEON_PP_BORDER_COLOR_0" },
+	{ RADEON_PP_TXFILTER_1,6,"RADEON_PP_TXFILTER_1" },
+	{ RADEON_PP_BORDER_COLOR_1,1,"RADEON_PP_BORDER_COLOR_1" },
+	{ RADEON_PP_TXFILTER_2,6,"RADEON_PP_TXFILTER_2" },
+	{ RADEON_PP_BORDER_COLOR_2,1,"RADEON_PP_BORDER_COLOR_2" },
+	{ RADEON_SE_ZBIAS_FACTOR,2,"RADEON_SE_ZBIAS_FACTOR" },
+	{ RADEON_SE_TCL_OUTPUT_VTX_FMT,11,"RADEON_SE_TCL_OUTPUT_VTX_FMT" },
+	{ RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED,17,"RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED" },
+};
+
+
+static void radeonCompatEmitPacket( radeonContextPtr rmesa, 
+				    struct radeon_state_atom *state )
+{
+   drm_radeon_sarea_t *sarea = rmesa->sarea;
+   drm_radeon_context_regs_t *ctx = &sarea->context_state;
+   drm_radeon_texture_regs_t *tex0 = &sarea->tex_state[0];
+   drm_radeon_texture_regs_t *tex1 = &sarea->tex_state[1];
+   int i;
+   int *buf = state->cmd;
+
+   for ( i = 0 ; i < state->cmd_size ; ) {
+      drm_radeon_cmd_header_t *header = (drm_radeon_cmd_header_t *)&buf[i++];
+
+      if (RADEON_DEBUG & DEBUG_STATE)
+	 fprintf(stderr, "%s %d: %s\n", __FUNCTION__, header->packet.packet_id,
+		 packet[(int)header->packet.packet_id].name);
+
+      switch (header->packet.packet_id) {
+      case RADEON_EMIT_PP_MISC:
+	 ctx->pp_misc = buf[i++]; 
+	 ctx->pp_fog_color = buf[i++];
+	 ctx->re_solid_color = buf[i++];
+	 ctx->rb3d_blendcntl = buf[i++];
+	 ctx->rb3d_depthoffset = buf[i++];
+	 ctx->rb3d_depthpitch = buf[i++];
+	 ctx->rb3d_zstencilcntl = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_CONTEXT;
+	 break;
+      case RADEON_EMIT_PP_CNTL:
+	 ctx->pp_cntl = buf[i++];
+	 ctx->rb3d_cntl = buf[i++];
+	 ctx->rb3d_coloroffset = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_CONTEXT;
+	 break;
+      case RADEON_EMIT_RB3D_COLORPITCH:
+	 ctx->rb3d_colorpitch = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_CONTEXT;
+	 break;
+      case RADEON_EMIT_RE_LINE_PATTERN:
+	 ctx->re_line_pattern = buf[i++];
+	 ctx->re_line_state = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_LINE;
+	 break;
+      case RADEON_EMIT_SE_LINE_WIDTH:
+	 ctx->se_line_width = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_LINE;
+	 break;
+      case RADEON_EMIT_PP_LUM_MATRIX:
+	 ctx->pp_lum_matrix = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_BUMPMAP;
+	 break;
+      case RADEON_EMIT_PP_ROT_MATRIX_0:
+	 ctx->pp_rot_matrix_0 = buf[i++];
+	 ctx->pp_rot_matrix_1 = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_BUMPMAP;
+	 break;
+      case RADEON_EMIT_RB3D_STENCILREFMASK:
+	 ctx->rb3d_stencilrefmask = buf[i++];
+	 ctx->rb3d_ropcntl = buf[i++];
+	 ctx->rb3d_planemask = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_MASKS;
+	 break;
+      case RADEON_EMIT_SE_VPORT_XSCALE:
+	 ctx->se_vport_xscale = buf[i++];
+	 ctx->se_vport_xoffset = buf[i++];
+	 ctx->se_vport_yscale = buf[i++];
+	 ctx->se_vport_yoffset = buf[i++];
+	 ctx->se_vport_zscale = buf[i++];
+	 ctx->se_vport_zoffset = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_VIEWPORT;
+	 break;
+      case RADEON_EMIT_SE_CNTL:
+	 ctx->se_cntl = buf[i++];
+	 ctx->se_coord_fmt = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_CONTEXT | RADEON_UPLOAD_VERTFMT;
+	 break;
+      case RADEON_EMIT_SE_CNTL_STATUS:
+	 ctx->se_cntl_status = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_SETUP;
+	 break;
+      case RADEON_EMIT_RE_MISC:
+	 ctx->re_misc = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_MISC;
+	 break;
+      case RADEON_EMIT_PP_TXFILTER_0:
+	 tex0->pp_txfilter = buf[i++];
+	 tex0->pp_txformat = buf[i++];
+	 tex0->pp_txoffset = buf[i++];
+	 tex0->pp_txcblend = buf[i++];
+	 tex0->pp_txablend = buf[i++];
+	 tex0->pp_tfactor = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_TEX0;
+	 break;
+      case RADEON_EMIT_PP_BORDER_COLOR_0:
+	 tex0->pp_border_color = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_TEX0;
+	 break;
+      case RADEON_EMIT_PP_TXFILTER_1:
+	 tex1->pp_txfilter = buf[i++];
+	 tex1->pp_txformat = buf[i++];
+	 tex1->pp_txoffset = buf[i++];
+	 tex1->pp_txcblend = buf[i++];
+	 tex1->pp_txablend = buf[i++];
+	 tex1->pp_tfactor = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_TEX1;
+	 break;
+      case RADEON_EMIT_PP_BORDER_COLOR_1:
+	 tex1->pp_border_color = buf[i++];
+	 sarea->dirty |= RADEON_UPLOAD_TEX1;
+	 break;
+
+      case RADEON_EMIT_SE_ZBIAS_FACTOR:
+	 i++;
+	 i++;
+	 break;
+
+      case RADEON_EMIT_PP_TXFILTER_2:
+      case RADEON_EMIT_PP_BORDER_COLOR_2:
+      case RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT:
+      case RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED:
+      default:
+	 /* These states aren't understood by radeon drm 1.1 */
+	 fprintf(stderr, "Tried to emit unsupported state\n");
+	 return;
+      }
+   }
+}
+
+
+
+static void radeonCompatEmitStateLocked( radeonContextPtr rmesa )
+{
+   struct radeon_state_atom *atom;
+
+   if (RADEON_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   if (!rmesa->hw.is_dirty && !rmesa->hw.all_dirty)
+      return;
+
+   foreach(atom, &rmesa->hw.atomlist) {
+      if (rmesa->hw.all_dirty)
+	 atom->dirty = GL_TRUE;
+      if (atom->is_tcl)
+	 atom->dirty = GL_FALSE;
+      if (atom->dirty)
+	 radeonCompatEmitPacket(rmesa, atom);
+   }
+ 
+   rmesa->hw.is_dirty = GL_FALSE;
+   rmesa->hw.all_dirty = GL_FALSE;
+}
+
+
+static void radeonCompatEmitPrimitiveLocked( radeonContextPtr rmesa,
+					     GLuint hw_primitive,
+					     GLuint nverts,
+					     drm_clip_rect_t *pbox,
+					     GLuint nbox )
+{
+   int i;
+
+   for ( i = 0 ; i < nbox ; ) {
+      int nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS, nbox );
+      drm_clip_rect_t *b = rmesa->sarea->boxes;
+      drm_radeon_vertex_t vtx;
+      
+      rmesa->sarea->dirty |= RADEON_UPLOAD_CLIPRECTS;
+      rmesa->sarea->nbox = nr - i;
+
+      for ( ; i < nr ; i++) 
+	 *b++ = pbox[i];
+      
+      if (RADEON_DEBUG & DEBUG_IOCTL)
+	 fprintf(stderr, 
+		 "RadeonFlushVertexBuffer: prim %x buf %d verts %d "
+		 "disc %d nbox %d\n",
+		 hw_primitive, 
+		 rmesa->dma.current.buf->buf->idx, 
+		 nverts, 
+		 nr == nbox,
+		 rmesa->sarea->nbox );
+
+      vtx.prim = hw_primitive;
+      vtx.idx = rmesa->dma.current.buf->buf->idx;
+      vtx.count = nverts;
+      vtx.discard = (nr == nbox);      
+
+      drmCommandWrite( rmesa->dri.fd, 
+		       DRM_RADEON_VERTEX,
+		       &vtx, sizeof(vtx));
+   }
+}
+
+
+
+/* No 'start' for 1.1 vertices ioctl: only one vertex prim/buffer!  
+ */
+void radeonCompatEmitPrimitive( radeonContextPtr rmesa,
+				GLuint vertex_format,
+				GLuint hw_primitive,
+				GLuint nrverts )
+{
+   if (RADEON_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   LOCK_HARDWARE( rmesa );
+
+   radeonCompatEmitStateLocked( rmesa );
+   rmesa->sarea->vc_format = vertex_format;
+   
+   if (rmesa->state.scissor.enabled) {
+      radeonCompatEmitPrimitiveLocked( rmesa, 
+				       hw_primitive,
+				       nrverts,
+				       rmesa->state.scissor.pClipRects,
+				       rmesa->state.scissor.numClipRects );
+   }
+   else {
+      radeonCompatEmitPrimitiveLocked( rmesa, 
+				       hw_primitive,
+				       nrverts,
+				       rmesa->pClipRects,
+				       rmesa->numClipRects );
+   }
+
+
+   UNLOCK_HARDWARE( rmesa );
+}
+
diff --git a/radeon/radeon_context.c b/radeon/radeon_context.c
new file mode 100644
index 0000000..9451ec4
--- /dev/null
+++ b/radeon/radeon_context.c
@@ -0,0 +1,637 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_context.c,v 1.9 2003/09/24 02:43:12 dawes Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "api_arrayelt.h"
+#include "context.h"
+#include "simple_list.h"
+#include "imports.h"
+#include "matrix.h"
+#include "extensions.h"
+#include "framebuffer.h"
+
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "vbo/vbo.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+
+#include "drivers/common/driverfuncs.h"
+
+#include "radeon_context.h"
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "radeon_span.h"
+#include "radeon_tex.h"
+#include "radeon_swtcl.h"
+#include "radeon_tcl.h"
+#include "radeon_maos.h"
+
+#define need_GL_ARB_multisample
+#define need_GL_ARB_texture_compression
+#define need_GL_ARB_vertex_buffer_object
+#define need_GL_EXT_blend_minmax
+#define need_GL_EXT_fog_coord
+#define need_GL_EXT_secondary_color
+#include "extension_helper.h"
+
+#define DRIVER_DATE	"20061018"
+
+#include "vblank.h"
+#include "utils.h"
+#include "xmlpool.h" /* for symbolic values of enum-type options */
+#ifndef RADEON_DEBUG
+int RADEON_DEBUG = (0);
+#endif
+
+
+/* Return various strings for glGetString().
+ */
+static const GLubyte *radeonGetString( GLcontext *ctx, GLenum name )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   static char buffer[128];
+   unsigned   offset;
+   GLuint agp_mode = (rmesa->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
+      rmesa->radeonScreen->AGPMode;
+
+   switch ( name ) {
+   case GL_VENDOR:
+      return (GLubyte *)"Tungsten Graphics, Inc.";
+
+   case GL_RENDERER:
+      offset = driGetRendererString( buffer, "Radeon", DRIVER_DATE,
+				     agp_mode );
+
+      sprintf( & buffer[ offset ], " %sTCL",
+	       !(rmesa->TclFallback & RADEON_TCL_FALLBACK_TCL_DISABLE)
+	       ? "" : "NO-" );
+
+      return (GLubyte *)buffer;
+
+   default:
+      return NULL;
+   }
+}
+
+
+/* Extension strings exported by the R100 driver.
+ */
+const struct dri_extension card_extensions[] =
+{
+    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
+    { "GL_ARB_multitexture",               NULL },
+    { "GL_ARB_texture_border_clamp",       NULL },
+    { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
+    { "GL_ARB_texture_env_add",            NULL },
+    { "GL_ARB_texture_env_combine",        NULL },
+    { "GL_ARB_texture_env_crossbar",       NULL },
+    { "GL_ARB_texture_env_dot3",           NULL },
+    { "GL_ARB_texture_mirrored_repeat",    NULL },
+    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
+    { "GL_EXT_blend_logic_op",             NULL },
+    { "GL_EXT_blend_subtract",             GL_EXT_blend_minmax_functions },
+    { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+    { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
+    { "GL_EXT_stencil_wrap",               NULL },
+    { "GL_EXT_texture_edge_clamp",         NULL },
+    { "GL_EXT_texture_env_combine",        NULL },
+    { "GL_EXT_texture_env_dot3",           NULL },
+    { "GL_EXT_texture_filter_anisotropic", NULL },
+    { "GL_EXT_texture_lod_bias",           NULL },
+    { "GL_EXT_texture_mirror_clamp",       NULL },
+    { "GL_ATI_texture_env_combine3",       NULL },
+    { "GL_ATI_texture_mirror_once",        NULL },
+    { "GL_MESA_ycbcr_texture",             NULL },
+    { "GL_NV_blend_square",                NULL },
+    { "GL_SGIS_generate_mipmap",           NULL },
+    { NULL,                                NULL }
+};
+
+extern const struct tnl_pipeline_stage _radeon_render_stage;
+extern const struct tnl_pipeline_stage _radeon_tcl_stage;
+
+static const struct tnl_pipeline_stage *radeon_pipeline[] = {
+
+   /* Try and go straight to t&l
+    */
+   &_radeon_tcl_stage,  
+
+   /* Catch any t&l fallbacks
+    */
+   &_tnl_vertex_transform_stage,
+   &_tnl_normal_transform_stage,
+   &_tnl_lighting_stage,
+   &_tnl_fog_coordinate_stage,
+   &_tnl_texgen_stage,
+   &_tnl_texture_transform_stage,
+
+   &_radeon_render_stage,
+   &_tnl_render_stage,		/* FALLBACK:  */
+   NULL,
+};
+
+
+
+/* Initialize the driver's misc functions.
+ */
+static void radeonInitDriverFuncs( struct dd_function_table *functions )
+{
+    functions->GetString	= radeonGetString;
+}
+
+static const struct dri_debug_control debug_control[] =
+{
+    { "fall",  DEBUG_FALLBACKS },
+    { "tex",   DEBUG_TEXTURE },
+    { "ioctl", DEBUG_IOCTL },
+    { "prim",  DEBUG_PRIMS },
+    { "vert",  DEBUG_VERTS },
+    { "state", DEBUG_STATE },
+    { "code",  DEBUG_CODEGEN },
+    { "vfmt",  DEBUG_VFMT },
+    { "vtxf",  DEBUG_VFMT },
+    { "verb",  DEBUG_VERBOSE },
+    { "dri",   DEBUG_DRI },
+    { "dma",   DEBUG_DMA },
+    { "san",   DEBUG_SANITY },
+    { "sync",  DEBUG_SYNC },
+    { NULL,    0 }
+};
+
+
+/* Create the device specific context.
+ */
+GLboolean
+radeonCreateContext( const __GLcontextModes *glVisual,
+                     __DRIcontextPrivate *driContextPriv,
+                     void *sharedContextPrivate)
+{
+   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   radeonScreenPtr screen = (radeonScreenPtr)(sPriv->private);
+   struct dd_function_table functions;
+   radeonContextPtr rmesa;
+   GLcontext *ctx, *shareCtx;
+   int i;
+   int tcl_mode, fthrottle_mode;
+
+   assert(glVisual);
+   assert(driContextPriv);
+   assert(screen);
+
+   /* Allocate the Radeon context */
+   rmesa = (radeonContextPtr) CALLOC( sizeof(*rmesa) );
+   if ( !rmesa )
+      return GL_FALSE;
+
+   /* init exp fog table data */
+   radeonInitStaticFogData();
+   
+   /* Parse configuration files.
+    * Do this here so that initialMaxAnisotropy is set before we create
+    * the default textures.
+    */
+   driParseConfigFiles (&rmesa->optionCache, &screen->optionCache,
+			screen->driScreen->myNum, "radeon");
+   rmesa->initialMaxAnisotropy = driQueryOptionf(&rmesa->optionCache,
+                                                 "def_max_anisotropy");
+
+   if ( driQueryOptionb( &rmesa->optionCache, "hyperz" ) ) {
+      if ( sPriv->drmMinor < 13 )
+	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
+			  "disabling.\n",sPriv->drmMinor );
+      else
+	 rmesa->using_hyperz = GL_TRUE;
+   }
+
+   if ( sPriv->drmMinor >= 15 )
+      rmesa->texmicrotile = GL_TRUE;
+
+   /* Init default driver functions then plug in our Radeon-specific functions
+    * (the texture functions are especially important)
+    */
+   _mesa_init_driver_functions( &functions );
+   radeonInitDriverFuncs( &functions );
+   radeonInitTextureFuncs( &functions );
+
+   /* Allocate the Mesa context */
+   if (sharedContextPrivate)
+      shareCtx = ((radeonContextPtr) sharedContextPrivate)->glCtx;
+   else
+      shareCtx = NULL;
+   rmesa->glCtx = _mesa_create_context(glVisual, shareCtx,
+                                       &functions, (void *) rmesa);
+   if (!rmesa->glCtx) {
+      FREE(rmesa);
+      return GL_FALSE;
+   }
+   driContextPriv->driverPrivate = rmesa;
+
+   /* Init radeon context data */
+   rmesa->dri.context = driContextPriv;
+   rmesa->dri.screen = sPriv;
+   rmesa->dri.drawable = NULL;
+   rmesa->dri.readable = NULL;
+   rmesa->dri.hwContext = driContextPriv->hHWContext;
+   rmesa->dri.hwLock = &sPriv->pSAREA->lock;
+   rmesa->dri.fd = sPriv->fd;
+   rmesa->dri.drmMinor = sPriv->drmMinor;
+
+   rmesa->radeonScreen = screen;
+   rmesa->sarea = (drm_radeon_sarea_t *)((GLubyte *)sPriv->pSAREA +
+				       screen->sarea_priv_offset);
+
+
+   rmesa->dma.buf0_address = rmesa->radeonScreen->buffers->list[0].address;
+
+   (void) memset( rmesa->texture_heaps, 0, sizeof( rmesa->texture_heaps ) );
+   make_empty_list( & rmesa->swapped );
+
+   rmesa->nr_heaps = screen->numTexHeaps;
+   for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
+      rmesa->texture_heaps[i] = driCreateTextureHeap( i, rmesa,
+	    screen->texSize[i],
+	    12,
+	    RADEON_NR_TEX_REGIONS,
+	    (drmTextureRegionPtr)rmesa->sarea->tex_list[i],
+	    & rmesa->sarea->tex_age[i],
+	    & rmesa->swapped,
+	    sizeof( radeonTexObj ),
+	    (destroy_texture_object_t *) radeonDestroyTexObj );
+
+      driSetTextureSwapCounterLocation( rmesa->texture_heaps[i],
+					& rmesa->c_textureSwaps );
+   }
+   rmesa->texture_depth = driQueryOptioni (&rmesa->optionCache,
+					   "texture_depth");
+   if (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
+      rmesa->texture_depth = ( screen->cpp == 4 ) ?
+	 DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
+
+   rmesa->swtcl.RenderIndex = ~0;
+   rmesa->hw.all_dirty = GL_TRUE;
+
+   /* Set the maximum texture size small enough that we can guarentee that
+    * all texture units can bind a maximal texture and have all of them in
+    * texturable memory at once. Depending on the allow_large_textures driconf
+    * setting allow larger textures.
+    */
+
+   ctx = rmesa->glCtx;
+   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->optionCache,
+						 "texture_units");
+   ctx->Const.MaxTextureImageUnits = ctx->Const.MaxTextureUnits;
+   ctx->Const.MaxTextureCoordUnits = ctx->Const.MaxTextureUnits;
+
+   i = driQueryOptioni( &rmesa->optionCache, "allow_large_textures");
+
+   driCalculateMaxTextureLevels( rmesa->texture_heaps,
+				 rmesa->nr_heaps,
+				 & ctx->Const,
+				 4,
+				 11, /* max 2D texture size is 2048x2048 */
+				 8,  /* 256^3 */
+				 9,  /* \todo: max cube texture size seems to be 512x512(x6) */
+				 11, /* max rect texture size is 2048x2048. */
+				 12,
+				 GL_FALSE,
+				 i );
+
+
+   ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+
+   /* No wide points.
+    */
+   ctx->Const.MinPointSize = 1.0;
+   ctx->Const.MinPointSizeAA = 1.0;
+   ctx->Const.MaxPointSize = 1.0;
+   ctx->Const.MaxPointSizeAA = 1.0;
+
+   ctx->Const.MinLineWidth = 1.0;
+   ctx->Const.MinLineWidthAA = 1.0;
+   ctx->Const.MaxLineWidth = 10.0;
+   ctx->Const.MaxLineWidthAA = 10.0;
+   ctx->Const.LineWidthGranularity = 0.0625;
+
+   /* Set maxlocksize (and hence vb size) small enough to avoid
+    * fallbacks in radeon_tcl.c.  ie. guarentee that all vertices can
+    * fit in a single dma buffer for indexed rendering of quad strips,
+    * etc.
+    */
+   ctx->Const.MaxArrayLockSize = 
+      MIN2( ctx->Const.MaxArrayLockSize, 
+ 	    RADEON_BUFFER_SIZE / RADEON_MAX_TCL_VERTSIZE ); 
+
+   rmesa->boxes = 0;
+
+   /* Initialize the software rasterizer and helper modules.
+    */
+   _swrast_CreateContext( ctx );
+   _vbo_CreateContext( ctx );
+   _tnl_CreateContext( ctx );
+   _swsetup_CreateContext( ctx );
+   _ae_create_context( ctx );
+
+   /* Install the customized pipeline:
+    */
+   _tnl_destroy_pipeline( ctx );
+   _tnl_install_pipeline( ctx, radeon_pipeline );
+
+   /* Try and keep materials and vertices separate:
+    */
+/*    _tnl_isolate_materials( ctx, GL_TRUE ); */
+
+   /* Configure swrast and T&L to match hardware characteristics:
+    */
+   _swrast_allow_pixel_fog( ctx, GL_FALSE );
+   _swrast_allow_vertex_fog( ctx, GL_TRUE );
+   _tnl_allow_pixel_fog( ctx, GL_FALSE );
+   _tnl_allow_vertex_fog( ctx, GL_TRUE );
+
+
+   for ( i = 0 ; i < RADEON_MAX_TEXTURE_UNITS ; i++ ) {
+      _math_matrix_ctr( &rmesa->TexGenMatrix[i] );
+      _math_matrix_ctr( &rmesa->tmpmat[i] );
+      _math_matrix_set_identity( &rmesa->TexGenMatrix[i] );
+      _math_matrix_set_identity( &rmesa->tmpmat[i] );
+   }
+
+   driInitExtensions( ctx, card_extensions, GL_TRUE );
+   if (rmesa->radeonScreen->drmSupportsCubeMapsR100)
+      _mesa_enable_extension( ctx, "GL_ARB_texture_cube_map" );
+   if (rmesa->glCtx->Mesa_DXTn) {
+      _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
+      _mesa_enable_extension( ctx, "GL_S3_s3tc" );
+   }
+   else if (driQueryOptionb (&rmesa->optionCache, "force_s3tc_enable")) {
+      _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
+   }
+
+   if (rmesa->dri.drmMinor >= 9)
+      _mesa_enable_extension( ctx, "GL_NV_texture_rectangle");
+
+   /* XXX these should really go right after _mesa_init_driver_functions() */
+   radeonInitIoctlFuncs( ctx );
+   radeonInitStateFuncs( ctx );
+   radeonInitSpanFuncs( ctx );
+   radeonInitState( rmesa );
+   radeonInitSwtcl( ctx );
+
+   _mesa_vector4f_alloc( &rmesa->tcl.ObjClean, 0, 
+			 ctx->Const.MaxArrayLockSize, 32 );
+
+   fthrottle_mode = driQueryOptioni(&rmesa->optionCache, "fthrottle_mode");
+   rmesa->iw.irq_seq = -1;
+   rmesa->irqsEmitted = 0;
+   rmesa->do_irqs = (rmesa->radeonScreen->irq != 0 &&
+		     fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS);
+
+   rmesa->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
+
+   rmesa->vblank_flags = (rmesa->radeonScreen->irq != 0)
+       ? driGetDefaultVBlankFlags(&rmesa->optionCache) : VBLANK_FLAG_NO_IRQ;
+
+   (*dri_interface->getUST)( & rmesa->swap_ust );
+
+
+#if DO_DEBUG
+   RADEON_DEBUG = driParseDebugString( getenv( "RADEON_DEBUG" ),
+				       debug_control );
+#endif
+
+   tcl_mode = driQueryOptioni(&rmesa->optionCache, "tcl_mode");
+   if (driQueryOptionb(&rmesa->optionCache, "no_rast")) {
+      fprintf(stderr, "disabling 3D acceleration\n");
+      FALLBACK(rmesa, RADEON_FALLBACK_DISABLE, 1);
+   } else if (tcl_mode == DRI_CONF_TCL_SW ||
+	      !(rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+      if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+	 rmesa->radeonScreen->chip_flags &= ~RADEON_CHIPSET_TCL;
+	 fprintf(stderr, "Disabling HW TCL support\n");
+      }
+      TCL_FALLBACK(rmesa->glCtx, RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
+   }
+
+   if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+/*       _tnl_need_dlist_norm_lengths( ctx, GL_FALSE ); */
+   }
+   return GL_TRUE;
+}
+
+
+/* Destroy the device specific context.
+ */
+/* Destroy the Mesa and driver specific context data.
+ */
+void radeonDestroyContext( __DRIcontextPrivate *driContextPriv )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   radeonContextPtr rmesa = (radeonContextPtr) driContextPriv->driverPrivate;
+   radeonContextPtr current = ctx ? RADEON_CONTEXT(ctx) : NULL;
+
+   /* check if we're deleting the currently bound context */
+   if (rmesa == current) {
+      RADEON_FIREVERTICES( rmesa );
+      _mesa_make_current(NULL, NULL, NULL);
+   }
+
+   /* Free radeon context resources */
+   assert(rmesa); /* should never be null */
+   if ( rmesa ) {
+      GLboolean   release_texture_heaps;
+
+
+      release_texture_heaps = (rmesa->glCtx->Shared->RefCount == 1);
+      _swsetup_DestroyContext( rmesa->glCtx );
+      _tnl_DestroyContext( rmesa->glCtx );
+      _vbo_DestroyContext( rmesa->glCtx );
+      _swrast_DestroyContext( rmesa->glCtx );
+
+      radeonDestroySwtcl( rmesa->glCtx );
+      radeonReleaseArrays( rmesa->glCtx, ~0 );
+      if (rmesa->dma.current.buf) {
+	 radeonReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
+	 radeonFlushCmdBuf( rmesa, __FUNCTION__ );
+      }
+
+      _mesa_vector4f_free( &rmesa->tcl.ObjClean );
+
+      if (rmesa->state.scissor.pClipRects) {
+	 FREE(rmesa->state.scissor.pClipRects);
+	 rmesa->state.scissor.pClipRects = NULL;
+      }
+
+      if ( release_texture_heaps ) {
+         /* This share group is about to go away, free our private
+          * texture object data.
+          */
+         int i;
+
+         for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
+	    driDestroyTextureHeap( rmesa->texture_heaps[ i ] );
+	    rmesa->texture_heaps[ i ] = NULL;
+         }
+
+	 assert( is_empty_list( & rmesa->swapped ) );
+      }
+
+      /* free the Mesa context */
+      rmesa->glCtx->DriverCtx = NULL;
+      _mesa_destroy_context( rmesa->glCtx );
+
+      /* free the option cache */
+      driDestroyOptionCache (&rmesa->optionCache);
+
+      FREE( rmesa );
+   }
+}
+
+
+
+
+void
+radeonSwapBuffers( __DRIdrawablePrivate *dPriv )
+{
+
+   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+      radeonContextPtr rmesa;
+      GLcontext *ctx;
+      rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+      ctx = rmesa->glCtx;
+      if (ctx->Visual.doubleBufferMode) {
+         _mesa_notifySwapBuffers( ctx );  /* flush pending rendering comands */
+
+         if ( rmesa->doPageFlip ) {
+            radeonPageFlip( dPriv );
+         }
+         else {
+	     radeonCopyBuffer( dPriv, NULL );
+         }
+      }
+   }
+   else {
+      /* XXX this shouldn't be an error but we can't handle it for now */
+      _mesa_problem(NULL, "%s: drawable has no context!", __FUNCTION__);
+   }
+}
+
+void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+			 int x, int y, int w, int h )
+{
+    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+	radeonContextPtr radeon;
+	GLcontext *ctx;
+
+	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+	ctx = radeon->glCtx;
+
+	if (ctx->Visual.doubleBufferMode) {
+	    drm_clip_rect_t rect;
+	    rect.x1 = x + dPriv->x;
+	    rect.y1 = (dPriv->h - y - h) + dPriv->y;
+	    rect.x2 = rect.x1 + w;
+	    rect.y2 = rect.y1 + h;
+	    _mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
+	    radeonCopyBuffer(dPriv, &rect);
+	}
+    } else {
+	/* XXX this shouldn't be an error but we can't handle it for now */
+	_mesa_problem(NULL, "%s: drawable has no context!",
+		      __FUNCTION__);
+    }
+}
+
+/* Make context `c' the current context and bind it to the given
+ * drawing and reading surfaces.
+ */
+GLboolean
+radeonMakeCurrent( __DRIcontextPrivate *driContextPriv,
+                   __DRIdrawablePrivate *driDrawPriv,
+                   __DRIdrawablePrivate *driReadPriv )
+{
+   if ( driContextPriv ) {
+      radeonContextPtr newCtx = 
+	 (radeonContextPtr) driContextPriv->driverPrivate;
+
+      if (RADEON_DEBUG & DEBUG_DRI)
+	 fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *) newCtx->glCtx);
+
+      if ( newCtx->dri.drawable != driDrawPriv ) {
+         /* XXX we may need to validate the drawable here!!! */
+	 driDrawableInitVBlank( driDrawPriv, newCtx->vblank_flags,
+				&newCtx->vbl_seq );
+      }
+
+      newCtx->dri.readable = driReadPriv;
+
+      if ( (newCtx->dri.drawable != driDrawPriv) ||
+           newCtx->lastStamp != driDrawPriv->lastStamp ) {
+	 newCtx->dri.drawable = driDrawPriv;
+
+	 radeonSetCliprects(newCtx);
+	 radeonUpdateViewportOffset( newCtx->glCtx );
+      }
+
+      _mesa_make_current( newCtx->glCtx,
+			  (GLframebuffer *) driDrawPriv->driverPrivate,
+			  (GLframebuffer *) driReadPriv->driverPrivate );
+
+      _mesa_update_state( newCtx->glCtx );
+   } else {
+      if (RADEON_DEBUG & DEBUG_DRI)
+	 fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
+      _mesa_make_current( NULL, NULL, NULL );
+   }
+
+   if (RADEON_DEBUG & DEBUG_DRI)
+      fprintf(stderr, "End %s\n", __FUNCTION__);
+   return GL_TRUE;
+}
+
+/* Force the context `c' to be unbound from its buffer.
+ */
+GLboolean
+radeonUnbindContext( __DRIcontextPrivate *driContextPriv )
+{
+   radeonContextPtr rmesa = (radeonContextPtr) driContextPriv->driverPrivate;
+
+   if (RADEON_DEBUG & DEBUG_DRI)
+      fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *) rmesa->glCtx);
+
+   return GL_TRUE;
+}
diff --git a/radeon/radeon_context.h b/radeon/radeon_context.h
new file mode 100644
index 0000000..8dedd66
--- /dev/null
+++ b/radeon/radeon_context.h
@@ -0,0 +1,764 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __RADEON_CONTEXT_H__
+#define __RADEON_CONTEXT_H__
+
+#include "tnl/t_vertex.h"
+#include "dri_util.h"
+#include "drm.h"
+#include "radeon_drm.h"
+#include "texmem.h"
+
+#include "macros.h"
+#include "mtypes.h"
+#include "colormac.h"
+
+struct radeon_context;
+typedef struct radeon_context radeonContextRec;
+typedef struct radeon_context *radeonContextPtr;
+
+/* This union is used to avoid warnings/miscompilation
+   with float to uint32_t casts due to strict-aliasing */
+typedef union {
+	GLfloat f;
+	uint32_t ui32;
+} float_ui32_type;
+
+#include "radeon_lock.h"
+#include "radeon_screen.h"
+#include "mm.h"
+
+#include "math/m_vector.h"
+
+#define TEX_0   0x1
+#define TEX_1   0x2
+#define TEX_2   0x4
+#define TEX_ALL 0x7
+
+/* Rasterizing fallbacks */
+/* See correponding strings in r200_swtcl.c */
+#define RADEON_FALLBACK_TEXTURE		0x0001
+#define RADEON_FALLBACK_DRAW_BUFFER	0x0002
+#define RADEON_FALLBACK_STENCIL		0x0004
+#define RADEON_FALLBACK_RENDER_MODE	0x0008
+#define RADEON_FALLBACK_BLEND_EQ	0x0010
+#define RADEON_FALLBACK_BLEND_FUNC	0x0020
+#define RADEON_FALLBACK_DISABLE 	0x0040
+#define RADEON_FALLBACK_BORDER_MODE	0x0080
+
+/* The blit width for texture uploads
+ */
+#define BLIT_WIDTH_BYTES 1024
+
+/* Use the templated vertex format:
+ */
+#define COLOR_IS_RGBA
+#define TAG(x) radeon##x
+#include "tnl_dd/t_dd_vertex.h"
+#undef TAG
+
+typedef void (*radeon_tri_func) (radeonContextPtr,
+				 radeonVertex *,
+				 radeonVertex *, radeonVertex *);
+
+typedef void (*radeon_line_func) (radeonContextPtr,
+				  radeonVertex *, radeonVertex *);
+
+typedef void (*radeon_point_func) (radeonContextPtr, radeonVertex *);
+
+struct radeon_colorbuffer_state {
+	GLuint clear;
+	int roundEnable;
+};
+
+struct radeon_depthbuffer_state {
+	GLuint clear;
+	GLfloat scale;
+};
+
+struct radeon_scissor_state {
+	drm_clip_rect_t rect;
+	GLboolean enabled;
+
+	GLuint numClipRects;	/* Cliprects active */
+	GLuint numAllocedClipRects;	/* Cliprects available */
+	drm_clip_rect_t *pClipRects;
+};
+
+struct radeon_stencilbuffer_state {
+	GLboolean hwBuffer;
+	GLuint clear;		/* rb3d_stencilrefmask value */
+};
+
+struct radeon_stipple_state {
+	GLuint mask[32];
+};
+
+/* used for both tcl_vtx and vc_frmt tex bits (they are identical) */
+#define RADEON_ST_BIT(unit) \
+(unit == 0 ? RADEON_CP_VC_FRMT_ST0 : (RADEON_CP_VC_FRMT_ST1 >> 2) << (2 * unit))
+
+#define RADEON_Q_BIT(unit) \
+(unit == 0 ? RADEON_CP_VC_FRMT_Q0 : (RADEON_CP_VC_FRMT_Q1 >> 2) << (2 * unit))
+
+typedef struct radeon_tex_obj radeonTexObj, *radeonTexObjPtr;
+
+/* Texture object in locally shared texture space.
+ */
+struct radeon_tex_obj {
+	driTextureObject base;
+
+	GLuint bufAddr;		/* Offset to start of locally
+				   shared texture block */
+
+	GLuint dirty_state;	/* Flags (1 per texunit) for
+				   whether or not this texobj
+				   has dirty hardware state
+				   (pp_*) that needs to be
+				   brought into the
+				   texunit. */
+
+	drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
+	/* Six, for the cube faces */
+
+	GLuint pp_txfilter;	/* hardware register values */
+	GLuint pp_txformat;
+	GLuint pp_txoffset;	/* Image location in texmem.
+				   All cube faces follow. */
+	GLuint pp_txsize;	/* npot only */
+	GLuint pp_txpitch;	/* npot only */
+	GLuint pp_border_color;
+	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
+
+	GLboolean border_fallback;
+
+	GLuint tile_bits;	/* hw texture tile bits used on this texture */
+};
+
+struct radeon_texture_env_state {
+	radeonTexObjPtr texobj;
+	GLenum format;
+	GLenum envMode;
+};
+
+struct radeon_texture_state {
+	struct radeon_texture_env_state unit[RADEON_MAX_TEXTURE_UNITS];
+};
+
+struct radeon_state_atom {
+	struct radeon_state_atom *next, *prev;
+	const char *name;	/* for debug */
+	int cmd_size;		/* size in bytes */
+	GLuint is_tcl;
+	int *cmd;		/* one or more cmd's */
+	int *lastcmd;		/* one or more cmd's */
+	GLboolean dirty;	/* dirty-mark in emit_state_list */
+	 GLboolean(*check) (GLcontext *);	/* is this state active? */
+};
+
+/* Trying to keep these relatively short as the variables are becoming
+ * extravagently long.  Drop the driver name prefix off the front of
+ * everything - I think we know which driver we're in by now, and keep the
+ * prefix to 3 letters unless absolutely impossible.  
+ */
+
+#define CTX_CMD_0             0
+#define CTX_PP_MISC           1
+#define CTX_PP_FOG_COLOR      2
+#define CTX_RE_SOLID_COLOR    3
+#define CTX_RB3D_BLENDCNTL    4
+#define CTX_RB3D_DEPTHOFFSET  5
+#define CTX_RB3D_DEPTHPITCH   6
+#define CTX_RB3D_ZSTENCILCNTL 7
+#define CTX_CMD_1             8
+#define CTX_PP_CNTL           9
+#define CTX_RB3D_CNTL         10
+#define CTX_RB3D_COLOROFFSET  11
+#define CTX_CMD_2             12
+#define CTX_RB3D_COLORPITCH   13
+#define CTX_STATE_SIZE        14
+
+#define SET_CMD_0               0
+#define SET_SE_CNTL             1
+#define SET_SE_COORDFMT         2
+#define SET_CMD_1               3
+#define SET_SE_CNTL_STATUS      4
+#define SET_STATE_SIZE          5
+
+#define LIN_CMD_0               0
+#define LIN_RE_LINE_PATTERN     1
+#define LIN_RE_LINE_STATE       2
+#define LIN_CMD_1               3
+#define LIN_SE_LINE_WIDTH       4
+#define LIN_STATE_SIZE          5
+
+#define MSK_CMD_0               0
+#define MSK_RB3D_STENCILREFMASK 1
+#define MSK_RB3D_ROPCNTL        2
+#define MSK_RB3D_PLANEMASK      3
+#define MSK_STATE_SIZE          4
+
+#define VPT_CMD_0           0
+#define VPT_SE_VPORT_XSCALE          1
+#define VPT_SE_VPORT_XOFFSET         2
+#define VPT_SE_VPORT_YSCALE          3
+#define VPT_SE_VPORT_YOFFSET         4
+#define VPT_SE_VPORT_ZSCALE          5
+#define VPT_SE_VPORT_ZOFFSET         6
+#define VPT_STATE_SIZE      7
+
+#define MSC_CMD_0               0
+#define MSC_RE_MISC             1
+#define MSC_STATE_SIZE          2
+
+#define TEX_CMD_0                   0
+#define TEX_PP_TXFILTER             1
+#define TEX_PP_TXFORMAT             2
+#define TEX_PP_TXOFFSET             3
+#define TEX_PP_TXCBLEND             4
+#define TEX_PP_TXABLEND             5
+#define TEX_PP_TFACTOR              6
+#define TEX_CMD_1                   7
+#define TEX_PP_BORDER_COLOR         8
+#define TEX_STATE_SIZE              9
+
+#define TXR_CMD_0                   0	/* rectangle textures */
+#define TXR_PP_TEX_SIZE             1	/* 0x1d04, 0x1d0c for NPOT! */
+#define TXR_PP_TEX_PITCH            2	/* 0x1d08, 0x1d10 for NPOT! */
+#define TXR_STATE_SIZE              3
+
+#define CUBE_CMD_0                  0
+#define CUBE_PP_CUBIC_FACES         1
+#define CUBE_CMD_1                  2
+#define CUBE_PP_CUBIC_OFFSET_0      3
+#define CUBE_PP_CUBIC_OFFSET_1      4
+#define CUBE_PP_CUBIC_OFFSET_2      5
+#define CUBE_PP_CUBIC_OFFSET_3      6
+#define CUBE_PP_CUBIC_OFFSET_4      7
+#define CUBE_STATE_SIZE             8
+
+#define ZBS_CMD_0              0
+#define ZBS_SE_ZBIAS_FACTOR             1
+#define ZBS_SE_ZBIAS_CONSTANT           2
+#define ZBS_STATE_SIZE         3
+
+#define TCL_CMD_0                        0
+#define TCL_OUTPUT_VTXFMT         1
+#define TCL_OUTPUT_VTXSEL         2
+#define TCL_MATRIX_SELECT_0       3
+#define TCL_MATRIX_SELECT_1       4
+#define TCL_UCP_VERT_BLEND_CTL    5
+#define TCL_TEXTURE_PROC_CTL      6
+#define TCL_LIGHT_MODEL_CTL       7
+#define TCL_PER_LIGHT_CTL_0       8
+#define TCL_PER_LIGHT_CTL_1       9
+#define TCL_PER_LIGHT_CTL_2       10
+#define TCL_PER_LIGHT_CTL_3       11
+#define TCL_STATE_SIZE                   12
+
+#define MTL_CMD_0            0
+#define MTL_EMMISSIVE_RED    1
+#define MTL_EMMISSIVE_GREEN  2
+#define MTL_EMMISSIVE_BLUE   3
+#define MTL_EMMISSIVE_ALPHA  4
+#define MTL_AMBIENT_RED      5
+#define MTL_AMBIENT_GREEN    6
+#define MTL_AMBIENT_BLUE     7
+#define MTL_AMBIENT_ALPHA    8
+#define MTL_DIFFUSE_RED      9
+#define MTL_DIFFUSE_GREEN    10
+#define MTL_DIFFUSE_BLUE     11
+#define MTL_DIFFUSE_ALPHA    12
+#define MTL_SPECULAR_RED     13
+#define MTL_SPECULAR_GREEN   14
+#define MTL_SPECULAR_BLUE    15
+#define MTL_SPECULAR_ALPHA   16
+#define MTL_SHININESS        17
+#define MTL_STATE_SIZE       18
+
+#define VTX_CMD_0              0
+#define VTX_SE_COORD_FMT       1
+#define VTX_STATE_SIZE         2
+
+#define MAT_CMD_0              0
+#define MAT_ELT_0              1
+#define MAT_STATE_SIZE         17
+
+#define GRD_CMD_0                  0
+#define GRD_VERT_GUARD_CLIP_ADJ    1
+#define GRD_VERT_GUARD_DISCARD_ADJ 2
+#define GRD_HORZ_GUARD_CLIP_ADJ    3
+#define GRD_HORZ_GUARD_DISCARD_ADJ 4
+#define GRD_STATE_SIZE             5
+
+/* position changes frequently when lighting in modelpos - separate
+ * out to new state item?  
+ */
+#define LIT_CMD_0                  0
+#define LIT_AMBIENT_RED            1
+#define LIT_AMBIENT_GREEN          2
+#define LIT_AMBIENT_BLUE           3
+#define LIT_AMBIENT_ALPHA          4
+#define LIT_DIFFUSE_RED            5
+#define LIT_DIFFUSE_GREEN          6
+#define LIT_DIFFUSE_BLUE           7
+#define LIT_DIFFUSE_ALPHA          8
+#define LIT_SPECULAR_RED           9
+#define LIT_SPECULAR_GREEN         10
+#define LIT_SPECULAR_BLUE          11
+#define LIT_SPECULAR_ALPHA         12
+#define LIT_POSITION_X             13
+#define LIT_POSITION_Y             14
+#define LIT_POSITION_Z             15
+#define LIT_POSITION_W             16
+#define LIT_DIRECTION_X            17
+#define LIT_DIRECTION_Y            18
+#define LIT_DIRECTION_Z            19
+#define LIT_DIRECTION_W            20
+#define LIT_ATTEN_QUADRATIC        21
+#define LIT_ATTEN_LINEAR           22
+#define LIT_ATTEN_CONST            23
+#define LIT_ATTEN_XXX              24
+#define LIT_CMD_1                  25
+#define LIT_SPOT_DCD               26
+#define LIT_SPOT_EXPONENT          27
+#define LIT_SPOT_CUTOFF            28
+#define LIT_SPECULAR_THRESH        29
+#define LIT_RANGE_CUTOFF           30	/* ? */
+#define LIT_ATTEN_CONST_INV        31
+#define LIT_STATE_SIZE             32
+
+/* Fog
+ */
+#define FOG_CMD_0      0
+#define FOG_R          1
+#define FOG_C          2
+#define FOG_D          3
+#define FOG_PAD        4
+#define FOG_STATE_SIZE 5
+
+/* UCP
+ */
+#define UCP_CMD_0      0
+#define UCP_X          1
+#define UCP_Y          2
+#define UCP_Z          3
+#define UCP_W          4
+#define UCP_STATE_SIZE 5
+
+/* GLT - Global ambient
+ */
+#define GLT_CMD_0      0
+#define GLT_RED        1
+#define GLT_GREEN      2
+#define GLT_BLUE       3
+#define GLT_ALPHA      4
+#define GLT_STATE_SIZE 5
+
+/* EYE
+ */
+#define EYE_CMD_0          0
+#define EYE_X              1
+#define EYE_Y              2
+#define EYE_Z              3
+#define EYE_RESCALE_FACTOR 4
+#define EYE_STATE_SIZE     5
+
+#define SHN_CMD_0          0
+#define SHN_SHININESS      1
+#define SHN_STATE_SIZE     2
+
+struct radeon_hw_state {
+	/* Head of the linked list of state atoms. */
+	struct radeon_state_atom atomlist;
+
+	/* Hardware state, stored as cmdbuf commands:  
+	 *   -- Need to doublebuffer for
+	 *           - eliding noop statechange loops? (except line stipple count)
+	 */
+	struct radeon_state_atom ctx;
+	struct radeon_state_atom set;
+	struct radeon_state_atom lin;
+	struct radeon_state_atom msk;
+	struct radeon_state_atom vpt;
+	struct radeon_state_atom tcl;
+	struct radeon_state_atom msc;
+	struct radeon_state_atom tex[3];
+	struct radeon_state_atom cube[3];
+	struct radeon_state_atom zbs;
+	struct radeon_state_atom mtl;
+	struct radeon_state_atom mat[6];
+	struct radeon_state_atom lit[8];	/* includes vec, scl commands */
+	struct radeon_state_atom ucp[6];
+	struct radeon_state_atom eye;	/* eye pos */
+	struct radeon_state_atom grd;	/* guard band clipping */
+	struct radeon_state_atom fog;
+	struct radeon_state_atom glt;
+	struct radeon_state_atom txr[3];	/* for NPOT */
+
+	int max_state_size;	/* Number of bytes necessary for a full state emit. */
+	GLboolean is_dirty, all_dirty;
+};
+
+struct radeon_state {
+	/* Derived state for internal purposes:
+	 */
+	struct radeon_colorbuffer_state color;
+	struct radeon_depthbuffer_state depth;
+	struct radeon_scissor_state scissor;
+	struct radeon_stencilbuffer_state stencil;
+	struct radeon_stipple_state stipple;
+	struct radeon_texture_state texture;
+};
+
+/* Need refcounting on dma buffers:
+ */
+struct radeon_dma_buffer {
+	int refcount;		/* the number of retained regions in buf */
+	drmBufPtr buf;
+};
+
+#define GET_START(rvb) (rmesa->radeonScreen->gart_buffer_offset +			\
+			(rvb)->address - rmesa->dma.buf0_address +	\
+			(rvb)->start)
+
+/* A retained region, eg vertices for indexed vertices.
+ */
+struct radeon_dma_region {
+	struct radeon_dma_buffer *buf;
+	char *address;		/* == buf->address */
+	int start, end, ptr;	/* offsets from start of buf */
+	int aos_start;
+	int aos_stride;
+	int aos_size;
+};
+
+struct radeon_dma {
+	/* Active dma region.  Allocations for vertices and retained
+	 * regions come from here.  Also used for emitting random vertices,
+	 * these may be flushed by calling flush_current();
+	 */
+	struct radeon_dma_region current;
+
+	void (*flush) (radeonContextPtr);
+
+	char *buf0_address;	/* start of buf[0], for index calcs */
+	GLuint nr_released_bufs;	/* flush after so many buffers released */
+};
+
+struct radeon_dri_mirror {
+	__DRIcontextPrivate *context;	/* DRI context */
+	__DRIscreenPrivate *screen;	/* DRI screen */
+
+   /**
+    * DRI drawable bound to this context for drawing.
+    */
+	__DRIdrawablePrivate *drawable;
+
+   /**
+    * DRI drawable bound to this context for reading.
+    */
+	__DRIdrawablePrivate *readable;
+
+	drm_context_t hwContext;
+	drm_hw_lock_t *hwLock;
+	int fd;
+	int drmMinor;
+};
+
+#define RADEON_CMD_BUF_SZ  (8*1024)
+
+struct radeon_store {
+	GLuint statenr;
+	GLuint primnr;
+	char cmd_buf[RADEON_CMD_BUF_SZ];
+	int cmd_used;
+	int elts_start;
+};
+
+/* radeon_tcl.c
+ */
+struct radeon_tcl_info {
+	GLuint vertex_format;
+	GLuint hw_primitive;
+
+	/* Temporary for cases where incoming vertex data is incompatible
+	 * with maos code.
+	 */
+	GLvector4f ObjClean;
+
+	struct radeon_dma_region *aos_components[8];
+	GLuint nr_aos_components;
+
+	GLuint *Elts;
+
+	struct radeon_dma_region indexed_verts;
+	struct radeon_dma_region obj;
+	struct radeon_dma_region rgba;
+	struct radeon_dma_region spec;
+	struct radeon_dma_region fog;
+	struct radeon_dma_region tex[RADEON_MAX_TEXTURE_UNITS];
+	struct radeon_dma_region norm;
+};
+
+/* radeon_swtcl.c
+ */
+struct radeon_swtcl_info {
+	GLuint RenderIndex;
+	GLuint vertex_size;
+	GLuint vertex_format;
+
+	struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
+	GLuint vertex_attr_count;
+
+	GLubyte *verts;
+
+	/* Fallback rasterization functions
+	 */
+	radeon_point_func draw_point;
+	radeon_line_func draw_line;
+	radeon_tri_func draw_tri;
+
+	GLuint hw_primitive;
+	GLenum render_primitive;
+	GLuint numverts;
+
+   /**
+    * Offset of the 4UB color data within a hardware (swtcl) vertex.
+    */
+	GLuint coloroffset;
+
+   /**
+    * Offset of the 3UB specular color data within a hardware (swtcl) vertex.
+    */
+	GLuint specoffset;
+
+	GLboolean needproj;
+
+	struct radeon_dma_region indexed_verts;
+};
+
+struct radeon_ioctl {
+	GLuint vertex_offset;
+	GLuint vertex_size;
+};
+
+#define RADEON_MAX_PRIMS 64
+
+struct radeon_prim {
+	GLuint start;
+	GLuint end;
+	GLuint prim;
+};
+
+/* A maximum total of 20 elements per vertex:  3 floats for position, 3
+ * floats for normal, 4 floats for color, 4 bytes for secondary color,
+ * 3 floats for each texture unit (9 floats total).
+ * 
+ * The position data is never actually stored here, so 3 elements could be
+ * trimmed out of the buffer. This number is only valid for vtxfmt!
+ */
+#define RADEON_MAX_VERTEX_SIZE 20
+
+struct radeon_context {
+	GLcontext *glCtx;	/* Mesa context */
+
+	/* Driver and hardware state management
+	 */
+	struct radeon_hw_state hw;
+	struct radeon_state state;
+
+	/* Texture object bookkeeping
+	 */
+	unsigned nr_heaps;
+	driTexHeap *texture_heaps[RADEON_NR_TEX_HEAPS];
+	driTextureObject swapped;
+	int texture_depth;
+	float initialMaxAnisotropy;
+
+	/* Rasterization and vertex state:
+	 */
+	GLuint TclFallback;
+	GLuint Fallback;
+	GLuint NewGLState;
+	 DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
+
+	/* Vertex buffers
+	 */
+	struct radeon_ioctl ioctl;
+	struct radeon_dma dma;
+	struct radeon_store store;
+	/* A full state emit as of the first state emit in the main store, in case
+	 * the context is lost.
+	 */
+	struct radeon_store backup_store;
+
+	/* Page flipping
+	 */
+	GLuint doPageFlip;
+
+	/* Busy waiting
+	 */
+	GLuint do_usleeps;
+	GLuint do_irqs;
+	GLuint irqsEmitted;
+	drm_radeon_irq_wait_t iw;
+
+	/* Drawable, cliprect and scissor information
+	 */
+	GLuint numClipRects;	/* Cliprects for the draw buffer */
+	drm_clip_rect_t *pClipRects;
+	unsigned int lastStamp;
+	GLboolean lost_context;
+	GLboolean save_on_next_emit;
+	radeonScreenPtr radeonScreen;	/* Screen private DRI data */
+	drm_radeon_sarea_t *sarea;	/* Private SAREA data */
+
+	/* TCL stuff
+	 */
+	GLmatrix TexGenMatrix[RADEON_MAX_TEXTURE_UNITS];
+	GLboolean recheck_texgen[RADEON_MAX_TEXTURE_UNITS];
+	GLboolean TexGenNeedNormals[RADEON_MAX_TEXTURE_UNITS];
+	GLuint TexGenEnabled;
+	GLuint NeedTexMatrix;
+	GLuint TexMatColSwap;
+	GLmatrix tmpmat[RADEON_MAX_TEXTURE_UNITS];
+	GLuint last_ReallyEnabled;
+
+	/* VBI
+	 */
+	GLuint vbl_seq;
+	GLuint vblank_flags;
+
+	int64_t swap_ust;
+	int64_t swap_missed_ust;
+
+	GLuint swap_count;
+	GLuint swap_missed_count;
+
+	/* radeon_tcl.c
+	 */
+	struct radeon_tcl_info tcl;
+
+	/* radeon_swtcl.c
+	 */
+	struct radeon_swtcl_info swtcl;
+
+	/* Mirrors of some DRI state
+	 */
+	struct radeon_dri_mirror dri;
+
+	/* Configuration cache
+	 */
+	driOptionCache optionCache;
+
+	GLboolean using_hyperz;
+	GLboolean texmicrotile;
+
+	/* Performance counters
+	 */
+	GLuint boxes;		/* Draw performance boxes */
+	GLuint hardwareWentIdle;
+	GLuint c_clears;
+	GLuint c_drawWaits;
+	GLuint c_textureSwaps;
+	GLuint c_textureBytes;
+	GLuint c_vertexBuffers;
+};
+
+#define RADEON_CONTEXT(ctx)		((radeonContextPtr)(ctx->DriverCtx))
+
+static __inline GLuint radeonPackColor(GLuint cpp,
+				       GLubyte r, GLubyte g,
+				       GLubyte b, GLubyte a)
+{
+	switch (cpp) {
+	case 2:
+		return PACK_COLOR_565(r, g, b);
+	case 4:
+		return PACK_COLOR_8888(a, r, g, b);
+	default:
+		return 0;
+	}
+}
+
+#define RADEON_OLD_PACKETS 1
+
+extern void radeonDestroyContext(__DRIcontextPrivate * driContextPriv);
+extern GLboolean radeonCreateContext(const __GLcontextModes * glVisual,
+				     __DRIcontextPrivate * driContextPriv,
+				     void *sharedContextPrivate);
+extern void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
+extern void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+				int x, int y, int w, int h);
+extern GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
+				   __DRIdrawablePrivate * driDrawPriv,
+				   __DRIdrawablePrivate * driReadPriv);
+extern GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
+
+/* ================================================================
+ * Debugging:
+ */
+#define DO_DEBUG		1
+
+#if DO_DEBUG
+extern int RADEON_DEBUG;
+#else
+#define RADEON_DEBUG		0
+#endif
+
+#define DEBUG_TEXTURE	0x0001
+#define DEBUG_STATE	0x0002
+#define DEBUG_IOCTL	0x0004
+#define DEBUG_PRIMS	0x0008
+#define DEBUG_VERTS	0x0010
+#define DEBUG_FALLBACKS	0x0020
+#define DEBUG_VFMT	0x0040
+#define DEBUG_CODEGEN	0x0080
+#define DEBUG_VERBOSE	0x0100
+#define DEBUG_DRI       0x0200
+#define DEBUG_DMA       0x0400
+#define DEBUG_SANITY    0x0800
+#define DEBUG_SYNC      0x1000
+
+#endif				/* __RADEON_CONTEXT_H__ */
diff --git a/radeon/radeon_ioctl.c b/radeon/radeon_ioctl.c
new file mode 100644
index 0000000..4c64bc2
--- /dev/null
+++ b/radeon/radeon_ioctl.c
@@ -0,0 +1,1276 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_ioctl.c,v 1.11 2003/01/29 22:04:59 dawes Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include <sched.h>
+#include <errno.h> 
+
+#include "glheader.h"
+#include "imports.h"
+#include "simple_list.h"
+#include "swrast/swrast.h"
+
+#include "radeon_context.h"
+#include "radeon_state.h"
+#include "radeon_ioctl.h"
+#include "radeon_tcl.h"
+#include "radeon_sanity.h"
+
+#define STANDALONE_MMIO
+#include "radeon_macros.h"  /* for INREG() */
+
+#include "drirenderbuffer.h"
+#include "vblank.h"
+
+#define RADEON_TIMEOUT             512
+#define RADEON_IDLE_RETRY           16
+
+
+static void radeonWaitForIdle( radeonContextPtr rmesa );
+static int radeonFlushCmdBufLocked( radeonContextPtr rmesa, 
+				    const char * caller );
+
+static void print_state_atom( struct radeon_state_atom *state )
+{
+   int i;
+
+   fprintf(stderr, "emit %s/%d\n", state->name, state->cmd_size);
+
+   if (RADEON_DEBUG & DEBUG_VERBOSE) 
+      for (i = 0 ; i < state->cmd_size ; i++) 
+	 fprintf(stderr, "\t%s[%d]: %x\n", state->name, i, state->cmd[i]);
+
+}
+
+static void radeonSaveHwState( radeonContextPtr rmesa )
+{
+   struct radeon_state_atom *atom;
+   char * dest = rmesa->backup_store.cmd_buf;
+
+   if (RADEON_DEBUG & DEBUG_STATE)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+   
+   rmesa->backup_store.cmd_used = 0;
+
+   foreach( atom, &rmesa->hw.atomlist ) {
+      if ( atom->check( rmesa->glCtx ) ) {
+	 int size = atom->cmd_size * 4;
+	 memcpy( dest, atom->cmd, size);
+	 dest += size;
+	 rmesa->backup_store.cmd_used += size;
+	 if (RADEON_DEBUG & DEBUG_STATE)
+	    print_state_atom( atom );
+      }
+   }
+
+   assert( rmesa->backup_store.cmd_used <= RADEON_CMD_BUF_SZ );
+   if (RADEON_DEBUG & DEBUG_STATE)
+      fprintf(stderr, "Returning to radeonEmitState\n");
+}
+
+/* At this point we were in FlushCmdBufLocked but we had lost our context, so
+ * we need to unwire our current cmdbuf, hook the one with the saved state in
+ * it, flush it, and then put the current one back.  This is so commands at the
+ * start of a cmdbuf can rely on the state being kept from the previous one.
+ */
+static void radeonBackUpAndEmitLostStateLocked( radeonContextPtr rmesa )
+{
+   GLuint nr_released_bufs;
+   struct radeon_store saved_store;
+
+   if (rmesa->backup_store.cmd_used == 0)
+      return;
+
+   if (RADEON_DEBUG & DEBUG_STATE)
+      fprintf(stderr, "Emitting backup state on lost context\n");
+
+   rmesa->lost_context = GL_FALSE;
+
+   nr_released_bufs = rmesa->dma.nr_released_bufs;
+   saved_store = rmesa->store;
+   rmesa->dma.nr_released_bufs = 0;
+   rmesa->store = rmesa->backup_store;
+   radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
+   rmesa->dma.nr_released_bufs = nr_released_bufs;
+   rmesa->store = saved_store;
+}
+
+/* =============================================================
+ * Kernel command buffer handling
+ */
+
+/* The state atoms will be emitted in the order they appear in the atom list,
+ * so this step is important.
+ */
+void radeonSetUpAtomList( radeonContextPtr rmesa )
+{
+   int i, mtu = rmesa->glCtx->Const.MaxTextureUnits;
+
+   make_empty_list(&rmesa->hw.atomlist);
+   rmesa->hw.atomlist.name = "atom-list";
+
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.ctx);
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.set);
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.lin);
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.msk);
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.vpt);
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.tcl);
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.msc);
+   for (i = 0; i < mtu; ++i) {
+       insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.tex[i]);
+       insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.txr[i]);
+       insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.cube[i]);
+   }
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.zbs);
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.mtl);
+   for (i = 0; i < 3 + mtu; ++i)
+      insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.mat[i]);
+   for (i = 0; i < 8; ++i)
+      insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.lit[i]);
+   for (i = 0; i < 6; ++i)
+      insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.ucp[i]);
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.eye);
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.grd);
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.fog);
+   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.glt);
+}
+
+void radeonEmitState( radeonContextPtr rmesa )
+{
+   struct radeon_state_atom *atom;
+   char *dest;
+
+   if (RADEON_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   if (rmesa->save_on_next_emit) {
+      radeonSaveHwState(rmesa);
+      rmesa->save_on_next_emit = GL_FALSE;
+   }
+
+   /* this code used to return here but now it emits zbs */
+
+   /* To avoid going across the entire set of states multiple times, just check
+    * for enough space for the case of emitting all state, and inline the
+    * radeonAllocCmdBuf code here without all the checks.
+    */
+   radeonEnsureCmdBufSpace(rmesa, rmesa->hw.max_state_size);
+   dest = rmesa->store.cmd_buf + rmesa->store.cmd_used;
+
+   /* We always always emit zbs, this is due to a bug found by keithw in
+      the hardware and rediscovered after Erics changes by me.
+      if you ever touch this code make sure you emit zbs otherwise
+      you get tcl lockups on at least M7/7500 class of chips - airlied */
+   rmesa->hw.zbs.dirty=1;
+
+   if (RADEON_DEBUG & DEBUG_STATE) {
+      foreach(atom, &rmesa->hw.atomlist) {
+	 if (atom->dirty || rmesa->hw.all_dirty) {
+	    if (atom->check(rmesa->glCtx))
+	       print_state_atom(atom);
+	    else
+	       fprintf(stderr, "skip state %s\n", atom->name);
+	 }
+      }
+   }
+
+   foreach(atom, &rmesa->hw.atomlist) {
+      if (rmesa->hw.all_dirty)
+	 atom->dirty = GL_TRUE;
+      if (!(rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) &&
+	   atom->is_tcl)
+	 atom->dirty = GL_FALSE;
+      if (atom->dirty) {
+	 if (atom->check(rmesa->glCtx)) {
+	    int size = atom->cmd_size * 4;
+	    memcpy(dest, atom->cmd, size);
+	    dest += size;
+	    rmesa->store.cmd_used += size;
+	    atom->dirty = GL_FALSE;
+	 }
+      }
+   }
+
+   assert(rmesa->store.cmd_used <= RADEON_CMD_BUF_SZ);
+ 
+   rmesa->hw.is_dirty = GL_FALSE;
+   rmesa->hw.all_dirty = GL_FALSE;
+}
+
+/* Fire a section of the retained (indexed_verts) buffer as a regular
+ * primtive.  
+ */
+extern void radeonEmitVbufPrim( radeonContextPtr rmesa,
+				GLuint vertex_format,
+				GLuint primitive,
+				GLuint vertex_nr )
+{
+   drm_radeon_cmd_header_t *cmd;
+
+
+   assert(!(primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
+   
+   radeonEmitState( rmesa );
+
+   if (RADEON_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s cmd_used/4: %d\n", __FUNCTION__,
+	      rmesa->store.cmd_used/4);
+   
+   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, VBUF_BUFSZ,
+						       __FUNCTION__ );
+#if RADEON_OLD_PACKETS
+   cmd[0].i = 0;
+   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
+   cmd[1].i = RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM | (3 << 16);
+   cmd[2].i = rmesa->ioctl.vertex_offset;
+   cmd[3].i = vertex_nr;
+   cmd[4].i = vertex_format;
+   cmd[5].i = (primitive | 
+	       RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
+	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
+	       (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
+
+   if (RADEON_DEBUG & DEBUG_PRIMS)
+      fprintf(stderr, "%s: header 0x%x offt 0x%x vfmt 0x%x vfcntl %x \n",
+	      __FUNCTION__,
+	      cmd[1].i, cmd[2].i, cmd[4].i, cmd[5].i);
+#else
+   cmd[0].i = 0;
+   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
+   cmd[1].i = RADEON_CP_PACKET3_3D_DRAW_VBUF | (1 << 16);
+   cmd[2].i = vertex_format;
+   cmd[3].i = (primitive | 
+	       RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
+	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+	       RADEON_CP_VC_CNTL_MAOS_ENABLE |
+	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
+	       (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
+
+
+   if (RADEON_DEBUG & DEBUG_PRIMS)
+      fprintf(stderr, "%s: header 0x%x vfmt 0x%x vfcntl %x \n",
+	      __FUNCTION__,
+	      cmd[1].i, cmd[2].i, cmd[3].i);
+#endif
+}
+
+
+void radeonFlushElts( radeonContextPtr rmesa )
+{
+   int *cmd = (int *)(rmesa->store.cmd_buf + rmesa->store.elts_start);
+   int dwords;
+#if RADEON_OLD_PACKETS
+   int nr = (rmesa->store.cmd_used - (rmesa->store.elts_start + 24)) / 2;
+#else
+   int nr = (rmesa->store.cmd_used - (rmesa->store.elts_start + 16)) / 2;
+#endif
+
+   if (RADEON_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   assert( rmesa->dma.flush == radeonFlushElts );
+   rmesa->dma.flush = NULL;
+
+   /* Cope with odd number of elts:
+    */
+   rmesa->store.cmd_used = (rmesa->store.cmd_used + 2) & ~2;
+   dwords = (rmesa->store.cmd_used - rmesa->store.elts_start) / 4;
+
+#if RADEON_OLD_PACKETS
+   cmd[1] |= (dwords - 3) << 16;
+   cmd[5] |= nr << RADEON_CP_VC_CNTL_NUM_SHIFT;
+#else
+   cmd[1] |= (dwords - 3) << 16;
+   cmd[3] |= nr << RADEON_CP_VC_CNTL_NUM_SHIFT;
+#endif
+
+   if (RADEON_DEBUG & DEBUG_SYNC) {
+      fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
+      radeonFinish( rmesa->glCtx );
+   }
+}
+
+
+GLushort *radeonAllocEltsOpenEnded( radeonContextPtr rmesa,
+				    GLuint vertex_format,
+				    GLuint primitive,
+				    GLuint min_nr )
+{
+   drm_radeon_cmd_header_t *cmd;
+   GLushort *retval;
+
+   if (RADEON_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s %d\n", __FUNCTION__, min_nr);
+
+   assert((primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
+   
+   radeonEmitState( rmesa );
+   
+   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa,
+						       ELTS_BUFSZ(min_nr),
+						       __FUNCTION__ );
+#if RADEON_OLD_PACKETS
+   cmd[0].i = 0;
+   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
+   cmd[1].i = RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM;
+   cmd[2].i = rmesa->ioctl.vertex_offset;
+   cmd[3].i = 0xffff;
+   cmd[4].i = vertex_format;
+   cmd[5].i = (primitive | 
+	       RADEON_CP_VC_CNTL_PRIM_WALK_IND |
+	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
+
+   retval = (GLushort *)(cmd+6);
+#else   
+   cmd[0].i = 0;
+   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
+   cmd[1].i = RADEON_CP_PACKET3_3D_DRAW_INDX;
+   cmd[2].i = vertex_format;
+   cmd[3].i = (primitive | 
+	       RADEON_CP_VC_CNTL_PRIM_WALK_IND |
+	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+	       RADEON_CP_VC_CNTL_MAOS_ENABLE |
+	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
+
+   retval = (GLushort *)(cmd+4);
+#endif
+
+   if (RADEON_DEBUG & DEBUG_PRIMS)
+      fprintf(stderr, "%s: header 0x%x vfmt 0x%x prim %x \n",
+	      __FUNCTION__,
+	      cmd[1].i, vertex_format, primitive);
+
+   assert(!rmesa->dma.flush);
+   rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+   rmesa->dma.flush = radeonFlushElts;
+
+   rmesa->store.elts_start = ((char *)cmd) - rmesa->store.cmd_buf;
+
+   return retval;
+}
+
+
+
+void radeonEmitVertexAOS( radeonContextPtr rmesa,
+			  GLuint vertex_size,
+			  GLuint offset )
+{
+#if RADEON_OLD_PACKETS
+   rmesa->ioctl.vertex_size = vertex_size;
+   rmesa->ioctl.vertex_offset = offset;
+#else
+   drm_radeon_cmd_header_t *cmd;
+
+   if (RADEON_DEBUG & (DEBUG_PRIMS|DEBUG_IOCTL))
+      fprintf(stderr, "%s:  vertex_size 0x%x offset 0x%x \n",
+	      __FUNCTION__, vertex_size, offset);
+
+   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, VERT_AOS_BUFSZ,
+						  __FUNCTION__ );
+
+   cmd[0].i = 0;
+   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
+   cmd[1].i = RADEON_CP_PACKET3_3D_LOAD_VBPNTR | (2 << 16);
+   cmd[2].i = 1;
+   cmd[3].i = vertex_size | (vertex_size << 8);
+   cmd[4].i = offset;
+#endif
+}
+		       
+
+void radeonEmitAOS( radeonContextPtr rmesa,
+		    struct radeon_dma_region **component,
+		    GLuint nr,
+		    GLuint offset )
+{
+#if RADEON_OLD_PACKETS
+   assert( nr == 1 );
+   assert( component[0]->aos_size == component[0]->aos_stride );
+   rmesa->ioctl.vertex_size = component[0]->aos_size;
+   rmesa->ioctl.vertex_offset = 
+      (component[0]->aos_start + offset * component[0]->aos_stride * 4);
+#else
+   drm_radeon_cmd_header_t *cmd;
+   int sz = AOS_BUFSZ(nr);
+   int i;
+   int *tmp;
+
+   if (RADEON_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+
+   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, sz,
+						  __FUNCTION__ );
+   cmd[0].i = 0;
+   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
+   cmd[1].i = RADEON_CP_PACKET3_3D_LOAD_VBPNTR | (((sz / sizeof(int))-3) << 16);
+   cmd[2].i = nr;
+   tmp = &cmd[0].i;
+   cmd += 3;
+
+   for (i = 0 ; i < nr ; i++) {
+      if (i & 1) {
+	 cmd[0].i |= ((component[i]->aos_stride << 24) | 
+		      (component[i]->aos_size << 16));
+	 cmd[2].i = (component[i]->aos_start + 
+		     offset * component[i]->aos_stride * 4);
+	 cmd += 3;
+      }
+      else {
+	 cmd[0].i = ((component[i]->aos_stride << 8) | 
+		     (component[i]->aos_size << 0));
+	 cmd[1].i = (component[i]->aos_start + 
+		     offset * component[i]->aos_stride * 4);
+      }
+   }
+
+   if (RADEON_DEBUG & DEBUG_VERTS) {
+      fprintf(stderr, "%s:\n", __FUNCTION__);
+      for (i = 0 ; i < sz ; i++)
+	 fprintf(stderr, "   %d: %x\n", i, tmp[i]);
+   }
+#endif
+}
+
+/* using already shifted color_fmt! */
+void radeonEmitBlit( radeonContextPtr rmesa, /* FIXME: which drmMinor is required? */
+		   GLuint color_fmt,
+		   GLuint src_pitch,
+		   GLuint src_offset,
+		   GLuint dst_pitch,
+		   GLuint dst_offset,
+		   GLint srcx, GLint srcy,
+		   GLint dstx, GLint dsty,
+		   GLuint w, GLuint h )
+{
+   drm_radeon_cmd_header_t *cmd;
+
+   if (RADEON_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
+	      __FUNCTION__, 
+	      src_pitch, src_offset, srcx, srcy,
+	      dst_pitch, dst_offset, dstx, dsty,
+	      w, h);
+
+   assert( (src_pitch & 63) == 0 );
+   assert( (dst_pitch & 63) == 0 );
+   assert( (src_offset & 1023) == 0 ); 
+   assert( (dst_offset & 1023) == 0 ); 
+   assert( w < (1<<16) );
+   assert( h < (1<<16) );
+
+   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 8 * sizeof(int),
+						  __FUNCTION__ );
+
+
+   cmd[0].i = 0;
+   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
+   cmd[1].i = RADEON_CP_PACKET3_CNTL_BITBLT_MULTI | (5 << 16);
+   cmd[2].i = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
+	       RADEON_GMC_DST_PITCH_OFFSET_CNTL |
+	       RADEON_GMC_BRUSH_NONE |
+	       color_fmt |
+	       RADEON_GMC_SRC_DATATYPE_COLOR |
+	       RADEON_ROP3_S |
+	       RADEON_DP_SRC_SOURCE_MEMORY |
+	       RADEON_GMC_CLR_CMP_CNTL_DIS |
+	       RADEON_GMC_WR_MSK_DIS );
+
+   cmd[3].i = ((src_pitch/64)<<22) | (src_offset >> 10);
+   cmd[4].i = ((dst_pitch/64)<<22) | (dst_offset >> 10);
+   cmd[5].i = (srcx << 16) | srcy;
+   cmd[6].i = (dstx << 16) | dsty; /* dst */
+   cmd[7].i = (w << 16) | h;
+}
+
+
+void radeonEmitWait( radeonContextPtr rmesa, GLuint flags )
+{
+   drm_radeon_cmd_header_t *cmd;
+
+   assert( !(flags & ~(RADEON_WAIT_2D|RADEON_WAIT_3D)) );
+
+   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 1 * sizeof(int),
+					   __FUNCTION__ );
+   cmd[0].i = 0;
+   cmd[0].wait.cmd_type = RADEON_CMD_WAIT;
+   cmd[0].wait.flags = flags;
+}
+
+
+static int radeonFlushCmdBufLocked( radeonContextPtr rmesa, 
+				    const char * caller )
+{
+   int ret, i;
+   drm_radeon_cmd_buffer_t cmd;
+
+   if (rmesa->lost_context)
+      radeonBackUpAndEmitLostStateLocked(rmesa);
+
+   if (RADEON_DEBUG & DEBUG_IOCTL) {
+      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
+
+      if (RADEON_DEBUG & DEBUG_VERBOSE) 
+	 for (i = 0 ; i < rmesa->store.cmd_used ; i += 4 )
+	    fprintf(stderr, "%d: %x\n", i/4, 
+		    *(int *)(&rmesa->store.cmd_buf[i]));
+   }
+
+   if (RADEON_DEBUG & DEBUG_DMA)
+      fprintf(stderr, "%s: Releasing %d buffers\n", __FUNCTION__,
+	      rmesa->dma.nr_released_bufs);
+
+
+   if (RADEON_DEBUG & DEBUG_SANITY) {
+      if (rmesa->state.scissor.enabled) 
+	 ret = radeonSanityCmdBuffer( rmesa, 
+				      rmesa->state.scissor.numClipRects,
+				      rmesa->state.scissor.pClipRects);
+      else
+	 ret = radeonSanityCmdBuffer( rmesa, 
+				      rmesa->numClipRects,
+				      rmesa->pClipRects);
+      if (ret) {
+	 fprintf(stderr, "drmSanityCommandWrite: %d\n", ret);	 
+	 goto out;
+      }
+   }
+
+
+   cmd.bufsz = rmesa->store.cmd_used;
+   cmd.buf = rmesa->store.cmd_buf;
+
+   if (rmesa->state.scissor.enabled) {
+      cmd.nbox = rmesa->state.scissor.numClipRects;
+      cmd.boxes = rmesa->state.scissor.pClipRects;
+   } else {
+      cmd.nbox = rmesa->numClipRects;
+      cmd.boxes = rmesa->pClipRects;
+   }
+
+   ret = drmCommandWrite( rmesa->dri.fd,
+			  DRM_RADEON_CMDBUF,
+			  &cmd, sizeof(cmd) );
+
+   if (ret)
+      fprintf(stderr, "drmCommandWrite: %d\n", ret);
+
+   if (RADEON_DEBUG & DEBUG_SYNC) {
+      fprintf(stderr, "\nSyncing in %s\n\n", __FUNCTION__);
+      radeonWaitForIdleLocked( rmesa );
+   }
+
+ out:
+   rmesa->store.primnr = 0;
+   rmesa->store.statenr = 0;
+   rmesa->store.cmd_used = 0;
+   rmesa->dma.nr_released_bufs = 0;
+   rmesa->save_on_next_emit = 1;
+
+   return ret;
+}
+
+
+/* Note: does not emit any commands to avoid recursion on
+ * radeonAllocCmdBuf.
+ */
+void radeonFlushCmdBuf( radeonContextPtr rmesa, const char *caller )
+{
+   int ret;
+
+	      
+   LOCK_HARDWARE( rmesa );
+
+   ret = radeonFlushCmdBufLocked( rmesa, caller );
+
+   UNLOCK_HARDWARE( rmesa );
+
+   if (ret) {
+      fprintf(stderr, "drm_radeon_cmd_buffer_t: %d (exiting)\n", ret);
+      exit(ret);
+   }
+}
+
+/* =============================================================
+ * Hardware vertex buffer handling
+ */
+
+
+void radeonRefillCurrentDmaRegion( radeonContextPtr rmesa )
+{
+   struct radeon_dma_buffer *dmabuf;
+   int fd = rmesa->dri.fd;
+   int index = 0;
+   int size = 0;
+   drmDMAReq dma;
+   int ret;
+
+   if (RADEON_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
+      fprintf(stderr, "%s\n", __FUNCTION__);  
+
+   if (rmesa->dma.flush) {
+      rmesa->dma.flush( rmesa );
+   }
+
+   if (rmesa->dma.current.buf)
+      radeonReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
+
+   if (rmesa->dma.nr_released_bufs > 4)
+      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
+
+   dma.context = rmesa->dri.hwContext;
+   dma.send_count = 0;
+   dma.send_list = NULL;
+   dma.send_sizes = NULL;
+   dma.flags = 0;
+   dma.request_count = 1;
+   dma.request_size = RADEON_BUFFER_SIZE;
+   dma.request_list = &index;
+   dma.request_sizes = &size;
+   dma.granted_count = 0;
+
+   LOCK_HARDWARE(rmesa);	/* no need to validate */
+
+   ret = drmDMA( fd, &dma );
+      
+   if (ret != 0) {
+      /* Free some up this way?
+       */
+      if (rmesa->dma.nr_released_bufs) {
+	 radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
+      }
+      
+      if (RADEON_DEBUG & DEBUG_DMA)
+	 fprintf(stderr, "Waiting for buffers\n");
+
+      radeonWaitForIdleLocked( rmesa );
+      ret = drmDMA( fd, &dma );
+
+      if ( ret != 0 ) {
+	 UNLOCK_HARDWARE( rmesa );
+	 fprintf( stderr, "Error: Could not get dma buffer... exiting\n" );
+	 exit( -1 );
+      }
+   }
+
+   UNLOCK_HARDWARE(rmesa);
+
+   if (RADEON_DEBUG & DEBUG_DMA)
+      fprintf(stderr, "Allocated buffer %d\n", index);
+
+   dmabuf = CALLOC_STRUCT( radeon_dma_buffer );
+   dmabuf->buf = &rmesa->radeonScreen->buffers->list[index];
+   dmabuf->refcount = 1;
+
+   rmesa->dma.current.buf = dmabuf;
+   rmesa->dma.current.address = dmabuf->buf->address;
+   rmesa->dma.current.end = dmabuf->buf->total;
+   rmesa->dma.current.start = 0;
+   rmesa->dma.current.ptr = 0;
+
+   rmesa->c_vertexBuffers++;
+}
+
+void radeonReleaseDmaRegion( radeonContextPtr rmesa,
+			     struct radeon_dma_region *region,
+			     const char *caller )
+{
+   if (RADEON_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
+   
+   if (!region->buf)
+      return;
+
+   if (rmesa->dma.flush)
+      rmesa->dma.flush( rmesa );
+
+   if (--region->buf->refcount == 0) {
+      drm_radeon_cmd_header_t *cmd;
+
+      if (RADEON_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
+	 fprintf(stderr, "%s -- DISCARD BUF %d\n", __FUNCTION__,
+		 region->buf->buf->idx);  
+      
+      cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, sizeof(*cmd), 
+						     __FUNCTION__ );
+      cmd->dma.cmd_type = RADEON_CMD_DMA_DISCARD;
+      cmd->dma.buf_idx = region->buf->buf->idx;
+      FREE(region->buf);
+      rmesa->dma.nr_released_bufs++;
+   }
+
+   region->buf = NULL;
+   region->start = 0;
+}
+
+/* Allocates a region from rmesa->dma.current.  If there isn't enough
+ * space in current, grab a new buffer (and discard what was left of current)
+ */
+void radeonAllocDmaRegion( radeonContextPtr rmesa, 
+			   struct radeon_dma_region *region,
+			   int bytes,
+			   int alignment )
+{
+   if (RADEON_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+
+   if (rmesa->dma.flush)
+      rmesa->dma.flush( rmesa );
+
+   if (region->buf)
+      radeonReleaseDmaRegion( rmesa, region, __FUNCTION__ );
+
+   alignment--;
+   rmesa->dma.current.start = rmesa->dma.current.ptr = 
+      (rmesa->dma.current.ptr + alignment) & ~alignment;
+
+   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
+      radeonRefillCurrentDmaRegion( rmesa );
+
+   region->start = rmesa->dma.current.start;
+   region->ptr = rmesa->dma.current.start;
+   region->end = rmesa->dma.current.start + bytes;
+   region->address = rmesa->dma.current.address;
+   region->buf = rmesa->dma.current.buf;
+   region->buf->refcount++;
+
+   rmesa->dma.current.ptr += bytes; /* bug - if alignment > 7 */
+   rmesa->dma.current.start = 
+      rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;  
+}
+
+/* ================================================================
+ * SwapBuffers with client-side throttling
+ */
+
+static u_int32_t radeonGetLastFrame (radeonContextPtr rmesa) 
+{
+   drm_radeon_getparam_t gp;
+   int ret;
+   u_int32_t frame;
+
+   gp.param = RADEON_PARAM_LAST_FRAME;
+   gp.value = (int *)&frame;
+   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_GETPARAM,
+			      &gp, sizeof(gp) );
+
+   if ( ret ) {
+      fprintf( stderr, "%s: drm_radeon_getparam_t: %d\n", __FUNCTION__, ret );
+      exit(1);
+   }
+
+   return frame;
+}
+
+static void radeonEmitIrqLocked( radeonContextPtr rmesa )
+{
+   drm_radeon_irq_emit_t ie;
+   int ret;
+
+   ie.irq_seq = &rmesa->iw.irq_seq;
+   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_IRQ_EMIT, 
+			      &ie, sizeof(ie) );
+   if ( ret ) {
+      fprintf( stderr, "%s: drm_radeon_irq_emit_t: %d\n", __FUNCTION__, ret );
+      exit(1);
+   }
+}
+
+
+static void radeonWaitIrq( radeonContextPtr rmesa )
+{
+   int ret;
+
+   do {
+      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_IRQ_WAIT,
+			     &rmesa->iw, sizeof(rmesa->iw) );
+   } while (ret && (errno == EINTR || errno == EBUSY));
+
+   if ( ret ) {
+      fprintf( stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__, ret );
+      exit(1);
+   }
+}
+
+
+static void radeonWaitForFrameCompletion( radeonContextPtr rmesa )
+{
+   drm_radeon_sarea_t *sarea = rmesa->sarea;
+
+   if (rmesa->do_irqs) {
+      if (radeonGetLastFrame(rmesa) < sarea->last_frame) {
+	 if (!rmesa->irqsEmitted) {
+	    while (radeonGetLastFrame (rmesa) < sarea->last_frame)
+	       ;
+	 }
+	 else {
+	    UNLOCK_HARDWARE( rmesa ); 
+	    radeonWaitIrq( rmesa );	
+	    LOCK_HARDWARE( rmesa ); 
+	 }
+	 rmesa->irqsEmitted = 10;
+      }
+
+      if (rmesa->irqsEmitted) {
+	 radeonEmitIrqLocked( rmesa );
+	 rmesa->irqsEmitted--;
+      }
+   } 
+   else {
+      while (radeonGetLastFrame (rmesa) < sarea->last_frame) {
+	 UNLOCK_HARDWARE( rmesa ); 
+	 if (rmesa->do_usleeps) 
+	    DO_USLEEP( 1 );
+	 LOCK_HARDWARE( rmesa ); 
+      }
+   }
+}
+
+/* Copy the back color buffer to the front color buffer.
+ */
+void radeonCopyBuffer( const __DRIdrawablePrivate *dPriv,
+		       const drm_clip_rect_t	  *rect)
+{
+   radeonContextPtr rmesa;
+   GLint nbox, i, ret;
+   GLboolean   missed_target;
+   int64_t ust;
+
+   assert(dPriv);
+   assert(dPriv->driContextPriv);
+   assert(dPriv->driContextPriv->driverPrivate);
+
+   rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+
+   if ( RADEON_DEBUG & DEBUG_IOCTL ) {
+      fprintf( stderr, "\n%s( %p )\n\n", __FUNCTION__, (void *) rmesa->glCtx );
+   }
+
+   RADEON_FIREVERTICES( rmesa );
+   LOCK_HARDWARE( rmesa );
+
+   /* Throttle the frame rate -- only allow one pending swap buffers
+    * request at a time.
+    */
+   radeonWaitForFrameCompletion( rmesa );
+   if (!rect)
+   {
+       UNLOCK_HARDWARE( rmesa );
+       driWaitForVBlank( dPriv, & rmesa->vbl_seq, rmesa->vblank_flags, & missed_target );
+       LOCK_HARDWARE( rmesa );
+   }
+
+   nbox = dPriv->numClipRects; /* must be in locked region */
+
+   for ( i = 0 ; i < nbox ; ) {
+      GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS , nbox );
+      drm_clip_rect_t *box = dPriv->pClipRects;
+      drm_clip_rect_t *b = rmesa->sarea->boxes;
+      GLint n = 0;
+
+      for ( ; i < nr ; i++ ) {
+
+	  *b = box[i];
+
+	  if (rect)
+	  {
+	      if (rect->x1 > b->x1)
+		  b->x1 = rect->x1;
+	      if (rect->y1 > b->y1)
+		  b->y1 = rect->y1;
+	      if (rect->x2 < b->x2)
+		  b->x2 = rect->x2;
+	      if (rect->y2 < b->y2)
+		  b->y2 = rect->y2;
+
+	      if (b->x1 < b->x2 && b->y1 < b->y2)
+		  b++;
+	  }
+	  else
+	      b++;
+
+	  n++;
+      }
+      rmesa->sarea->nbox = n;
+
+      ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_SWAP );
+
+      if ( ret ) {
+	 fprintf( stderr, "DRM_RADEON_SWAP_BUFFERS: return = %d\n", ret );
+	 UNLOCK_HARDWARE( rmesa );
+	 exit( 1 );
+      }
+   }
+
+   UNLOCK_HARDWARE( rmesa );
+   if (!rect)
+   {
+       rmesa->swap_count++;
+       (*dri_interface->getUST)( & ust );
+       if ( missed_target ) {
+	   rmesa->swap_missed_count++;
+	   rmesa->swap_missed_ust = ust - rmesa->swap_ust;
+       }
+
+       rmesa->swap_ust = ust;
+       rmesa->hw.all_dirty = GL_TRUE;
+   }
+}
+
+void radeonPageFlip( const __DRIdrawablePrivate *dPriv )
+{
+   radeonContextPtr rmesa;
+   GLint ret;
+   GLboolean   missed_target;
+
+   assert(dPriv);
+   assert(dPriv->driContextPriv);
+   assert(dPriv->driContextPriv->driverPrivate);
+
+   rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+
+   if ( RADEON_DEBUG & DEBUG_IOCTL ) {
+      fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
+	      rmesa->sarea->pfCurrentPage);
+   }
+
+   RADEON_FIREVERTICES( rmesa );
+   LOCK_HARDWARE( rmesa );
+
+   /* Need to do this for the perf box placement:
+    */
+   if (dPriv->numClipRects)
+   {
+      drm_clip_rect_t *box = dPriv->pClipRects;
+      drm_clip_rect_t *b = rmesa->sarea->boxes;
+      b[0] = box[0];
+      rmesa->sarea->nbox = 1;
+   }
+
+   /* Throttle the frame rate -- only allow a few pending swap buffers
+    * request at a time.
+    */
+   radeonWaitForFrameCompletion( rmesa );
+   UNLOCK_HARDWARE( rmesa );
+   driWaitForVBlank( dPriv, & rmesa->vbl_seq, rmesa->vblank_flags, & missed_target );
+   if ( missed_target ) {
+      rmesa->swap_missed_count++;
+      (void) (*dri_interface->getUST)( & rmesa->swap_missed_ust );
+   }
+   LOCK_HARDWARE( rmesa );
+
+   ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_FLIP );
+
+   UNLOCK_HARDWARE( rmesa );
+
+   if ( ret ) {
+      fprintf( stderr, "DRM_RADEON_FLIP: return = %d\n", ret );
+      exit( 1 );
+   }
+
+   rmesa->swap_count++;
+   (void) (*dri_interface->getUST)( & rmesa->swap_ust );
+
+   /* Get ready for drawing next frame.  Update the renderbuffers'
+    * flippedOffset/Pitch fields so we draw into the right place.
+    */
+   driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
+                        rmesa->sarea->pfCurrentPage);
+
+   radeonUpdateDrawBuffer(rmesa->glCtx);
+}
+
+
+/* ================================================================
+ * Buffer clear
+ */
+#define RADEON_MAX_CLEARS	256
+
+static void radeonClear( GLcontext *ctx, GLbitfield mask )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   drm_radeon_sarea_t *sarea = rmesa->sarea;
+   u_int32_t clear;
+   GLuint flags = 0;
+   GLuint color_mask = 0;
+   GLint ret, i;
+   GLint cx, cy, cw, ch;
+
+   if ( RADEON_DEBUG & DEBUG_IOCTL ) {
+      fprintf( stderr, "radeonClear\n");
+   }
+
+   {
+      LOCK_HARDWARE( rmesa );
+      UNLOCK_HARDWARE( rmesa );
+      if ( dPriv->numClipRects == 0 ) 
+	 return;
+   }
+   
+   radeonFlush( ctx ); 
+
+   if ( mask & BUFFER_BIT_FRONT_LEFT ) {
+      flags |= RADEON_FRONT;
+      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      mask &= ~BUFFER_BIT_FRONT_LEFT;
+   }
+
+   if ( mask & BUFFER_BIT_BACK_LEFT ) {
+      flags |= RADEON_BACK;
+      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      mask &= ~BUFFER_BIT_BACK_LEFT;
+   }
+
+   if ( mask & BUFFER_BIT_DEPTH ) {
+      flags |= RADEON_DEPTH;
+      mask &= ~BUFFER_BIT_DEPTH;
+   }
+
+   if ( (mask & BUFFER_BIT_STENCIL) && rmesa->state.stencil.hwBuffer ) {
+      flags |= RADEON_STENCIL;
+      mask &= ~BUFFER_BIT_STENCIL;
+   }
+
+   if ( mask ) {
+      if (RADEON_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "%s: swrast clear, mask: %x\n", __FUNCTION__, mask);
+      _swrast_Clear( ctx, mask );
+   }
+
+   if ( !flags ) 
+      return;
+
+   if (rmesa->using_hyperz) {
+      flags |= RADEON_USE_COMP_ZBUF;
+/*      if (rmesa->radeonScreen->chipset & RADEON_CHIPSET_TCL) 
+         flags |= RADEON_USE_HIERZ; */
+      if (!(rmesa->state.stencil.hwBuffer) ||
+	 ((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
+	    ((rmesa->state.stencil.clear & RADEON_STENCIL_WRITE_MASK) == RADEON_STENCIL_WRITE_MASK))) {
+	  flags |= RADEON_CLEAR_FASTZ;
+      }
+   }
+
+   LOCK_HARDWARE( rmesa );
+
+   /* compute region after locking: */
+   cx = ctx->DrawBuffer->_Xmin;
+   cy = ctx->DrawBuffer->_Ymin;
+   cw = ctx->DrawBuffer->_Xmax - cx;
+   ch = ctx->DrawBuffer->_Ymax - cy;
+
+   /* Flip top to bottom */
+   cx += dPriv->x;
+   cy  = dPriv->y + dPriv->h - cy - ch;
+
+   /* Throttle the number of clear ioctls we do.
+    */
+   while ( 1 ) {
+      int ret;
+      drm_radeon_getparam_t gp;
+
+      gp.param = RADEON_PARAM_LAST_CLEAR;
+      gp.value = (int *)&clear;
+      ret = drmCommandWriteRead( rmesa->dri.fd,
+				 DRM_RADEON_GETPARAM, &gp, sizeof(gp) );
+
+      if ( ret ) {
+	 fprintf( stderr, "%s: drm_radeon_getparam_t: %d\n", __FUNCTION__, ret );
+	 exit(1);
+      }
+
+      if ( sarea->last_clear - clear <= RADEON_MAX_CLEARS ) {
+	 break;
+      }
+
+      if ( rmesa->do_usleeps ) {
+	 UNLOCK_HARDWARE( rmesa );
+	 DO_USLEEP( 1 );
+	 LOCK_HARDWARE( rmesa );
+      }
+   }
+
+   /* Send current state to the hardware */
+   radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
+
+   for ( i = 0 ; i < dPriv->numClipRects ; ) {
+      GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS, dPriv->numClipRects );
+      drm_clip_rect_t *box = dPriv->pClipRects;
+      drm_clip_rect_t *b = rmesa->sarea->boxes;
+      drm_radeon_clear_t clear;
+      drm_radeon_clear_rect_t depth_boxes[RADEON_NR_SAREA_CLIPRECTS];
+      GLint n = 0;
+
+      if (cw != dPriv->w || ch != dPriv->h) {
+         /* clear subregion */
+	 for ( ; i < nr ; i++ ) {
+	    GLint x = box[i].x1;
+	    GLint y = box[i].y1;
+	    GLint w = box[i].x2 - x;
+	    GLint h = box[i].y2 - y;
+
+	    if ( x < cx ) w -= cx - x, x = cx;
+	    if ( y < cy ) h -= cy - y, y = cy;
+	    if ( x + w > cx + cw ) w = cx + cw - x;
+	    if ( y + h > cy + ch ) h = cy + ch - y;
+	    if ( w <= 0 ) continue;
+	    if ( h <= 0 ) continue;
+
+	    b->x1 = x;
+	    b->y1 = y;
+	    b->x2 = x + w;
+	    b->y2 = y + h;
+	    b++;
+	    n++;
+	 }
+      } else {
+         /* clear whole buffer */
+	 for ( ; i < nr ; i++ ) {
+	    *b++ = box[i];
+	    n++;
+	 }
+      }
+
+      rmesa->sarea->nbox = n;
+
+      clear.flags       = flags;
+      clear.clear_color = rmesa->state.color.clear;
+      clear.clear_depth = rmesa->state.depth.clear;
+      clear.color_mask  = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      clear.depth_mask  = rmesa->state.stencil.clear;
+      clear.depth_boxes = depth_boxes;
+
+      n--;
+      b = rmesa->sarea->boxes;
+      for ( ; n >= 0 ; n-- ) {
+	 depth_boxes[n].f[CLEAR_X1] = (float)b[n].x1;
+	 depth_boxes[n].f[CLEAR_Y1] = (float)b[n].y1;
+	 depth_boxes[n].f[CLEAR_X2] = (float)b[n].x2;
+	 depth_boxes[n].f[CLEAR_Y2] = (float)b[n].y2;
+	 depth_boxes[n].f[CLEAR_DEPTH] = 
+	    (float)rmesa->state.depth.clear;
+      }
+
+      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_CLEAR,
+			     &clear, sizeof(drm_radeon_clear_t));
+
+      if ( ret ) {
+	 UNLOCK_HARDWARE( rmesa );
+	 fprintf( stderr, "DRM_RADEON_CLEAR: return = %d\n", ret );
+	 exit( 1 );
+      }
+   }
+
+   UNLOCK_HARDWARE( rmesa );
+   rmesa->hw.all_dirty = GL_TRUE;
+}
+
+
+void radeonWaitForIdleLocked( radeonContextPtr rmesa )
+{
+    int fd = rmesa->dri.fd;
+    int to = 0;
+    int ret, i = 0;
+
+    rmesa->c_drawWaits++;
+
+    do {
+        do {
+            ret = drmCommandNone( fd, DRM_RADEON_CP_IDLE);
+        } while ( ret && errno == EBUSY && i++ < RADEON_IDLE_RETRY );
+    } while ( ( ret == -EBUSY ) && ( to++ < RADEON_TIMEOUT ) );
+
+    if ( ret < 0 ) {
+	UNLOCK_HARDWARE( rmesa );
+	fprintf( stderr, "Error: Radeon timed out... exiting\n" );
+	exit( -1 );
+    }
+}
+
+
+static void radeonWaitForIdle( radeonContextPtr rmesa )
+{
+   LOCK_HARDWARE(rmesa);
+   radeonWaitForIdleLocked( rmesa );
+   UNLOCK_HARDWARE(rmesa);
+}
+
+
+void radeonFlush( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+
+   if (RADEON_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   if (rmesa->dma.flush)
+      rmesa->dma.flush( rmesa );
+
+   radeonEmitState( rmesa );
+   
+   if (rmesa->store.cmd_used)
+      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
+}
+
+/* Make sure all commands have been sent to the hardware and have
+ * completed processing.
+ */
+void radeonFinish( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   radeonFlush( ctx );
+
+   if (rmesa->do_irqs) {
+      LOCK_HARDWARE( rmesa );
+      radeonEmitIrqLocked( rmesa );
+      UNLOCK_HARDWARE( rmesa );
+      radeonWaitIrq( rmesa );
+   }
+   else
+      radeonWaitForIdle( rmesa );
+}
+
+
+void radeonInitIoctlFuncs( GLcontext *ctx )
+{
+    ctx->Driver.Clear = radeonClear;
+    ctx->Driver.Finish = radeonFinish;
+    ctx->Driver.Flush = radeonFlush;
+}
+
diff --git a/radeon/radeon_ioctl.h b/radeon/radeon_ioctl.h
new file mode 100644
index 0000000..11a7d02
--- /dev/null
+++ b/radeon/radeon_ioctl.h
@@ -0,0 +1,203 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_ioctl.h,v 1.6 2002/12/16 16:18:58 dawes Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Gareth Hughes <gareth@valinux.com>
+ */
+
+#ifndef __RADEON_IOCTL_H__
+#define __RADEON_IOCTL_H__
+
+#include "simple_list.h"
+#include "radeon_lock.h"
+
+
+extern void radeonEmitState( radeonContextPtr rmesa );
+extern void radeonEmitVertexAOS( radeonContextPtr rmesa,
+				 GLuint vertex_size,
+				 GLuint offset );
+
+extern void radeonEmitVbufPrim( radeonContextPtr rmesa,
+				GLuint vertex_format,
+				GLuint primitive,
+				GLuint vertex_nr );
+
+extern void radeonFlushElts( radeonContextPtr rmesa );
+
+extern GLushort *radeonAllocEltsOpenEnded( radeonContextPtr rmesa,
+					   GLuint vertex_format,
+					   GLuint primitive,
+					   GLuint min_nr );
+
+extern void radeonEmitAOS( radeonContextPtr rmesa,
+			   struct radeon_dma_region **regions,
+			   GLuint n,
+			   GLuint offset );
+
+extern void radeonEmitBlit( radeonContextPtr rmesa,
+			    GLuint color_fmt,
+			    GLuint src_pitch,
+			    GLuint src_offset,
+			    GLuint dst_pitch,
+			    GLuint dst_offset,
+			    GLint srcx, GLint srcy,
+			    GLint dstx, GLint dsty,
+			    GLuint w, GLuint h );
+
+extern void radeonEmitWait( radeonContextPtr rmesa, GLuint flags );
+
+extern void radeonFlushCmdBuf( radeonContextPtr rmesa, const char * );
+extern void radeonRefillCurrentDmaRegion( radeonContextPtr rmesa );
+
+extern void radeonAllocDmaRegion( radeonContextPtr rmesa,
+				  struct radeon_dma_region *region,
+				  int bytes, 
+				  int alignment );
+
+extern void radeonReleaseDmaRegion( radeonContextPtr rmesa,
+				    struct radeon_dma_region *region,
+				    const char *caller );
+
+extern void radeonCopyBuffer( const __DRIdrawablePrivate *drawable,
+			      const drm_clip_rect_t	 *rect);
+extern void radeonPageFlip( const __DRIdrawablePrivate *drawable );
+extern void radeonFlush( GLcontext *ctx );
+extern void radeonFinish( GLcontext *ctx );
+extern void radeonWaitForIdleLocked( radeonContextPtr rmesa );
+extern void radeonWaitForVBlank( radeonContextPtr rmesa );
+extern void radeonInitIoctlFuncs( GLcontext *ctx );
+extern void radeonGetAllParams( radeonContextPtr rmesa );
+extern void radeonSetUpAtomList( radeonContextPtr rmesa );
+
+/* ================================================================
+ * Helper macros:
+ */
+
+/* Close off the last primitive, if it exists.
+ */
+#define RADEON_NEWPRIM( rmesa )			\
+do {						\
+   if ( rmesa->dma.flush )			\
+      rmesa->dma.flush( rmesa );	\
+} while (0)
+
+/* Can accomodate several state changes and primitive changes without
+ * actually firing the buffer.
+ */
+#define RADEON_STATECHANGE( rmesa, ATOM )			\
+do {								\
+   RADEON_NEWPRIM( rmesa );					\
+   rmesa->hw.ATOM.dirty = GL_TRUE;				\
+   rmesa->hw.is_dirty = GL_TRUE;				\
+} while (0)
+
+#define RADEON_DB_STATE( ATOM )			        \
+   memcpy( rmesa->hw.ATOM.lastcmd, rmesa->hw.ATOM.cmd,	\
+	   rmesa->hw.ATOM.cmd_size * 4)
+
+static __inline int RADEON_DB_STATECHANGE( 
+   radeonContextPtr rmesa,
+   struct radeon_state_atom *atom )
+{
+   if (memcmp(atom->cmd, atom->lastcmd, atom->cmd_size*4)) {
+      int *tmp;
+      RADEON_NEWPRIM( rmesa );
+      atom->dirty = GL_TRUE;
+      rmesa->hw.is_dirty = GL_TRUE;
+      tmp = atom->cmd; 
+      atom->cmd = atom->lastcmd;
+      atom->lastcmd = tmp;
+      return 1;
+   }
+   else
+      return 0;
+}
+
+
+/* Fire the buffered vertices no matter what.
+ */
+#define RADEON_FIREVERTICES( rmesa )			\
+do {							\
+   if ( rmesa->store.cmd_used || rmesa->dma.flush ) {	\
+      radeonFlush( rmesa->glCtx );			\
+   }							\
+} while (0)
+
+/* Command lengths.  Note that any time you ensure ELTS_BUFSZ or VBUF_BUFSZ
+ * are available, you will also be adding an rmesa->state.max_state_size because
+ * r200EmitState is called from within r200EmitVbufPrim and r200FlushElts.
+ */
+#if RADEON_OLD_PACKETS
+#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2)) * sizeof(int))
+#define VERT_AOS_BUFSZ	(0)
+#define ELTS_BUFSZ(nr)	(24 + nr * 2)
+#define VBUF_BUFSZ	(6 * sizeof(int))
+#else
+#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2)) * sizeof(int))
+#define VERT_AOS_BUFSZ	(5 * sizeof(int))
+#define ELTS_BUFSZ(nr)	(16 + nr * 2)
+#define VBUF_BUFSZ	(4 * sizeof(int))
+#endif
+
+/* Ensure that a minimum amount of space is available in the command buffer.
+ * This is used to ensure atomicity of state updates with the rendering requests
+ * that rely on them.
+ *
+ * An alternative would be to implement a "soft lock" such that when the buffer
+ * wraps at an inopportune time, we grab the lock, flush the current buffer,
+ * and hang on to the lock until the critical section is finished and we flush
+ * the buffer again and unlock.
+ */
+static __inline void radeonEnsureCmdBufSpace( radeonContextPtr rmesa,
+					      int bytes )
+{
+   if (rmesa->store.cmd_used + bytes > RADEON_CMD_BUF_SZ)
+      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
+   assert( bytes <= RADEON_CMD_BUF_SZ );
+}
+
+/* Alloc space in the command buffer
+ */
+static __inline char *radeonAllocCmdBuf( radeonContextPtr rmesa,
+					 int bytes, const char *where )
+{
+   if (rmesa->store.cmd_used + bytes > RADEON_CMD_BUF_SZ)
+      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
+
+   {
+      char *head = rmesa->store.cmd_buf + rmesa->store.cmd_used;
+      rmesa->store.cmd_used += bytes;
+      return head;
+   }
+}
+
+#endif /* __RADEON_IOCTL_H__ */
diff --git a/radeon/radeon_lighting.c b/radeon/radeon_lighting.c
new file mode 100644
index 0000000..44e00af
--- /dev/null
+++ b/radeon/radeon_lighting.c
@@ -0,0 +1,682 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_state.c,v 1.5 2002/09/16 18:05:20 eich Exp $ */
+/*
+ * Copyright 2000, 2001 VA Linux Systems Inc., Fremont, California.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Gareth Hughes <gareth@valinux.com>
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "api_arrayelt.h"
+/* #include "mmath.h" */
+#include "enums.h"
+#include "colormac.h"
+
+
+#include "radeon_context.h"
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "radeon_tcl.h"
+#include "radeon_tex.h"
+#include "radeon_vtxfmt.h"
+
+
+
+/* =============================================================
+ * Materials
+ */
+
+
+/* Update on colormaterial, material emmissive/ambient, 
+ * lightmodel.globalambient
+ */
+void update_global_ambient( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   float *fcmd = (float *)RADEON_DB_STATE( glt );
+
+   /* Need to do more if both emmissive & ambient are PREMULT:
+    */
+   if ((rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &
+       ((3 << RADEON_EMISSIVE_SOURCE_SHIFT) |
+	(3 << RADEON_AMBIENT_SOURCE_SHIFT))) == 0) 
+   {
+      COPY_3V( &fcmd[GLT_RED], 
+	       ctx->Light.Material[0].Emission);
+      ACC_SCALE_3V( &fcmd[GLT_RED],
+		   ctx->Light.Model.Ambient,
+		   ctx->Light.Material[0].Ambient);
+   } 
+   else
+   {
+      COPY_3V( &fcmd[GLT_RED], ctx->Light.Model.Ambient );
+   }
+   
+   RADEON_DB_STATECHANGE(rmesa, &rmesa->hw.glt);
+}
+
+/* Update on change to 
+ *    - light[p].colors
+ *    - light[p].enabled
+ *    - material,
+ *    - colormaterial enabled
+ *    - colormaterial bitmask
+ */
+void update_light_colors( GLcontext *ctx, GLuint p )
+{
+   struct gl_light *l = &ctx->Light.Light[p];
+
+/*     fprintf(stderr, "%s\n", __FUNCTION__); */
+
+   if (l->Enabled) {
+      radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+      float *fcmd = (float *)RADEON_DB_STATE( lit[p] );
+      GLuint bitmask = ctx->Light.ColorMaterialBitmask;
+      struct gl_material *mat = &ctx->Light.Material[0];
+
+      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );	 
+      COPY_4V( &fcmd[LIT_DIFFUSE_RED], l->Diffuse );
+      COPY_4V( &fcmd[LIT_SPECULAR_RED], l->Specular );
+      
+      if (!ctx->Light.ColorMaterialEnabled)
+	 bitmask = 0;
+
+      if ((bitmask & FRONT_AMBIENT_BIT) == 0) 
+	 SELF_SCALE_3V( &fcmd[LIT_AMBIENT_RED], mat->Ambient );
+
+      if ((bitmask & FRONT_DIFFUSE_BIT) == 0) 
+	 SELF_SCALE_3V( &fcmd[LIT_DIFFUSE_RED], mat->Diffuse );
+      
+      if ((bitmask & FRONT_SPECULAR_BIT) == 0) 
+	 SELF_SCALE_3V( &fcmd[LIT_SPECULAR_RED], mat->Specular );
+
+      RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.lit[p] );
+   }
+}
+
+/* Also fallback for asym colormaterial mode in twoside lighting...
+ */
+void check_twoside_fallback( GLcontext *ctx )
+{
+   GLboolean fallback = GL_FALSE;
+
+   if (ctx->Light.Enabled && ctx->Light.Model.TwoSide) {
+      if (memcmp( &ctx->Light.Material[0],
+		  &ctx->Light.Material[1],
+		  sizeof(struct gl_material)) != 0)
+	 fallback = GL_TRUE;  
+      else if (ctx->Light.ColorMaterialEnabled &&
+	       (ctx->Light.ColorMaterialBitmask & BACK_MATERIAL_BITS) != 
+	       ((ctx->Light.ColorMaterialBitmask & FRONT_MATERIAL_BITS)<<1))
+	 fallback = GL_TRUE;
+   }
+
+   TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_LIGHT_TWOSIDE, fallback );
+}
+
+void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
+{
+   if (ctx->Light.ColorMaterialEnabled) {
+      radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+      GLuint light_model_ctl = rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL];
+      GLuint mask = ctx->Light.ColorMaterialBitmask;
+
+      /* Default to PREMULT:
+       */
+      light_model_ctl &= ~((3 << RADEON_EMISSIVE_SOURCE_SHIFT) |
+			   (3 << RADEON_AMBIENT_SOURCE_SHIFT) |
+			   (3 << RADEON_DIFFUSE_SOURCE_SHIFT) |
+			   (3 << RADEON_SPECULAR_SOURCE_SHIFT)); 
+   
+      if (mask & FRONT_EMISSION_BIT) {
+	 light_model_ctl |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
+			     RADEON_EMISSIVE_SOURCE_SHIFT);
+      }
+
+      if (mask & FRONT_AMBIENT_BIT) {
+	 light_model_ctl |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
+			     RADEON_AMBIENT_SOURCE_SHIFT);
+      }
+	 
+      if (mask & FRONT_DIFFUSE_BIT) {
+	 light_model_ctl |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
+			     RADEON_DIFFUSE_SOURCE_SHIFT);
+      }
+   
+      if (mask & FRONT_SPECULAR_BIT) {
+	 light_model_ctl |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
+			     RADEON_SPECULAR_SOURCE_SHIFT);
+      }
+   
+      if (light_model_ctl != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]) {
+	 GLuint p;
+
+	 RADEON_STATECHANGE( rmesa, tcl );
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = light_model_ctl;      
+
+	 for (p = 0 ; p < MAX_LIGHTS; p++) 
+	    update_light_colors( ctx, p );
+	 update_global_ambient( ctx );
+      }
+   }
+   
+   check_twoside_fallback( ctx );
+}
+
+void radeonUpdateMaterial( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( mtl );
+   GLuint p;
+   GLuint mask = ~0;
+   
+   if (ctx->Light.ColorMaterialEnabled)
+      mask &= ~ctx->Light.ColorMaterialBitmask;
+
+   if (RADEON_DEBUG & DEBUG_STATE)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+      
+   if (mask & FRONT_EMISSION_BIT) {
+      fcmd[MTL_EMMISSIVE_RED]   = ctx->Light.Material[0].Emission[0];
+      fcmd[MTL_EMMISSIVE_GREEN] = ctx->Light.Material[0].Emission[1];
+      fcmd[MTL_EMMISSIVE_BLUE]  = ctx->Light.Material[0].Emission[2];
+      fcmd[MTL_EMMISSIVE_ALPHA] = ctx->Light.Material[0].Emission[3];
+   }
+   if (mask & FRONT_AMBIENT_BIT) {
+      fcmd[MTL_AMBIENT_RED]     = ctx->Light.Material[0].Ambient[0];
+      fcmd[MTL_AMBIENT_GREEN]   = ctx->Light.Material[0].Ambient[1];
+      fcmd[MTL_AMBIENT_BLUE]    = ctx->Light.Material[0].Ambient[2];
+      fcmd[MTL_AMBIENT_ALPHA]   = ctx->Light.Material[0].Ambient[3];
+   }
+   if (mask & FRONT_DIFFUSE_BIT) {
+      fcmd[MTL_DIFFUSE_RED]     = ctx->Light.Material[0].Diffuse[0];
+      fcmd[MTL_DIFFUSE_GREEN]   = ctx->Light.Material[0].Diffuse[1];
+      fcmd[MTL_DIFFUSE_BLUE]    = ctx->Light.Material[0].Diffuse[2];
+      fcmd[MTL_DIFFUSE_ALPHA]   = ctx->Light.Material[0].Diffuse[3];
+   }
+   if (mask & FRONT_SPECULAR_BIT) {
+      fcmd[MTL_SPECULAR_RED]    = ctx->Light.Material[0].Specular[0];
+      fcmd[MTL_SPECULAR_GREEN]  = ctx->Light.Material[0].Specular[1];
+      fcmd[MTL_SPECULAR_BLUE]   = ctx->Light.Material[0].Specular[2];
+      fcmd[MTL_SPECULAR_ALPHA]  = ctx->Light.Material[0].Specular[3];
+   }
+   if (mask & FRONT_SHININESS_BIT) {
+      fcmd[MTL_SHININESS]       = ctx->Light.Material[0].Shininess;
+   }
+
+   if (RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.mtl )) {
+      for (p = 0 ; p < MAX_LIGHTS; p++) 
+	 update_light_colors( ctx, p );
+
+      check_twoside_fallback( ctx );
+      update_global_ambient( ctx );
+   }
+   else if (RADEON_DEBUG & (DEBUG_PRIMS|DEBUG_STATE))
+      fprintf(stderr, "%s: Elided noop material call\n", __FUNCTION__);
+}
+
+/* _NEW_LIGHT
+ * _NEW_MODELVIEW
+ * _MESA_NEW_NEED_EYE_COORDS
+ *
+ * Uses derived state from mesa:
+ *       _VP_inf_norm
+ *       _h_inf_norm
+ *       _Position
+ *       _NormDirection
+ *       _ModelViewInvScale
+ *       _NeedEyeCoords
+ *       _EyeZDir
+ *
+ * which are calculated in light.c and are correct for the current
+ * lighting space (model or eye), hence dependencies on _NEW_MODELVIEW
+ * and _MESA_NEW_NEED_EYE_COORDS.  
+ */
+void radeonUpdateLighting( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   /* Have to check these, or have an automatic shortcircuit mechanism
+    * to remove noop statechanges. (Or just do a better job on the
+    * front end).
+    */
+   {
+      GLuint tmp = rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL];
+
+      if (ctx->_NeedEyeCoords)
+	 tmp &= ~RADEON_LIGHT_IN_MODELSPACE;
+      else
+	 tmp |= RADEON_LIGHT_IN_MODELSPACE;
+      
+
+      /* Leave this test disabled: (unexplained q3 lockup) (even with
+         new packets)
+      */
+      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]) 
+      {
+	 RADEON_STATECHANGE( rmesa, tcl );
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = tmp;
+      }
+   }
+
+   {
+      GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( eye );
+      fcmd[EYE_X] = ctx->_EyeZDir[0];
+      fcmd[EYE_Y] = ctx->_EyeZDir[1];
+      fcmd[EYE_Z] = - ctx->_EyeZDir[2];
+      fcmd[EYE_RESCALE_FACTOR] = ctx->_ModelViewInvScale;
+      RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.eye );
+   }
+
+
+/*     RADEON_STATECHANGE( rmesa, glt ); */
+
+   if (ctx->Light.Enabled) {
+      GLint p;
+      for (p = 0 ; p < MAX_LIGHTS; p++) {
+	 if (ctx->Light.Light[p].Enabled) {
+	    struct gl_light *l = &ctx->Light.Light[p];
+	    GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( lit[p] );
+	    
+	    if (l->EyePosition[3] == 0.0) {
+	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm ); 
+	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm ); 
+	       fcmd[LIT_POSITION_W] = 0;
+	       fcmd[LIT_DIRECTION_W] = 0;
+	    } else {
+	       COPY_4V( &fcmd[LIT_POSITION_X], l->_Position );
+	       fcmd[LIT_DIRECTION_X] = -l->_NormDirection[0];
+	       fcmd[LIT_DIRECTION_Y] = -l->_NormDirection[1];
+	       fcmd[LIT_DIRECTION_Z] = -l->_NormDirection[2];
+	       fcmd[LIT_DIRECTION_W] = 0;
+	    }
+
+	    RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.lit[p] );
+	 }
+      }
+   }
+}
+
+
+void radeonLightfv( GLcontext *ctx, GLenum light,
+		    GLenum pname, const GLfloat *params )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLint p = light - GL_LIGHT0;
+   struct gl_light *l = &ctx->Light.Light[p];
+   GLfloat *fcmd = (GLfloat *)rmesa->hw.lit[p].cmd;
+   
+
+   switch (pname) {
+   case GL_AMBIENT:		
+   case GL_DIFFUSE:
+   case GL_SPECULAR:
+      update_light_colors( ctx, p );
+      break;
+
+   case GL_SPOT_DIRECTION: 
+      /* picked up in update_light */	
+      break;
+
+   case GL_POSITION: {
+      /* positions picked up in update_light, but can do flag here */	
+      GLuint flag = (p&1)? RADEON_LIGHT_1_IS_LOCAL : RADEON_LIGHT_0_IS_LOCAL;
+      GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
+
+      RADEON_STATECHANGE(rmesa, tcl);
+      if (l->EyePosition[3] != 0.0F)
+	 rmesa->hw.tcl.cmd[idx] |= flag;
+      else
+	 rmesa->hw.tcl.cmd[idx] &= ~flag;
+      break;
+   }
+
+   case GL_SPOT_EXPONENT:
+      RADEON_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_SPOT_EXPONENT] = params[0];
+      break;
+
+   case GL_SPOT_CUTOFF: {
+      GLuint flag = (p&1) ? RADEON_LIGHT_1_IS_SPOT : RADEON_LIGHT_0_IS_SPOT;
+      GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
+
+      RADEON_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_SPOT_CUTOFF] = l->_CosCutoff;
+
+      RADEON_STATECHANGE(rmesa, tcl);
+      if (l->SpotCutoff != 180.0F)
+	 rmesa->hw.tcl.cmd[idx] |= flag;
+      else
+	 rmesa->hw.tcl.cmd[idx] &= ~flag;
+      break;
+   }
+
+   case GL_CONSTANT_ATTENUATION:
+      RADEON_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_ATTEN_CONST] = params[0];
+      break;
+   case GL_LINEAR_ATTENUATION:
+      RADEON_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_ATTEN_LINEAR] = params[0];
+      break;
+   case GL_QUADRATIC_ATTENUATION:
+      RADEON_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_ATTEN_QUADRATIC] = params[0];
+      break;
+   default:
+      return;
+   }
+
+}
+
+		  
+
+
+void radeonLightModelfv( GLcontext *ctx, GLenum pname,
+			 const GLfloat *param )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   switch (pname) {
+      case GL_LIGHT_MODEL_AMBIENT: 
+	 update_global_ambient( ctx );
+	 break;
+
+      case GL_LIGHT_MODEL_LOCAL_VIEWER:
+	 RADEON_STATECHANGE( rmesa, tcl );
+	 if (ctx->Light.Model.LocalViewer)
+	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_LOCAL_VIEWER;
+	 else
+	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= ~RADEON_LOCAL_VIEWER;
+         break;
+
+      case GL_LIGHT_MODEL_TWO_SIDE:
+	 RADEON_STATECHANGE( rmesa, tcl );
+	 if (ctx->Light.Model.TwoSide)
+	    rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= RADEON_LIGHT_TWOSIDE;
+	 else
+	    rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~RADEON_LIGHT_TWOSIDE;
+
+	 check_twoside_fallback( ctx );
+
+#if _HAVE_SWTNL
+	 if (rmesa->TclFallback) {
+	    radeonChooseRenderState( ctx );
+	    radeonChooseVertexState( ctx );
+	 }
+#endif
+         break;
+
+      case GL_LIGHT_MODEL_COLOR_CONTROL:
+	 radeonUpdateSpecular(ctx);
+
+	 RADEON_STATECHANGE( rmesa, tcl );
+	 if (ctx->Light.Model.ColorControl == GL_SEPARATE_SPECULAR_COLOR) 
+	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= 
+	       ~RADEON_DIFFUSE_SPECULAR_COMBINE;
+	 else
+	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= 
+	       RADEON_DIFFUSE_SPECULAR_COMBINE;
+         break;
+
+      default:
+         break;
+   }
+}
+
+
+/* =============================================================
+ * Fog
+ */
+
+
+static void radeonFogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   union { int i; float f; } c, d;
+   GLchan col[4];
+
+   c.i = rmesa->hw.fog.cmd[FOG_C];
+   d.i = rmesa->hw.fog.cmd[FOG_D];
+
+   switch (pname) {
+   case GL_FOG_MODE:
+      if (!ctx->Fog.Enabled)
+	 return;
+      RADEON_STATECHANGE(rmesa, tcl);
+      rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~RADEON_TCL_FOG_MASK;
+      switch (ctx->Fog.Mode) {
+      case GL_LINEAR:
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= RADEON_TCL_FOG_LINEAR;
+	 if (ctx->Fog.Start == ctx->Fog.End) {
+	    c.f = 1.0F;
+	    d.f = 1.0F;
+	 }
+	 else {
+	    c.f = ctx->Fog.End/(ctx->Fog.End-ctx->Fog.Start);
+	    d.f = 1.0/(ctx->Fog.End-ctx->Fog.Start);
+	 }
+	 break;
+      case GL_EXP:
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= RADEON_TCL_FOG_EXP;
+	 c.f = 0.0;
+	 d.f = ctx->Fog.Density;
+	 break;
+      case GL_EXP2:
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= RADEON_TCL_FOG_EXP2;
+	 c.f = 0.0;
+	 d.f = -(ctx->Fog.Density * ctx->Fog.Density);
+	 break;
+      default:
+	 return;
+      }
+      break;
+   case GL_FOG_DENSITY:
+      switch (ctx->Fog.Mode) {
+      case GL_EXP:
+	 c.f = 0.0;
+	 d.f = ctx->Fog.Density;
+	 break;
+      case GL_EXP2:
+	 c.f = 0.0;
+	 d.f = -(ctx->Fog.Density * ctx->Fog.Density);
+	 break;
+      default:
+	 break;
+      }
+      break;
+   case GL_FOG_START:
+   case GL_FOG_END:
+      if (ctx->Fog.Mode == GL_LINEAR) {
+	 if (ctx->Fog.Start == ctx->Fog.End) {
+	    c.f = 1.0F;
+	    d.f = 1.0F;
+	 } else {
+	    c.f = ctx->Fog.End/(ctx->Fog.End-ctx->Fog.Start);
+	    d.f = 1.0/(ctx->Fog.End-ctx->Fog.Start);
+	 }
+      }
+      break;
+   case GL_FOG_COLOR: 
+      RADEON_STATECHANGE( rmesa, ctx );
+      UNCLAMPED_FLOAT_TO_RGB_CHAN( col, ctx->Fog.Color );
+      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] =
+	 radeonPackColor( 4, col[0], col[1], col[2], 0 );
+      break;
+   case GL_FOG_COORDINATE_SOURCE_EXT: 
+      /* What to do?
+       */
+      break;
+   default:
+      return;
+   }
+
+   if (c.i != rmesa->hw.fog.cmd[FOG_C] || d.i != rmesa->hw.fog.cmd[FOG_D]) {
+      RADEON_STATECHANGE( rmesa, fog );
+      rmesa->hw.fog.cmd[FOG_C] = c.i;
+      rmesa->hw.fog.cmd[FOG_D] = d.i;
+   }
+}
+
+/* Examine lighting and texture state to determine if separate specular
+ * should be enabled.
+ */
+void radeonUpdateSpecular( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint p = rmesa->hw.ctx.cmd[CTX_PP_CNTL];
+
+   if (NEED_SECONDARY_COLOR(ctx)) {
+      p |=  RADEON_SPECULAR_ENABLE;
+   } else {
+      p &= ~RADEON_SPECULAR_ENABLE;
+   }
+
+   if ( rmesa->hw.ctx.cmd[CTX_PP_CNTL] != p ) {
+      RADEON_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_PP_CNTL] = p;
+   }
+
+   /* Bizzare: have to leave lighting enabled to get fog.
+    */
+   RADEON_STATECHANGE( rmesa, tcl );
+   if ((ctx->Light.Enabled &&
+	ctx->Light.Model.ColorControl == GL_SEPARATE_SPECULAR_COLOR)) {
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] |= RADEON_TCL_COMPUTE_SPECULAR;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] |= RADEON_TCL_COMPUTE_DIFFUSE;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_SPEC;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_LIGHTING_ENABLE;
+   }
+   else if (ctx->Fog.Enabled) {
+      if (ctx->Light.Enabled) {
+	 rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] |= RADEON_TCL_COMPUTE_SPECULAR;
+	 rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] |= RADEON_TCL_COMPUTE_DIFFUSE;
+	 rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_SPEC;
+	 rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_LIGHTING_ENABLE;
+      } else {
+	 rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] |= RADEON_TCL_COMPUTE_SPECULAR;
+	 rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] &= ~RADEON_TCL_COMPUTE_DIFFUSE;
+	 rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_SPEC;
+	 rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_LIGHTING_ENABLE;
+      }
+   }
+   else if (ctx->Light.Enabled) {
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] &= ~RADEON_TCL_COMPUTE_SPECULAR;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] |= RADEON_TCL_COMPUTE_DIFFUSE;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~RADEON_TCL_VTX_PK_SPEC;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_LIGHTING_ENABLE;
+   } else if (ctx->Fog.ColorSumEnabled ) {
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] &= ~RADEON_TCL_COMPUTE_SPECULAR;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] &= ~RADEON_TCL_COMPUTE_DIFFUSE;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_SPEC;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= ~RADEON_LIGHTING_ENABLE;
+   } else {
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] &= ~RADEON_TCL_COMPUTE_SPECULAR;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] &= ~RADEON_TCL_COMPUTE_DIFFUSE;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~RADEON_TCL_VTX_PK_SPEC;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= ~RADEON_LIGHTING_ENABLE;
+   }
+
+#if _HAVE_SWTNL
+   /* Update vertex/render formats
+    */
+   if (rmesa->TclFallback) { 
+      radeonChooseRenderState( ctx );
+      radeonChooseVertexState( ctx );
+   }
+#endif
+}
+
+
+
+static void radeonLightingSpaceChange( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLboolean tmp;
+   RADEON_STATECHANGE( rmesa, tcl );
+
+   if (RADEON_DEBUG & DEBUG_STATE)
+      fprintf(stderr, "%s %d\n", __FUNCTION__, ctx->_NeedEyeCoords);
+
+   if (ctx->_NeedEyeCoords)
+      tmp = ctx->Transform.RescaleNormals;
+   else
+      tmp = !ctx->Transform.RescaleNormals;
+
+   if ( tmp ) {
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |=  RADEON_RESCALE_NORMALS;
+   } else {
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= ~RADEON_RESCALE_NORMALS;
+   }
+}
+
+void radeonInitLightStateFuncs( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   int i;
+
+   ctx->Driver.LightModelfv		= radeonLightModelfv; 
+   ctx->Driver.Lightfv			= radeonLightfv; 
+   ctx->Driver.Fogfv			= radeonFogfv;
+   ctx->Driver.LightingSpaceChange      = radeonLightingSpaceChange;
+
+   for (i = 0 ; i < 8; i++) {
+      struct gl_light *l = &ctx->Light.Light[i];
+      GLenum p = GL_LIGHT0 + i;
+      *(float *)&(rmesa->hw.lit[i].cmd[LIT_RANGE_CUTOFF]) = FLT_MAX;
+
+      ctx->Driver.Lightfv( ctx, p, GL_AMBIENT, l->Ambient );
+      ctx->Driver.Lightfv( ctx, p, GL_DIFFUSE, l->Diffuse );
+      ctx->Driver.Lightfv( ctx, p, GL_SPECULAR, l->Specular );
+      ctx->Driver.Lightfv( ctx, p, GL_POSITION, 0 );
+      ctx->Driver.Lightfv( ctx, p, GL_SPOT_DIRECTION, 0 );
+      ctx->Driver.Lightfv( ctx, p, GL_SPOT_EXPONENT, &l->SpotExponent );
+      ctx->Driver.Lightfv( ctx, p, GL_SPOT_CUTOFF, &l->SpotCutoff );
+      ctx->Driver.Lightfv( ctx, p, GL_CONSTANT_ATTENUATION,
+			   &l->ConstantAttenuation );
+      ctx->Driver.Lightfv( ctx, p, GL_LINEAR_ATTENUATION, 
+			   &l->LinearAttenuation );
+      ctx->Driver.Lightfv( ctx, p, GL_QUADRATIC_ATTENUATION, 
+		     &l->QuadraticAttenuation );
+   }
+
+   ctx->Driver.LightModelfv( ctx, GL_LIGHT_MODEL_AMBIENT, 
+			     ctx->Light.Model.Ambient );
+
+   ctx->Driver.Fogfv( ctx, GL_FOG_MODE, 0 );
+   ctx->Driver.Fogfv( ctx, GL_FOG_DENSITY, &ctx->Fog.Density );
+   ctx->Driver.Fogfv( ctx, GL_FOG_START, &ctx->Fog.Start );
+   ctx->Driver.Fogfv( ctx, GL_FOG_END, &ctx->Fog.End );
+   ctx->Driver.Fogfv( ctx, GL_FOG_COLOR, ctx->Fog.Color );
+   ctx->Driver.Fogfv( ctx, GL_FOG_COORDINATE_SOURCE_EXT, 0 );
+}
diff --git a/radeon/radeon_lock.c b/radeon/radeon_lock.c
new file mode 100644
index 0000000..30a0c38
--- /dev/null
+++ b/radeon/radeon_lock.c
@@ -0,0 +1,124 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Kevin E. Martin <martin@valinux.com>
+ */
+
+#include "glheader.h"
+#include "mtypes.h"
+#include "radeon_context.h"
+#include "radeon_lock.h"
+#include "radeon_tex.h"
+#include "radeon_state.h"
+#include "radeon_ioctl.h"
+
+#include "drirenderbuffer.h"
+
+#if DEBUG_LOCKING
+char *prevLockFile = NULL;
+int prevLockLine = 0;
+#endif
+
+/* Turn on/off page flipping according to the flags in the sarea:
+ */
+static void radeonUpdatePageFlipping(radeonContextPtr rmesa)
+{
+	rmesa->doPageFlip = rmesa->sarea->pfState;
+	if (rmesa->glCtx->WinSysDrawBuffer) {
+		driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
+				     rmesa->sarea->pfCurrentPage);
+	}
+}
+
+/* Update the hardware state.  This is called if another context has
+ * grabbed the hardware lock, which includes the X server.  This
+ * function also updates the driver's window state after the X server
+ * moves, resizes or restacks a window -- the change will be reflected
+ * in the drawable position and clip rects.  Since the X server grabs
+ * the hardware lock when it changes the window state, this routine will
+ * automatically be called after such a change.
+ */
+void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
+{
+	__DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
+	__DRIdrawablePrivate *const readable = rmesa->dri.readable;
+	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
+	drm_radeon_sarea_t *sarea = rmesa->sarea;
+
+	drmGetLock(rmesa->dri.fd, rmesa->dri.hwContext, flags);
+
+	/* The window might have moved, so we might need to get new clip
+	 * rects.
+	 *
+	 * NOTE: This releases and regrabs the hw lock to allow the X server
+	 * to respond to the DRI protocol request for new drawable info.
+	 * Since the hardware state depends on having the latest drawable
+	 * clip rects, all state checking must be done _after_ this call.
+	 */
+	DRI_VALIDATE_DRAWABLE_INFO(sPriv, drawable);
+	if (drawable != readable) {
+		DRI_VALIDATE_DRAWABLE_INFO(sPriv, readable);
+	}
+
+	if (rmesa->lastStamp != drawable->lastStamp) {
+		radeonUpdatePageFlipping(rmesa);
+		radeonSetCliprects(rmesa);
+		radeonUpdateViewportOffset(rmesa->glCtx);
+		driUpdateFramebufferSize(rmesa->glCtx, drawable);
+	}
+
+	RADEON_STATECHANGE(rmesa, ctx);
+	if (rmesa->sarea->tiling_enabled) {
+		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
+		    RADEON_COLOR_TILE_ENABLE;
+	} else {
+		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &=
+		    ~RADEON_COLOR_TILE_ENABLE;
+	}
+
+	if (sarea->ctx_owner != rmesa->dri.hwContext) {
+		int i;
+		sarea->ctx_owner = rmesa->dri.hwContext;
+
+		for (i = 0; i < rmesa->nr_heaps; i++) {
+			DRI_AGE_TEXTURES(rmesa->texture_heaps[i]);
+		}
+	}
+
+	rmesa->lost_context = GL_TRUE;
+}
diff --git a/radeon/radeon_lock.h b/radeon/radeon_lock.h
new file mode 100644
index 0000000..86e96aa
--- /dev/null
+++ b/radeon/radeon_lock.h
@@ -0,0 +1,112 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Kevin E. Martin <martin@valinux.com>
+ */
+
+#ifndef __RADEON_LOCK_H__
+#define __RADEON_LOCK_H__
+
+extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
+
+/* Turn DEBUG_LOCKING on to find locking conflicts.
+ */
+#define DEBUG_LOCKING	0
+
+#if DEBUG_LOCKING
+extern char *prevLockFile;
+extern int prevLockLine;
+
+#define DEBUG_LOCK()							\
+   do {									\
+      prevLockFile = (__FILE__);					\
+      prevLockLine = (__LINE__);					\
+   } while (0)
+
+#define DEBUG_RESET()							\
+   do {									\
+      prevLockFile = 0;							\
+      prevLockLine = 0;							\
+   } while (0)
+
+#define DEBUG_CHECK_LOCK()						\
+   do {									\
+      if ( prevLockFile ) {						\
+	 fprintf( stderr,						\
+		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
+		  prevLockFile, prevLockLine, __FILE__, __LINE__ );	\
+	 exit( 1 );							\
+      }									\
+   } while (0)
+
+#else
+
+#define DEBUG_LOCK()
+#define DEBUG_RESET()
+#define DEBUG_CHECK_LOCK()
+
+#endif
+
+/*
+ * !!! We may want to separate locks from locks with validation.  This
+ * could be used to improve performance for those things commands that
+ * do not do any drawing !!!
+ */
+
+/* Lock the hardware and validate our state.
+ */
+#define LOCK_HARDWARE( rmesa )					\
+   do {								\
+      char __ret = 0;						\
+      DEBUG_CHECK_LOCK();					\
+      DRM_CAS( (rmesa)->dri.hwLock, (rmesa)->dri.hwContext,		\
+	       (DRM_LOCK_HELD | (rmesa)->dri.hwContext), __ret );	\
+      if ( __ret )						\
+	 radeonGetLock( (rmesa), 0 );				\
+      DEBUG_LOCK();						\
+   } while (0)
+
+#define UNLOCK_HARDWARE( rmesa )					\
+   do {									\
+      DRM_UNLOCK( (rmesa)->dri.fd,					\
+		  (rmesa)->dri.hwLock,					\
+		  (rmesa)->dri.hwContext );				\
+      DEBUG_RESET();							\
+   } while (0)
+
+#endif				/* __RADEON_LOCK_H__ */
diff --git a/radeon/radeon_maos.c b/radeon/radeon_maos.c
new file mode 100644
index 0000000..ea1e893
--- /dev/null
+++ b/radeon/radeon_maos.c
@@ -0,0 +1,12 @@
+
+
+/* If using new packets, can choose either verts or arrays.
+ * Otherwise, must use verts.
+ */
+#include "radeon_context.h"
+#define RADEON_MAOS_VERTS 0
+#if (RADEON_MAOS_VERTS) || (RADEON_OLD_PACKETS)
+#include "radeon_maos_verts.c"
+#else
+#include "radeon_maos_arrays.c"
+#endif
diff --git a/radeon/radeon_maos.h b/radeon/radeon_maos.h
new file mode 100644
index 0000000..09039d6
--- /dev/null
+++ b/radeon/radeon_maos.h
@@ -0,0 +1,44 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_maos.h,v 1.1 2002/10/30 12:51:55 alanh Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     Tungsten Grahpics Inc., Austin, Texas.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __RADEON_MAOS_H__
+#define __RADEON_MAOS_H__
+
+#include "radeon_context.h"
+
+extern void radeonEmitArrays( GLcontext *ctx, GLuint inputs );
+extern void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs );
+
+#endif
diff --git a/radeon/radeon_maos_arrays.c b/radeon/radeon_maos_arrays.c
new file mode 100644
index 0000000..49118b5
--- /dev/null
+++ b/radeon/radeon_maos_arrays.c
@@ -0,0 +1,657 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_maos_arrays.c,v 1.1 2002/10/30 12:51:55 alanh Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     Tungsten Graphics Inc., Cedar Park, Texas.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "mtypes.h"
+#include "macros.h"
+
+#include "swrast_setup/swrast_setup.h"
+#include "math/m_translate.h"
+#include "tnl/tnl.h"
+#include "tnl/t_context.h"
+
+#include "radeon_context.h"
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "radeon_swtcl.h"
+#include "radeon_maos.h"
+#include "radeon_tcl.h"
+
+#if 0
+/* Usage:
+ *   - from radeon_tcl_render
+ *   - call radeonEmitArrays to ensure uptodate arrays in dma
+ *   - emit primitives (new type?) which reference the data
+ *       -- need to use elts for lineloop, quads, quadstrip/flat
+ *       -- other primitives are all well-formed (need tristrip-1,fake-poly)
+ *
+ */
+static void emit_ubyte_rgba3( GLcontext *ctx,
+		       struct radeon_dma_region *rvb,
+		       char *data,
+		       int stride,
+		       int count )
+{
+   int i;
+   radeon_color_t *out = (radeon_color_t *)(rvb->start + rvb->address);
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d out %p\n",
+	      __FUNCTION__, count, stride, (void *)out);
+
+   for (i = 0; i < count; i++) {
+      out->red   = *data;
+      out->green = *(data+1);
+      out->blue  = *(data+2);
+      out->alpha = 0xFF;
+      out++;
+      data += stride;
+   }
+}
+
+static void emit_ubyte_rgba4( GLcontext *ctx,
+			      struct radeon_dma_region *rvb,
+			      char *data,
+			      int stride,
+			      int count )
+{
+   int i;
+   int *out = (int *)(rvb->address + rvb->start);
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d\n",
+	      __FUNCTION__, count, stride);
+
+   if (stride == 4)
+       COPY_DWORDS( out, data, count );
+   else
+      for (i = 0; i < count; i++) {
+	 *out++ = LE32_TO_CPU(*(int *)data);
+	 data += stride;
+      }
+}
+
+
+static void emit_ubyte_rgba( GLcontext *ctx,
+			     struct radeon_dma_region *rvb,
+			     char *data,
+			     int size,
+			     int stride,
+			     int count )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
+
+   assert (!rvb->buf);
+
+   if (stride == 0) {
+      radeonAllocDmaRegion( rmesa, rvb, 4, 4 );
+      count = 1;
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = 0;
+      rvb->aos_size = 1;
+   }
+   else {
+      radeonAllocDmaRegion( rmesa, rvb, 4 * count, 4 );	/* alignment? */
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = 1;
+      rvb->aos_size = 1;
+   }
+
+   /* Emit the data
+    */
+   switch (size) {
+   case 3:
+      emit_ubyte_rgba3( ctx, rvb, data, stride, count );
+      break;
+   case 4:
+      emit_ubyte_rgba4( ctx, rvb, data, stride, count );
+      break;
+   default:
+      assert(0);
+      exit(1);
+      break;
+   }
+}
+#endif
+
+#if defined(USE_X86_ASM)
+#define COPY_DWORDS( dst, src, nr )					\
+do {									\
+	int __tmp;							\
+	__asm__ __volatile__( "rep ; movsl"				\
+			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
+			      : "0" (nr),				\
+			        "D" ((long)dst),			\
+			        "S" ((long)src) );			\
+} while (0)
+#else
+#define COPY_DWORDS( dst, src, nr )		\
+do {						\
+   int j;					\
+   for ( j = 0 ; j < nr ; j++ )			\
+      dst[j] = ((int *)src)[j];			\
+   dst += nr;					\
+} while (0)
+#endif
+
+static void emit_vecfog( GLcontext *ctx,
+			 struct radeon_dma_region *rvb,
+			 char *data,
+			 int stride,
+			 int count )
+{
+   int i;
+   GLfloat *out;
+
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d\n",
+	      __FUNCTION__, count, stride);
+
+   assert (!rvb->buf);
+
+   if (stride == 0) {
+      radeonAllocDmaRegion( rmesa, rvb, 4, 4 );
+      count = 1;
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = 0;
+      rvb->aos_size = 1;
+   }
+   else {
+      radeonAllocDmaRegion( rmesa, rvb, count * 4, 4 );	/* alignment? */
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = 1;
+      rvb->aos_size = 1;
+   }
+
+   /* Emit the data
+    */
+   out = (GLfloat *)(rvb->address + rvb->start);
+   for (i = 0; i < count; i++) {
+      out[0] = radeonComputeFogBlendFactor( ctx, *(GLfloat *)data );
+      out++;
+      data += stride;
+   }
+}
+
+static void emit_vec4( GLcontext *ctx,
+		       struct radeon_dma_region *rvb,
+		       char *data,
+		       int stride,
+		       int count )
+{
+   int i;
+   int *out = (int *)(rvb->address + rvb->start);
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d\n",
+	      __FUNCTION__, count, stride);
+
+   if (stride == 4)
+      COPY_DWORDS( out, data, count );
+   else
+      for (i = 0; i < count; i++) {
+	 out[0] = *(int *)data;
+	 out++;
+	 data += stride;
+      }
+}
+
+
+static void emit_vec8( GLcontext *ctx,
+		       struct radeon_dma_region *rvb,
+		       char *data,
+		       int stride,
+		       int count )
+{
+   int i;
+   int *out = (int *)(rvb->address + rvb->start);
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d\n",
+	      __FUNCTION__, count, stride);
+
+   if (stride == 8)
+      COPY_DWORDS( out, data, count*2 );
+   else
+      for (i = 0; i < count; i++) {
+	 out[0] = *(int *)data;
+	 out[1] = *(int *)(data+4);
+	 out += 2;
+	 data += stride;
+      }
+}
+
+static void emit_vec12( GLcontext *ctx,
+		       struct radeon_dma_region *rvb,
+		       char *data,
+		       int stride,
+		       int count )
+{
+   int i;
+   int *out = (int *)(rvb->address + rvb->start);
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+	      __FUNCTION__, count, stride, (void *)out, (void *)data);
+
+   if (stride == 12)
+      COPY_DWORDS( out, data, count*3 );
+   else
+      for (i = 0; i < count; i++) {
+	 out[0] = *(int *)data;
+	 out[1] = *(int *)(data+4);
+	 out[2] = *(int *)(data+8);
+	 out += 3;
+	 data += stride;
+      }
+}
+
+static void emit_vec16( GLcontext *ctx,
+			struct radeon_dma_region *rvb,
+			char *data,
+			int stride,
+			int count )
+{
+   int i;
+   int *out = (int *)(rvb->address + rvb->start);
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d\n",
+	      __FUNCTION__, count, stride);
+
+   if (stride == 16)
+      COPY_DWORDS( out, data, count*4 );
+   else
+      for (i = 0; i < count; i++) {
+	 out[0] = *(int *)data;
+	 out[1] = *(int *)(data+4);
+	 out[2] = *(int *)(data+8);
+	 out[3] = *(int *)(data+12);
+	 out += 4;
+	 data += stride;
+      }
+}
+
+
+static void emit_vector( GLcontext *ctx,
+			 struct radeon_dma_region *rvb,
+			 char *data,
+			 int size,
+			 int stride,
+			 int count )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d size %d stride %d\n",
+	      __FUNCTION__, count, size, stride);
+
+   assert (!rvb->buf);
+
+   if (stride == 0) {
+      radeonAllocDmaRegion( rmesa, rvb, size * 4, 4 );
+      count = 1;
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = 0;
+      rvb->aos_size = size;
+   }
+   else {
+      radeonAllocDmaRegion( rmesa, rvb, size * count * 4, 4 );	/* alignment? */
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = size;
+      rvb->aos_size = size;
+   }
+
+   /* Emit the data
+    */
+   switch (size) {
+   case 1:
+      emit_vec4( ctx, rvb, data, stride, count );
+      break;
+   case 2:
+      emit_vec8( ctx, rvb, data, stride, count );
+      break;
+   case 3:
+      emit_vec12( ctx, rvb, data, stride, count );
+      break;
+   case 4:
+      emit_vec16( ctx, rvb, data, stride, count );
+      break;
+   default:
+      assert(0);
+      exit(1);
+      break;
+   }
+
+}
+
+
+
+static void emit_s0_vec( GLcontext *ctx,
+			 struct radeon_dma_region *rvb,
+			 char *data,
+			 int stride,
+			 int count )
+{
+   int i;
+   int *out = (int *)(rvb->address + rvb->start);
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d\n",
+	      __FUNCTION__, count, stride);
+
+   for (i = 0; i < count; i++) {
+      out[0] = *(int *)data;
+      out[1] = 0;
+      out += 2;
+      data += stride;
+   }
+}
+
+static void emit_stq_vec( GLcontext *ctx,
+			 struct radeon_dma_region *rvb,
+			 char *data,
+			 int stride,
+			 int count )
+{
+   int i;
+   int *out = (int *)(rvb->address + rvb->start);
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s count %d stride %d\n",
+	      __FUNCTION__, count, stride);
+
+   for (i = 0; i < count; i++) {
+      out[0] = *(int *)data;
+      out[1] = *(int *)(data+4);
+      out[2] = *(int *)(data+12);
+      out += 3;
+      data += stride;
+   }
+}
+
+
+
+
+static void emit_tex_vector( GLcontext *ctx,
+			     struct radeon_dma_region *rvb,
+			     char *data,
+			     int size,
+			     int stride,
+			     int count )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   int emitsize;
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
+
+   assert (!rvb->buf);
+
+   switch (size) {
+   case 4: emitsize = 3; break;
+   case 3: emitsize = 3; break;
+   default: emitsize = 2; break;
+   }
+
+
+   if (stride == 0) {
+      radeonAllocDmaRegion( rmesa, rvb, 4 * emitsize, 4 );
+      count = 1;
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = 0;
+      rvb->aos_size = emitsize;
+   }
+   else {
+      radeonAllocDmaRegion( rmesa, rvb, 4 * emitsize * count, 4 );
+      rvb->aos_start = GET_START(rvb);
+      rvb->aos_stride = emitsize;
+      rvb->aos_size = emitsize;
+   }
+
+
+   /* Emit the data
+    */
+   switch (size) {
+   case 1:
+      emit_s0_vec( ctx, rvb, data, stride, count ); 
+      break;
+   case 2:
+      emit_vec8( ctx, rvb, data, stride, count );
+      break;
+   case 3:
+      emit_vec12( ctx, rvb, data, stride, count );
+      break;
+   case 4:
+      emit_stq_vec( ctx, rvb, data, stride, count );
+      break;
+   default:
+      assert(0);
+      exit(1);
+      break;
+   }
+}
+
+
+
+
+/* Emit any changed arrays to new GART memory, re-emit a packet to
+ * update the arrays.  
+ */
+void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   struct vertex_buffer *VB = &TNL_CONTEXT( ctx )->vb;
+   struct radeon_dma_region **component = rmesa->tcl.aos_components;
+   GLuint nr = 0;
+   GLuint vfmt = 0;
+   GLuint count = VB->Count;
+   GLuint vtx, unit;
+   
+#if 0
+   if (RADEON_DEBUG & DEBUG_VERTS) 
+      _tnl_print_vert_flags( __FUNCTION__, inputs );
+#endif
+
+   if (1) {
+      if (!rmesa->tcl.obj.buf) 
+	 emit_vector( ctx, 
+		      &rmesa->tcl.obj, 
+		      (char *)VB->ObjPtr->data,
+		      VB->ObjPtr->size,
+		      VB->ObjPtr->stride,
+		      count);
+
+      switch( VB->ObjPtr->size ) {
+      case 4: vfmt |= RADEON_CP_VC_FRMT_W0;
+      case 3: vfmt |= RADEON_CP_VC_FRMT_Z;
+      case 2: vfmt |= RADEON_CP_VC_FRMT_XY;
+      default:
+         break;
+      }
+      component[nr++] = &rmesa->tcl.obj;
+   }
+   
+
+   if (inputs & VERT_BIT_NORMAL) {
+      if (!rmesa->tcl.norm.buf)
+	 emit_vector( ctx, 
+		      &(rmesa->tcl.norm), 
+		      (char *)VB->NormalPtr->data,
+		      3,
+		      VB->NormalPtr->stride,
+		      count);
+
+      vfmt |= RADEON_CP_VC_FRMT_N0;
+      component[nr++] = &rmesa->tcl.norm;
+   }
+
+   if (inputs & VERT_BIT_COLOR0) {
+      int emitsize;
+      if (VB->ColorPtr[0]->size == 4 &&
+	  (VB->ColorPtr[0]->stride != 0 ||
+	   VB->ColorPtr[0]->data[0][3] != 1.0)) {
+	 vfmt |= RADEON_CP_VC_FRMT_FPCOLOR | RADEON_CP_VC_FRMT_FPALPHA;
+	 emitsize = 4;
+      }
+
+      else {
+	 vfmt |= RADEON_CP_VC_FRMT_FPCOLOR;
+	 emitsize = 3;
+      }
+
+      if (!rmesa->tcl.rgba.buf)
+	 emit_vector( ctx,
+		      &(rmesa->tcl.rgba),
+		      (char *)VB->ColorPtr[0]->data,
+		      emitsize,
+		      VB->ColorPtr[0]->stride,
+		      count);
+
+
+      component[nr++] = &rmesa->tcl.rgba;
+   }
+
+
+   if (inputs & VERT_BIT_COLOR1) {
+      if (!rmesa->tcl.spec.buf) {
+
+	 emit_vector( ctx,
+		      &rmesa->tcl.spec,
+		      (char *)VB->SecondaryColorPtr[0]->data,
+		      3,
+		      VB->SecondaryColorPtr[0]->stride,
+		      count);
+      }
+
+      vfmt |= RADEON_CP_VC_FRMT_FPSPEC;
+      component[nr++] = &rmesa->tcl.spec;
+   }
+
+/* FIXME: not sure if this is correct. May need to stitch this together with
+   secondary color. It seems odd that for primary color color and alpha values
+   are emitted together but for secondary color not. */
+   if (inputs & VERT_BIT_FOG) {
+      if (!rmesa->tcl.fog.buf)
+	 emit_vecfog( ctx,
+		      &(rmesa->tcl.fog),
+		      (char *)VB->FogCoordPtr->data,
+		      VB->FogCoordPtr->stride,
+		      count);
+
+      vfmt |= RADEON_CP_VC_FRMT_FPFOG;
+      component[nr++] = &rmesa->tcl.fog;
+   }
+
+
+   vtx = (rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &
+	  ~(RADEON_TCL_VTX_Q0|RADEON_TCL_VTX_Q1|RADEON_TCL_VTX_Q2));
+      
+   for (unit = 0; unit < ctx->Const.MaxTextureUnits; unit++) {
+      if (inputs & VERT_BIT_TEX(unit)) {
+	 if (!rmesa->tcl.tex[unit].buf)
+	    emit_tex_vector( ctx,
+			     &(rmesa->tcl.tex[unit]),
+			     (char *)VB->TexCoordPtr[unit]->data,
+			     VB->TexCoordPtr[unit]->size,
+			     VB->TexCoordPtr[unit]->stride,
+			     count );
+
+	 vfmt |= RADEON_ST_BIT(unit);
+         /* assume we need the 3rd coord if texgen is active for r/q OR at least
+	    3 coords are submitted. This may not be 100% correct */
+         if (VB->TexCoordPtr[unit]->size >= 3) {
+	    vtx |= RADEON_Q_BIT(unit);
+	    vfmt |= RADEON_Q_BIT(unit);
+	 }
+	 if ( (ctx->Texture.Unit[unit].TexGenEnabled & (R_BIT | Q_BIT)) )
+	    vtx |= RADEON_Q_BIT(unit);
+	 else if ((VB->TexCoordPtr[unit]->size >= 3) &&
+	          ((ctx->Texture.Unit[unit]._ReallyEnabled & (TEXTURE_CUBE_BIT)) == 0)) {
+	    GLuint swaptexmatcol = (VB->TexCoordPtr[unit]->size - 3);
+	    if (((rmesa->NeedTexMatrix >> unit) & 1) &&
+		 (swaptexmatcol != ((rmesa->TexMatColSwap >> unit) & 1)))
+	       radeonUploadTexMatrix( rmesa, unit, swaptexmatcol ) ;
+	 }
+	 component[nr++] = &rmesa->tcl.tex[unit];
+      }
+   }
+
+   if (vtx != rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT]) {
+      RADEON_STATECHANGE( rmesa, tcl );
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] = vtx;
+   }
+
+   rmesa->tcl.nr_aos_components = nr;
+   rmesa->tcl.vertex_format = vfmt;
+}
+
+
+void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   GLuint unit;
+
+#if 0
+   if (RADEON_DEBUG & DEBUG_VERTS) 
+      _tnl_print_vert_flags( __FUNCTION__, newinputs );
+#endif
+
+   if (newinputs & VERT_BIT_POS) 
+     radeonReleaseDmaRegion( rmesa, &rmesa->tcl.obj, __FUNCTION__ );
+
+   if (newinputs & VERT_BIT_NORMAL) 
+      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.norm, __FUNCTION__ );
+
+   if (newinputs & VERT_BIT_COLOR0) 
+      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.rgba, __FUNCTION__ );
+
+   if (newinputs & VERT_BIT_COLOR1) 
+      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.spec, __FUNCTION__ );
+      
+   if (newinputs & VERT_BIT_FOG)
+      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.fog, __FUNCTION__ );
+
+   for (unit = 0 ; unit < ctx->Const.MaxTextureUnits; unit++) {
+      if (newinputs & VERT_BIT_TEX(unit))
+         radeonReleaseDmaRegion( rmesa, &rmesa->tcl.tex[unit], __FUNCTION__ );
+   }
+}
diff --git a/radeon/radeon_maos_vbtmp.h b/radeon/radeon_maos_vbtmp.h
new file mode 100644
index 0000000..034cda8
--- /dev/null
+++ b/radeon/radeon_maos_vbtmp.h
@@ -0,0 +1,301 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  4.1
+ *
+ * Copyright (C) 1999-2002  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef LOCALVARS
+#define LOCALVARS
+#endif
+
+#undef TCL_DEBUG
+#ifndef TCL_DEBUG
+#define TCL_DEBUG 0
+#endif
+
+static void TAG(emit)( GLcontext *ctx,
+		       GLuint start, GLuint end,
+		       void *dest )
+{
+   LOCALVARS
+      struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
+   GLuint (*tc0)[4], (*tc1)[4], (*tc2)[4];
+   GLfloat (*col)[4], (*spec)[4];
+   GLfloat (*fog)[4];
+   GLuint (*norm)[4];
+   GLuint tc0_stride, tc1_stride, col_stride, spec_stride, fog_stride;
+   GLuint tc2_stride, norm_stride;
+   GLuint fill_tex = 0;
+   GLuint rqcoordsnoswap = 0;
+   GLuint (*coord)[4];
+   GLuint coord_stride; /* object coordinates */
+   int i;
+
+   union emit_union *v = (union emit_union *)dest;
+
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s\n", __FUNCTION__); 
+
+   coord = (GLuint (*)[4])VB->ObjPtr->data;
+   coord_stride = VB->ObjPtr->stride;
+
+   if (DO_TEX2) {
+      if (VB->TexCoordPtr[2]) {
+	 const GLuint t2 = GET_TEXSOURCE(2);
+	 tc2 = (GLuint (*)[4])VB->TexCoordPtr[t2]->data;
+	 tc2_stride = VB->TexCoordPtr[t2]->stride;
+	 if (DO_PTEX && VB->TexCoordPtr[t2]->size < 3) {
+	    fill_tex |= (1<<2);
+	 }
+	 else if (DO_PTEX && VB->TexCoordPtr[t2]->size < 4) {
+	    rqcoordsnoswap |= (1<<2);
+	 }
+      } else {
+	 tc2 = (GLuint (*)[4])&ctx->Current.Attrib[VERT_ATTRIB_TEX2];
+	 tc2_stride = 0;
+      }
+   }
+
+   if (DO_TEX1) {
+      if (VB->TexCoordPtr[1]) {
+	 const GLuint t1 = GET_TEXSOURCE(1);
+	 tc1 = (GLuint (*)[4])VB->TexCoordPtr[t1]->data;
+	 tc1_stride = VB->TexCoordPtr[t1]->stride;
+	 if (DO_PTEX && VB->TexCoordPtr[t1]->size < 3) {
+	    fill_tex |= (1<<1);
+	 }
+	 else if (DO_PTEX && VB->TexCoordPtr[t1]->size < 4) {
+	    rqcoordsnoswap |= (1<<1);
+	 }
+      } else {
+	 tc1 = (GLuint (*)[4])&ctx->Current.Attrib[VERT_ATTRIB_TEX1];
+	 tc1_stride = 0;
+      }
+   }
+
+   if (DO_TEX0) {
+      if (VB->TexCoordPtr[0]) {
+	 const GLuint t0 = GET_TEXSOURCE(0);
+	 tc0_stride = VB->TexCoordPtr[t0]->stride;
+	 tc0 = (GLuint (*)[4])VB->TexCoordPtr[t0]->data;
+	 if (DO_PTEX && VB->TexCoordPtr[t0]->size < 3) {
+	    fill_tex |= (1<<0);
+	 }
+	 else if (DO_PTEX && VB->TexCoordPtr[t0]->size < 4) {
+	    rqcoordsnoswap |= (1<<0);
+	 }
+      } else {
+	 tc0 = (GLuint (*)[4])&ctx->Current.Attrib[VERT_ATTRIB_TEX0];
+	 tc0_stride = 0;
+      }
+	 
+   }
+
+   if (DO_NORM) {
+      if (VB->NormalPtr) {
+	 norm_stride = VB->NormalPtr->stride;
+	 norm = (GLuint (*)[4])VB->NormalPtr->data;
+      } else {
+	 norm_stride = 0;
+	 norm = (GLuint (*)[4])&ctx->Current.Attrib[VERT_ATTRIB_NORMAL];
+      }
+   }
+
+   if (DO_RGBA) {
+      if (VB->ColorPtr[0]) {
+	 col = VB->ColorPtr[0]->data;
+	 col_stride = VB->ColorPtr[0]->stride;
+      } else {
+	 col = (GLfloat (*)[4])ctx->Current.Attrib[VERT_ATTRIB_COLOR0];
+	 col_stride = 0;
+      }
+   }
+
+   if (DO_SPEC_OR_FOG) {
+      if (VB->SecondaryColorPtr[0]) {
+	 spec = VB->SecondaryColorPtr[0]->data;
+	 spec_stride = VB->SecondaryColorPtr[0]->stride;
+      } else {
+	 spec = (GLfloat (*)[4])ctx->Current.Attrib[VERT_ATTRIB_COLOR1];
+	 spec_stride = 0;
+      }
+   }
+
+   if (DO_SPEC_OR_FOG) {
+      if (VB->FogCoordPtr) {
+	 fog = VB->FogCoordPtr->data;
+	 fog_stride = VB->FogCoordPtr->stride;
+      } else {
+	 fog = (GLfloat (*)[4])ctx->Current.Attrib[VERT_ATTRIB_FOG];
+	 fog_stride = 0;
+      }
+   }
+   
+   
+   if (start) {
+      coord =  (GLuint (*)[4])((GLubyte *)coord + start * coord_stride);
+      if (DO_TEX0)
+	 tc0 =  (GLuint (*)[4])((GLubyte *)tc0 + start * tc0_stride);
+      if (DO_TEX1) 
+	 tc1 =  (GLuint (*)[4])((GLubyte *)tc1 + start * tc1_stride);
+      if (DO_TEX2) 
+	 tc2 =  (GLuint (*)[4])((GLubyte *)tc2 + start * tc2_stride);
+      if (DO_NORM) 
+	 norm =  (GLuint (*)[4])((GLubyte *)norm + start * norm_stride);
+      if (DO_RGBA) 
+	 STRIDE_4F(col, start * col_stride);
+      if (DO_SPEC)
+	 STRIDE_4F(spec, start * spec_stride);
+      if (DO_FOG)
+	 STRIDE_4F(fog, start * fog_stride);
+   }
+
+
+   {
+      for (i=start; i < end; i++) {
+	 
+	 v[0].ui = coord[0][0];
+	 v[1].ui = coord[0][1];
+	 v[2].ui = coord[0][2];
+	 if (DO_W) {
+	    v[3].ui = coord[0][3];
+	    v += 4;
+	 } 
+	 else
+	    v += 3;
+	 coord =  (GLuint (*)[4])((GLubyte *)coord +  coord_stride);
+
+	 if (DO_NORM) {
+	    v[0].ui = norm[0][0];
+	    v[1].ui = norm[0][1];
+	    v[2].ui = norm[0][2];
+	    v += 3;
+	    norm =  (GLuint (*)[4])((GLubyte *)norm +  norm_stride);
+	 }
+	 if (DO_RGBA) {
+	    UNCLAMPED_FLOAT_TO_UBYTE(v[0].rgba.red, col[0][0]);
+	    UNCLAMPED_FLOAT_TO_UBYTE(v[0].rgba.green, col[0][1]);
+	    UNCLAMPED_FLOAT_TO_UBYTE(v[0].rgba.blue, col[0][2]);
+	    UNCLAMPED_FLOAT_TO_UBYTE(v[0].rgba.alpha, col[0][3]);
+	    STRIDE_4F(col, col_stride);
+	    v++;
+	 }
+	 if (DO_SPEC_OR_FOG) {
+	    if (DO_SPEC) {
+	       UNCLAMPED_FLOAT_TO_UBYTE(v[0].rgba.red, spec[0][0]);
+	       UNCLAMPED_FLOAT_TO_UBYTE(v[0].rgba.green, spec[0][1]);
+	       UNCLAMPED_FLOAT_TO_UBYTE(v[0].rgba.blue, spec[0][2]);
+	       STRIDE_4F(spec, spec_stride);
+	    }
+	    if (DO_FOG) {
+	       UNCLAMPED_FLOAT_TO_UBYTE(v[0].rgba.alpha, radeonComputeFogBlendFactor(ctx, fog[0][0]));
+	       STRIDE_4F(fog, fog_stride);
+	    }
+	    if (TCL_DEBUG) fprintf(stderr, "%x ", v[0].ui);
+	    v++;
+	 }
+	 if (DO_TEX0) {
+	    v[0].ui = tc0[0][0];
+	    v[1].ui = tc0[0][1];
+	    if (TCL_DEBUG) fprintf(stderr, "t0: %.2f %.2f ", v[0].f, v[1].f);
+	    if (DO_PTEX) {
+	       if (fill_tex & (1<<0))
+		  v[2].f = 1.0;
+	       else if (rqcoordsnoswap & (1<<0))
+		  v[2].ui = tc0[0][2];
+	       else
+		  v[2].ui = tc0[0][3];
+	       if (TCL_DEBUG) fprintf(stderr, "%.2f ", v[2].f);
+	       v += 3;
+	    } 
+	    else
+	       v += 2;
+	    tc0 =  (GLuint (*)[4])((GLubyte *)tc0 +  tc0_stride);
+	 }
+	 if (DO_TEX1) {
+	    v[0].ui = tc1[0][0];
+	    v[1].ui = tc1[0][1];
+	    if (TCL_DEBUG) fprintf(stderr, "t1: %.2f %.2f ", v[0].f, v[1].f);
+	    if (DO_PTEX) {
+	       if (fill_tex & (1<<1))
+		  v[2].f = 1.0;
+	       else if (rqcoordsnoswap & (1<<1))
+		  v[2].ui = tc1[0][2];
+	       else
+		  v[2].ui = tc1[0][3];
+	       if (TCL_DEBUG) fprintf(stderr, "%.2f ", v[2].f);
+	       v += 3;
+	    } 
+	    else
+	       v += 2;
+	    tc1 =  (GLuint (*)[4])((GLubyte *)tc1 +  tc1_stride);
+	 } 
+	 if (DO_TEX2) {
+	    v[0].ui = tc2[0][0];
+	    v[1].ui = tc2[0][1];
+	    if (TCL_DEBUG) fprintf(stderr, "t2: %.2f %.2f ", v[0].f, v[1].f);
+	    if (DO_PTEX) {
+	       if (fill_tex & (1<<2))
+		  v[2].f = 1.0;
+	       else if (rqcoordsnoswap & (1<<2))
+		  v[2].ui = tc2[0][2];
+	       else
+		  v[2].ui = tc2[0][3];
+	       if (TCL_DEBUG) fprintf(stderr, "%.2f ", v[2].f);
+	       v += 3;
+	    } 
+	    else
+	       v += 2;
+	    tc2 =  (GLuint (*)[4])((GLubyte *)tc2 +  tc2_stride);
+	 } 
+	 if (TCL_DEBUG) fprintf(stderr, "\n");
+      }
+   }
+}
+
+
+
+static void TAG(init)( void )
+{
+   int sz = 3;
+   if (DO_W) sz++;
+   if (DO_NORM) sz += 3;
+   if (DO_RGBA) sz++;
+   if (DO_SPEC_OR_FOG) sz++;
+   if (DO_TEX0) sz += 2;
+   if (DO_TEX0 && DO_PTEX) sz++;
+   if (DO_TEX1) sz += 2;
+   if (DO_TEX1 && DO_PTEX) sz++;
+   if (DO_TEX2) sz += 2;
+   if (DO_TEX2 && DO_PTEX) sz++;
+
+   setup_tab[IDX].emit = TAG(emit);
+   setup_tab[IDX].vertex_format = IND;
+   setup_tab[IDX].vertex_size = sz;
+}
+
+
+#undef IND
+#undef TAG
+#undef IDX
diff --git a/radeon/radeon_maos_verts.c b/radeon/radeon_maos_verts.c
new file mode 100644
index 0000000..65dbecf
--- /dev/null
+++ b/radeon/radeon_maos_verts.c
@@ -0,0 +1,450 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_maos_verts.c,v 1.1 2002/10/30 12:51:55 alanh Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     Tungsten Graphics Inc., Austin, Texas.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "mtypes.h"
+
+#include "vbo/vbo.h"
+#include "math/m_translate.h"
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "math/m_translate.h"
+#include "radeon_context.h"
+#include "radeon_state.h"
+#include "radeon_ioctl.h"
+#include "radeon_tex.h"
+#include "radeon_tcl.h"
+#include "radeon_swtcl.h"
+#include "radeon_maos.h"
+
+
+#define RADEON_TCL_MAX_SETUP 19
+
+union emit_union { float f; GLuint ui; radeon_color_t rgba; };
+
+static struct {
+   void   (*emit)( GLcontext *, GLuint, GLuint, void * );
+   GLuint vertex_size;
+   GLuint vertex_format;
+} setup_tab[RADEON_TCL_MAX_SETUP];
+
+#define DO_W    (IND & RADEON_CP_VC_FRMT_W0)
+#define DO_RGBA (IND & RADEON_CP_VC_FRMT_PKCOLOR)
+#define DO_SPEC_OR_FOG (IND & RADEON_CP_VC_FRMT_PKSPEC)
+#define DO_SPEC ((IND & RADEON_CP_VC_FRMT_PKSPEC) && \
+		 (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR))
+#define DO_FOG  ((IND & RADEON_CP_VC_FRMT_PKSPEC) && ctx->Fog.Enabled && \
+		 (ctx->Fog.FogCoordinateSource == GL_FOG_COORD))
+#define DO_TEX0 (IND & RADEON_CP_VC_FRMT_ST0)
+#define DO_TEX1 (IND & RADEON_CP_VC_FRMT_ST1)
+#define DO_TEX2 (IND & RADEON_CP_VC_FRMT_ST2)
+#define DO_PTEX (IND & RADEON_CP_VC_FRMT_Q0)
+#define DO_NORM (IND & RADEON_CP_VC_FRMT_N0)
+
+#define DO_TEX3 0
+
+#define GET_TEXSOURCE(n)  n
+
+/***********************************************************************
+ *             Generate vertex emit functions               *
+ ***********************************************************************/
+
+
+/* Defined in order of increasing vertex size:
+ */
+#define IDX 0
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR)
+#define TAG(x) x##_rgba
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 1
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_N0)
+#define TAG(x) x##_n
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 2
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_ST0)
+#define TAG(x) x##_rgba_st
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 3
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_N0)
+#define TAG(x) x##_rgba_n
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 4
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_N0)
+#define TAG(x) x##_st_n
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 5
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_ST1)
+#define TAG(x) x##_rgba_st_st
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 6
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_N0)
+#define TAG(x) x##_rgba_st_n
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 7
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_PKSPEC|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_ST1)
+#define TAG(x) x##_rgba_spec_st_st
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 8
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_ST1|		\
+	     RADEON_CP_VC_FRMT_N0)
+#define TAG(x) x##_st_st_n
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 9
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_PKSPEC|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_ST1|		\
+	     RADEON_CP_VC_FRMT_N0)
+#define TAG(x) x##_rgba_spec_st_st_n
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 10
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_Q0)
+#define TAG(x) x##_rgba_stq
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 11
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_ST1|		\
+	     RADEON_CP_VC_FRMT_Q1|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_Q0)
+#define TAG(x) x##_rgba_stq_stq
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 12
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_W0|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_PKSPEC|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_Q0|		\
+	     RADEON_CP_VC_FRMT_ST1|		\
+	     RADEON_CP_VC_FRMT_Q1|		\
+	     RADEON_CP_VC_FRMT_N0)
+#define TAG(x) x##_w_rgba_spec_stq_stq_n
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 13
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_ST1|		\
+	     RADEON_CP_VC_FRMT_ST2)
+#define TAG(x) x##_rgba_st_st_st
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 14
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_PKSPEC|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_ST1|		\
+	     RADEON_CP_VC_FRMT_ST2)
+#define TAG(x) x##_rgba_spec_st_st_st
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 15
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_ST1|		\
+	     RADEON_CP_VC_FRMT_ST2|		\
+	     RADEON_CP_VC_FRMT_N0)
+#define TAG(x) x##_st_st_st_n
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 16
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_PKSPEC|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_ST1|		\
+	     RADEON_CP_VC_FRMT_ST2|		\
+	     RADEON_CP_VC_FRMT_N0)
+#define TAG(x) x##_rgba_spec_st_st_st_n
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 17
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_Q0|		\
+	     RADEON_CP_VC_FRMT_ST1|		\
+	     RADEON_CP_VC_FRMT_Q1|		\
+	     RADEON_CP_VC_FRMT_ST2|		\
+	     RADEON_CP_VC_FRMT_Q2)
+#define TAG(x) x##_rgba_stq_stq_stq
+#include "radeon_maos_vbtmp.h"
+
+#define IDX 18
+#define IND (RADEON_CP_VC_FRMT_XY|		\
+	     RADEON_CP_VC_FRMT_Z|		\
+	     RADEON_CP_VC_FRMT_W0|		\
+	     RADEON_CP_VC_FRMT_PKCOLOR|		\
+	     RADEON_CP_VC_FRMT_PKSPEC|		\
+	     RADEON_CP_VC_FRMT_ST0|		\
+	     RADEON_CP_VC_FRMT_Q0|		\
+	     RADEON_CP_VC_FRMT_ST1|		\
+	     RADEON_CP_VC_FRMT_Q1|		\
+	     RADEON_CP_VC_FRMT_ST2|		\
+	     RADEON_CP_VC_FRMT_Q2|		\
+	     RADEON_CP_VC_FRMT_N0)
+#define TAG(x) x##_w_rgba_spec_stq_stq_stq_n
+#include "radeon_maos_vbtmp.h"
+
+
+
+
+/***********************************************************************
+ *                         Initialization 
+ ***********************************************************************/
+
+
+static void init_tcl_verts( void )
+{
+   init_rgba();
+   init_n();
+   init_rgba_n();
+   init_rgba_st();
+   init_st_n();
+   init_rgba_st_st();
+   init_rgba_st_n();
+   init_rgba_spec_st_st();
+   init_st_st_n();
+   init_rgba_spec_st_st_n();
+   init_rgba_stq();
+   init_rgba_stq_stq();
+   init_w_rgba_spec_stq_stq_n();
+   init_rgba_st_st_st();
+   init_rgba_spec_st_st_st();
+   init_st_st_st_n();
+   init_rgba_spec_st_st_st_n();
+   init_rgba_stq_stq_stq();
+   init_w_rgba_spec_stq_stq_stq_n();
+}
+
+
+void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
+   GLuint req = 0;
+   GLuint unit;
+   GLuint vtx = (rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &
+		 ~(RADEON_TCL_VTX_Q0|RADEON_TCL_VTX_Q1|RADEON_TCL_VTX_Q2));
+   int i;
+   static int firsttime = 1;
+
+   if (firsttime) {
+      init_tcl_verts();
+      firsttime = 0;
+   }
+
+   if (1) {
+      req |= RADEON_CP_VC_FRMT_Z;
+      if (VB->ObjPtr->size == 4) {
+	 req |= RADEON_CP_VC_FRMT_W0;
+      }
+   }
+
+   if (inputs & VERT_BIT_NORMAL) {
+      req |= RADEON_CP_VC_FRMT_N0;
+   }
+
+   if (inputs & VERT_BIT_COLOR0) {
+      req |= RADEON_CP_VC_FRMT_PKCOLOR;
+   }
+
+   if (inputs & (VERT_BIT_COLOR1|VERT_BIT_FOG)) {
+      req |= RADEON_CP_VC_FRMT_PKSPEC;
+   }
+
+   for (unit = 0; unit < ctx->Const.MaxTextureUnits; unit++) {
+      if (inputs & VERT_BIT_TEX(unit)) {
+	 req |= RADEON_ST_BIT(unit);
+	 /* assume we need the 3rd coord if texgen is active for r/q OR at least
+	    3 coords are submitted. This may not be 100% correct */
+	 if (VB->TexCoordPtr[unit]->size >= 3) {
+	    req |= RADEON_Q_BIT(unit);
+	    vtx |= RADEON_Q_BIT(unit);
+	 }
+	 if ( (ctx->Texture.Unit[unit].TexGenEnabled & (R_BIT | Q_BIT)) )
+	    vtx |= RADEON_Q_BIT(unit);
+	 else if ((VB->TexCoordPtr[unit]->size >= 3) &&
+	          ((ctx->Texture.Unit[unit]._ReallyEnabled & (TEXTURE_CUBE_BIT)) == 0)) {
+	    GLuint swaptexmatcol = (VB->TexCoordPtr[unit]->size - 3);
+	    if (((rmesa->NeedTexMatrix >> unit) & 1) &&
+		 (swaptexmatcol != ((rmesa->TexMatColSwap >> unit) & 1)))
+	       radeonUploadTexMatrix( rmesa, unit, swaptexmatcol ) ;
+	 }
+      }
+   }
+
+   if (vtx != rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT]) {
+      RADEON_STATECHANGE( rmesa, tcl );
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] = vtx;
+   }
+
+   for (i = 0 ; i < RADEON_TCL_MAX_SETUP ; i++) 
+      if ((setup_tab[i].vertex_format & req) == req) 
+	 break;
+
+   if (rmesa->tcl.vertex_format == setup_tab[i].vertex_format &&
+       rmesa->tcl.indexed_verts.buf)
+      return;
+
+   if (rmesa->tcl.indexed_verts.buf)
+      radeonReleaseArrays( ctx, ~0 );
+
+   radeonAllocDmaRegion( rmesa,
+			 &rmesa->tcl.indexed_verts, 
+			 VB->Count * setup_tab[i].vertex_size * 4, 
+			 4);
+
+   /* The vertex code expects Obj to be clean to element 3.  To fix
+    * this, add more vertex code (for obj-2, obj-3) or preferably move
+    * to maos.  
+    */
+   if (VB->ObjPtr->size < 3 || 
+       (VB->ObjPtr->size == 3 && 
+	(setup_tab[i].vertex_format & RADEON_CP_VC_FRMT_W0))) {
+
+      _math_trans_4f( rmesa->tcl.ObjClean.data,
+		      VB->ObjPtr->data,
+		      VB->ObjPtr->stride,
+		      GL_FLOAT,
+		      VB->ObjPtr->size,
+		      0,
+		      VB->Count );
+
+      switch (VB->ObjPtr->size) {
+      case 1:
+	    _mesa_vector4f_clean_elem(&rmesa->tcl.ObjClean, VB->Count, 1);
+      case 2:
+	    _mesa_vector4f_clean_elem(&rmesa->tcl.ObjClean, VB->Count, 2);
+      case 3:
+	 if (setup_tab[i].vertex_format & RADEON_CP_VC_FRMT_W0) {
+	    _mesa_vector4f_clean_elem(&rmesa->tcl.ObjClean, VB->Count, 3);
+	 }
+      case 4:
+      default:
+	 break;
+      }
+
+      VB->ObjPtr = &rmesa->tcl.ObjClean;
+   }
+
+
+
+   setup_tab[i].emit( ctx, 0, VB->Count, 
+		      rmesa->tcl.indexed_verts.address + 
+		      rmesa->tcl.indexed_verts.start );
+
+   rmesa->tcl.vertex_format = setup_tab[i].vertex_format;
+   rmesa->tcl.indexed_verts.aos_start = GET_START( &rmesa->tcl.indexed_verts );
+   rmesa->tcl.indexed_verts.aos_size = setup_tab[i].vertex_size;
+   rmesa->tcl.indexed_verts.aos_stride = setup_tab[i].vertex_size;
+
+   rmesa->tcl.aos_components[0] = &rmesa->tcl.indexed_verts;
+   rmesa->tcl.nr_aos_components = 1;
+}
+
+
+
+void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+
+#if 0
+   if (RADEON_DEBUG & DEBUG_VERTS) 
+      _tnl_print_vert_flags( __FUNCTION__, newinputs );
+#endif
+
+   if (newinputs) 
+     radeonReleaseDmaRegion( rmesa, &rmesa->tcl.indexed_verts, __FUNCTION__ );
+}
diff --git a/radeon/radeon_sanity.c b/radeon/radeon_sanity.c
new file mode 100644
index 0000000..5570577
--- /dev/null
+++ b/radeon/radeon_sanity.c
@@ -0,0 +1,1082 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_sanity.c,v 1.1 2002/10/30 12:51:55 alanh Exp $ */
+/**************************************************************************
+
+Copyright 2002 ATI Technologies Inc., Ontario, Canada, and
+                     Tungsten Graphics Inc, Cedar Park, TX.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ATI, TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *
+ */
+#include <errno.h> 
+
+#include "glheader.h"
+
+#include "radeon_context.h"
+#include "radeon_ioctl.h"
+#include "radeon_sanity.h"
+
+/* Set this '1' to get more verbiage.
+ */
+#define MORE_VERBOSE 1
+
+#if MORE_VERBOSE
+#define VERBOSE (RADEON_DEBUG & DEBUG_VERBOSE)
+#define NORMAL  (1)
+#else
+#define VERBOSE 0
+#define NORMAL  (RADEON_DEBUG & DEBUG_VERBOSE)
+#endif
+
+
+/* New (1.3) state mechanism.  3 commands (packet, scalar, vector) in
+ * 1.3 cmdbuffers allow all previous state to be updated as well as
+ * the tcl scalar and vector areas.  
+ */
+static struct { 
+   int start; 
+   int len; 
+   const char *name;
+} packet[RADEON_MAX_STATE_PACKETS] = {
+   { RADEON_PP_MISC,7,"RADEON_PP_MISC" },
+   { RADEON_PP_CNTL,3,"RADEON_PP_CNTL" },
+   { RADEON_RB3D_COLORPITCH,1,"RADEON_RB3D_COLORPITCH" },
+   { RADEON_RE_LINE_PATTERN,2,"RADEON_RE_LINE_PATTERN" },
+   { RADEON_SE_LINE_WIDTH,1,"RADEON_SE_LINE_WIDTH" },
+   { RADEON_PP_LUM_MATRIX,1,"RADEON_PP_LUM_MATRIX" },
+   { RADEON_PP_ROT_MATRIX_0,2,"RADEON_PP_ROT_MATRIX_0" },
+   { RADEON_RB3D_STENCILREFMASK,3,"RADEON_RB3D_STENCILREFMASK" },
+   { RADEON_SE_VPORT_XSCALE,6,"RADEON_SE_VPORT_XSCALE" },
+   { RADEON_SE_CNTL,2,"RADEON_SE_CNTL" },
+   { RADEON_SE_CNTL_STATUS,1,"RADEON_SE_CNTL_STATUS" },
+   { RADEON_RE_MISC,1,"RADEON_RE_MISC" },
+   { RADEON_PP_TXFILTER_0,6,"RADEON_PP_TXFILTER_0" },
+   { RADEON_PP_BORDER_COLOR_0,1,"RADEON_PP_BORDER_COLOR_0" },
+   { RADEON_PP_TXFILTER_1,6,"RADEON_PP_TXFILTER_1" },
+   { RADEON_PP_BORDER_COLOR_1,1,"RADEON_PP_BORDER_COLOR_1" },
+   { RADEON_PP_TXFILTER_2,6,"RADEON_PP_TXFILTER_2" },
+   { RADEON_PP_BORDER_COLOR_2,1,"RADEON_PP_BORDER_COLOR_2" },
+   { RADEON_SE_ZBIAS_FACTOR,2,"RADEON_SE_ZBIAS_FACTOR" },
+   { RADEON_SE_TCL_OUTPUT_VTX_FMT,11,"RADEON_SE_TCL_OUTPUT_VTX_FMT" },
+   { RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED,17,"RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED" },
+	{ 0, 4, "R200_PP_TXCBLEND_0" },
+	{ 0, 4, "R200_PP_TXCBLEND_1" },
+	{ 0, 4, "R200_PP_TXCBLEND_2" },
+	{ 0, 4, "R200_PP_TXCBLEND_3" },
+	{ 0, 4, "R200_PP_TXCBLEND_4" },
+	{ 0, 4, "R200_PP_TXCBLEND_5" },
+	{ 0, 4, "R200_PP_TXCBLEND_6" },
+	{ 0, 4, "R200_PP_TXCBLEND_7" },
+	{ 0, 6, "R200_SE_TCL_LIGHT_MODEL_CTL_0" },
+	{ 0, 6, "R200_PP_TFACTOR_0" },
+	{ 0, 4, "R200_SE_VTX_FMT_0" },
+	{ 0, 1, "R200_SE_VAP_CNTL" },
+	{ 0, 5, "R200_SE_TCL_MATRIX_SEL_0" },
+	{ 0, 5, "R200_SE_TCL_TEX_PROC_CTL_2" },
+	{ 0, 1, "R200_SE_TCL_UCP_VERT_BLEND_CTL" },
+	{ 0, 6, "R200_PP_TXFILTER_0" },
+	{ 0, 6, "R200_PP_TXFILTER_1" },
+	{ 0, 6, "R200_PP_TXFILTER_2" },
+	{ 0, 6, "R200_PP_TXFILTER_3" },
+	{ 0, 6, "R200_PP_TXFILTER_4" },
+	{ 0, 6, "R200_PP_TXFILTER_5" },
+	{ 0, 1, "R200_PP_TXOFFSET_0" },
+	{ 0, 1, "R200_PP_TXOFFSET_1" },
+	{ 0, 1, "R200_PP_TXOFFSET_2" },
+	{ 0, 1, "R200_PP_TXOFFSET_3" },
+	{ 0, 1, "R200_PP_TXOFFSET_4" },
+	{ 0, 1, "R200_PP_TXOFFSET_5" },
+	{ 0, 1, "R200_SE_VTE_CNTL" },
+	{ 0, 1, "R200_SE_TCL_OUTPUT_VTX_COMP_SEL" },
+	{ 0, 1, "R200_PP_TAM_DEBUG3" },
+	{ 0, 1, "R200_PP_CNTL_X" }, 
+	{ 0, 1, "R200_RB3D_DEPTHXY_OFFSET" }, 
+	{ 0, 1, "R200_RE_AUX_SCISSOR_CNTL" }, 
+	{ 0, 2, "R200_RE_SCISSOR_TL_0" }, 
+	{ 0, 2, "R200_RE_SCISSOR_TL_1" }, 
+	{ 0, 2, "R200_RE_SCISSOR_TL_2" }, 
+	{ 0, 1, "R200_SE_VAP_CNTL_STATUS" }, 
+	{ 0, 1, "R200_SE_VTX_STATE_CNTL" }, 
+	{ 0, 1, "R200_RE_POINTSIZE" }, 
+	{ 0, 4, "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0" },
+	{ 0, 1, "R200_PP_CUBIC_FACES_0" }, /* 61 */
+	{ 0, 5, "R200_PP_CUBIC_OFFSET_F1_0" }, /* 62 */
+	{ 0, 1, "R200_PP_CUBIC_FACES_1" },
+	{ 0, 5, "R200_PP_CUBIC_OFFSET_F1_1" },
+	{ 0, 1, "R200_PP_CUBIC_FACES_2" },
+	{ 0, 5, "R200_PP_CUBIC_OFFSET_F1_2" },
+	{ 0, 1, "R200_PP_CUBIC_FACES_3" },
+	{ 0, 5, "R200_PP_CUBIC_OFFSET_F1_3" },
+	{ 0, 1, "R200_PP_CUBIC_FACES_4" },
+	{ 0, 5, "R200_PP_CUBIC_OFFSET_F1_4" },
+	{ 0, 1, "R200_PP_CUBIC_FACES_5" },
+	{ 0, 5, "R200_PP_CUBIC_OFFSET_F1_5" },
+   { RADEON_PP_TEX_SIZE_0, 2, "RADEON_PP_TEX_SIZE_0" },
+   { RADEON_PP_TEX_SIZE_1, 2, "RADEON_PP_TEX_SIZE_1" },
+   { RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2" },
+	{ 0, 3, "R200_RB3D_BLENDCOLOR" },
+	{ 0, 1, "R200_SE_TCL_POINT_SPRITE_CNTL" },
+   { RADEON_PP_CUBIC_FACES_0, 1, "RADEON_PP_CUBIC_FACES_0" },
+   { RADEON_PP_CUBIC_OFFSET_T0_0, 5, "RADEON_PP_CUBIC_OFFSET_T0_0" },
+   { RADEON_PP_CUBIC_FACES_1, 1, "RADEON_PP_CUBIC_FACES_1" },
+   { RADEON_PP_CUBIC_OFFSET_T1_0, 5, "RADEON_PP_CUBIC_OFFSET_T1_0" },
+   { RADEON_PP_CUBIC_FACES_2, 1, "RADEON_PP_CUBIC_FACES_2" },
+   { RADEON_PP_CUBIC_OFFSET_T2_0, 5, "RADEON_PP_CUBIC_OFFSET_T2_0" },
+   { 0, 2, "R200_PP_TRI_PERF" },
+   { 0, 32, "R200_PP_AFS_0"},   /* 85 */
+   { 0, 32, "R200_PP_AFS_1"},
+   { 0, 8, "R200_ATF_TFACTOR"},
+   { 0, 8, "R200_PP_TXCTLALL_0"},
+   { 0, 8, "R200_PP_TXCTLALL_1"},
+   { 0, 8, "R200_PP_TXCTLALL_2"},
+   { 0, 8, "R200_PP_TXCTLALL_3"},
+   { 0, 8, "R200_PP_TXCTLALL_4"},
+   { 0, 8, "R200_PP_TXCTLALL_5"},
+   { 0, 2, "R200_VAP_PVS_CNTL"},
+};
+
+struct reg_names {
+   int idx;
+   const char *name;
+};
+
+static struct reg_names reg_names[] = {
+   { RADEON_PP_MISC, "RADEON_PP_MISC" },
+   { RADEON_PP_FOG_COLOR, "RADEON_PP_FOG_COLOR" },
+   { RADEON_RE_SOLID_COLOR, "RADEON_RE_SOLID_COLOR" },
+   { RADEON_RB3D_BLENDCNTL, "RADEON_RB3D_BLENDCNTL" },
+   { RADEON_RB3D_DEPTHOFFSET, "RADEON_RB3D_DEPTHOFFSET" },
+   { RADEON_RB3D_DEPTHPITCH, "RADEON_RB3D_DEPTHPITCH" },
+   { RADEON_RB3D_ZSTENCILCNTL, "RADEON_RB3D_ZSTENCILCNTL" },
+   { RADEON_PP_CNTL, "RADEON_PP_CNTL" },
+   { RADEON_RB3D_CNTL, "RADEON_RB3D_CNTL" },
+   { RADEON_RB3D_COLOROFFSET, "RADEON_RB3D_COLOROFFSET" },
+   { RADEON_RB3D_COLORPITCH, "RADEON_RB3D_COLORPITCH" },
+   { RADEON_SE_CNTL, "RADEON_SE_CNTL" },
+   { RADEON_SE_COORD_FMT, "RADEON_SE_COORDFMT" },
+   { RADEON_SE_CNTL_STATUS, "RADEON_SE_CNTL_STATUS" },
+   { RADEON_RE_LINE_PATTERN, "RADEON_RE_LINE_PATTERN" },
+   { RADEON_RE_LINE_STATE, "RADEON_RE_LINE_STATE" },
+   { RADEON_SE_LINE_WIDTH, "RADEON_SE_LINE_WIDTH" },
+   { RADEON_RB3D_STENCILREFMASK, "RADEON_RB3D_STENCILREFMASK" },
+   { RADEON_RB3D_ROPCNTL, "RADEON_RB3D_ROPCNTL" },
+   { RADEON_RB3D_PLANEMASK, "RADEON_RB3D_PLANEMASK" },
+   { RADEON_SE_VPORT_XSCALE, "RADEON_SE_VPORT_XSCALE" },
+   { RADEON_SE_VPORT_XOFFSET, "RADEON_SE_VPORT_XOFFSET" },
+   { RADEON_SE_VPORT_YSCALE, "RADEON_SE_VPORT_YSCALE" },
+   { RADEON_SE_VPORT_YOFFSET, "RADEON_SE_VPORT_YOFFSET" },
+   { RADEON_SE_VPORT_ZSCALE, "RADEON_SE_VPORT_ZSCALE" },
+   { RADEON_SE_VPORT_ZOFFSET, "RADEON_SE_VPORT_ZOFFSET" },
+   { RADEON_RE_MISC, "RADEON_RE_MISC" },
+   { RADEON_PP_TXFILTER_0, "RADEON_PP_TXFILTER_0" },
+   { RADEON_PP_TXFILTER_1, "RADEON_PP_TXFILTER_1" },
+   { RADEON_PP_TXFILTER_2, "RADEON_PP_TXFILTER_2" },
+   { RADEON_PP_TXFORMAT_0, "RADEON_PP_TXFORMAT_0" },
+   { RADEON_PP_TXFORMAT_1, "RADEON_PP_TXFORMAT_1" },
+   { RADEON_PP_TXFORMAT_2, "RADEON_PP_TXFORMAT_2" },
+   { RADEON_PP_TXOFFSET_0, "RADEON_PP_TXOFFSET_0" },
+   { RADEON_PP_TXOFFSET_1, "RADEON_PP_TXOFFSET_1" },
+   { RADEON_PP_TXOFFSET_2, "RADEON_PP_TXOFFSET_2" },
+   { RADEON_PP_TXCBLEND_0, "RADEON_PP_TXCBLEND_0" },
+   { RADEON_PP_TXCBLEND_1, "RADEON_PP_TXCBLEND_1" },
+   { RADEON_PP_TXCBLEND_2, "RADEON_PP_TXCBLEND_2" },
+   { RADEON_PP_TXABLEND_0, "RADEON_PP_TXABLEND_0" },
+   { RADEON_PP_TXABLEND_1, "RADEON_PP_TXABLEND_1" },
+   { RADEON_PP_TXABLEND_2, "RADEON_PP_TXABLEND_2" },
+   { RADEON_PP_TFACTOR_0, "RADEON_PP_TFACTOR_0" },
+   { RADEON_PP_TFACTOR_1, "RADEON_PP_TFACTOR_1" },
+   { RADEON_PP_TFACTOR_2, "RADEON_PP_TFACTOR_2" },
+   { RADEON_PP_BORDER_COLOR_0, "RADEON_PP_BORDER_COLOR_0" },
+   { RADEON_PP_BORDER_COLOR_1, "RADEON_PP_BORDER_COLOR_1" },
+   { RADEON_PP_BORDER_COLOR_2, "RADEON_PP_BORDER_COLOR_2" },
+   { RADEON_SE_ZBIAS_FACTOR, "RADEON_SE_ZBIAS_FACTOR" },
+   { RADEON_SE_ZBIAS_CONSTANT, "RADEON_SE_ZBIAS_CONSTANT" },
+   { RADEON_SE_TCL_OUTPUT_VTX_FMT, "RADEON_SE_TCL_OUTPUT_VTXFMT" },
+   { RADEON_SE_TCL_OUTPUT_VTX_SEL, "RADEON_SE_TCL_OUTPUT_VTXSEL" },
+   { RADEON_SE_TCL_MATRIX_SELECT_0, "RADEON_SE_TCL_MATRIX_SELECT_0" },
+   { RADEON_SE_TCL_MATRIX_SELECT_1, "RADEON_SE_TCL_MATRIX_SELECT_1" },
+   { RADEON_SE_TCL_UCP_VERT_BLEND_CTL, "RADEON_SE_TCL_UCP_VERT_BLEND_CTL" },
+   { RADEON_SE_TCL_TEXTURE_PROC_CTL, "RADEON_SE_TCL_TEXTURE_PROC_CTL" },
+   { RADEON_SE_TCL_LIGHT_MODEL_CTL, "RADEON_SE_TCL_LIGHT_MODEL_CTL" },
+   { RADEON_SE_TCL_PER_LIGHT_CTL_0, "RADEON_SE_TCL_PER_LIGHT_CTL_0" },
+   { RADEON_SE_TCL_PER_LIGHT_CTL_1, "RADEON_SE_TCL_PER_LIGHT_CTL_1" },
+   { RADEON_SE_TCL_PER_LIGHT_CTL_2, "RADEON_SE_TCL_PER_LIGHT_CTL_2" },
+   { RADEON_SE_TCL_PER_LIGHT_CTL_3, "RADEON_SE_TCL_PER_LIGHT_CTL_3" },
+   { RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED, "RADEON_SE_TCL_EMMISSIVE_RED" },
+   { RADEON_SE_TCL_MATERIAL_EMMISSIVE_GREEN, "RADEON_SE_TCL_EMMISSIVE_GREEN" },
+   { RADEON_SE_TCL_MATERIAL_EMMISSIVE_BLUE, "RADEON_SE_TCL_EMMISSIVE_BLUE" },
+   { RADEON_SE_TCL_MATERIAL_EMMISSIVE_ALPHA, "RADEON_SE_TCL_EMMISSIVE_ALPHA" },
+   { RADEON_SE_TCL_MATERIAL_AMBIENT_RED, "RADEON_SE_TCL_AMBIENT_RED" },
+   { RADEON_SE_TCL_MATERIAL_AMBIENT_GREEN, "RADEON_SE_TCL_AMBIENT_GREEN" },
+   { RADEON_SE_TCL_MATERIAL_AMBIENT_BLUE, "RADEON_SE_TCL_AMBIENT_BLUE" },
+   { RADEON_SE_TCL_MATERIAL_AMBIENT_ALPHA, "RADEON_SE_TCL_AMBIENT_ALPHA" },
+   { RADEON_SE_TCL_MATERIAL_DIFFUSE_RED, "RADEON_SE_TCL_DIFFUSE_RED" },
+   { RADEON_SE_TCL_MATERIAL_DIFFUSE_GREEN, "RADEON_SE_TCL_DIFFUSE_GREEN" },
+   { RADEON_SE_TCL_MATERIAL_DIFFUSE_BLUE, "RADEON_SE_TCL_DIFFUSE_BLUE" },
+   { RADEON_SE_TCL_MATERIAL_DIFFUSE_ALPHA, "RADEON_SE_TCL_DIFFUSE_ALPHA" },
+   { RADEON_SE_TCL_MATERIAL_SPECULAR_RED, "RADEON_SE_TCL_SPECULAR_RED" },
+   { RADEON_SE_TCL_MATERIAL_SPECULAR_GREEN, "RADEON_SE_TCL_SPECULAR_GREEN" },
+   { RADEON_SE_TCL_MATERIAL_SPECULAR_BLUE, "RADEON_SE_TCL_SPECULAR_BLUE" },
+   { RADEON_SE_TCL_MATERIAL_SPECULAR_ALPHA, "RADEON_SE_TCL_SPECULAR_ALPHA" },
+   { RADEON_SE_TCL_SHININESS, "RADEON_SE_TCL_SHININESS" },
+   { RADEON_SE_COORD_FMT, "RADEON_SE_COORD_FMT" },
+   { RADEON_PP_TEX_SIZE_0, "RADEON_PP_TEX_SIZE_0" },
+   { RADEON_PP_TEX_SIZE_1, "RADEON_PP_TEX_SIZE_1" },
+   { RADEON_PP_TEX_SIZE_2, "RADEON_PP_TEX_SIZE_2" },
+   { RADEON_PP_TEX_SIZE_0+4, "RADEON_PP_TEX_PITCH_0" },
+   { RADEON_PP_TEX_SIZE_1+4, "RADEON_PP_TEX_PITCH_1" },
+   { RADEON_PP_TEX_SIZE_2+4, "RADEON_PP_TEX_PITCH_2" },
+   { RADEON_PP_CUBIC_FACES_0, "RADEON_PP_CUBIC_FACES_0" },
+   { RADEON_PP_CUBIC_FACES_1, "RADEON_PP_CUBIC_FACES_1" },
+   { RADEON_PP_CUBIC_FACES_2, "RADEON_PP_CUBIC_FACES_2" },
+   { RADEON_PP_CUBIC_OFFSET_T0_0, "RADEON_PP_CUBIC_OFFSET_T0_0" },
+   { RADEON_PP_CUBIC_OFFSET_T0_1, "RADEON_PP_CUBIC_OFFSET_T0_1" },
+   { RADEON_PP_CUBIC_OFFSET_T0_2, "RADEON_PP_CUBIC_OFFSET_T0_2" },
+   { RADEON_PP_CUBIC_OFFSET_T0_3, "RADEON_PP_CUBIC_OFFSET_T0_3" },
+   { RADEON_PP_CUBIC_OFFSET_T0_4, "RADEON_PP_CUBIC_OFFSET_T0_4" },
+   { RADEON_PP_CUBIC_OFFSET_T1_0, "RADEON_PP_CUBIC_OFFSET_T1_0" },
+   { RADEON_PP_CUBIC_OFFSET_T1_1, "RADEON_PP_CUBIC_OFFSET_T1_1" },
+   { RADEON_PP_CUBIC_OFFSET_T1_2, "RADEON_PP_CUBIC_OFFSET_T1_2" },
+   { RADEON_PP_CUBIC_OFFSET_T1_3, "RADEON_PP_CUBIC_OFFSET_T1_3" },
+   { RADEON_PP_CUBIC_OFFSET_T1_4, "RADEON_PP_CUBIC_OFFSET_T1_4" },
+   { RADEON_PP_CUBIC_OFFSET_T2_0, "RADEON_PP_CUBIC_OFFSET_T2_0" },
+   { RADEON_PP_CUBIC_OFFSET_T2_1, "RADEON_PP_CUBIC_OFFSET_T2_1" },
+   { RADEON_PP_CUBIC_OFFSET_T2_2, "RADEON_PP_CUBIC_OFFSET_T2_2" },
+   { RADEON_PP_CUBIC_OFFSET_T2_3, "RADEON_PP_CUBIC_OFFSET_T2_3" },
+   { RADEON_PP_CUBIC_OFFSET_T2_4, "RADEON_PP_CUBIC_OFFSET_T2_4" },
+};
+
+static struct reg_names scalar_names[] = {
+   { RADEON_SS_LIGHT_DCD_ADDR, "LIGHT_DCD" },
+   { RADEON_SS_LIGHT_SPOT_EXPONENT_ADDR, "LIGHT_SPOT_EXPONENT" },
+   { RADEON_SS_LIGHT_SPOT_CUTOFF_ADDR, "LIGHT_SPOT_CUTOFF" },
+   { RADEON_SS_LIGHT_SPECULAR_THRESH_ADDR, "LIGHT_SPECULAR_THRESH" },
+   { RADEON_SS_LIGHT_RANGE_CUTOFF_ADDR, "LIGHT_RANGE_CUTOFF" },
+   { RADEON_SS_VERT_GUARD_CLIP_ADJ_ADDR, "VERT_GUARD_CLIP" },
+   { RADEON_SS_VERT_GUARD_DISCARD_ADJ_ADDR, "VERT_GUARD_DISCARD" },
+   { RADEON_SS_HORZ_GUARD_CLIP_ADJ_ADDR, "HORZ_GUARD_CLIP" },
+   { RADEON_SS_HORZ_GUARD_DISCARD_ADJ_ADDR, "HORZ_GUARD_DISCARD" },
+   { RADEON_SS_SHININESS, "SHININESS" },
+   { 1000, "" },
+};
+
+/* Puff these out to make them look like normal (dword) registers.
+ */
+static struct reg_names vector_names[] = {
+   { RADEON_VS_MATRIX_0_ADDR * 4, "MATRIX_0" },
+   { RADEON_VS_MATRIX_1_ADDR * 4, "MATRIX_1" },
+   { RADEON_VS_MATRIX_2_ADDR * 4, "MATRIX_2" },
+   { RADEON_VS_MATRIX_3_ADDR * 4, "MATRIX_3" },
+   { RADEON_VS_MATRIX_4_ADDR * 4, "MATRIX_4" },
+   { RADEON_VS_MATRIX_5_ADDR * 4, "MATRIX_5" },
+   { RADEON_VS_MATRIX_6_ADDR * 4, "MATRIX_6" },
+   { RADEON_VS_MATRIX_7_ADDR * 4, "MATRIX_7" },
+   { RADEON_VS_MATRIX_8_ADDR * 4, "MATRIX_8" },
+   { RADEON_VS_MATRIX_9_ADDR * 4, "MATRIX_9" },
+   { RADEON_VS_MATRIX_10_ADDR * 4, "MATRIX_10" },
+   { RADEON_VS_MATRIX_11_ADDR * 4, "MATRIX_11" },
+   { RADEON_VS_MATRIX_12_ADDR * 4, "MATRIX_12" },
+   { RADEON_VS_MATRIX_13_ADDR * 4, "MATRIX_13" },
+   { RADEON_VS_MATRIX_14_ADDR * 4, "MATRIX_14" },
+   { RADEON_VS_MATRIX_15_ADDR * 4, "MATRIX_15" },
+   { RADEON_VS_LIGHT_AMBIENT_ADDR * 4, "LIGHT_AMBIENT" },
+   { RADEON_VS_LIGHT_DIFFUSE_ADDR * 4, "LIGHT_DIFFUSE" },
+   { RADEON_VS_LIGHT_SPECULAR_ADDR * 4, "LIGHT_SPECULAR" },
+   { RADEON_VS_LIGHT_DIRPOS_ADDR * 4, "LIGHT_DIRPOS" },
+   { RADEON_VS_LIGHT_HWVSPOT_ADDR * 4, "LIGHT_HWVSPOT" },
+   { RADEON_VS_LIGHT_ATTENUATION_ADDR * 4, "LIGHT_ATTENUATION" },
+   { RADEON_VS_MATRIX_EYE2CLIP_ADDR * 4, "MATRIX_EYE2CLIP" },
+   { RADEON_VS_UCP_ADDR * 4, "UCP" },
+   { RADEON_VS_GLOBAL_AMBIENT_ADDR * 4, "GLOBAL_AMBIENT" },
+   { RADEON_VS_FOG_PARAM_ADDR * 4, "FOG_PARAM" },
+   { RADEON_VS_EYE_VECTOR_ADDR * 4, "EYE_VECTOR" },
+   { 1000, "" },
+};
+
+union fi { float f; int i; };
+
+#define ISVEC   1
+#define ISFLOAT 2
+#define TOUCHED 4
+
+struct reg {
+   int idx; 
+   struct reg_names *closest;
+   int flags;
+   union fi current;
+   union fi *values;
+   int nvalues;
+   int nalloc;
+   float vmin, vmax;
+};
+
+
+static struct reg regs[Elements(reg_names)+1];
+static struct reg scalars[512+1];
+static struct reg vectors[512*4+1];
+
+static int total, total_changed, bufs;
+
+static void init_regs( void )
+{
+   struct reg_names *tmp;
+   int i;
+
+   for (i = 0 ; i < Elements(regs)-1 ; i++) {
+      regs[i].idx = reg_names[i].idx;
+      regs[i].closest = &reg_names[i];
+      regs[i].flags = 0;
+   }
+
+   for (i = 0, tmp = scalar_names ; i < Elements(scalars) ; i++) {
+      if (tmp[1].idx == i) tmp++;
+      scalars[i].idx = i;
+      scalars[i].closest = tmp;
+      scalars[i].flags = ISFLOAT;
+   }
+
+   for (i = 0, tmp = vector_names ; i < Elements(vectors) ; i++) {
+      if (tmp[1].idx*4 == i) tmp++;
+      vectors[i].idx = i;
+      vectors[i].closest = tmp;
+      vectors[i].flags = ISFLOAT|ISVEC;
+   }
+
+   regs[Elements(regs)-1].idx = -1;
+   scalars[Elements(scalars)-1].idx = -1;
+   vectors[Elements(vectors)-1].idx = -1;
+}
+
+static int find_or_add_value( struct reg *reg, int val )
+{
+   int j;
+
+   for ( j = 0 ; j < reg->nvalues ; j++)
+      if ( val == reg->values[j].i )
+	 return 1;
+
+   if (j == reg->nalloc) {
+      reg->nalloc += 5;
+      reg->nalloc *= 2;
+      reg->values = (union fi *) realloc( reg->values, 
+					  reg->nalloc * sizeof(union fi) );
+   }
+
+   reg->values[reg->nvalues++].i = val;
+   return 0;
+}
+
+static struct reg *lookup_reg( struct reg *tab, int reg )
+{
+   int i;
+
+   for (i = 0 ; tab[i].idx != -1 ; i++) {
+      if (tab[i].idx == reg)
+	 return &tab[i];
+   }
+
+   fprintf(stderr, "*** unknown reg 0x%x\n", reg);
+   return NULL;
+}
+
+
+static const char *get_reg_name( struct reg *reg )
+{
+   static char tmp[80];
+
+   if (reg->idx == reg->closest->idx) 
+      return reg->closest->name;
+
+   
+   if (reg->flags & ISVEC) {
+      if (reg->idx/4 != reg->closest->idx)
+	 sprintf(tmp, "%s+%d[%d]", 
+		 reg->closest->name, 
+		 (reg->idx/4) - reg->closest->idx,
+		 reg->idx%4);
+      else
+	 sprintf(tmp, "%s[%d]", reg->closest->name, reg->idx%4);
+   }
+   else {
+      if (reg->idx != reg->closest->idx)
+	 sprintf(tmp, "%s+%d", reg->closest->name, reg->idx - reg->closest->idx);
+      else
+	 sprintf(tmp, "%s", reg->closest->name);
+   }
+
+   return tmp;
+}
+
+static int print_int_reg_assignment( struct reg *reg, int data )
+{
+   int changed = (reg->current.i != data);
+   int ever_seen = find_or_add_value( reg, data );
+   
+   if (VERBOSE || (NORMAL && (changed || !ever_seen)))
+       fprintf(stderr, "   %s <-- 0x%x", get_reg_name(reg), data);
+       
+   if (NORMAL) {
+      if (!ever_seen) 
+	 fprintf(stderr, " *** BRAND NEW VALUE");
+      else if (changed) 
+	 fprintf(stderr, " *** CHANGED"); 
+   }
+   
+   reg->current.i = data;
+
+   if (VERBOSE || (NORMAL && (changed || !ever_seen)))
+      fprintf(stderr, "\n");
+
+   return changed;
+}
+
+
+static int print_float_reg_assignment( struct reg *reg, float data )
+{
+   int changed = (reg->current.f != data);
+   int newmin = (data < reg->vmin);
+   int newmax = (data > reg->vmax);
+
+   if (VERBOSE || (NORMAL && (newmin || newmax || changed)))
+      fprintf(stderr, "   %s <-- %.3f", get_reg_name(reg), data);
+
+   if (NORMAL) {
+      if (newmin) {
+	 fprintf(stderr, " *** NEW MIN (prev %.3f)", reg->vmin);
+	 reg->vmin = data;
+      }
+      else if (newmax) {
+	 fprintf(stderr, " *** NEW MAX (prev %.3f)", reg->vmax);
+	 reg->vmax = data;
+      }
+      else if (changed) {
+	 fprintf(stderr, " *** CHANGED");
+      }
+   }
+
+   reg->current.f = data;
+
+   if (VERBOSE || (NORMAL && (newmin || newmax || changed)))
+      fprintf(stderr, "\n");
+
+   return changed;
+}
+
+static int print_reg_assignment( struct reg *reg, int data )
+{
+   float_ui32_type datau;
+   datau.ui32 = data;
+   reg->flags |= TOUCHED;
+   if (reg->flags & ISFLOAT)
+      return print_float_reg_assignment( reg, datau.f );
+   else
+      return print_int_reg_assignment( reg, data );
+}
+
+static void print_reg( struct reg *reg )
+{
+   if (reg->flags & TOUCHED) {
+      if (reg->flags & ISFLOAT) {
+	 fprintf(stderr, "   %s == %f\n", get_reg_name(reg), reg->current.f);
+      } else {
+	 fprintf(stderr, "   %s == 0x%x\n", get_reg_name(reg), reg->current.i);
+      }
+   }
+}
+
+
+static void dump_state( void )
+{
+   int i;
+
+   for (i = 0 ; i < Elements(regs) ; i++) 
+      print_reg( &regs[i] );
+
+   for (i = 0 ; i < Elements(scalars) ; i++) 
+      print_reg( &scalars[i] );
+
+   for (i = 0 ; i < Elements(vectors) ; i++) 
+      print_reg( &vectors[i] );
+}
+
+
+
+static int radeon_emit_packets( 
+   drm_radeon_cmd_header_t header,
+   drm_radeon_cmd_buffer_t *cmdbuf )
+{
+   int id = (int)header.packet.packet_id;
+   int sz = packet[id].len;
+   int *data = (int *)cmdbuf->buf;
+   int i;
+   
+   if (sz * sizeof(int) > cmdbuf->bufsz) {
+      fprintf(stderr, "Packet overflows cmdbuf\n");      
+      return -EINVAL;
+   }
+
+   if (!packet[id].name) {
+      fprintf(stderr, "*** Unknown packet 0 nr %d\n", id );
+      return -EINVAL;
+   }
+
+   
+   if (VERBOSE) 
+      fprintf(stderr, "Packet 0 reg %s nr %d\n", packet[id].name, sz );
+
+   for ( i = 0 ; i < sz ; i++) {
+      struct reg *reg = lookup_reg( regs, packet[id].start + i*4 );
+      if (print_reg_assignment( reg, data[i] ))
+	 total_changed++;
+      total++;
+   }
+
+   cmdbuf->buf += sz * sizeof(int);
+   cmdbuf->bufsz -= sz * sizeof(int);
+   return 0;
+}
+
+
+static int radeon_emit_scalars( 
+   drm_radeon_cmd_header_t header,
+   drm_radeon_cmd_buffer_t *cmdbuf )
+{
+   int sz = header.scalars.count;
+   int *data = (int *)cmdbuf->buf;
+   int start = header.scalars.offset;
+   int stride = header.scalars.stride;
+   int i;
+
+   if (VERBOSE)
+      fprintf(stderr, "emit scalars, start %d stride %d nr %d (end %d)\n",
+	      start, stride, sz, start + stride * sz);
+
+
+   for (i = 0 ; i < sz ; i++, start += stride) {
+      struct reg *reg = lookup_reg( scalars, start );
+      if (print_reg_assignment( reg, data[i] ))
+	 total_changed++;
+      total++;
+   }
+	 
+   cmdbuf->buf += sz * sizeof(int);
+   cmdbuf->bufsz -= sz * sizeof(int);
+   return 0;
+}
+
+
+static int radeon_emit_scalars2( 
+   drm_radeon_cmd_header_t header,
+   drm_radeon_cmd_buffer_t *cmdbuf )
+{
+   int sz = header.scalars.count;
+   int *data = (int *)cmdbuf->buf;
+   int start = header.scalars.offset + 0x100;
+   int stride = header.scalars.stride;
+   int i;
+
+   if (VERBOSE)
+      fprintf(stderr, "emit scalars2, start %d stride %d nr %d (end %d)\n",
+	      start, stride, sz, start + stride * sz);
+
+   if (start + stride * sz > 257) {
+      fprintf(stderr, "emit scalars OVERFLOW %d/%d/%d\n", start, stride, sz);
+      return -1;
+   }
+
+   for (i = 0 ; i < sz ; i++, start += stride) {
+      struct reg *reg = lookup_reg( scalars, start );
+      if (print_reg_assignment( reg, data[i] ))
+	 total_changed++;
+      total++;
+   }
+	 
+   cmdbuf->buf += sz * sizeof(int);
+   cmdbuf->bufsz -= sz * sizeof(int);
+   return 0;
+}
+
+/* Check: inf/nan/extreme-size?
+ * Check: table start, end, nr, etc.
+ */
+static int radeon_emit_vectors( 
+   drm_radeon_cmd_header_t header,
+   drm_radeon_cmd_buffer_t *cmdbuf )
+{
+   int sz = header.vectors.count;
+   int *data = (int *)cmdbuf->buf;
+   int start = header.vectors.offset;
+   int stride = header.vectors.stride;
+   int i,j;
+
+   if (VERBOSE)
+      fprintf(stderr, "emit vectors, start %d stride %d nr %d (end %d) (0x%x)\n",
+	      start, stride, sz, start + stride * sz, header.i);
+
+/*    if (start + stride * (sz/4) > 128) { */
+/*       fprintf(stderr, "emit vectors OVERFLOW %d/%d/%d\n", start, stride, sz); */
+/*       return -1; */
+/*    } */
+
+   for (i = 0 ; i < sz ;  start += stride) {
+      int changed = 0;
+      for (j = 0 ; j < 4 ; i++,j++) {
+	 struct reg *reg = lookup_reg( vectors, start*4+j );
+	 if (print_reg_assignment( reg, data[i] ))
+	    changed = 1;
+      }
+      if (changed)
+	 total_changed += 4;
+      total += 4;
+   }
+	 
+
+   cmdbuf->buf += sz * sizeof(int);
+   cmdbuf->bufsz -= sz * sizeof(int);
+   return 0;
+}
+
+
+static int print_vertex_format( int vfmt )
+{
+   if (NORMAL) {
+      fprintf(stderr, "   %s(%x): %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+	      "vertex format",
+	      vfmt,
+	      "xy,",
+	      (vfmt & RADEON_CP_VC_FRMT_Z) ? "z," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_W0) ? "w0," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_FPCOLOR) ? "fpcolor," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_FPALPHA) ? "fpalpha," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_PKCOLOR) ? "pkcolor," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_FPSPEC) ? "fpspec," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_FPFOG) ? "fpfog," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_PKSPEC) ? "pkspec," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_ST0) ? "st0," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_ST1) ? "st1," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_Q1) ? "q1," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_ST2) ? "st2," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_Q2) ? "q2," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_ST3) ? "st3," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_Q3) ? "q3," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_Q0) ? "q0," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_N0) ? "n0," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_XY1) ? "xy1," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_Z1) ? "z1," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_W1) ? "w1," : "",
+	      (vfmt & RADEON_CP_VC_FRMT_N1) ? "n1," : "");
+
+   
+/*       if (!find_or_add_value( &others[V_VTXFMT], vfmt )) */
+/* 	 fprintf(stderr, " *** NEW VALUE"); */
+
+      fprintf(stderr, "\n");
+   }
+
+   return 0;
+}
+
+static char *primname[0xf] = {
+   "NONE",
+   "POINTS",
+   "LINES",
+   "LINE_STRIP",
+   "TRIANGLES",
+   "TRIANGLE_FAN",
+   "TRIANGLE_STRIP",
+   "TRI_TYPE_2",
+   "RECT_LIST",
+   "3VRT_POINTS",
+   "3VRT_LINES",
+};
+
+static int print_prim_and_flags( int prim )
+{
+   int numverts;
+   
+   if (NORMAL)
+      fprintf(stderr, "   %s(%x): %s%s%s%s%s%s%s\n",
+	      "prim flags",
+	      prim,
+	      ((prim & 0x30) == RADEON_CP_VC_CNTL_PRIM_WALK_IND) ? "IND," : "",
+	      ((prim & 0x30) == RADEON_CP_VC_CNTL_PRIM_WALK_LIST) ? "LIST," : "",
+	      ((prim & 0x30) == RADEON_CP_VC_CNTL_PRIM_WALK_RING) ? "RING," : "",
+	      (prim & RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA) ? "RGBA," : "BGRA, ",
+	      (prim & RADEON_CP_VC_CNTL_MAOS_ENABLE) ? "MAOS," : "",
+	      (prim & RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE) ? "RADEON," : "",
+	      (prim & RADEON_CP_VC_CNTL_TCL_ENABLE) ? "TCL," : "");
+
+   if ((prim & 0xf) > RADEON_CP_VC_CNTL_PRIM_TYPE_3VRT_LINE_LIST) {
+      fprintf(stderr, "   *** Bad primitive: %x\n", prim & 0xf);
+      return -1;
+   }
+
+   numverts = prim>>16;
+   
+   if (NORMAL)
+      fprintf(stderr, "   prim: %s numverts %d\n", primname[prim&0xf], numverts);
+
+   switch (prim & 0xf) {
+   case RADEON_CP_VC_CNTL_PRIM_TYPE_NONE:
+   case RADEON_CP_VC_CNTL_PRIM_TYPE_POINT:
+      if (numverts < 1) {
+	 fprintf(stderr, "Bad nr verts for line %d\n", numverts);
+	 return -1;
+      }
+      break;
+   case RADEON_CP_VC_CNTL_PRIM_TYPE_LINE:
+      if ((numverts & 1) || numverts == 0) {
+	 fprintf(stderr, "Bad nr verts for line %d\n", numverts);
+	 return -1;
+      }
+      break;
+   case RADEON_CP_VC_CNTL_PRIM_TYPE_LINE_STRIP:
+      if (numverts < 2) {
+	 fprintf(stderr, "Bad nr verts for line_strip %d\n", numverts);
+	 return -1;
+      }
+      break;
+   case RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST:
+   case RADEON_CP_VC_CNTL_PRIM_TYPE_3VRT_POINT_LIST:
+   case RADEON_CP_VC_CNTL_PRIM_TYPE_3VRT_LINE_LIST:
+   case RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST:
+      if (numverts % 3 || numverts == 0) {
+	 fprintf(stderr, "Bad nr verts for tri %d\n", numverts);
+	 return -1;
+      }
+      break;
+   case RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_FAN:
+   case RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_STRIP:
+      if (numverts < 3) {
+	 fprintf(stderr, "Bad nr verts for strip/fan %d\n", numverts);
+	 return -1;
+      }
+      break;
+   default:
+      fprintf(stderr, "Bad primitive\n");
+      return -1;
+   }	
+   return 0;
+}
+
+/* build in knowledge about each packet type
+ */
+static int radeon_emit_packet3( drm_radeon_cmd_buffer_t *cmdbuf )
+{
+   int cmdsz;
+   int *cmd = (int *)cmdbuf->buf;
+   int *tmp;
+   int i, stride, size, start;
+
+   cmdsz = 2 + ((cmd[0] & RADEON_CP_PACKET_COUNT_MASK) >> 16);
+
+   if ((cmd[0] & RADEON_CP_PACKET_MASK) != RADEON_CP_PACKET3 ||
+       cmdsz * 4 > cmdbuf->bufsz ||
+       cmdsz > RADEON_CP_PACKET_MAX_DWORDS) {
+      fprintf(stderr, "Bad packet\n");
+      return -EINVAL;
+   }
+
+   switch( cmd[0] & ~RADEON_CP_PACKET_COUNT_MASK ) {
+   case RADEON_CP_PACKET3_NOP:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_NOP, %d dwords\n", cmdsz);
+      break;
+   case RADEON_CP_PACKET3_NEXT_CHAR:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_NEXT_CHAR, %d dwords\n", cmdsz);
+      break;
+   case RADEON_CP_PACKET3_PLY_NEXTSCAN:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_PLY_NEXTSCAN, %d dwords\n", cmdsz);
+      break;
+   case RADEON_CP_PACKET3_SET_SCISSORS:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_SET_SCISSORS, %d dwords\n", cmdsz);
+      break;
+   case RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_3D_RNDR_GEN_INDX_PRIM, %d dwords\n",
+	      cmdsz);
+      break;
+   case RADEON_CP_PACKET3_LOAD_MICROCODE:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_LOAD_MICROCODE, %d dwords\n", cmdsz);
+      break;
+   case RADEON_CP_PACKET3_WAIT_FOR_IDLE:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_WAIT_FOR_IDLE, %d dwords\n", cmdsz);
+      break;
+
+   case RADEON_CP_PACKET3_3D_DRAW_VBUF:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_3D_DRAW_VBUF, %d dwords\n", cmdsz);
+      print_vertex_format(cmd[1]);
+      print_prim_and_flags(cmd[2]);
+      break;
+
+   case RADEON_CP_PACKET3_3D_DRAW_IMMD:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_3D_DRAW_IMMD, %d dwords\n", cmdsz);
+      break;
+   case RADEON_CP_PACKET3_3D_DRAW_INDX: {
+      int neltdwords;
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_3D_DRAW_INDX, %d dwords\n", cmdsz);
+      print_vertex_format(cmd[1]);
+      print_prim_and_flags(cmd[2]);
+      neltdwords = cmd[2]>>16;
+      neltdwords += neltdwords & 1;
+      neltdwords /= 2;
+      if (neltdwords + 3 != cmdsz)
+	 fprintf(stderr, "Mismatch in DRAW_INDX, %d vs cmdsz %d\n",
+		 neltdwords, cmdsz);
+      break;
+   }
+   case RADEON_CP_PACKET3_LOAD_PALETTE:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_LOAD_PALETTE, %d dwords\n", cmdsz);
+      break;
+   case RADEON_CP_PACKET3_3D_LOAD_VBPNTR:
+      if (NORMAL) {
+	 fprintf(stderr, "PACKET3_3D_LOAD_VBPNTR, %d dwords\n", cmdsz);
+	 fprintf(stderr, "   nr arrays: %d\n", cmd[1]);
+      }
+
+      if (cmd[1]/2 + cmd[1]%2 != cmdsz - 3) {
+	 fprintf(stderr, "  ****** MISMATCH %d/%d *******\n",
+		 cmd[1]/2 + cmd[1]%2 + 3, cmdsz);
+	 return -EINVAL;
+      }
+
+      if (NORMAL) {
+	 tmp = cmd+2;
+	 for (i = 0 ; i < cmd[1] ; i++) {
+	    if (i & 1) {
+	       stride = (tmp[0]>>24) & 0xff;
+	       size = (tmp[0]>>16) & 0xff;
+	       start = tmp[2];
+	       tmp += 3;
+	    }
+	    else {
+	       stride = (tmp[0]>>8) & 0xff;
+	       size = (tmp[0]) & 0xff;
+	       start = tmp[1];
+	    }
+	    fprintf(stderr, "   array %d: start 0x%x vsize %d vstride %d\n",
+		    i, start, size, stride );
+	 }
+      }
+      break;
+   case RADEON_CP_PACKET3_CNTL_PAINT:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_PAINT, %d dwords\n", cmdsz);
+      break;
+   case RADEON_CP_PACKET3_CNTL_BITBLT:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_BITBLT, %d dwords\n", cmdsz);
+      break;
+   case RADEON_CP_PACKET3_CNTL_SMALLTEXT:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_SMALLTEXT, %d dwords\n", cmdsz);
+      break;
+   case RADEON_CP_PACKET3_CNTL_HOSTDATA_BLT:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_HOSTDATA_BLT, %d dwords\n", 
+	      cmdsz);
+      break;
+   case RADEON_CP_PACKET3_CNTL_POLYLINE:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_POLYLINE, %d dwords\n", cmdsz);
+      break;
+   case RADEON_CP_PACKET3_CNTL_POLYSCANLINES:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_POLYSCANLINES, %d dwords\n", 
+	      cmdsz);
+      break;
+   case RADEON_CP_PACKET3_CNTL_PAINT_MULTI:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_PAINT_MULTI, %d dwords\n", 
+	      cmdsz);
+      break;
+   case RADEON_CP_PACKET3_CNTL_BITBLT_MULTI:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_BITBLT_MULTI, %d dwords\n", 
+	      cmdsz);
+      break;
+   case RADEON_CP_PACKET3_CNTL_TRANS_BITBLT:
+      if (NORMAL)
+	 fprintf(stderr, "PACKET3_CNTL_TRANS_BITBLT, %d dwords\n", 
+	      cmdsz);
+      break;
+   default:
+      fprintf(stderr, "UNKNOWN PACKET, %d dwords\n", cmdsz);
+      break;
+   }
+      
+   cmdbuf->buf += cmdsz * 4;
+   cmdbuf->bufsz -= cmdsz * 4;
+   return 0;
+}
+
+
+/* Check cliprects for bounds, then pass on to above:
+ */
+static int radeon_emit_packet3_cliprect( drm_radeon_cmd_buffer_t *cmdbuf )
+{   
+   drm_clip_rect_t *boxes = cmdbuf->boxes;
+   int i = 0;
+
+   if (VERBOSE && total_changed) {
+      dump_state();
+      total_changed = 0;
+   }
+   else fprintf(stderr, "total_changed zero\n");
+
+   if (NORMAL) {
+      do {
+	 if ( i < cmdbuf->nbox ) {
+	    fprintf(stderr, "Emit box %d/%d %d,%d %d,%d\n",
+		    i, cmdbuf->nbox,
+		    boxes[i].x1, boxes[i].y1, boxes[i].x2, boxes[i].y2);
+	 }
+      } while ( ++i < cmdbuf->nbox );
+   }
+
+   if (cmdbuf->nbox == 1)
+      cmdbuf->nbox = 0;
+
+   return radeon_emit_packet3( cmdbuf );
+}
+
+
+int radeonSanityCmdBuffer( radeonContextPtr rmesa,
+			   int nbox,
+			   drm_clip_rect_t *boxes )
+{
+   int idx;
+   drm_radeon_cmd_buffer_t cmdbuf;
+   drm_radeon_cmd_header_t header;
+   static int inited = 0;
+
+   if (!inited) {
+      init_regs();
+      inited = 1;
+   }
+
+   cmdbuf.buf = rmesa->store.cmd_buf;
+   cmdbuf.bufsz = rmesa->store.cmd_used;
+   cmdbuf.boxes = boxes;
+   cmdbuf.nbox = nbox;
+
+   while ( cmdbuf.bufsz >= sizeof(header) ) {
+		
+      header.i = *(int *)cmdbuf.buf;
+      cmdbuf.buf += sizeof(header);
+      cmdbuf.bufsz -= sizeof(header);
+
+      switch (header.header.cmd_type) {
+      case RADEON_CMD_PACKET: 
+	 if (radeon_emit_packets( header, &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_packets failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      case RADEON_CMD_SCALARS:
+	 if (radeon_emit_scalars( header, &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_scalars failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      case RADEON_CMD_SCALARS2:
+	 if (radeon_emit_scalars2( header, &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_scalars failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      case RADEON_CMD_VECTORS:
+	 if (radeon_emit_vectors( header, &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_vectors failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      case RADEON_CMD_DMA_DISCARD:
+	 idx = header.dma.buf_idx;
+	 if (NORMAL)
+	    fprintf(stderr, "RADEON_CMD_DMA_DISCARD buf %d\n", idx);
+	 bufs++;
+	 break;
+
+      case RADEON_CMD_PACKET3:
+	 if (radeon_emit_packet3( &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_packet3 failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      case RADEON_CMD_PACKET3_CLIP:
+	 if (radeon_emit_packet3_cliprect( &cmdbuf )) {
+	    fprintf(stderr,"radeon_emit_packet3_clip failed\n");
+	    return -EINVAL;
+	 }
+	 break;
+
+      case RADEON_CMD_WAIT:
+	 break;
+
+      default:
+	 fprintf(stderr,"bad cmd_type %d at %p\n", 
+		   header.header.cmd_type,
+		   cmdbuf.buf - sizeof(header));
+	 return -EINVAL;
+      }
+   }
+
+   if (0)
+   {
+      static int n = 0;
+      n++;
+      if (n == 10) {
+	 fprintf(stderr, "Bufs %d Total emitted %d real changes %d (%.2f%%)\n",
+		 bufs,
+		 total, total_changed, 
+		 ((float)total_changed/(float)total*100.0));
+	 fprintf(stderr, "Total emitted per buf: %.2f\n",
+		 (float)total/(float)bufs);
+	 fprintf(stderr, "Real changes per buf: %.2f\n",
+		 (float)total_changed/(float)bufs);
+
+	 bufs = n = total = total_changed = 0;
+      }
+   }
+
+   return 0;
+}
diff --git a/radeon/radeon_sanity.h b/radeon/radeon_sanity.h
new file mode 100644
index 0000000..1ec06bc
--- /dev/null
+++ b/radeon/radeon_sanity.h
@@ -0,0 +1,8 @@
+#ifndef RADEON_SANITY_H
+#define RADEON_SANITY_H
+
+extern int radeonSanityCmdBuffer( radeonContextPtr rmesa,
+				  int nbox,
+				  drm_clip_rect_t *boxes );
+
+#endif
diff --git a/radeon/radeon_screen.c b/radeon/radeon_screen.c
new file mode 100644
index 0000000..907a987
--- /dev/null
+++ b/radeon/radeon_screen.c
@@ -0,0 +1,1113 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_screen.c,v 1.7 2003/03/26 20:43:51 tsi Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file radeon_screen.c
+ * Screen initialization functions for the Radeon driver.
+ *
+ * \author Kevin E. Martin <martin@valinux.com>
+ * \author  Gareth Hughes <gareth@valinux.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "mtypes.h"
+#include "framebuffer.h"
+#include "renderbuffer.h"
+
+#define STANDALONE_MMIO
+#include "radeon_chipset.h"
+#include "radeon_macros.h"
+#include "radeon_screen.h"
+#if !RADEON_COMMON
+#include "radeon_context.h"
+#include "radeon_span.h"
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+#include "r200_context.h"
+#include "r200_ioctl.h"
+#include "r200_span.h"
+#include "r200_tex.h"
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+#include "r300_context.h"
+#include "r300_fragprog.h"
+#include "r300_tex.h"
+#include "radeon_span.h"
+#endif
+
+#include "utils.h"
+#include "context.h"
+#include "vblank.h"
+#include "drirenderbuffer.h"
+
+#include "GL/internal/dri_interface.h"
+
+/* Radeon configuration
+ */
+#include "xmlpool.h"
+
+#if !RADEON_COMMON	/* R100 */
+PUBLIC const char __driConfigOptions[] =
+DRI_CONF_BEGIN
+    DRI_CONF_SECTION_PERFORMANCE
+        DRI_CONF_TCL_MODE(DRI_CONF_TCL_CODEGEN)
+        DRI_CONF_FTHROTTLE_MODE(DRI_CONF_FTHROTTLE_IRQS)
+        DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_0)
+        DRI_CONF_MAX_TEXTURE_UNITS(3,2,3)
+        DRI_CONF_HYPERZ(false)
+    DRI_CONF_SECTION_END
+    DRI_CONF_SECTION_QUALITY
+        DRI_CONF_TEXTURE_DEPTH(DRI_CONF_TEXTURE_DEPTH_FB)
+        DRI_CONF_DEF_MAX_ANISOTROPY(1.0,"1.0,2.0,4.0,8.0,16.0")
+        DRI_CONF_NO_NEG_LOD_BIAS(false)
+        DRI_CONF_FORCE_S3TC_ENABLE(false)
+        DRI_CONF_COLOR_REDUCTION(DRI_CONF_COLOR_REDUCTION_DITHER)
+        DRI_CONF_ROUND_MODE(DRI_CONF_ROUND_TRUNC)
+        DRI_CONF_DITHER_MODE(DRI_CONF_DITHER_XERRORDIFF)
+        DRI_CONF_ALLOW_LARGE_TEXTURES(1)
+    DRI_CONF_SECTION_END
+    DRI_CONF_SECTION_DEBUG
+        DRI_CONF_NO_RAST(false)
+    DRI_CONF_SECTION_END
+DRI_CONF_END;
+static const GLuint __driNConfigOptions = 14;
+
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+
+PUBLIC const char __driConfigOptions[] =
+DRI_CONF_BEGIN
+    DRI_CONF_SECTION_PERFORMANCE
+        DRI_CONF_TCL_MODE(DRI_CONF_TCL_CODEGEN)
+        DRI_CONF_FTHROTTLE_MODE(DRI_CONF_FTHROTTLE_IRQS)
+        DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_0)
+        DRI_CONF_MAX_TEXTURE_UNITS(6,2,6)
+        DRI_CONF_HYPERZ(false)
+    DRI_CONF_SECTION_END
+    DRI_CONF_SECTION_QUALITY
+        DRI_CONF_TEXTURE_DEPTH(DRI_CONF_TEXTURE_DEPTH_FB)
+        DRI_CONF_DEF_MAX_ANISOTROPY(1.0,"1.0,2.0,4.0,8.0,16.0")
+        DRI_CONF_NO_NEG_LOD_BIAS(false)
+        DRI_CONF_FORCE_S3TC_ENABLE(false)
+        DRI_CONF_COLOR_REDUCTION(DRI_CONF_COLOR_REDUCTION_DITHER)
+        DRI_CONF_ROUND_MODE(DRI_CONF_ROUND_TRUNC)
+        DRI_CONF_DITHER_MODE(DRI_CONF_DITHER_XERRORDIFF)
+        DRI_CONF_ALLOW_LARGE_TEXTURES(1)
+        DRI_CONF_TEXTURE_BLEND_QUALITY(1.0,"0.0:1.0")
+    DRI_CONF_SECTION_END
+    DRI_CONF_SECTION_DEBUG
+        DRI_CONF_NO_RAST(false)
+    DRI_CONF_SECTION_END
+    DRI_CONF_SECTION_SOFTWARE
+        DRI_CONF_NV_VERTEX_PROGRAM(false)
+    DRI_CONF_SECTION_END
+DRI_CONF_END;
+static const GLuint __driNConfigOptions = 16;
+
+extern const struct dri_extension blend_extensions[];
+extern const struct dri_extension ARB_vp_extension[];
+extern const struct dri_extension NV_vp_extension[];
+extern const struct dri_extension ATI_fs_extension[];
+extern const struct dri_extension point_extensions[];
+
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+
+/* TODO: integrate these into xmlpool.h! */
+#define DRI_CONF_MAX_TEXTURE_IMAGE_UNITS(def,min,max) \
+DRI_CONF_OPT_BEGIN_V(texture_image_units,int,def, # min ":" # max ) \
+        DRI_CONF_DESC(en,"Number of texture image units") \
+        DRI_CONF_DESC(de,"Anzahl der Textureinheiten") \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_MAX_TEXTURE_COORD_UNITS(def,min,max) \
+DRI_CONF_OPT_BEGIN_V(texture_coord_units,int,def, # min ":" # max ) \
+        DRI_CONF_DESC(en,"Number of texture coordinate units") \
+        DRI_CONF_DESC(de,"Anzahl der Texturkoordinateneinheiten") \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_COMMAND_BUFFER_SIZE(def,min,max) \
+DRI_CONF_OPT_BEGIN_V(command_buffer_size,int,def, # min ":" # max ) \
+        DRI_CONF_DESC(en,"Size of command buffer (in KB)") \
+        DRI_CONF_DESC(de,"Grösse des Befehlspuffers (in KB)") \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_DISABLE_S3TC(def) \
+DRI_CONF_OPT_BEGIN(disable_s3tc,bool,def) \
+        DRI_CONF_DESC(en,"Disable S3TC compression") \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_DISABLE_FALLBACK(def) \
+DRI_CONF_OPT_BEGIN(disable_lowimpact_fallback,bool,def) \
+        DRI_CONF_DESC(en,"Disable Low-impact fallback") \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_DISABLE_DOUBLE_SIDE_STENCIL(def) \
+DRI_CONF_OPT_BEGIN(disable_stencil_two_side,bool,def) \
+        DRI_CONF_DESC(en,"Disable GL_EXT_stencil_two_side") \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_FP_OPTIMIZATION(def) \
+DRI_CONF_OPT_BEGIN_V(fp_optimization,enum,def,"0:1") \
+	DRI_CONF_DESC_BEGIN(en,"Fragment Program optimization") \
+                DRI_CONF_ENUM(0,"Optimize for Speed") \
+                DRI_CONF_ENUM(1,"Optimize for Quality") \
+        DRI_CONF_DESC_END \
+DRI_CONF_OPT_END
+
+const char __driConfigOptions[] =
+DRI_CONF_BEGIN
+	DRI_CONF_SECTION_PERFORMANCE
+		DRI_CONF_TCL_MODE(DRI_CONF_TCL_CODEGEN)
+		DRI_CONF_FTHROTTLE_MODE(DRI_CONF_FTHROTTLE_IRQS)
+		DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_0)
+		DRI_CONF_MAX_TEXTURE_IMAGE_UNITS(8, 2, 8)
+		DRI_CONF_MAX_TEXTURE_COORD_UNITS(8, 2, 8)
+		DRI_CONF_COMMAND_BUFFER_SIZE(8, 8, 32)
+		DRI_CONF_DISABLE_FALLBACK(false)
+		DRI_CONF_DISABLE_DOUBLE_SIDE_STENCIL(false)
+	DRI_CONF_SECTION_END
+	DRI_CONF_SECTION_QUALITY
+		DRI_CONF_TEXTURE_DEPTH(DRI_CONF_TEXTURE_DEPTH_FB)
+		DRI_CONF_DEF_MAX_ANISOTROPY(1.0, "1.0,2.0,4.0,8.0,16.0")
+		DRI_CONF_NO_NEG_LOD_BIAS(false)
+                DRI_CONF_FORCE_S3TC_ENABLE(false)
+		DRI_CONF_DISABLE_S3TC(false)
+		DRI_CONF_COLOR_REDUCTION(DRI_CONF_COLOR_REDUCTION_DITHER)
+		DRI_CONF_ROUND_MODE(DRI_CONF_ROUND_TRUNC)
+		DRI_CONF_DITHER_MODE(DRI_CONF_DITHER_XERRORDIFF)
+		DRI_CONF_FP_OPTIMIZATION(DRI_CONF_FP_OPTIMIZATION_SPEED)
+	DRI_CONF_SECTION_END
+	DRI_CONF_SECTION_DEBUG
+		DRI_CONF_NO_RAST(false)
+	DRI_CONF_SECTION_END
+DRI_CONF_END;
+static const GLuint __driNConfigOptions = 18;
+
+#ifndef RADEON_DEBUG
+int RADEON_DEBUG = 0;
+
+static const struct dri_debug_control debug_control[] = {
+	{"fall", DEBUG_FALLBACKS},
+	{"tex", DEBUG_TEXTURE},
+	{"ioctl", DEBUG_IOCTL},
+	{"prim", DEBUG_PRIMS},
+	{"vert", DEBUG_VERTS},
+	{"state", DEBUG_STATE},
+	{"code", DEBUG_CODEGEN},
+	{"vfmt", DEBUG_VFMT},
+	{"vtxf", DEBUG_VFMT},
+	{"verb", DEBUG_VERBOSE},
+	{"dri", DEBUG_DRI},
+	{"dma", DEBUG_DMA},
+	{"san", DEBUG_SANITY},
+	{"sync", DEBUG_SYNC},
+	{"pix", DEBUG_PIXEL},
+	{"mem", DEBUG_MEMORY},
+	{"allmsg", ~DEBUG_SYNC}, /* avoid the term "sync" because the parser uses strstr */
+	{NULL, 0}
+};
+#endif /* RADEON_DEBUG */
+
+#endif /* RADEON_COMMON && defined(RADEON_COMMON_FOR_R300) */
+
+extern const struct dri_extension card_extensions[];
+
+static int getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo );
+
+static int
+radeonGetParam(int fd, int param, void *value)
+{
+  int ret;
+  drm_radeon_getparam_t gp;
+  
+  gp.param = param;
+  gp.value = value;
+  
+  ret = drmCommandWriteRead( fd, DRM_RADEON_GETPARAM, &gp, sizeof(gp));
+  return ret;
+}
+
+static __GLcontextModes *
+radeonFillInModes( unsigned pixel_bits, unsigned depth_bits,
+		 unsigned stencil_bits, GLboolean have_back_buffer )
+{
+    __GLcontextModes * modes;
+    __GLcontextModes * m;
+    unsigned num_modes;
+    unsigned depth_buffer_factor;
+    unsigned back_buffer_factor;
+    GLenum fb_format;
+    GLenum fb_type;
+
+    /* Right now GLX_SWAP_COPY_OML isn't supported, but it would be easy
+     * enough to add support.  Basically, if a context is created with an
+     * fbconfig where the swap method is GLX_SWAP_COPY_OML, pageflipping
+     * will never be used.
+     */
+    static const GLenum back_buffer_modes[] = {
+	GLX_NONE, GLX_SWAP_UNDEFINED_OML /*, GLX_SWAP_COPY_OML */
+    };
+
+    u_int8_t depth_bits_array[2];
+    u_int8_t stencil_bits_array[2];
+
+
+    depth_bits_array[0] = depth_bits;
+    depth_bits_array[1] = depth_bits;
+    
+    /* Just like with the accumulation buffer, always provide some modes
+     * with a stencil buffer.  It will be a sw fallback, but some apps won't
+     * care about that.
+     */
+    stencil_bits_array[0] = 0;
+    stencil_bits_array[1] = (stencil_bits == 0) ? 8 : stencil_bits;
+
+    depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 2 : 1;
+    back_buffer_factor  = (have_back_buffer) ? 2 : 1;
+
+    num_modes = depth_buffer_factor * back_buffer_factor * 4;
+
+    if ( pixel_bits == 16 ) {
+        fb_format = GL_RGB;
+        fb_type = GL_UNSIGNED_SHORT_5_6_5;
+    }
+    else {
+        fb_format = GL_BGRA;
+        fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
+    }
+
+    modes = (*dri_interface->createContextModes)( num_modes, sizeof( __GLcontextModes ) );
+    m = modes;
+    if ( ! driFillInModes( & m, fb_format, fb_type,
+			   depth_bits_array, stencil_bits_array, depth_buffer_factor,
+			   back_buffer_modes, back_buffer_factor,
+			   GLX_TRUE_COLOR ) ) {
+	fprintf( stderr, "[%s:%u] Error creating FBConfig!\n",
+		 __func__, __LINE__ );
+	return NULL;
+    }
+
+    if ( ! driFillInModes( & m, fb_format, fb_type,
+			   depth_bits_array, stencil_bits_array, depth_buffer_factor,
+			   back_buffer_modes, back_buffer_factor,
+			   GLX_DIRECT_COLOR ) ) {
+	fprintf( stderr, "[%s:%u] Error creating FBConfig!\n",
+		 __func__, __LINE__ );
+	return NULL;
+    }
+
+    /* Mark the visual as slow if there are "fake" stencil bits.
+     */
+    for ( m = modes ; m != NULL ; m = m->next ) {
+	if ( (m->stencilBits != 0) && (m->stencilBits != stencil_bits) ) {
+	    m->visualRating = GLX_SLOW_CONFIG;
+	}
+    }
+
+    return modes;
+}
+
+
+/* Create the device specific screen private data struct.
+ */
+static radeonScreenPtr
+radeonCreateScreen( __DRIscreenPrivate *sPriv )
+{
+   radeonScreenPtr screen;
+   RADEONDRIPtr dri_priv = (RADEONDRIPtr)sPriv->pDevPriv;
+   unsigned char *RADEONMMIO;
+   PFNGLXSCRENABLEEXTENSIONPROC glx_enable_extension =
+     (PFNGLXSCRENABLEEXTENSIONPROC) (*dri_interface->getProcAddress("glxEnableExtension"));
+   void * const psc = sPriv->psc->screenConfigs;
+
+   if (sPriv->devPrivSize != sizeof(RADEONDRIRec)) {
+      fprintf(stderr,"\nERROR!  sizeof(RADEONDRIRec) does not match passed size from device driver\n");
+      return GL_FALSE;
+   }
+
+   /* Allocate the private area */
+   screen = (radeonScreenPtr) CALLOC( sizeof(*screen) );
+   if ( !screen ) {
+      __driUtilMessage("%s: Could not allocate memory for screen structure",
+		       __FUNCTION__);
+      return NULL;
+   }
+
+#if DO_DEBUG && RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+	RADEON_DEBUG = driParseDebugString(getenv("RADEON_DEBUG"), debug_control);
+#endif
+
+   /* parse information in __driConfigOptions */
+   driParseOptionInfo (&screen->optionCache,
+		       __driConfigOptions, __driNConfigOptions);
+
+   /* This is first since which regions we map depends on whether or
+    * not we are using a PCI card.
+    */
+   screen->card_type = (dri_priv->IsPCI ? RADEON_CARD_PCI : RADEON_CARD_AGP);
+   {
+      int ret;
+      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_GART_BUFFER_OFFSET,
+			    &screen->gart_buffer_offset);
+	
+      if (ret) {
+	 FREE( screen );
+	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BUFFER_OFFSET): %d\n", ret);
+	 return NULL;
+      }
+
+      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_GART_BASE,
+			    &screen->gart_base);
+      if (ret) {
+	 FREE( screen );
+	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BASE): %d\n", ret);
+	 return NULL;
+      }
+
+      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_IRQ_NR,
+			    &screen->irq);
+      if (ret) {
+	 FREE( screen );
+	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_IRQ_NR): %d\n", ret);
+	 return NULL;
+      }
+      screen->drmSupportsCubeMapsR200 = (sPriv->drmMinor >= 7);
+      screen->drmSupportsBlendColor = (sPriv->drmMinor >= 11);
+      screen->drmSupportsTriPerf = (sPriv->drmMinor >= 16);
+      screen->drmSupportsFragShader = (sPriv->drmMinor >= 18);
+      screen->drmSupportsPointSprites = (sPriv->drmMinor >= 13);
+      screen->drmSupportsCubeMapsR100 = (sPriv->drmMinor >= 15);
+      screen->drmSupportsVertexProgram = (sPriv->drmMinor >= 25);
+   }
+
+   screen->mmio.handle = dri_priv->registerHandle;
+   screen->mmio.size   = dri_priv->registerSize;
+   if ( drmMap( sPriv->fd,
+		screen->mmio.handle,
+		screen->mmio.size,
+		&screen->mmio.map ) ) {
+      FREE( screen );
+      __driUtilMessage("%s: drmMap failed\n", __FUNCTION__ );
+      return NULL;
+   }
+
+   RADEONMMIO = screen->mmio.map;
+
+   screen->status.handle = dri_priv->statusHandle;
+   screen->status.size   = dri_priv->statusSize;
+   if ( drmMap( sPriv->fd,
+		screen->status.handle,
+		screen->status.size,
+		&screen->status.map ) ) {
+      drmUnmap( screen->mmio.map, screen->mmio.size );
+      FREE( screen );
+      __driUtilMessage("%s: drmMap (2) failed\n", __FUNCTION__ );
+      return NULL;
+   }
+   screen->scratch = (__volatile__ u_int32_t *)
+      ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
+
+   screen->buffers = drmMapBufs( sPriv->fd );
+   if ( !screen->buffers ) {
+      drmUnmap( screen->status.map, screen->status.size );
+      drmUnmap( screen->mmio.map, screen->mmio.size );
+      FREE( screen );
+      __driUtilMessage("%s: drmMapBufs failed\n", __FUNCTION__ );
+      return NULL;
+   }
+
+   if ( dri_priv->gartTexHandle && dri_priv->gartTexMapSize ) {
+      screen->gartTextures.handle = dri_priv->gartTexHandle;
+      screen->gartTextures.size   = dri_priv->gartTexMapSize;
+      if ( drmMap( sPriv->fd,
+		   screen->gartTextures.handle,
+		   screen->gartTextures.size,
+		   (drmAddressPtr)&screen->gartTextures.map ) ) {
+	 drmUnmapBufs( screen->buffers );
+	 drmUnmap( screen->status.map, screen->status.size );
+	 drmUnmap( screen->mmio.map, screen->mmio.size );
+	 FREE( screen );
+	 __driUtilMessage("%s: drmMap failed for GART texture area\n", __FUNCTION__);
+	 return NULL;
+      }
+
+      screen->gart_texture_offset = dri_priv->gartTexOffset + screen->gart_base;
+   }
+
+   screen->chip_flags = 0;
+   /* XXX: add more chipsets */
+   switch ( dri_priv->deviceID ) {
+   case PCI_CHIP_RADEON_LY:
+   case PCI_CHIP_RADEON_LZ:
+   case PCI_CHIP_RADEON_QY:
+   case PCI_CHIP_RADEON_QZ:
+   case PCI_CHIP_RN50_515E:
+   case PCI_CHIP_RN50_5969:
+      screen->chip_family = CHIP_FAMILY_RV100;
+      break;
+
+   case PCI_CHIP_RS100_4136:
+   case PCI_CHIP_RS100_4336:
+      screen->chip_family = CHIP_FAMILY_RS100;
+      break;
+
+   case PCI_CHIP_RS200_4137:
+   case PCI_CHIP_RS200_4337:
+   case PCI_CHIP_RS250_4237:
+   case PCI_CHIP_RS250_4437:
+      screen->chip_family = CHIP_FAMILY_RS200;
+      break;
+
+   case PCI_CHIP_RADEON_QD:
+   case PCI_CHIP_RADEON_QE:
+   case PCI_CHIP_RADEON_QF:
+   case PCI_CHIP_RADEON_QG:
+      /* all original radeons (7200) presumably have a stencil op bug */
+      screen->chip_family = CHIP_FAMILY_R100;
+      screen->chip_flags = RADEON_CHIPSET_TCL | RADEON_CHIPSET_BROKEN_STENCIL;
+      break;
+
+   case PCI_CHIP_RV200_QW:
+   case PCI_CHIP_RV200_QX:
+   case PCI_CHIP_RADEON_LW:
+   case PCI_CHIP_RADEON_LX:
+      screen->chip_family = CHIP_FAMILY_RV200;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_R200_BB:
+   case PCI_CHIP_R200_BC:
+   case PCI_CHIP_R200_QH:
+   case PCI_CHIP_R200_QL:
+   case PCI_CHIP_R200_QM:
+      screen->chip_family = CHIP_FAMILY_R200;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV250_If:
+   case PCI_CHIP_RV250_Ig:
+   case PCI_CHIP_RV250_Ld:
+   case PCI_CHIP_RV250_Lf:
+   case PCI_CHIP_RV250_Lg:
+      screen->chip_family = CHIP_FAMILY_RV250;
+      screen->chip_flags = R200_CHIPSET_YCBCR_BROKEN | RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV280_5960:
+   case PCI_CHIP_RV280_5961:
+   case PCI_CHIP_RV280_5962:
+   case PCI_CHIP_RV280_5964:
+   case PCI_CHIP_RV280_5965:
+   case PCI_CHIP_RV280_5C61:
+   case PCI_CHIP_RV280_5C63:
+      screen->chip_family = CHIP_FAMILY_RV280;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RS300_5834:
+   case PCI_CHIP_RS300_5835:
+   case PCI_CHIP_RS350_7834:
+   case PCI_CHIP_RS350_7835:
+      screen->chip_family = CHIP_FAMILY_RS300;
+      break;
+
+   case PCI_CHIP_R300_AD:
+   case PCI_CHIP_R300_AE:
+   case PCI_CHIP_R300_AF:
+   case PCI_CHIP_R300_AG:
+   case PCI_CHIP_R300_ND:
+   case PCI_CHIP_R300_NE:
+   case PCI_CHIP_R300_NF:
+   case PCI_CHIP_R300_NG:
+      screen->chip_family = CHIP_FAMILY_R300;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV350_AP:
+   case PCI_CHIP_RV350_AQ:
+   case PCI_CHIP_RV350_AR:
+   case PCI_CHIP_RV350_AS:
+   case PCI_CHIP_RV350_AT:
+   case PCI_CHIP_RV350_AV:
+   case PCI_CHIP_RV350_AU:
+   case PCI_CHIP_RV350_NP:
+   case PCI_CHIP_RV350_NQ:
+   case PCI_CHIP_RV350_NR:
+   case PCI_CHIP_RV350_NS:
+   case PCI_CHIP_RV350_NT:
+   case PCI_CHIP_RV350_NV:
+      screen->chip_family = CHIP_FAMILY_RV350;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_R350_AH:
+   case PCI_CHIP_R350_AI:
+   case PCI_CHIP_R350_AJ:
+   case PCI_CHIP_R350_AK:
+   case PCI_CHIP_R350_NH:
+   case PCI_CHIP_R350_NI:
+   case PCI_CHIP_R360_NJ:
+   case PCI_CHIP_R350_NK:
+      screen->chip_family = CHIP_FAMILY_R350;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV370_5460:
+   case PCI_CHIP_RV370_5462:
+   case PCI_CHIP_RV370_5464:
+   case PCI_CHIP_RV370_5B60:
+   case PCI_CHIP_RV370_5B62:
+   case PCI_CHIP_RV370_5B63:
+   case PCI_CHIP_RV370_5B64:
+   case PCI_CHIP_RV370_5B65:
+   case PCI_CHIP_RV370_5657:
+   case PCI_CHIP_RV380_3150:
+   case PCI_CHIP_RV380_3152:
+   case PCI_CHIP_RV380_3154:
+   case PCI_CHIP_RV380_3E50:
+   case PCI_CHIP_RV380_3E54:
+      screen->chip_family = CHIP_FAMILY_RV380;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_R420_JN:
+   case PCI_CHIP_R420_JH:
+   case PCI_CHIP_R420_JI:
+   case PCI_CHIP_R420_JJ:
+   case PCI_CHIP_R420_JK:
+   case PCI_CHIP_R420_JL:
+   case PCI_CHIP_R420_JM:
+   case PCI_CHIP_R420_JO:
+   case PCI_CHIP_R420_JP:
+   case PCI_CHIP_R420_JT:
+   case PCI_CHIP_R481_4B49:
+   case PCI_CHIP_R481_4B4A:
+   case PCI_CHIP_R481_4B4B:
+   case PCI_CHIP_R481_4B4C:
+   case PCI_CHIP_R423_UH:
+   case PCI_CHIP_R423_UI:
+   case PCI_CHIP_R423_UJ:
+   case PCI_CHIP_R423_UK:
+   case PCI_CHIP_R430_554C:
+   case PCI_CHIP_R430_554D:
+   case PCI_CHIP_R430_554E:
+   case PCI_CHIP_R430_554F:
+   case PCI_CHIP_R423_5550:
+   case PCI_CHIP_R423_UQ:
+   case PCI_CHIP_R423_UR:
+   case PCI_CHIP_R423_UT:
+   case PCI_CHIP_R430_5D48:
+   case PCI_CHIP_R430_5D49:
+   case PCI_CHIP_R430_5D4A:
+   case PCI_CHIP_R480_5D4C:
+   case PCI_CHIP_R480_5D4D:
+   case PCI_CHIP_R480_5D4E:
+   case PCI_CHIP_R480_5D4F:
+   case PCI_CHIP_R480_5D50:
+   case PCI_CHIP_R480_5D52:
+   case PCI_CHIP_R423_5D57:
+      screen->chip_family = CHIP_FAMILY_R420;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   /* RV410 SE chips have half the pipes of regular RV410 */
+   case PCI_CHIP_RV410_5E4C:
+   case PCI_CHIP_RV410_5E4F:
+      screen->chip_family = CHIP_FAMILY_RV380;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV410_564A:
+   case PCI_CHIP_RV410_564B:
+   case PCI_CHIP_RV410_564F:
+   case PCI_CHIP_RV410_5652:
+   case PCI_CHIP_RV410_5653:
+   case PCI_CHIP_RV410_5E48:
+   case PCI_CHIP_RV410_5E4A:
+   case PCI_CHIP_RV410_5E4B:
+   case PCI_CHIP_RV410_5E4D:
+      screen->chip_family = CHIP_FAMILY_RV410;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RS480_5954:
+   case PCI_CHIP_RS480_5955:
+   case PCI_CHIP_RS482_5974:
+   case PCI_CHIP_RS482_5975:
+   case PCI_CHIP_RS400_5A41:
+   case PCI_CHIP_RS400_5A42:
+   case PCI_CHIP_RC410_5A61:
+   case PCI_CHIP_RC410_5A62:
+      screen->chip_family = CHIP_FAMILY_RS400;
+      fprintf(stderr, "Warning, xpress200 detected.\n");
+      break;
+
+   default:
+      fprintf(stderr, "unknown chip id 0x%x, can't guess.\n",
+	      dri_priv->deviceID);
+      return NULL;
+   }
+   if ((screen->chip_family == CHIP_FAMILY_R350 || screen->chip_family == CHIP_FAMILY_R300) &&
+       sPriv->ddxMinor < 2) {
+      fprintf(stderr, "xf86-video-ati-6.6.2 or newer needed for Radeon 9500/9700/9800 cards.\n");
+      return NULL;
+   }
+
+   if (screen->chip_family <= CHIP_FAMILY_RS200)
+      screen->chip_flags |= RADEON_CLASS_R100;
+   else if (screen->chip_family <= CHIP_FAMILY_RV280)
+      screen->chip_flags |= RADEON_CLASS_R200;
+   else
+      screen->chip_flags |= RADEON_CLASS_R300;
+
+   screen->cpp = dri_priv->bpp / 8;
+   screen->AGPMode = dri_priv->AGPMode;
+
+   screen->fbLocation	= ( INREG( RADEON_MC_FB_LOCATION ) & 0xffff ) << 16;
+
+   if ( sPriv->drmMinor >= 10 ) {
+      drm_radeon_setparam_t sp;
+
+      sp.param = RADEON_SETPARAM_FB_LOCATION;
+      sp.value = screen->fbLocation;
+
+      drmCommandWrite( sPriv->fd, DRM_RADEON_SETPARAM,
+		       &sp, sizeof( sp ) );
+   }
+
+   screen->frontOffset	= dri_priv->frontOffset;
+   screen->frontPitch	= dri_priv->frontPitch;
+   screen->backOffset	= dri_priv->backOffset;
+   screen->backPitch	= dri_priv->backPitch;
+   screen->depthOffset	= dri_priv->depthOffset;
+   screen->depthPitch	= dri_priv->depthPitch;
+
+   /* Check if ddx has set up a surface reg to cover depth buffer */
+   screen->depthHasSurface = (sPriv->ddxMajor > 4) ||
+      /* these chips don't use tiled z without hyperz. So always pretend
+         we have set up a surface which will cause linear reads/writes */
+      ((screen->chip_family & RADEON_CLASS_R100) &&
+      !(screen->chip_flags & RADEON_CHIPSET_TCL));
+
+   if ( dri_priv->textureSize == 0 ) {
+      screen->texOffset[RADEON_LOCAL_TEX_HEAP] = screen->gart_texture_offset;
+      screen->texSize[RADEON_LOCAL_TEX_HEAP] = dri_priv->gartTexMapSize;
+      screen->logTexGranularity[RADEON_LOCAL_TEX_HEAP] =
+	 dri_priv->log2GARTTexGran;
+   } else {
+      screen->texOffset[RADEON_LOCAL_TEX_HEAP] = dri_priv->textureOffset
+				               + screen->fbLocation;
+      screen->texSize[RADEON_LOCAL_TEX_HEAP] = dri_priv->textureSize;
+      screen->logTexGranularity[RADEON_LOCAL_TEX_HEAP] =
+	 dri_priv->log2TexGran;
+   }
+
+   if ( !screen->gartTextures.map || dri_priv->textureSize == 0
+	|| getenv( "RADEON_GARTTEXTURING_FORCE_DISABLE" ) ) {
+      screen->numTexHeaps = RADEON_NR_TEX_HEAPS - 1;
+      screen->texOffset[RADEON_GART_TEX_HEAP] = 0;
+      screen->texSize[RADEON_GART_TEX_HEAP] = 0;
+      screen->logTexGranularity[RADEON_GART_TEX_HEAP] = 0;
+   } else {
+      screen->numTexHeaps = RADEON_NR_TEX_HEAPS;
+      screen->texOffset[RADEON_GART_TEX_HEAP] = screen->gart_texture_offset;
+      screen->texSize[RADEON_GART_TEX_HEAP] = dri_priv->gartTexMapSize;
+      screen->logTexGranularity[RADEON_GART_TEX_HEAP] =
+	 dri_priv->log2GARTTexGran;
+   }
+
+   if ( glx_enable_extension != NULL ) {
+      if ( screen->irq != 0 ) {
+	 (*glx_enable_extension)( psc, "GLX_SGI_swap_control" );
+	 (*glx_enable_extension)( psc, "GLX_SGI_video_sync" );
+	 (*glx_enable_extension)( psc, "GLX_MESA_swap_control" );
+      }
+
+      (*glx_enable_extension)( psc, "GLX_MESA_swap_frame_usage" );
+      if (IS_R200_CLASS(screen))
+	 (*glx_enable_extension)( psc, "GLX_MESA_allocate_memory" );
+
+      (*glx_enable_extension)( psc, "GLX_MESA_copy_sub_buffer" );
+      (*glx_enable_extension)( psc, "GLX_SGI_make_current_read" );
+   }
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+   if (IS_R200_CLASS(screen)) {
+      sPriv->psc->allocateMemory = (void *) r200AllocateMemoryMESA;
+      sPriv->psc->freeMemory     = (void *) r200FreeMemoryMESA;
+      sPriv->psc->memoryOffset   = (void *) r200GetMemoryOffsetMESA;
+   }
+#endif
+
+   screen->driScreen = sPriv;
+   screen->sarea_priv_offset = dri_priv->sarea_priv_offset;
+   return screen;
+}
+
+/* Destroy the device specific screen private data struct.
+ */
+static void
+radeonDestroyScreen( __DRIscreenPrivate *sPriv )
+{
+   radeonScreenPtr screen = (radeonScreenPtr)sPriv->private;
+
+   if (!screen)
+      return;
+
+   if ( screen->gartTextures.map ) {
+      drmUnmap( screen->gartTextures.map, screen->gartTextures.size );
+   }
+   drmUnmapBufs( screen->buffers );
+   drmUnmap( screen->status.map, screen->status.size );
+   drmUnmap( screen->mmio.map, screen->mmio.size );
+
+   /* free all option information */
+   driDestroyOptionInfo (&screen->optionCache);
+
+   FREE( screen );
+   sPriv->private = NULL;
+}
+
+
+/* Initialize the driver specific screen private data.
+ */
+static GLboolean
+radeonInitDriver( __DRIscreenPrivate *sPriv )
+{
+   sPriv->private = (void *) radeonCreateScreen( sPriv );
+   if ( !sPriv->private ) {
+      radeonDestroyScreen( sPriv );
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+
+/**
+ * Create the Mesa framebuffer and renderbuffers for a given window/drawable.
+ *
+ * \todo This function (and its interface) will need to be updated to support
+ * pbuffers.
+ */
+static GLboolean
+radeonCreateBuffer( __DRIscreenPrivate *driScrnPriv,
+                    __DRIdrawablePrivate *driDrawPriv,
+                    const __GLcontextModes *mesaVis,
+                    GLboolean isPixmap )
+{
+   radeonScreenPtr screen = (radeonScreenPtr) driScrnPriv->private;
+
+   if (isPixmap) {
+      return GL_FALSE; /* not implemented */
+   }
+   else {
+      const GLboolean swDepth = GL_FALSE;
+      const GLboolean swAlpha = GL_FALSE;
+      const GLboolean swAccum = mesaVis->accumRedBits > 0;
+      const GLboolean swStencil = mesaVis->stencilBits > 0 &&
+         mesaVis->depthBits != 24;
+      struct gl_framebuffer *fb = _mesa_create_framebuffer(mesaVis);
+
+      /* front color renderbuffer */
+      {
+         driRenderbuffer *frontRb
+            = driNewRenderbuffer(GL_RGBA,
+                                 driScrnPriv->pFB + screen->frontOffset,
+                                 screen->cpp,
+                                 screen->frontOffset, screen->frontPitch,
+                                 driDrawPriv);
+         radeonSetSpanFunctions(frontRb, mesaVis);
+         _mesa_add_renderbuffer(fb, BUFFER_FRONT_LEFT, &frontRb->Base);
+      }
+
+      /* back color renderbuffer */
+      if (mesaVis->doubleBufferMode) {
+         driRenderbuffer *backRb
+            = driNewRenderbuffer(GL_RGBA,
+                                 driScrnPriv->pFB + screen->backOffset,
+                                 screen->cpp,
+                                 screen->backOffset, screen->backPitch,
+                                 driDrawPriv);
+         radeonSetSpanFunctions(backRb, mesaVis);
+         _mesa_add_renderbuffer(fb, BUFFER_BACK_LEFT, &backRb->Base);
+      }
+
+      /* depth renderbuffer */
+      if (mesaVis->depthBits == 16) {
+         driRenderbuffer *depthRb
+            = driNewRenderbuffer(GL_DEPTH_COMPONENT16,
+                                 driScrnPriv->pFB + screen->depthOffset,
+                                 screen->cpp,
+                                 screen->depthOffset, screen->depthPitch,
+                                 driDrawPriv);
+         radeonSetSpanFunctions(depthRb, mesaVis);
+         _mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depthRb->Base);
+	 depthRb->depthHasSurface = screen->depthHasSurface;
+      }
+      else if (mesaVis->depthBits == 24) {
+         driRenderbuffer *depthRb
+            = driNewRenderbuffer(GL_DEPTH_COMPONENT24,
+                                 driScrnPriv->pFB + screen->depthOffset,
+                                 screen->cpp,
+                                 screen->depthOffset, screen->depthPitch,
+                                 driDrawPriv);
+         radeonSetSpanFunctions(depthRb, mesaVis);
+         _mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depthRb->Base);
+	 depthRb->depthHasSurface = screen->depthHasSurface;
+      }
+
+      /* stencil renderbuffer */
+      if (mesaVis->stencilBits > 0 && !swStencil) {
+         driRenderbuffer *stencilRb
+            = driNewRenderbuffer(GL_STENCIL_INDEX8_EXT,
+                                 driScrnPriv->pFB + screen->depthOffset,
+                                 screen->cpp,
+                                 screen->depthOffset, screen->depthPitch,
+                                 driDrawPriv);
+         radeonSetSpanFunctions(stencilRb, mesaVis);
+         _mesa_add_renderbuffer(fb, BUFFER_STENCIL, &stencilRb->Base);
+	 stencilRb->depthHasSurface = screen->depthHasSurface;
+      }
+
+      _mesa_add_soft_renderbuffers(fb,
+                                   GL_FALSE, /* color */
+                                   swDepth,
+                                   swStencil,
+                                   swAccum,
+                                   swAlpha,
+                                   GL_FALSE /* aux */);
+      driDrawPriv->driverPrivate = (void *) fb;
+
+      return (driDrawPriv->driverPrivate != NULL);
+   }
+}
+
+
+static void
+radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+{
+   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+}
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+/**
+ * Choose the appropriate CreateContext function based on the chipset.
+ * Eventually, all drivers will go through this process.
+ */
+static GLboolean radeonCreateContext(const __GLcontextModes * glVisual,
+				     __DRIcontextPrivate * driContextPriv,
+				     void *sharedContextPriv)
+{
+	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+
+	if (IS_R300_CLASS(screen))
+		return r300CreateContext(glVisual, driContextPriv, sharedContextPriv);
+        return GL_FALSE;
+}
+
+/**
+ * Choose the appropriate DestroyContext function based on the chipset.
+ */
+static void radeonDestroyContext(__DRIcontextPrivate * driContextPriv)
+{
+	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
+
+	if (IS_R300_CLASS(radeon->radeonScreen))
+		return r300DestroyContext(driContextPriv);
+}
+
+
+#endif
+
+#if !RADEON_COMMON || (RADEON_COMMON && defined(RADEON_COMMON_FOR_R300))
+static struct __DriverAPIRec radeonAPI = {
+   .InitDriver      = radeonInitDriver,
+   .DestroyScreen   = radeonDestroyScreen,
+   .CreateContext   = radeonCreateContext,
+   .DestroyContext  = radeonDestroyContext,
+   .CreateBuffer    = radeonCreateBuffer,
+   .DestroyBuffer   = radeonDestroyBuffer,
+   .SwapBuffers     = radeonSwapBuffers,
+   .MakeCurrent     = radeonMakeCurrent,
+   .UnbindContext   = radeonUnbindContext,
+   .GetSwapInfo     = getSwapInfo,
+   .GetMSC          = driGetMSC32,
+   .WaitForMSC      = driWaitForMSC32,
+   .WaitForSBC      = NULL,
+   .SwapBuffersMSC  = NULL,
+   .CopySubBuffer   = radeonCopySubBuffer,
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+   .setTexOffset    = r300SetTexOffset,
+#endif
+};
+#else
+static const struct __DriverAPIRec r200API = {
+   .InitDriver      = radeonInitDriver,
+   .DestroyScreen   = radeonDestroyScreen,
+   .CreateContext   = r200CreateContext,
+   .DestroyContext  = r200DestroyContext,
+   .CreateBuffer    = radeonCreateBuffer,
+   .DestroyBuffer   = radeonDestroyBuffer,
+   .SwapBuffers     = r200SwapBuffers,
+   .MakeCurrent     = r200MakeCurrent,
+   .UnbindContext   = r200UnbindContext,
+   .GetSwapInfo     = getSwapInfo,
+   .GetMSC          = driGetMSC32,
+   .WaitForMSC      = driWaitForMSC32,
+   .WaitForSBC      = NULL,
+   .SwapBuffersMSC  = NULL,
+   .CopySubBuffer   = r200CopySubBuffer,
+   .setTexOffset    = r200SetTexOffset
+};
+#endif
+
+/**
+ * This is the bootstrap function for the driver.  libGL supplies all of the
+ * requisite information about the system, and the driver initializes itself.
+ * This routine also fills in the linked list pointed to by \c driver_modes
+ * with the \c __GLcontextModes that the driver can support for windows or
+ * pbuffers.
+ *
+ * \return A pointer to a \c __DRIscreenPrivate on success, or \c NULL on 
+ *         failure.
+ */
+PUBLIC void *
+__driCreateNewScreen_20050727( __DRInativeDisplay *dpy,
+                             int scrn, __DRIscreen *psc,
+			     const __GLcontextModes * modes,
+			     const __DRIversion * ddx_version,
+			     const __DRIversion * dri_version,
+			     const __DRIversion * drm_version,
+			     const __DRIframebuffer * frame_buffer,
+			     drmAddress pSAREA, int fd,
+			     int internal_api_version,
+			     const __DRIinterfaceMethods * interface,
+			     __GLcontextModes ** driver_modes )
+{
+   __DRIscreenPrivate *psp;
+#if !RADEON_COMMON
+   static const char *driver_name = "Radeon";
+   static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
+   static const __DRIversion dri_expected = { 4, 0, 0 };
+   static const __DRIversion drm_expected = { 1, 6, 0 };
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+   static const char *driver_name = "R200";
+   static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
+   static const __DRIversion dri_expected = { 4, 0, 0 };
+   static const __DRIversion drm_expected = { 1, 6, 0 };
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+   static const char *driver_name = "R300";
+   static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
+   static const __DRIversion dri_expected = { 4, 0, 0 };
+   static const __DRIversion drm_expected = { 1, 24, 0 };
+#endif
+
+   dri_interface = interface;
+
+   if ( ! driCheckDriDdxDrmVersions3( driver_name,
+				      dri_version, & dri_expected,
+				      ddx_version, & ddx_expected,
+				      drm_version, & drm_expected ) ) {
+      return NULL;
+   }
+#if !RADEON_COMMON || (RADEON_COMMON && defined(RADEON_COMMON_FOR_R300))
+   psp = __driUtilCreateNewScreen(dpy, scrn, psc, NULL,
+				  ddx_version, dri_version, drm_version,
+				  frame_buffer, pSAREA, fd,
+				  internal_api_version, &radeonAPI);
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+   psp = __driUtilCreateNewScreen(dpy, scrn, psc, NULL,
+				  ddx_version, dri_version, drm_version,
+				  frame_buffer, pSAREA, fd,
+				  internal_api_version, &r200API);
+#endif
+
+   if ( psp != NULL ) {
+      RADEONDRIPtr dri_priv = (RADEONDRIPtr) psp->pDevPriv;
+      if (driver_modes) {
+         *driver_modes = radeonFillInModes( dri_priv->bpp,
+                                            (dri_priv->bpp == 16) ? 16 : 24,
+                                            (dri_priv->bpp == 16) ? 0  : 8,
+                                            (dri_priv->backOffset != dri_priv->depthOffset) );
+      }
+
+      /* Calling driInitExtensions here, with a NULL context pointer,
+       * does not actually enable the extensions.  It just makes sure
+       * that all the dispatch offsets for all the extensions that
+       * *might* be enables are known.  This is needed because the
+       * dispatch offsets need to be known when _mesa_context_create
+       * is called, but we can't enable the extensions until we have a
+       * context pointer.
+       *
+       * Hello chicken.  Hello egg.  How are you two today?
+       */
+      driInitExtensions( NULL, card_extensions, GL_FALSE );
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+      driInitExtensions( NULL, blend_extensions, GL_FALSE );
+      driInitSingleExtension( NULL, ARB_vp_extension );
+      driInitSingleExtension( NULL, NV_vp_extension );
+      driInitSingleExtension( NULL, ATI_fs_extension );
+      driInitExtensions( NULL, point_extensions, GL_FALSE );
+#endif
+   }
+
+   return (void *) psp;
+}
+
+
+/**
+ * Get information about previous buffer swaps.
+ */
+static int
+getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo )
+{
+#if !RADEON_COMMON || (RADEON_COMMON && defined(RADEON_COMMON_FOR_R300))
+   radeonContextPtr  rmesa;
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+   r200ContextPtr  rmesa;
+#endif
+
+   if ( (dPriv == NULL) || (dPriv->driContextPriv == NULL)
+	|| (dPriv->driContextPriv->driverPrivate == NULL)
+	|| (sInfo == NULL) ) {
+      return -1;
+   }
+
+   rmesa = dPriv->driContextPriv->driverPrivate;
+   sInfo->swap_count = rmesa->swap_count;
+   sInfo->swap_ust = rmesa->swap_ust;
+   sInfo->swap_missed_count = rmesa->swap_missed_count;
+
+   sInfo->swap_missed_usage = (sInfo->swap_missed_count != 0)
+       ? driCalculateSwapUsage( dPriv, 0, rmesa->swap_missed_ust )
+       : 0.0;
+
+   return 0;
+}
diff --git a/radeon/radeon_screen.h b/radeon/radeon_screen.h
new file mode 100644
index 0000000..25e6fcf
--- /dev/null
+++ b/radeon/radeon_screen.h
@@ -0,0 +1,115 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_screen.h,v 1.5 2002/12/16 16:18:58 dawes Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Gareth Hughes <gareth@valinux.com>
+ */
+
+#ifndef __RADEON_SCREEN_H__
+#define __RADEON_SCREEN_H__
+
+/*
+ * IMPORTS: these headers contain all the DRI, X and kernel-related
+ * definitions that we need.
+ */
+#include "dri_util.h"
+#include "radeon_dri.h"
+#include "radeon_chipset.h"
+#include "radeon_reg.h"
+#include "drm_sarea.h"
+#include "xmlconfig.h"
+
+
+typedef struct {
+   drm_handle_t handle;			/* Handle to the DRM region */
+   drmSize size;			/* Size of the DRM region */
+   drmAddress map;			/* Mapping of the DRM region */
+} radeonRegionRec, *radeonRegionPtr;
+
+typedef struct {
+   int chip_family;
+   int chip_flags;
+   int cpp;
+   int card_type;
+   int AGPMode;
+   unsigned int irq;			/* IRQ number (0 means none) */
+
+   unsigned int fbLocation;
+   unsigned int frontOffset;
+   unsigned int frontPitch;
+   unsigned int backOffset;
+   unsigned int backPitch;
+
+   unsigned int depthOffset;
+   unsigned int depthPitch;
+
+    /* Shared texture data */
+   int numTexHeaps;
+   int texOffset[RADEON_NR_TEX_HEAPS];
+   int texSize[RADEON_NR_TEX_HEAPS];
+   int logTexGranularity[RADEON_NR_TEX_HEAPS];
+
+   radeonRegionRec mmio;
+   radeonRegionRec status;
+   radeonRegionRec gartTextures;
+
+   drmBufMapPtr buffers;
+
+   __volatile__ u_int32_t *scratch;
+
+   __DRIscreenPrivate *driScreen;
+   unsigned int sarea_priv_offset;
+   unsigned int gart_buffer_offset;	/* offset in card memory space */
+   unsigned int gart_texture_offset;	/* offset in card memory space */
+   unsigned int gart_base;
+
+   GLboolean drmSupportsCubeMapsR200;   /* need radeon kernel module >= 1.7 */
+   GLboolean drmSupportsBlendColor;     /* need radeon kernel module >= 1.11 */
+   GLboolean drmSupportsTriPerf;        /* need radeon kernel module >= 1.16 */
+   GLboolean drmSupportsFragShader;     /* need radeon kernel module >= 1.18 */
+   GLboolean drmSupportsPointSprites;   /* need radeon kernel module >= 1.13 */
+   GLboolean drmSupportsCubeMapsR100;   /* need radeon kernel module >= 1.15 */
+   GLboolean drmSupportsVertexProgram;  /* need radeon kernel module >= 1.25 */
+   GLboolean depthHasSurface;
+
+   /* Configuration cache with default values for all contexts */
+   driOptionCache optionCache;
+} radeonScreenRec, *radeonScreenPtr;
+
+#define IS_R100_CLASS(screen) \
+	((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R100)
+#define IS_R200_CLASS(screen) \
+	((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R200)
+#define IS_R300_CLASS(screen) \
+	((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R300)
+
+#endif /* __RADEON_SCREEN_H__ */
diff --git a/radeon/radeon_span.c b/radeon/radeon_span.c
new file mode 100644
index 0000000..732a85e
--- /dev/null
+++ b/radeon/radeon_span.c
@@ -0,0 +1,322 @@
+/**************************************************************************
+
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *
+ */
+
+#include "glheader.h"
+#include "swrast/swrast.h"
+
+#include "radeon_context.h"
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "radeon_span.h"
+#include "radeon_tex.h"
+
+#include "drirenderbuffer.h"
+
+#define DBG 0
+
+/*
+ * Note that all information needed to access pixels in a renderbuffer
+ * should be obtained through the gl_renderbuffer parameter, not per-context
+ * information.
+ */
+#define LOCAL_VARS						\
+   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
+   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
+   const GLuint bottom = dPriv->h - 1;				\
+   GLubyte *buf = (GLubyte *) drb->flippedData			\
+      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
+   GLuint p;							\
+   (void) p;
+
+#define LOCAL_DEPTH_VARS				\
+   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
+   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
+   const GLuint bottom = dPriv->h - 1;			\
+   GLuint xo = dPriv->x;				\
+   GLuint yo = dPriv->y;				\
+   GLubyte *buf = (GLubyte *) drb->Base.Data;
+
+#define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
+
+#define Y_FLIP(Y) (bottom - (Y))
+
+#define HW_LOCK()
+
+#define HW_UNLOCK()
+
+/* ================================================================
+ * Color buffer
+ */
+
+/* 16 bit, RGB565 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_RGB
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
+
+#define TAG(x)    radeon##x##_RGB565
+#define TAG2(x,y) radeon##x##_RGB565##y
+#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
+#include "spantmp2.h"
+
+/* 32 bit, ARGB8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+
+#define TAG(x)    radeon##x##_ARGB8888
+#define TAG2(x,y) radeon##x##_ARGB8888##y
+#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
+#include "spantmp2.h"
+
+/* ================================================================
+ * Depth buffer
+ */
+
+/* The Radeon family has depth tiling on all the time, so we have to convert
+ * the x,y coordinates into the memory bus address (mba) in the same
+ * manner as the engine.  In each case, the linear block address (ba)
+ * is calculated, and then wired with x and y to produce the final
+ * memory address.
+ * The chip will do address translation on its own if the surface registers
+ * are set up correctly. It is not quite enough to get it working with hyperz
+ * too...
+ */
+
+static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
+{
+	GLuint pitch = drb->pitch;
+	if (drb->depthHasSurface) {
+		return 4 * (x + y * pitch);
+	} else {
+		GLuint ba, address = 0;	/* a[0..1] = 0           */
+
+#ifdef COMPILE_R300
+		ba = (y / 8) * (pitch / 8) + (x / 8);
+#else
+		ba = (y / 16) * (pitch / 16) + (x / 16);
+#endif
+
+		address |= (x & 0x7) << 2;	/* a[2..4] = x[0..2]     */
+		address |= (y & 0x3) << 5;	/* a[5..6] = y[0..1]     */
+		address |= (((x & 0x10) >> 2) ^ (y & 0x4)) << 5;	/* a[7]    = x[4] ^ y[2] */
+		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
+
+		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
+		address |= (((x & 0x8) << 1) ^ (y & 0x10)) << 7;	/* a[11]   = x[3] ^ y[4] */
+		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
+
+		return address;
+	}
+}
+
+static INLINE GLuint
+radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
+{
+	GLuint pitch = drb->pitch;
+	if (drb->depthHasSurface) {
+		return 2 * (x + y * pitch);
+	} else {
+		GLuint ba, address = 0;	/* a[0]    = 0           */
+
+		ba = (y / 16) * (pitch / 32) + (x / 32);
+
+		address |= (x & 0x7) << 1;	/* a[1..3] = x[0..2]     */
+		address |= (y & 0x7) << 4;	/* a[4..6] = y[0..2]     */
+		address |= (x & 0x8) << 4;	/* a[7]    = x[3]        */
+		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
+		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
+		address |= ((x & 0x10) ^ (y & 0x10)) << 7;	/* a[11]   = x[4] ^ y[4] */
+		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
+
+		return address;
+	}
+}
+
+/* 16-bit depth buffer functions
+ */
+#define WRITE_DEPTH( _x, _y, d )					\
+   *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo )) = d;
+
+#define READ_DEPTH( d, _x, _y )						\
+   d = *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo ));
+
+#define TAG(x) radeon##x##_z16
+#include "depthtmp.h"
+
+/* 24 bit depth, 8 bit stencil depthbuffer functions
+ *
+ * Careful: It looks like the R300 uses ZZZS byte order while the R200
+ * uses SZZZ for 24 bit depth, 8 bit stencil mode.
+ */
+#ifdef COMPILE_R300
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0x000000ff;							\
+   tmp |= ((d << 8) & 0xffffff00);					\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+#else
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0xff000000;							\
+   tmp |= ((d) & 0x00ffffff);						\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+#endif
+
+#ifdef COMPILE_R300
+#define READ_DEPTH( d, _x, _y )						\
+  do { \
+    d = (*(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,		\
+					 _y + yo )) & 0xffffff00) >> 8; \
+  }while(0)
+#else
+#define READ_DEPTH( d, _x, _y )						\
+   d = *(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,			\
+					 _y + yo )) & 0x00ffffff;
+#endif
+
+#define TAG(x) radeon##x##_z24_s8
+#include "depthtmp.h"
+
+/* ================================================================
+ * Stencil buffer
+ */
+
+/* 24 bit depth, 8 bit stencil depthbuffer functions
+ */
+#ifdef COMPILE_R300
+#define WRITE_STENCIL( _x, _y, d )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0xffffff00;							\
+   tmp |= (d) & 0xff;							\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+#else
+#define WRITE_STENCIL( _x, _y, d )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0x00ffffff;							\
+   tmp |= (((d) & 0xff) << 24);						\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+#endif
+
+#ifdef COMPILE_R300
+#define READ_STENCIL( d, _x, _y )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   d = tmp & 0x000000ff;						\
+} while (0)
+#else
+#define READ_STENCIL( d, _x, _y )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   d = (tmp & 0xff000000) >> 24;					\
+} while (0)
+#endif
+
+#define TAG(x) radeon##x##_z24_s8
+#include "stenciltmp.h"
+
+/* Move locking out to get reasonable span performance (10x better
+ * than doing this in HW_LOCK above).  WaitForIdle() is the main
+ * culprit.
+ */
+
+static void radeonSpanRenderStart(GLcontext * ctx)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+#ifdef COMPILE_R300
+	r300ContextPtr r300 = (r300ContextPtr) rmesa;
+	R300_FIREVERTICES(r300);
+#else
+	RADEON_FIREVERTICES(rmesa);
+#endif
+	LOCK_HARDWARE(rmesa);
+	radeonWaitForIdleLocked(rmesa);
+}
+
+static void radeonSpanRenderFinish(GLcontext * ctx)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	_swrast_flush(ctx);
+	UNLOCK_HARDWARE(rmesa);
+}
+
+void radeonInitSpanFuncs(GLcontext * ctx)
+{
+	struct swrast_device_driver *swdd =
+	    _swrast_GetDeviceDriverReference(ctx);
+	swdd->SpanRenderStart = radeonSpanRenderStart;
+	swdd->SpanRenderFinish = radeonSpanRenderFinish;
+}
+
+/**
+ * Plug in the Get/Put routines for the given driRenderbuffer.
+ */
+void radeonSetSpanFunctions(driRenderbuffer * drb, const GLvisual * vis)
+{
+	if (drb->Base.InternalFormat == GL_RGBA) {
+		if (vis->redBits == 5 && vis->greenBits == 6
+		    && vis->blueBits == 5) {
+			radeonInitPointers_RGB565(&drb->Base);
+		} else {
+			radeonInitPointers_ARGB8888(&drb->Base);
+		}
+	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
+		radeonInitDepthPointers_z16(&drb->Base);
+	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
+		radeonInitDepthPointers_z24_s8(&drb->Base);
+	} else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
+		radeonInitStencilPointers_z24_s8(&drb->Base);
+	}
+}
diff --git a/radeon/radeon_span.h b/radeon/radeon_span.h
new file mode 100644
index 0000000..9abe086
--- /dev/null
+++ b/radeon/radeon_span.h
@@ -0,0 +1,50 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Kevin E. Martin <martin@valinux.com>
+ */
+
+#ifndef __RADEON_SPAN_H__
+#define __RADEON_SPAN_H__
+
+#include "drirenderbuffer.h"
+
+extern void radeonInitSpanFuncs(GLcontext * ctx);
+extern void radeonSetSpanFunctions(driRenderbuffer * rb, const GLvisual * vis);
+
+#endif
diff --git a/radeon/radeon_state.c b/radeon/radeon_state.c
new file mode 100644
index 0000000..4de05c7
--- /dev/null
+++ b/radeon/radeon_state.c
@@ -0,0 +1,2409 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_state.c,v 1.8 2002/12/16 16:18:58 dawes Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "api_arrayelt.h"
+#include "enums.h"
+#include "light.h"
+#include "state.h"
+#include "context.h"
+#include "framebuffer.h"
+
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "swrast_setup/swrast_setup.h"
+
+#include "radeon_context.h"
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "radeon_tcl.h"
+#include "radeon_tex.h"
+#include "radeon_swtcl.h"
+#include "drirenderbuffer.h"
+
+static void radeonUpdateSpecular( GLcontext *ctx );
+
+/* =============================================================
+ * Alpha blending
+ */
+
+static void radeonAlphaFunc( GLcontext *ctx, GLenum func, GLfloat ref )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   int pp_misc = rmesa->hw.ctx.cmd[CTX_PP_MISC];
+   GLubyte refByte;
+
+   CLAMPED_FLOAT_TO_UBYTE(refByte, ref);
+
+   RADEON_STATECHANGE( rmesa, ctx );
+
+   pp_misc &= ~(RADEON_ALPHA_TEST_OP_MASK | RADEON_REF_ALPHA_MASK);
+   pp_misc |= (refByte & RADEON_REF_ALPHA_MASK);
+
+   switch ( func ) {
+   case GL_NEVER:
+      pp_misc |= RADEON_ALPHA_TEST_FAIL;
+      break;
+   case GL_LESS:
+      pp_misc |= RADEON_ALPHA_TEST_LESS;
+      break;
+   case GL_EQUAL:
+      pp_misc |= RADEON_ALPHA_TEST_EQUAL;
+      break;
+   case GL_LEQUAL:
+      pp_misc |= RADEON_ALPHA_TEST_LEQUAL;
+      break;
+   case GL_GREATER:
+      pp_misc |= RADEON_ALPHA_TEST_GREATER;
+      break;
+   case GL_NOTEQUAL:
+      pp_misc |= RADEON_ALPHA_TEST_NEQUAL;
+      break;
+   case GL_GEQUAL:
+      pp_misc |= RADEON_ALPHA_TEST_GEQUAL;
+      break;
+   case GL_ALWAYS:
+      pp_misc |= RADEON_ALPHA_TEST_PASS;
+      break;
+   }
+
+   rmesa->hw.ctx.cmd[CTX_PP_MISC] = pp_misc;
+}
+
+static void radeonBlendEquationSeparate( GLcontext *ctx,
+					 GLenum modeRGB, GLenum modeA )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint b = rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] & ~RADEON_COMB_FCN_MASK;
+   GLboolean fallback = GL_FALSE;
+
+   assert( modeRGB == modeA );
+
+   switch ( modeRGB ) {
+   case GL_FUNC_ADD:
+   case GL_LOGIC_OP:
+      b |= RADEON_COMB_FCN_ADD_CLAMP;
+      break;
+
+   case GL_FUNC_SUBTRACT:
+      b |= RADEON_COMB_FCN_SUB_CLAMP;
+      break;
+
+   default:
+      if (ctx->Color.BlendEnabled)
+	 fallback = GL_TRUE;
+      else
+	 b |= RADEON_COMB_FCN_ADD_CLAMP;
+      break;
+   }
+
+   FALLBACK( rmesa, RADEON_FALLBACK_BLEND_EQ, fallback );
+   if ( !fallback ) {
+      RADEON_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] = b;
+      if ( (ctx->Color.ColorLogicOpEnabled || (ctx->Color.BlendEnabled
+	    && ctx->Color.BlendEquationRGB == GL_LOGIC_OP)) ) {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_ROP_ENABLE;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_ROP_ENABLE;
+      }
+   }
+}
+
+static void radeonBlendFuncSeparate( GLcontext *ctx,
+				     GLenum sfactorRGB, GLenum dfactorRGB,
+				     GLenum sfactorA, GLenum dfactorA )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint b = rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] & 
+      ~(RADEON_SRC_BLEND_MASK | RADEON_DST_BLEND_MASK);
+   GLboolean fallback = GL_FALSE;
+
+   switch ( ctx->Color.BlendSrcRGB ) {
+   case GL_ZERO:
+      b |= RADEON_SRC_BLEND_GL_ZERO;
+      break;
+   case GL_ONE:
+      b |= RADEON_SRC_BLEND_GL_ONE;
+      break;
+   case GL_DST_COLOR:
+      b |= RADEON_SRC_BLEND_GL_DST_COLOR;
+      break;
+   case GL_ONE_MINUS_DST_COLOR:
+      b |= RADEON_SRC_BLEND_GL_ONE_MINUS_DST_COLOR;
+      break;
+   case GL_SRC_COLOR:
+      b |= RADEON_SRC_BLEND_GL_SRC_COLOR;
+      break;
+   case GL_ONE_MINUS_SRC_COLOR:
+      b |= RADEON_SRC_BLEND_GL_ONE_MINUS_SRC_COLOR;
+      break;
+   case GL_SRC_ALPHA:
+      b |= RADEON_SRC_BLEND_GL_SRC_ALPHA;
+      break;
+   case GL_ONE_MINUS_SRC_ALPHA:
+      b |= RADEON_SRC_BLEND_GL_ONE_MINUS_SRC_ALPHA;
+      break;
+   case GL_DST_ALPHA:
+      b |= RADEON_SRC_BLEND_GL_DST_ALPHA;
+      break;
+   case GL_ONE_MINUS_DST_ALPHA:
+      b |= RADEON_SRC_BLEND_GL_ONE_MINUS_DST_ALPHA;
+      break;
+   case GL_SRC_ALPHA_SATURATE:
+      b |= RADEON_SRC_BLEND_GL_SRC_ALPHA_SATURATE;
+      break;
+   case GL_CONSTANT_COLOR:
+   case GL_ONE_MINUS_CONSTANT_COLOR:
+   case GL_CONSTANT_ALPHA:
+   case GL_ONE_MINUS_CONSTANT_ALPHA:
+      if (ctx->Color.BlendEnabled)
+	 fallback = GL_TRUE;
+      else
+	 b |= RADEON_SRC_BLEND_GL_ONE;
+      break;
+   default:
+      break;
+   }
+
+   switch ( ctx->Color.BlendDstRGB ) {
+   case GL_ZERO:
+      b |= RADEON_DST_BLEND_GL_ZERO;
+      break;
+   case GL_ONE:
+      b |= RADEON_DST_BLEND_GL_ONE;
+      break;
+   case GL_SRC_COLOR:
+      b |= RADEON_DST_BLEND_GL_SRC_COLOR;
+      break;
+   case GL_ONE_MINUS_SRC_COLOR:
+      b |= RADEON_DST_BLEND_GL_ONE_MINUS_SRC_COLOR;
+      break;
+   case GL_SRC_ALPHA:
+      b |= RADEON_DST_BLEND_GL_SRC_ALPHA;
+      break;
+   case GL_ONE_MINUS_SRC_ALPHA:
+      b |= RADEON_DST_BLEND_GL_ONE_MINUS_SRC_ALPHA;
+      break;
+   case GL_DST_COLOR:
+      b |= RADEON_DST_BLEND_GL_DST_COLOR;
+      break;
+   case GL_ONE_MINUS_DST_COLOR:
+      b |= RADEON_DST_BLEND_GL_ONE_MINUS_DST_COLOR;
+      break;
+   case GL_DST_ALPHA:
+      b |= RADEON_DST_BLEND_GL_DST_ALPHA;
+      break;
+   case GL_ONE_MINUS_DST_ALPHA:
+      b |= RADEON_DST_BLEND_GL_ONE_MINUS_DST_ALPHA;
+      break;
+   case GL_CONSTANT_COLOR:
+   case GL_ONE_MINUS_CONSTANT_COLOR:
+   case GL_CONSTANT_ALPHA:
+   case GL_ONE_MINUS_CONSTANT_ALPHA:
+      if (ctx->Color.BlendEnabled)
+	 fallback = GL_TRUE;
+      else
+	 b |= RADEON_DST_BLEND_GL_ZERO;
+      break;
+   default:
+      break;
+   }
+
+   FALLBACK( rmesa, RADEON_FALLBACK_BLEND_FUNC, fallback );
+   if ( !fallback ) {
+      RADEON_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] = b;
+   }
+}
+
+
+/* =============================================================
+ * Depth testing
+ */
+
+static void radeonDepthFunc( GLcontext *ctx, GLenum func )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   RADEON_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_Z_TEST_MASK;
+
+   switch ( ctx->Depth.Func ) {
+   case GL_NEVER:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_TEST_NEVER;
+      break;
+   case GL_LESS:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_TEST_LESS;
+      break;
+   case GL_EQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_TEST_EQUAL;
+      break;
+   case GL_LEQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_TEST_LEQUAL;
+      break;
+   case GL_GREATER:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_TEST_GREATER;
+      break;
+   case GL_NOTEQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_TEST_NEQUAL;
+      break;
+   case GL_GEQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_TEST_GEQUAL;
+      break;
+   case GL_ALWAYS:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_TEST_ALWAYS;
+      break;
+   }
+}
+
+
+static void radeonDepthMask( GLcontext *ctx, GLboolean flag )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   RADEON_STATECHANGE( rmesa, ctx );
+
+   if ( ctx->Depth.Mask ) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |=  RADEON_Z_WRITE_ENABLE;
+   } else {
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_Z_WRITE_ENABLE;
+   }
+}
+
+static void radeonClearDepth( GLcontext *ctx, GLclampd d )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint format = (rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &
+		    RADEON_DEPTH_FORMAT_MASK);
+
+   switch ( format ) {
+   case RADEON_DEPTH_FORMAT_16BIT_INT_Z:
+      rmesa->state.depth.clear = d * 0x0000ffff;
+      break;
+   case RADEON_DEPTH_FORMAT_24BIT_INT_Z:
+      rmesa->state.depth.clear = d * 0x00ffffff;
+      break;
+   }
+}
+
+
+/* =============================================================
+ * Fog
+ */
+
+
+static void radeonFogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   union { int i; float f; } c, d;
+   GLchan col[4];
+
+   switch (pname) {
+   case GL_FOG_MODE:
+      if (!ctx->Fog.Enabled)
+	 return;
+      RADEON_STATECHANGE(rmesa, tcl);
+      rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~RADEON_TCL_FOG_MASK;
+      switch (ctx->Fog.Mode) {
+      case GL_LINEAR:
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= RADEON_TCL_FOG_LINEAR;
+	 break;
+      case GL_EXP:
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= RADEON_TCL_FOG_EXP;
+	 break;
+      case GL_EXP2:
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= RADEON_TCL_FOG_EXP2;
+	 break;
+      default:
+	 return;
+      }
+   /* fallthrough */
+   case GL_FOG_DENSITY:
+   case GL_FOG_START:
+   case GL_FOG_END:
+      if (!ctx->Fog.Enabled)
+	 return;
+      c.i = rmesa->hw.fog.cmd[FOG_C];
+      d.i = rmesa->hw.fog.cmd[FOG_D];
+      switch (ctx->Fog.Mode) {
+      case GL_EXP:
+	 c.f = 0.0;
+	 /* While this is the opposite sign from the DDK, it makes the fog test
+	  * pass, and matches r200.
+	  */
+	 d.f = -ctx->Fog.Density;
+	 break;
+      case GL_EXP2:
+	 c.f = 0.0;
+	 d.f = -(ctx->Fog.Density * ctx->Fog.Density);
+	 break;
+      case GL_LINEAR:
+	 if (ctx->Fog.Start == ctx->Fog.End) {
+	    c.f = 1.0F;
+	    d.f = 1.0F;
+	 } else {
+	    c.f = ctx->Fog.End/(ctx->Fog.End-ctx->Fog.Start);
+	    /* While this is the opposite sign from the DDK, it makes the fog
+	     * test pass, and matches r200.
+	     */
+	    d.f = -1.0/(ctx->Fog.End-ctx->Fog.Start);
+	 }
+	 break;
+      default:
+	 break;
+      }
+      if (c.i != rmesa->hw.fog.cmd[FOG_C] || d.i != rmesa->hw.fog.cmd[FOG_D]) {
+	 RADEON_STATECHANGE( rmesa, fog );
+	 rmesa->hw.fog.cmd[FOG_C] = c.i;
+	 rmesa->hw.fog.cmd[FOG_D] = d.i;
+      }
+      break;
+   case GL_FOG_COLOR: 
+      RADEON_STATECHANGE( rmesa, ctx );
+      UNCLAMPED_FLOAT_TO_RGB_CHAN( col, ctx->Fog.Color );
+      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] &= ~RADEON_FOG_COLOR_MASK;
+      rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] |=
+	 radeonPackColor( 4, col[0], col[1], col[2], 0 );
+      break;
+   case GL_FOG_COORD_SRC:
+      radeonUpdateSpecular( ctx );
+      break;
+   default:
+      return;
+   }
+}
+
+
+/* =============================================================
+ * Scissoring
+ */
+
+
+static GLboolean intersect_rect( drm_clip_rect_t *out,
+				 drm_clip_rect_t *a,
+				 drm_clip_rect_t *b )
+{
+   *out = *a;
+   if ( b->x1 > out->x1 ) out->x1 = b->x1;
+   if ( b->y1 > out->y1 ) out->y1 = b->y1;
+   if ( b->x2 < out->x2 ) out->x2 = b->x2;
+   if ( b->y2 < out->y2 ) out->y2 = b->y2;
+   if ( out->x1 >= out->x2 ) return GL_FALSE;
+   if ( out->y1 >= out->y2 ) return GL_FALSE;
+   return GL_TRUE;
+}
+
+
+void radeonRecalcScissorRects( radeonContextPtr rmesa )
+{
+   drm_clip_rect_t *out;
+   int i;
+
+   /* Grow cliprect store?
+    */
+   if (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
+      while (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
+	 rmesa->state.scissor.numAllocedClipRects += 1;	/* zero case */
+	 rmesa->state.scissor.numAllocedClipRects *= 2;
+      }
+
+      if (rmesa->state.scissor.pClipRects)
+	 FREE(rmesa->state.scissor.pClipRects);
+
+      rmesa->state.scissor.pClipRects = 
+	 MALLOC( rmesa->state.scissor.numAllocedClipRects * 
+		 sizeof(drm_clip_rect_t) );
+
+      if ( rmesa->state.scissor.pClipRects == NULL ) {
+	 rmesa->state.scissor.numAllocedClipRects = 0;
+	 return;
+      }
+   }
+   
+   out = rmesa->state.scissor.pClipRects;
+   rmesa->state.scissor.numClipRects = 0;
+
+   for ( i = 0 ; i < rmesa->numClipRects ;  i++ ) {
+      if ( intersect_rect( out, 
+			   &rmesa->pClipRects[i], 
+			   &rmesa->state.scissor.rect ) ) {
+	 rmesa->state.scissor.numClipRects++;
+	 out++;
+      }
+   }
+}
+
+
+static void radeonUpdateScissor( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   if ( rmesa->dri.drawable ) {
+      __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+
+      int x = ctx->Scissor.X;
+      int y = dPriv->h - ctx->Scissor.Y - ctx->Scissor.Height;
+      int w = ctx->Scissor.X + ctx->Scissor.Width - 1;
+      int h = dPriv->h - ctx->Scissor.Y - 1;
+
+      rmesa->state.scissor.rect.x1 = x + dPriv->x;
+      rmesa->state.scissor.rect.y1 = y + dPriv->y;
+      rmesa->state.scissor.rect.x2 = w + dPriv->x + 1;
+      rmesa->state.scissor.rect.y2 = h + dPriv->y + 1;
+
+      radeonRecalcScissorRects( rmesa );
+   }
+}
+
+
+static void radeonScissor( GLcontext *ctx,
+			   GLint x, GLint y, GLsizei w, GLsizei h )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   if ( ctx->Scissor.Enabled ) {
+      RADEON_FIREVERTICES( rmesa );	/* don't pipeline cliprect changes */
+      radeonUpdateScissor( ctx );
+   }
+
+}
+
+
+/* =============================================================
+ * Culling
+ */
+
+static void radeonCullFace( GLcontext *ctx, GLenum unused )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint s = rmesa->hw.set.cmd[SET_SE_CNTL];
+   GLuint t = rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL];
+
+   s |= RADEON_FFACE_SOLID | RADEON_BFACE_SOLID;
+   t &= ~(RADEON_CULL_FRONT | RADEON_CULL_BACK);
+
+   if ( ctx->Polygon.CullFlag ) {
+      switch ( ctx->Polygon.CullFaceMode ) {
+      case GL_FRONT:
+	 s &= ~RADEON_FFACE_SOLID;
+	 t |= RADEON_CULL_FRONT;
+	 break;
+      case GL_BACK:
+	 s &= ~RADEON_BFACE_SOLID;
+	 t |= RADEON_CULL_BACK;
+	 break;
+      case GL_FRONT_AND_BACK:
+	 s &= ~(RADEON_FFACE_SOLID | RADEON_BFACE_SOLID);
+	 t |= (RADEON_CULL_FRONT | RADEON_CULL_BACK);
+	 break;
+      }
+   }
+
+   if ( rmesa->hw.set.cmd[SET_SE_CNTL] != s ) {
+      RADEON_STATECHANGE(rmesa, set );
+      rmesa->hw.set.cmd[SET_SE_CNTL] = s;
+   }
+
+   if ( rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] != t ) {
+      RADEON_STATECHANGE(rmesa, tcl );
+      rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] = t;
+   }
+}
+
+static void radeonFrontFace( GLcontext *ctx, GLenum mode )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   RADEON_STATECHANGE( rmesa, set );
+   rmesa->hw.set.cmd[SET_SE_CNTL] &= ~RADEON_FFACE_CULL_DIR_MASK;
+
+   RADEON_STATECHANGE( rmesa, tcl );
+   rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~RADEON_CULL_FRONT_IS_CCW;
+
+   switch ( mode ) {
+   case GL_CW:
+      rmesa->hw.set.cmd[SET_SE_CNTL] |= RADEON_FFACE_CULL_CW;
+      break;
+   case GL_CCW:
+      rmesa->hw.set.cmd[SET_SE_CNTL] |= RADEON_FFACE_CULL_CCW;
+      rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= RADEON_CULL_FRONT_IS_CCW;
+      break;
+   }
+}
+
+
+/* =============================================================
+ * Line state
+ */
+static void radeonLineWidth( GLcontext *ctx, GLfloat widthf )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   RADEON_STATECHANGE( rmesa, lin );
+   RADEON_STATECHANGE( rmesa, set );
+
+   /* Line width is stored in U6.4 format.
+    */
+   rmesa->hw.lin.cmd[LIN_SE_LINE_WIDTH] = (GLuint)(widthf * 16.0);
+   if ( widthf > 1.0 ) {
+      rmesa->hw.set.cmd[SET_SE_CNTL] |=  RADEON_WIDELINE_ENABLE;
+   } else {
+      rmesa->hw.set.cmd[SET_SE_CNTL] &= ~RADEON_WIDELINE_ENABLE;
+   }
+}
+
+static void radeonLineStipple( GLcontext *ctx, GLint factor, GLushort pattern )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   RADEON_STATECHANGE( rmesa, lin );
+   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] = 
+      ((((GLuint)factor & 0xff) << 16) | ((GLuint)pattern));
+}
+
+
+/* =============================================================
+ * Masks
+ */
+static void radeonColorMask( GLcontext *ctx,
+			     GLboolean r, GLboolean g,
+			     GLboolean b, GLboolean a )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint mask = radeonPackColor( rmesa->radeonScreen->cpp,
+				  ctx->Color.ColorMask[RCOMP],
+				  ctx->Color.ColorMask[GCOMP],
+				  ctx->Color.ColorMask[BCOMP],
+				  ctx->Color.ColorMask[ACOMP] );
+
+   if ( rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK] != mask ) {
+      RADEON_STATECHANGE( rmesa, msk );
+      rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK] = mask;
+   }
+}
+
+
+/* =============================================================
+ * Polygon state
+ */
+
+static void radeonPolygonOffset( GLcontext *ctx,
+				 GLfloat factor, GLfloat units )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   float_ui32_type constant =  { units * rmesa->state.depth.scale };
+   float_ui32_type factoru = { factor };
+
+   RADEON_STATECHANGE( rmesa, zbs );
+   rmesa->hw.zbs.cmd[ZBS_SE_ZBIAS_FACTOR]   = factoru.ui32;
+   rmesa->hw.zbs.cmd[ZBS_SE_ZBIAS_CONSTANT] = constant.ui32;
+}
+
+static void radeonPolygonStipple( GLcontext *ctx, const GLubyte *mask )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint i;
+   drm_radeon_stipple_t stipple;
+
+   /* Must flip pattern upside down.
+    */
+   for ( i = 0 ; i < 32 ; i++ ) {
+      rmesa->state.stipple.mask[31 - i] = ((GLuint *) mask)[i];
+   }
+
+   /* TODO: push this into cmd mechanism
+    */
+   RADEON_FIREVERTICES( rmesa );
+   LOCK_HARDWARE( rmesa );
+
+   /* FIXME: Use window x,y offsets into stipple RAM.
+    */
+   stipple.mask = rmesa->state.stipple.mask;
+   drmCommandWrite( rmesa->dri.fd, DRM_RADEON_STIPPLE, 
+                    &stipple, sizeof(drm_radeon_stipple_t) );
+   UNLOCK_HARDWARE( rmesa );
+}
+
+static void radeonPolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLboolean flag = (ctx->_TriangleCaps & DD_TRI_UNFILLED) != 0;
+
+   /* Can't generally do unfilled via tcl, but some good special
+    * cases work. 
+    */
+   TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_UNFILLED, flag);
+   if (rmesa->TclFallback) {
+      radeonChooseRenderState( ctx );
+      radeonChooseVertexState( ctx );
+   }
+}
+
+
+/* =============================================================
+ * Rendering attributes
+ *
+ * We really don't want to recalculate all this every time we bind a
+ * texture.  These things shouldn't change all that often, so it makes
+ * sense to break them out of the core texture state update routines.
+ */
+
+/* Examine lighting and texture state to determine if separate specular
+ * should be enabled.
+ */
+static void radeonUpdateSpecular( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   u_int32_t p = rmesa->hw.ctx.cmd[CTX_PP_CNTL];
+   GLuint flag = 0;
+
+   RADEON_STATECHANGE( rmesa, tcl );
+
+   rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] &= ~RADEON_TCL_COMPUTE_SPECULAR;
+   rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] &= ~RADEON_TCL_COMPUTE_DIFFUSE;
+   rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~RADEON_TCL_VTX_PK_SPEC;
+   rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~RADEON_TCL_VTX_PK_DIFFUSE;
+   rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= ~RADEON_LIGHTING_ENABLE;
+
+   p &= ~RADEON_SPECULAR_ENABLE;
+
+   rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_DIFFUSE_SPECULAR_COMBINE;
+
+
+   if (ctx->Light.Enabled &&
+       ctx->Light.Model.ColorControl == GL_SEPARATE_SPECULAR_COLOR) {
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] |= RADEON_TCL_COMPUTE_SPECULAR;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] |= RADEON_TCL_COMPUTE_DIFFUSE;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_SPEC;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_LIGHTING_ENABLE;
+      p |=  RADEON_SPECULAR_ENABLE;
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= 
+	 ~RADEON_DIFFUSE_SPECULAR_COMBINE;
+   }
+   else if (ctx->Light.Enabled) {
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] |= RADEON_TCL_COMPUTE_DIFFUSE;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_LIGHTING_ENABLE;
+   } else if (ctx->Fog.ColorSumEnabled ) {
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_SPEC;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
+      p |= RADEON_SPECULAR_ENABLE;
+   } else {
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
+   }
+
+   if (ctx->Fog.Enabled) {
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_SPEC;
+      if (ctx->Fog.FogCoordinateSource == GL_FRAGMENT_DEPTH) {
+	 rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] |= RADEON_TCL_COMPUTE_SPECULAR;
+      /* Bizzare: have to leave lighting enabled to get fog. */
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_LIGHTING_ENABLE;
+      }
+      else {
+      /* cannot do tcl fog factor calculation with fog coord source
+       * (send precomputed factors). Cannot use precomputed fog
+       * factors together with tcl spec light (need tcl fallback) */
+	 flag = (rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] &
+	    RADEON_TCL_COMPUTE_SPECULAR) != 0;
+      }
+   }
+ 
+   TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_FOGCOORDSPEC, flag);
+
+   if (NEED_SECONDARY_COLOR(ctx)) {
+      assert( (p & RADEON_SPECULAR_ENABLE) != 0 );
+   } else {
+      assert( (p & RADEON_SPECULAR_ENABLE) == 0 );
+   }
+
+   if ( rmesa->hw.ctx.cmd[CTX_PP_CNTL] != p ) {
+      RADEON_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_PP_CNTL] = p;
+   }
+
+   /* Update vertex/render formats
+    */
+   if (rmesa->TclFallback) { 
+      radeonChooseRenderState( ctx );
+      radeonChooseVertexState( ctx );
+   }
+}
+
+
+/* =============================================================
+ * Materials
+ */
+
+
+/* Update on colormaterial, material emmissive/ambient, 
+ * lightmodel.globalambient
+ */
+static void update_global_ambient( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   float *fcmd = (float *)RADEON_DB_STATE( glt );
+
+   /* Need to do more if both emmissive & ambient are PREMULT:
+    * Hope this is not needed for MULT
+    */
+   if ((rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &
+       ((3 << RADEON_EMISSIVE_SOURCE_SHIFT) |
+	(3 << RADEON_AMBIENT_SOURCE_SHIFT))) == 0) 
+   {
+      COPY_3V( &fcmd[GLT_RED], 
+	       ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_EMISSION]);
+      ACC_SCALE_3V( &fcmd[GLT_RED],
+		   ctx->Light.Model.Ambient,
+		   ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_AMBIENT]);
+   } 
+   else
+   {
+      COPY_3V( &fcmd[GLT_RED], ctx->Light.Model.Ambient );
+   }
+   
+   RADEON_DB_STATECHANGE(rmesa, &rmesa->hw.glt);
+}
+
+/* Update on change to 
+ *    - light[p].colors
+ *    - light[p].enabled
+ */
+static void update_light_colors( GLcontext *ctx, GLuint p )
+{
+   struct gl_light *l = &ctx->Light.Light[p];
+
+/*     fprintf(stderr, "%s\n", __FUNCTION__); */
+
+   if (l->Enabled) {
+      radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+      float *fcmd = (float *)RADEON_DB_STATE( lit[p] );
+
+      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );	 
+      COPY_4V( &fcmd[LIT_DIFFUSE_RED], l->Diffuse );
+      COPY_4V( &fcmd[LIT_SPECULAR_RED], l->Specular );
+      
+      RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.lit[p] );
+   }
+}
+
+/* Also fallback for asym colormaterial mode in twoside lighting...
+ */
+static void check_twoside_fallback( GLcontext *ctx )
+{
+   GLboolean fallback = GL_FALSE;
+   GLint i;
+
+   if (ctx->Light.Enabled && ctx->Light.Model.TwoSide) {
+      if (ctx->Light.ColorMaterialEnabled &&
+	  (ctx->Light.ColorMaterialBitmask & BACK_MATERIAL_BITS) != 
+	  ((ctx->Light.ColorMaterialBitmask & FRONT_MATERIAL_BITS)<<1))
+	 fallback = GL_TRUE;
+      else {
+	 for (i = MAT_ATTRIB_FRONT_AMBIENT; i < MAT_ATTRIB_FRONT_INDEXES; i+=2)
+	    if (memcmp( ctx->Light.Material.Attrib[i],
+			ctx->Light.Material.Attrib[i+1],
+			sizeof(GLfloat)*4) != 0) {
+	       fallback = GL_TRUE;  
+	       break;
+	    }
+      }
+   }
+
+   TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_LIGHT_TWOSIDE, fallback );
+}
+
+
+static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
+{
+      radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+      GLuint light_model_ctl1 = rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL];
+
+      light_model_ctl1 &= ~((3 << RADEON_EMISSIVE_SOURCE_SHIFT) |
+			   (3 << RADEON_AMBIENT_SOURCE_SHIFT) |
+			   (3 << RADEON_DIFFUSE_SOURCE_SHIFT) |
+			   (3 << RADEON_SPECULAR_SOURCE_SHIFT)); 
+   
+   if (ctx->Light.ColorMaterialEnabled) {
+      GLuint mask = ctx->Light.ColorMaterialBitmask;
+
+      if (mask & MAT_BIT_FRONT_EMISSION) {
+	 light_model_ctl1 |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
+			     RADEON_EMISSIVE_SOURCE_SHIFT);
+      }
+      else {
+	 light_model_ctl1 |= (RADEON_LM_SOURCE_STATE_MULT <<
+			     RADEON_EMISSIVE_SOURCE_SHIFT);
+      }
+
+      if (mask & MAT_BIT_FRONT_AMBIENT) {
+	 light_model_ctl1 |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
+			     RADEON_AMBIENT_SOURCE_SHIFT);
+      }
+      else {
+	 light_model_ctl1 |= (RADEON_LM_SOURCE_STATE_MULT <<
+			     RADEON_AMBIENT_SOURCE_SHIFT);
+      }
+	 
+      if (mask & MAT_BIT_FRONT_DIFFUSE) {
+	 light_model_ctl1 |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
+			     RADEON_DIFFUSE_SOURCE_SHIFT);
+      }
+      else {
+	 light_model_ctl1 |= (RADEON_LM_SOURCE_STATE_MULT <<
+			     RADEON_DIFFUSE_SOURCE_SHIFT);
+      }
+   
+      if (mask & MAT_BIT_FRONT_SPECULAR) {
+	 light_model_ctl1 |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
+			     RADEON_SPECULAR_SOURCE_SHIFT);
+      }
+      else {
+	 light_model_ctl1 |= (RADEON_LM_SOURCE_STATE_MULT <<
+			     RADEON_SPECULAR_SOURCE_SHIFT);
+      }
+   }
+   else {
+   /* Default to MULT:
+    */
+      light_model_ctl1 |= (RADEON_LM_SOURCE_STATE_MULT << RADEON_EMISSIVE_SOURCE_SHIFT) |
+		   (RADEON_LM_SOURCE_STATE_MULT << RADEON_AMBIENT_SOURCE_SHIFT) |
+		   (RADEON_LM_SOURCE_STATE_MULT << RADEON_DIFFUSE_SOURCE_SHIFT) |
+		   (RADEON_LM_SOURCE_STATE_MULT << RADEON_SPECULAR_SOURCE_SHIFT);
+   }
+   
+      if (light_model_ctl1 != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]) {
+	 RADEON_STATECHANGE( rmesa, tcl );
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = light_model_ctl1;      
+   }
+}
+
+void radeonUpdateMaterial( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLfloat (*mat)[4] = ctx->Light.Material.Attrib;
+   GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( mtl );
+   GLuint mask = ~0;
+   
+   if (ctx->Light.ColorMaterialEnabled)
+      mask &= ~ctx->Light.ColorMaterialBitmask;
+
+   if (RADEON_DEBUG & DEBUG_STATE)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+      
+   if (mask & MAT_BIT_FRONT_EMISSION) {
+      fcmd[MTL_EMMISSIVE_RED]   = mat[MAT_ATTRIB_FRONT_EMISSION][0];
+      fcmd[MTL_EMMISSIVE_GREEN] = mat[MAT_ATTRIB_FRONT_EMISSION][1];
+      fcmd[MTL_EMMISSIVE_BLUE]  = mat[MAT_ATTRIB_FRONT_EMISSION][2];
+      fcmd[MTL_EMMISSIVE_ALPHA] = mat[MAT_ATTRIB_FRONT_EMISSION][3];
+   }
+   if (mask & MAT_BIT_FRONT_AMBIENT) {
+      fcmd[MTL_AMBIENT_RED]     = mat[MAT_ATTRIB_FRONT_AMBIENT][0];
+      fcmd[MTL_AMBIENT_GREEN]   = mat[MAT_ATTRIB_FRONT_AMBIENT][1];
+      fcmd[MTL_AMBIENT_BLUE]    = mat[MAT_ATTRIB_FRONT_AMBIENT][2];
+      fcmd[MTL_AMBIENT_ALPHA]   = mat[MAT_ATTRIB_FRONT_AMBIENT][3];
+   }
+   if (mask & MAT_BIT_FRONT_DIFFUSE) {
+      fcmd[MTL_DIFFUSE_RED]     = mat[MAT_ATTRIB_FRONT_DIFFUSE][0];
+      fcmd[MTL_DIFFUSE_GREEN]   = mat[MAT_ATTRIB_FRONT_DIFFUSE][1];
+      fcmd[MTL_DIFFUSE_BLUE]    = mat[MAT_ATTRIB_FRONT_DIFFUSE][2];
+      fcmd[MTL_DIFFUSE_ALPHA]   = mat[MAT_ATTRIB_FRONT_DIFFUSE][3];
+   }
+   if (mask & MAT_BIT_FRONT_SPECULAR) {
+      fcmd[MTL_SPECULAR_RED]    = mat[MAT_ATTRIB_FRONT_SPECULAR][0];
+      fcmd[MTL_SPECULAR_GREEN]  = mat[MAT_ATTRIB_FRONT_SPECULAR][1];
+      fcmd[MTL_SPECULAR_BLUE]   = mat[MAT_ATTRIB_FRONT_SPECULAR][2];
+      fcmd[MTL_SPECULAR_ALPHA]  = mat[MAT_ATTRIB_FRONT_SPECULAR][3];
+   }
+   if (mask & MAT_BIT_FRONT_SHININESS) {
+      fcmd[MTL_SHININESS]       = mat[MAT_ATTRIB_FRONT_SHININESS][0];
+   }
+
+   RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.mtl );
+
+   check_twoside_fallback( ctx );
+/*   update_global_ambient( ctx );*/
+}
+
+/* _NEW_LIGHT
+ * _NEW_MODELVIEW
+ * _MESA_NEW_NEED_EYE_COORDS
+ *
+ * Uses derived state from mesa:
+ *       _VP_inf_norm
+ *       _h_inf_norm
+ *       _Position
+ *       _NormDirection
+ *       _ModelViewInvScale
+ *       _NeedEyeCoords
+ *       _EyeZDir
+ *
+ * which are calculated in light.c and are correct for the current
+ * lighting space (model or eye), hence dependencies on _NEW_MODELVIEW
+ * and _MESA_NEW_NEED_EYE_COORDS.  
+ */
+static void update_light( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   /* Have to check these, or have an automatic shortcircuit mechanism
+    * to remove noop statechanges. (Or just do a better job on the
+    * front end).
+    */
+   {
+      GLuint tmp = rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL];
+
+      if (ctx->_NeedEyeCoords)
+	 tmp &= ~RADEON_LIGHT_IN_MODELSPACE;
+      else
+	 tmp |= RADEON_LIGHT_IN_MODELSPACE;
+      
+
+      /* Leave this test disabled: (unexplained q3 lockup) (even with
+         new packets)
+      */
+      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]) 
+      {
+	 RADEON_STATECHANGE( rmesa, tcl );
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = tmp;
+      }
+   }
+
+   {
+      GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( eye );
+      fcmd[EYE_X] = ctx->_EyeZDir[0];
+      fcmd[EYE_Y] = ctx->_EyeZDir[1];
+      fcmd[EYE_Z] = - ctx->_EyeZDir[2];
+      fcmd[EYE_RESCALE_FACTOR] = ctx->_ModelViewInvScale;
+      RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.eye );
+   }
+
+
+
+   if (ctx->Light.Enabled) {
+      GLint p;
+      for (p = 0 ; p < MAX_LIGHTS; p++) {
+	 if (ctx->Light.Light[p].Enabled) {
+	    struct gl_light *l = &ctx->Light.Light[p];
+	    GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( lit[p] );
+	    
+	    if (l->EyePosition[3] == 0.0) {
+	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm ); 
+	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm ); 
+	       fcmd[LIT_POSITION_W] = 0;
+	       fcmd[LIT_DIRECTION_W] = 0;
+	    } else {
+	       COPY_4V( &fcmd[LIT_POSITION_X], l->_Position );
+	       fcmd[LIT_DIRECTION_X] = -l->_NormDirection[0];
+	       fcmd[LIT_DIRECTION_Y] = -l->_NormDirection[1];
+	       fcmd[LIT_DIRECTION_Z] = -l->_NormDirection[2];
+	       fcmd[LIT_DIRECTION_W] = 0;
+	    }
+
+	    RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.lit[p] );
+	 }
+      }
+   }
+}
+
+static void radeonLightfv( GLcontext *ctx, GLenum light,
+			   GLenum pname, const GLfloat *params )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLint p = light - GL_LIGHT0;
+   struct gl_light *l = &ctx->Light.Light[p];
+   GLfloat *fcmd = (GLfloat *)rmesa->hw.lit[p].cmd;
+   
+
+   switch (pname) {
+   case GL_AMBIENT:		
+   case GL_DIFFUSE:
+   case GL_SPECULAR:
+      update_light_colors( ctx, p );
+      break;
+
+   case GL_SPOT_DIRECTION: 
+      /* picked up in update_light */	
+      break;
+
+   case GL_POSITION: {
+      /* positions picked up in update_light, but can do flag here */	
+      GLuint flag;
+      GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
+
+      /* FIXME: Set RANGE_ATTEN only when needed */
+      if (p&1) 
+	 flag = RADEON_LIGHT_1_IS_LOCAL;
+      else
+	 flag = RADEON_LIGHT_0_IS_LOCAL;
+
+      RADEON_STATECHANGE(rmesa, tcl);
+      if (l->EyePosition[3] != 0.0F)
+	 rmesa->hw.tcl.cmd[idx] |= flag;
+      else
+	 rmesa->hw.tcl.cmd[idx] &= ~flag;
+      break;
+   }
+
+   case GL_SPOT_EXPONENT:
+      RADEON_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_SPOT_EXPONENT] = params[0];
+      break;
+
+   case GL_SPOT_CUTOFF: {
+      GLuint flag = (p&1) ? RADEON_LIGHT_1_IS_SPOT : RADEON_LIGHT_0_IS_SPOT;
+      GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
+
+      RADEON_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_SPOT_CUTOFF] = l->_CosCutoff;
+
+      RADEON_STATECHANGE(rmesa, tcl);
+      if (l->SpotCutoff != 180.0F)
+	 rmesa->hw.tcl.cmd[idx] |= flag;
+      else
+	 rmesa->hw.tcl.cmd[idx] &= ~flag;
+
+      break;
+   }
+
+   case GL_CONSTANT_ATTENUATION:
+      RADEON_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_ATTEN_CONST] = params[0];
+      if ( params[0] == 0.0 )
+	 fcmd[LIT_ATTEN_CONST_INV] = FLT_MAX;
+      else
+	 fcmd[LIT_ATTEN_CONST_INV] = 1.0 / params[0];
+      break;
+   case GL_LINEAR_ATTENUATION:
+      RADEON_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_ATTEN_LINEAR] = params[0];
+      break;
+   case GL_QUADRATIC_ATTENUATION:
+      RADEON_STATECHANGE(rmesa, lit[p]);
+      fcmd[LIT_ATTEN_QUADRATIC] = params[0];
+      break;
+   default:
+      return;
+   }
+
+   /* Set RANGE_ATTEN only when needed */
+   switch (pname) {
+   case GL_POSITION:
+   case GL_CONSTANT_ATTENUATION:
+   case GL_LINEAR_ATTENUATION:
+   case GL_QUADRATIC_ATTENUATION:
+   {
+      GLuint *icmd = (GLuint *)RADEON_DB_STATE( tcl );
+      GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
+      GLuint atten_flag = ( p&1 ) ? RADEON_LIGHT_1_ENABLE_RANGE_ATTEN
+				  : RADEON_LIGHT_0_ENABLE_RANGE_ATTEN;
+      GLuint atten_const_flag = ( p&1 ) ? RADEON_LIGHT_1_CONSTANT_RANGE_ATTEN
+				  : RADEON_LIGHT_0_CONSTANT_RANGE_ATTEN;
+
+      if ( l->EyePosition[3] == 0.0F ||
+	   ( ( fcmd[LIT_ATTEN_CONST] == 0.0 || fcmd[LIT_ATTEN_CONST] == 1.0 ) &&
+	     fcmd[LIT_ATTEN_QUADRATIC] == 0.0 && fcmd[LIT_ATTEN_LINEAR] == 0.0 ) ) {
+	 /* Disable attenuation */
+	 icmd[idx] &= ~atten_flag;
+      } else {
+	 if ( fcmd[LIT_ATTEN_QUADRATIC] == 0.0 && fcmd[LIT_ATTEN_LINEAR] == 0.0 ) {
+	    /* Enable only constant portion of attenuation calculation */
+	    icmd[idx] |= ( atten_flag | atten_const_flag );
+	 } else {
+	    /* Enable full attenuation calculation */
+	    icmd[idx] &= ~atten_const_flag;
+	    icmd[idx] |= atten_flag;
+	 }
+      }
+
+      RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.tcl );
+      break;
+   }
+   default:
+      break;
+   }
+}
+
+		  
+
+
+static void radeonLightModelfv( GLcontext *ctx, GLenum pname,
+				const GLfloat *param )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   switch (pname) {
+      case GL_LIGHT_MODEL_AMBIENT: 
+	 update_global_ambient( ctx );
+	 break;
+
+      case GL_LIGHT_MODEL_LOCAL_VIEWER:
+	 RADEON_STATECHANGE( rmesa, tcl );
+	 if (ctx->Light.Model.LocalViewer)
+	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_LOCAL_VIEWER;
+	 else
+	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= ~RADEON_LOCAL_VIEWER;
+         break;
+
+      case GL_LIGHT_MODEL_TWO_SIDE:
+	 RADEON_STATECHANGE( rmesa, tcl );
+	 if (ctx->Light.Model.TwoSide)
+	    rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= RADEON_LIGHT_TWOSIDE;
+	 else
+	    rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~RADEON_LIGHT_TWOSIDE;
+
+	 check_twoside_fallback( ctx );
+
+	 if (rmesa->TclFallback) {
+	    radeonChooseRenderState( ctx );
+	    radeonChooseVertexState( ctx );
+	 }
+         break;
+
+      case GL_LIGHT_MODEL_COLOR_CONTROL:
+	 radeonUpdateSpecular(ctx);
+         break;
+
+      default:
+         break;
+   }
+}
+
+static void radeonShadeModel( GLcontext *ctx, GLenum mode )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint s = rmesa->hw.set.cmd[SET_SE_CNTL];
+
+   s &= ~(RADEON_DIFFUSE_SHADE_MASK |
+	  RADEON_ALPHA_SHADE_MASK |
+	  RADEON_SPECULAR_SHADE_MASK |
+	  RADEON_FOG_SHADE_MASK);
+
+   switch ( mode ) {
+   case GL_FLAT:
+      s |= (RADEON_DIFFUSE_SHADE_FLAT |
+	    RADEON_ALPHA_SHADE_FLAT |
+	    RADEON_SPECULAR_SHADE_FLAT |
+	    RADEON_FOG_SHADE_FLAT);
+      break;
+   case GL_SMOOTH:
+      s |= (RADEON_DIFFUSE_SHADE_GOURAUD |
+	    RADEON_ALPHA_SHADE_GOURAUD |
+	    RADEON_SPECULAR_SHADE_GOURAUD |
+	    RADEON_FOG_SHADE_GOURAUD);
+      break;
+   default:
+      return;
+   }
+
+   if ( rmesa->hw.set.cmd[SET_SE_CNTL] != s ) {
+      RADEON_STATECHANGE( rmesa, set );
+      rmesa->hw.set.cmd[SET_SE_CNTL] = s;
+   }
+}
+
+
+/* =============================================================
+ * User clip planes
+ */
+
+static void radeonClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
+{
+   GLint p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
+
+   RADEON_STATECHANGE( rmesa, ucp[p] );
+   rmesa->hw.ucp[p].cmd[UCP_X] = ip[0];
+   rmesa->hw.ucp[p].cmd[UCP_Y] = ip[1];
+   rmesa->hw.ucp[p].cmd[UCP_Z] = ip[2];
+   rmesa->hw.ucp[p].cmd[UCP_W] = ip[3];
+}
+
+static void radeonUpdateClipPlanes( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint p;
+
+   for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
+      if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
+	 GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
+
+	 RADEON_STATECHANGE( rmesa, ucp[p] );
+	 rmesa->hw.ucp[p].cmd[UCP_X] = ip[0];
+	 rmesa->hw.ucp[p].cmd[UCP_Y] = ip[1];
+	 rmesa->hw.ucp[p].cmd[UCP_Z] = ip[2];
+	 rmesa->hw.ucp[p].cmd[UCP_W] = ip[3];
+      }
+   }
+}
+
+
+/* =============================================================
+ * Stencil
+ */
+
+static void
+radeonStencilFuncSeparate( GLcontext *ctx, GLenum face, GLenum func,
+                           GLint ref, GLuint mask )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint refmask = (((ctx->Stencil.Ref[0] & 0xff) << RADEON_STENCIL_REF_SHIFT) |
+		     ((ctx->Stencil.ValueMask[0] & 0xff) << RADEON_STENCIL_MASK_SHIFT));
+
+   RADEON_STATECHANGE( rmesa, ctx );
+   RADEON_STATECHANGE( rmesa, msk );
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_STENCIL_TEST_MASK;
+   rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] &= ~(RADEON_STENCIL_REF_MASK|
+						   RADEON_STENCIL_VALUE_MASK);
+
+   switch ( ctx->Stencil.Function[0] ) {
+   case GL_NEVER:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_TEST_NEVER;
+      break;
+   case GL_LESS:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_TEST_LESS;
+      break;
+   case GL_EQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_TEST_EQUAL;
+      break;
+   case GL_LEQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_TEST_LEQUAL;
+      break;
+   case GL_GREATER:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_TEST_GREATER;
+      break;
+   case GL_NOTEQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_TEST_NEQUAL;
+      break;
+   case GL_GEQUAL:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_TEST_GEQUAL;
+      break;
+   case GL_ALWAYS:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_TEST_ALWAYS;
+      break;
+   }
+
+   rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] |= refmask;
+}
+
+static void
+radeonStencilMaskSeparate( GLcontext *ctx, GLenum face, GLuint mask )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   RADEON_STATECHANGE( rmesa, msk );
+   rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] &= ~RADEON_STENCIL_WRITE_MASK;
+   rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] |=
+      ((ctx->Stencil.WriteMask[0] & 0xff) << RADEON_STENCIL_WRITEMASK_SHIFT);
+}
+
+static void radeonStencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
+                                     GLenum zfail, GLenum zpass )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   /* radeon 7200 have stencil bug, DEC and INC_WRAP will actually both do DEC_WRAP,
+      and DEC_WRAP (and INVERT) will do INVERT. No way to get correct INC_WRAP and DEC,
+      but DEC_WRAP can be fixed by using DEC and INC_WRAP at least use INC. */
+   
+   GLuint tempRADEON_STENCIL_FAIL_DEC_WRAP;
+   GLuint tempRADEON_STENCIL_FAIL_INC_WRAP;
+   GLuint tempRADEON_STENCIL_ZFAIL_DEC_WRAP;
+   GLuint tempRADEON_STENCIL_ZFAIL_INC_WRAP;
+   GLuint tempRADEON_STENCIL_ZPASS_DEC_WRAP;
+   GLuint tempRADEON_STENCIL_ZPASS_INC_WRAP;
+   
+   if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_BROKEN_STENCIL) {
+      tempRADEON_STENCIL_FAIL_DEC_WRAP = RADEON_STENCIL_FAIL_DEC;
+      tempRADEON_STENCIL_FAIL_INC_WRAP = RADEON_STENCIL_FAIL_INC;
+      tempRADEON_STENCIL_ZFAIL_DEC_WRAP = RADEON_STENCIL_ZFAIL_DEC;
+      tempRADEON_STENCIL_ZFAIL_INC_WRAP = RADEON_STENCIL_ZFAIL_INC;
+      tempRADEON_STENCIL_ZPASS_DEC_WRAP = RADEON_STENCIL_ZPASS_DEC;
+      tempRADEON_STENCIL_ZPASS_INC_WRAP = RADEON_STENCIL_ZPASS_INC;
+   }
+   else {
+      tempRADEON_STENCIL_FAIL_DEC_WRAP = RADEON_STENCIL_FAIL_DEC_WRAP;
+      tempRADEON_STENCIL_FAIL_INC_WRAP = RADEON_STENCIL_FAIL_INC_WRAP;
+      tempRADEON_STENCIL_ZFAIL_DEC_WRAP = RADEON_STENCIL_ZFAIL_DEC_WRAP;
+      tempRADEON_STENCIL_ZFAIL_INC_WRAP = RADEON_STENCIL_ZFAIL_INC_WRAP;
+      tempRADEON_STENCIL_ZPASS_DEC_WRAP = RADEON_STENCIL_ZPASS_DEC_WRAP;
+      tempRADEON_STENCIL_ZPASS_INC_WRAP = RADEON_STENCIL_ZPASS_INC_WRAP;
+   }
+   
+   RADEON_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~(RADEON_STENCIL_FAIL_MASK |
+					       RADEON_STENCIL_ZFAIL_MASK |
+					       RADEON_STENCIL_ZPASS_MASK);
+
+   switch ( ctx->Stencil.FailFunc[0] ) {
+   case GL_KEEP:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_FAIL_KEEP;
+      break;
+   case GL_ZERO:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_FAIL_ZERO;
+      break;
+   case GL_REPLACE:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_FAIL_REPLACE;
+      break;
+   case GL_INCR:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_FAIL_INC;
+      break;
+   case GL_DECR:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_FAIL_DEC;
+      break;
+   case GL_INCR_WRAP:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= tempRADEON_STENCIL_FAIL_INC_WRAP;
+      break;
+   case GL_DECR_WRAP:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= tempRADEON_STENCIL_FAIL_DEC_WRAP;
+      break;
+   case GL_INVERT:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_FAIL_INVERT;
+      break;
+   }
+
+   switch ( ctx->Stencil.ZFailFunc[0] ) {
+   case GL_KEEP:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_ZFAIL_KEEP;
+      break;
+   case GL_ZERO:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_ZFAIL_ZERO;
+      break;
+   case GL_REPLACE:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_ZFAIL_REPLACE;
+      break;
+   case GL_INCR:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_ZFAIL_INC;
+      break;
+   case GL_DECR:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_ZFAIL_DEC;
+      break;
+   case GL_INCR_WRAP:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= tempRADEON_STENCIL_ZFAIL_INC_WRAP;
+      break;
+   case GL_DECR_WRAP:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= tempRADEON_STENCIL_ZFAIL_DEC_WRAP;
+      break;
+   case GL_INVERT:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_ZFAIL_INVERT;
+      break;
+   }
+
+   switch ( ctx->Stencil.ZPassFunc[0] ) {
+   case GL_KEEP:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_ZPASS_KEEP;
+      break;
+   case GL_ZERO:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_ZPASS_ZERO;
+      break;
+   case GL_REPLACE:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_ZPASS_REPLACE;
+      break;
+   case GL_INCR:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_ZPASS_INC;
+      break;
+   case GL_DECR:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_ZPASS_DEC;
+      break;
+   case GL_INCR_WRAP:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= tempRADEON_STENCIL_ZPASS_INC_WRAP;
+      break;
+   case GL_DECR_WRAP:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= tempRADEON_STENCIL_ZPASS_DEC_WRAP;
+      break;
+   case GL_INVERT:
+      rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_STENCIL_ZPASS_INVERT;
+      break;
+   }
+}
+
+static void radeonClearStencil( GLcontext *ctx, GLint s )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   rmesa->state.stencil.clear = 
+      ((GLuint) (ctx->Stencil.Clear & 0xff) |
+       (0xff << RADEON_STENCIL_MASK_SHIFT) |
+       ((ctx->Stencil.WriteMask[0] & 0xff) << RADEON_STENCIL_WRITEMASK_SHIFT));
+}
+
+
+/* =============================================================
+ * Window position and viewport transformation
+ */
+
+/*
+ * To correctly position primitives:
+ */
+#define SUBPIXEL_X 0.125
+#define SUBPIXEL_Y 0.125
+
+
+/**
+ * Called when window size or position changes or viewport or depth range
+ * state is changed.  We update the hardware viewport state here.
+ */
+void radeonUpdateWindow( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   GLfloat xoffset = (GLfloat)dPriv->x;
+   GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
+   const GLfloat *v = ctx->Viewport._WindowMap.m;
+
+   float_ui32_type sx = { v[MAT_SX] };
+   float_ui32_type tx = { v[MAT_TX] + xoffset + SUBPIXEL_X };
+   float_ui32_type sy = { - v[MAT_SY] };
+   float_ui32_type ty = { (- v[MAT_TY]) + yoffset + SUBPIXEL_Y };
+   float_ui32_type sz = { v[MAT_SZ] * rmesa->state.depth.scale };
+   float_ui32_type tz = { v[MAT_TZ] * rmesa->state.depth.scale };
+
+   RADEON_FIREVERTICES( rmesa );
+   RADEON_STATECHANGE( rmesa, vpt );
+
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_XSCALE]  = sx.ui32;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_XOFFSET] = tx.ui32;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_YSCALE]  = sy.ui32;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_YOFFSET] = ty.ui32;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_ZSCALE]  = sz.ui32;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_ZOFFSET] = tz.ui32;
+}
+
+
+static void radeonViewport( GLcontext *ctx, GLint x, GLint y,
+			    GLsizei width, GLsizei height )
+{
+   /* Don't pipeline viewport changes, conflict with window offset
+    * setting below.  Could apply deltas to rescue pipelined viewport
+    * values, or keep the originals hanging around.
+    */
+   radeonUpdateWindow( ctx );
+}
+
+static void radeonDepthRange( GLcontext *ctx, GLclampd nearval,
+			      GLclampd farval )
+{
+   radeonUpdateWindow( ctx );
+}
+
+void radeonUpdateViewportOffset( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   GLfloat xoffset = (GLfloat)dPriv->x;
+   GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
+   const GLfloat *v = ctx->Viewport._WindowMap.m;
+
+   float_ui32_type tx;
+   float_ui32_type ty;
+
+   tx.f = v[MAT_TX] + xoffset + SUBPIXEL_X;
+   ty.f = (- v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+
+   if ( rmesa->hw.vpt.cmd[VPT_SE_VPORT_XOFFSET] != tx.ui32 ||
+	rmesa->hw.vpt.cmd[VPT_SE_VPORT_YOFFSET] != ty.ui32 )
+   {
+      /* Note: this should also modify whatever data the context reset
+       * code uses...
+       */
+      RADEON_STATECHANGE( rmesa, vpt );
+      rmesa->hw.vpt.cmd[VPT_SE_VPORT_XOFFSET] = tx.ui32;
+      rmesa->hw.vpt.cmd[VPT_SE_VPORT_YOFFSET] = ty.ui32;
+
+      /* update polygon stipple x/y screen offset */
+      {
+         GLuint stx, sty;
+         GLuint m = rmesa->hw.msc.cmd[MSC_RE_MISC];
+
+         m &= ~(RADEON_STIPPLE_X_OFFSET_MASK |
+                RADEON_STIPPLE_Y_OFFSET_MASK);
+
+         /* add magic offsets, then invert */
+         stx = 31 - ((rmesa->dri.drawable->x - 1) & RADEON_STIPPLE_COORD_MASK);
+         sty = 31 - ((rmesa->dri.drawable->y + rmesa->dri.drawable->h - 1)
+                     & RADEON_STIPPLE_COORD_MASK);
+
+         m |= ((stx << RADEON_STIPPLE_X_OFFSET_SHIFT) |
+               (sty << RADEON_STIPPLE_Y_OFFSET_SHIFT));
+
+         if ( rmesa->hw.msc.cmd[MSC_RE_MISC] != m ) {
+            RADEON_STATECHANGE( rmesa, msc );
+	    rmesa->hw.msc.cmd[MSC_RE_MISC] = m;
+         }
+      }
+   }
+
+   radeonUpdateScissor( ctx );
+}
+
+
+
+/* =============================================================
+ * Miscellaneous
+ */
+
+static void radeonClearColor( GLcontext *ctx, const GLfloat color[4] )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLubyte c[4];
+   CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
+   CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
+   CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
+   CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
+   rmesa->state.color.clear = radeonPackColor( rmesa->radeonScreen->cpp,
+					       c[0], c[1], c[2], c[3] );
+}
+
+
+static void radeonRenderMode( GLcontext *ctx, GLenum mode )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   FALLBACK( rmesa, RADEON_FALLBACK_RENDER_MODE, (mode != GL_RENDER) );
+}
+
+
+static GLuint radeon_rop_tab[] = {
+   RADEON_ROP_CLEAR,
+   RADEON_ROP_AND,
+   RADEON_ROP_AND_REVERSE,
+   RADEON_ROP_COPY,
+   RADEON_ROP_AND_INVERTED,
+   RADEON_ROP_NOOP,
+   RADEON_ROP_XOR,
+   RADEON_ROP_OR,
+   RADEON_ROP_NOR,
+   RADEON_ROP_EQUIV,
+   RADEON_ROP_INVERT,
+   RADEON_ROP_OR_REVERSE,
+   RADEON_ROP_COPY_INVERTED,
+   RADEON_ROP_OR_INVERTED,
+   RADEON_ROP_NAND,
+   RADEON_ROP_SET,
+};
+
+static void radeonLogicOpCode( GLcontext *ctx, GLenum opcode )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint rop = (GLuint)opcode - GL_CLEAR;
+
+   ASSERT( rop < 16 );
+
+   RADEON_STATECHANGE( rmesa, msk );
+   rmesa->hw.msk.cmd[MSK_RB3D_ROPCNTL] = radeon_rop_tab[rop];
+}
+
+
+/**
+ * Set up the cliprects for either front or back-buffer drawing.
+ */
+void radeonSetCliprects( radeonContextPtr rmesa )
+{
+   __DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
+   __DRIdrawablePrivate *const readable = rmesa->dri.readable;
+   GLframebuffer *const draw_fb = (GLframebuffer*) drawable->driverPrivate;
+   GLframebuffer *const read_fb = (GLframebuffer*) readable->driverPrivate;
+
+   if (draw_fb->_ColorDrawBufferMask[0]
+       == BUFFER_BIT_BACK_LEFT) {
+      /* Can't ignore 2d windows if we are page flipping.
+       */
+      if ( drawable->numBackClipRects == 0 || rmesa->doPageFlip ) {
+	 rmesa->numClipRects = drawable->numClipRects;
+	 rmesa->pClipRects = drawable->pClipRects;
+      }
+      else {
+	 rmesa->numClipRects = drawable->numBackClipRects;
+	 rmesa->pClipRects = drawable->pBackClipRects;
+      }
+   }
+   else {
+      /* front buffer (or none, or multiple buffers */
+      rmesa->numClipRects = drawable->numClipRects;
+      rmesa->pClipRects = drawable->pClipRects;
+   }
+
+   if ((draw_fb->Width != drawable->w) || (draw_fb->Height != drawable->h)) {
+      _mesa_resize_framebuffer(rmesa->glCtx, draw_fb,
+			       drawable->w, drawable->h);
+      draw_fb->Initialized = GL_TRUE;
+   }
+
+   if (drawable != readable) {
+      if ((read_fb->Width != readable->w) || (read_fb->Height != readable->h)) {
+	 _mesa_resize_framebuffer(rmesa->glCtx, read_fb,
+				  readable->w, readable->h);
+	 read_fb->Initialized = GL_TRUE;
+      }
+   }
+
+   if (rmesa->state.scissor.enabled)
+      radeonRecalcScissorRects( rmesa );
+
+   rmesa->lastStamp = drawable->lastStamp;
+}
+
+
+/**
+ * Called via glDrawBuffer.
+ */
+static void radeonDrawBuffer( GLcontext *ctx, GLenum mode )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   if (RADEON_DEBUG & DEBUG_DRI)
+      fprintf(stderr, "%s %s\n", __FUNCTION__,
+	      _mesa_lookup_enum_by_nr( mode ));
+
+   RADEON_FIREVERTICES(rmesa);	/* don't pipeline cliprect changes */
+
+   /*
+    * _ColorDrawBufferMask is easier to cope with than <mode>.
+    * Check for software fallback, update cliprects.
+    */
+   switch ( ctx->DrawBuffer->_ColorDrawBufferMask[0] ) {
+   case BUFFER_BIT_FRONT_LEFT:
+   case BUFFER_BIT_BACK_LEFT:
+      FALLBACK( rmesa, RADEON_FALLBACK_DRAW_BUFFER, GL_FALSE );
+      break;
+   default:
+      /* 0 (GL_NONE) buffers or multiple color drawing buffers */
+      FALLBACK( rmesa, RADEON_FALLBACK_DRAW_BUFFER, GL_TRUE );
+      return;
+   }
+
+   radeonSetCliprects( rmesa );
+
+   /* We'll set the drawing engine's offset/pitch parameters later
+    * when we update other state.
+    */
+}
+
+static void radeonReadBuffer( GLcontext *ctx, GLenum mode )
+{
+   /* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
+}
+
+
+/* =============================================================
+ * State enable/disable
+ */
+
+static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint p, flag;
+
+   if ( RADEON_DEBUG & DEBUG_STATE )
+      fprintf( stderr, "%s( %s = %s )\n", __FUNCTION__,
+	       _mesa_lookup_enum_by_nr( cap ),
+	       state ? "GL_TRUE" : "GL_FALSE" );
+
+   switch ( cap ) {
+      /* Fast track this one...
+       */
+   case GL_TEXTURE_1D:
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_3D:
+      break;
+
+   case GL_ALPHA_TEST:
+      RADEON_STATECHANGE( rmesa, ctx );
+      if (state) {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= RADEON_ALPHA_TEST_ENABLE;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~RADEON_ALPHA_TEST_ENABLE;
+      }
+      break;
+
+   case GL_BLEND:
+      RADEON_STATECHANGE( rmesa, ctx );
+      if (state) {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_ALPHA_BLEND_ENABLE;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_ALPHA_BLEND_ENABLE;
+      }
+      if ( (ctx->Color.ColorLogicOpEnabled || (ctx->Color.BlendEnabled
+	    && ctx->Color.BlendEquationRGB == GL_LOGIC_OP)) ) {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_ROP_ENABLE;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_ROP_ENABLE;
+      }
+
+      /* Catch a possible fallback:
+       */
+      if (state) {
+	 ctx->Driver.BlendEquationSeparate( ctx,
+					    ctx->Color.BlendEquationRGB,
+					    ctx->Color.BlendEquationA );
+	 ctx->Driver.BlendFuncSeparate( ctx, ctx->Color.BlendSrcRGB,
+					ctx->Color.BlendDstRGB,
+					ctx->Color.BlendSrcA,
+					ctx->Color.BlendDstA );
+      }
+      else {
+	 FALLBACK( rmesa, RADEON_FALLBACK_BLEND_FUNC, GL_FALSE );
+	 FALLBACK( rmesa, RADEON_FALLBACK_BLEND_EQ, GL_FALSE );
+      }
+      break;
+
+   case GL_CLIP_PLANE0:
+   case GL_CLIP_PLANE1:
+   case GL_CLIP_PLANE2:
+   case GL_CLIP_PLANE3:
+   case GL_CLIP_PLANE4:
+   case GL_CLIP_PLANE5: 
+      p = cap-GL_CLIP_PLANE0;
+      RADEON_STATECHANGE( rmesa, tcl );
+      if (state) {
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] |= (RADEON_UCP_ENABLE_0<<p);
+	 radeonClipPlane( ctx, cap, NULL );
+      }
+      else {
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~(RADEON_UCP_ENABLE_0<<p);
+      }
+      break;
+
+   case GL_COLOR_MATERIAL:
+      radeonColorMaterial( ctx, 0, 0 );
+      radeonUpdateMaterial( ctx );
+      break;
+
+   case GL_CULL_FACE:
+      radeonCullFace( ctx, 0 );
+      break;
+
+   case GL_DEPTH_TEST:
+      RADEON_STATECHANGE(rmesa, ctx );
+      if ( state ) {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_Z_ENABLE;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_Z_ENABLE;
+      }
+      break;
+
+   case GL_DITHER:
+      RADEON_STATECHANGE(rmesa, ctx );
+      if ( state ) {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_DITHER_ENABLE;
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->state.color.roundEnable;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_DITHER_ENABLE;
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->state.color.roundEnable;
+      }
+      break;
+
+   case GL_FOG:
+      RADEON_STATECHANGE(rmesa, ctx );
+      if ( state ) {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= RADEON_FOG_ENABLE;
+	 radeonFogfv( ctx, GL_FOG_MODE, NULL );
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~RADEON_FOG_ENABLE;
+	 RADEON_STATECHANGE(rmesa, tcl);
+	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~RADEON_TCL_FOG_MASK;
+      }
+      radeonUpdateSpecular( ctx ); /* for PK_SPEC */
+      _mesa_allow_light_in_model( ctx, !state );
+      break;
+
+   case GL_LIGHT0:
+   case GL_LIGHT1:
+   case GL_LIGHT2:
+   case GL_LIGHT3:
+   case GL_LIGHT4:
+   case GL_LIGHT5:
+   case GL_LIGHT6:
+   case GL_LIGHT7:
+      RADEON_STATECHANGE(rmesa, tcl);
+      p = cap - GL_LIGHT0;
+      if (p&1) 
+	 flag = (RADEON_LIGHT_1_ENABLE |
+		 RADEON_LIGHT_1_ENABLE_AMBIENT | 
+		 RADEON_LIGHT_1_ENABLE_SPECULAR);
+      else
+	 flag = (RADEON_LIGHT_0_ENABLE |
+		 RADEON_LIGHT_0_ENABLE_AMBIENT | 
+		 RADEON_LIGHT_0_ENABLE_SPECULAR);
+
+      if (state)
+	 rmesa->hw.tcl.cmd[p/2 + TCL_PER_LIGHT_CTL_0] |= flag;
+      else
+	 rmesa->hw.tcl.cmd[p/2 + TCL_PER_LIGHT_CTL_0] &= ~flag;
+
+      /* 
+       */
+      update_light_colors( ctx, p );
+      break;
+
+   case GL_LIGHTING:
+      RADEON_STATECHANGE(rmesa, tcl);
+      radeonUpdateSpecular(ctx);
+      check_twoside_fallback( ctx );
+      break;
+
+   case GL_LINE_SMOOTH:
+      RADEON_STATECHANGE( rmesa, ctx );
+      if ( state ) {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |=  RADEON_ANTI_ALIAS_LINE;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~RADEON_ANTI_ALIAS_LINE;
+      }
+      break;
+
+   case GL_LINE_STIPPLE:
+      RADEON_STATECHANGE( rmesa, ctx );
+      if ( state ) {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |=  RADEON_PATTERN_ENABLE;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~RADEON_PATTERN_ENABLE;
+      }
+      break;
+
+   case GL_COLOR_LOGIC_OP:
+      RADEON_STATECHANGE( rmesa, ctx );
+      if ( (ctx->Color.ColorLogicOpEnabled || (ctx->Color.BlendEnabled
+	    && ctx->Color.BlendEquationRGB == GL_LOGIC_OP)) ) {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_ROP_ENABLE;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_ROP_ENABLE;
+      }
+      break;
+      
+   case GL_NORMALIZE:
+      RADEON_STATECHANGE( rmesa, tcl );
+      if ( state ) {
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |=  RADEON_NORMALIZE_NORMALS;
+      } else {
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= ~RADEON_NORMALIZE_NORMALS;
+      }
+      break;
+
+   case GL_POLYGON_OFFSET_POINT:
+      RADEON_STATECHANGE( rmesa, set );
+      if ( state ) {
+	 rmesa->hw.set.cmd[SET_SE_CNTL] |=  RADEON_ZBIAS_ENABLE_POINT;
+      } else {
+	 rmesa->hw.set.cmd[SET_SE_CNTL] &= ~RADEON_ZBIAS_ENABLE_POINT;
+      }
+      break;
+
+   case GL_POLYGON_OFFSET_LINE:
+      RADEON_STATECHANGE( rmesa, set );
+      if ( state ) {
+	 rmesa->hw.set.cmd[SET_SE_CNTL] |=  RADEON_ZBIAS_ENABLE_LINE;
+      } else {
+	 rmesa->hw.set.cmd[SET_SE_CNTL] &= ~RADEON_ZBIAS_ENABLE_LINE;
+      }
+      break;
+
+   case GL_POLYGON_OFFSET_FILL:
+      RADEON_STATECHANGE( rmesa, set );
+      if ( state ) {
+	 rmesa->hw.set.cmd[SET_SE_CNTL] |=  RADEON_ZBIAS_ENABLE_TRI;
+      } else {
+	 rmesa->hw.set.cmd[SET_SE_CNTL] &= ~RADEON_ZBIAS_ENABLE_TRI;
+      }
+      break;
+
+   case GL_POLYGON_SMOOTH:
+      RADEON_STATECHANGE( rmesa, ctx );
+      if ( state ) {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |=  RADEON_ANTI_ALIAS_POLY;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~RADEON_ANTI_ALIAS_POLY;
+      }
+      break;
+
+   case GL_POLYGON_STIPPLE:
+      RADEON_STATECHANGE(rmesa, ctx );
+      if ( state ) {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |=  RADEON_STIPPLE_ENABLE;
+      } else {
+	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~RADEON_STIPPLE_ENABLE;
+      }
+      break;
+
+   case GL_RESCALE_NORMAL_EXT: {
+      GLboolean tmp = ctx->_NeedEyeCoords ? state : !state;
+      RADEON_STATECHANGE( rmesa, tcl );
+      if ( tmp ) {
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |=  RADEON_RESCALE_NORMALS;
+      } else {
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= ~RADEON_RESCALE_NORMALS;
+      }
+      break;
+   }
+
+   case GL_SCISSOR_TEST:
+      RADEON_FIREVERTICES( rmesa );
+      rmesa->state.scissor.enabled = state;
+      radeonUpdateScissor( ctx );
+      break;
+
+   case GL_STENCIL_TEST:
+      if ( rmesa->state.stencil.hwBuffer ) {
+	 RADEON_STATECHANGE( rmesa, ctx );
+	 if ( state ) {
+	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_STENCIL_ENABLE;
+	 } else {
+	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_STENCIL_ENABLE;
+	 }
+      } else {
+	 FALLBACK( rmesa, RADEON_FALLBACK_STENCIL, state );
+      }
+      break;
+
+   case GL_TEXTURE_GEN_Q:
+   case GL_TEXTURE_GEN_R:
+   case GL_TEXTURE_GEN_S:
+   case GL_TEXTURE_GEN_T:
+      /* Picked up in radeonUpdateTextureState.
+       */
+      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE; 
+      break;
+
+   case GL_COLOR_SUM_EXT:
+      radeonUpdateSpecular ( ctx );
+      break;
+
+   default:
+      return;
+   }
+}
+
+
+static void radeonLightingSpaceChange( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLboolean tmp;
+   RADEON_STATECHANGE( rmesa, tcl );
+
+   if (RADEON_DEBUG & DEBUG_STATE)
+      fprintf(stderr, "%s %d BEFORE %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
+	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]);
+
+   if (ctx->_NeedEyeCoords)
+      tmp = ctx->Transform.RescaleNormals;
+   else
+      tmp = !ctx->Transform.RescaleNormals;
+
+   if ( tmp ) {
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |=  RADEON_RESCALE_NORMALS;
+   } else {
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= ~RADEON_RESCALE_NORMALS;
+   }
+
+   if (RADEON_DEBUG & DEBUG_STATE) 
+      fprintf(stderr, "%s %d AFTER %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
+	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]);
+}
+
+/* =============================================================
+ * Deferred state management - matrices, textures, other?
+ */
+
+
+void radeonUploadTexMatrix( radeonContextPtr rmesa,
+			    int unit, GLboolean swapcols )
+{
+/* Here's how this works: on r100, only 3 tex coords can be submitted, so the
+   vector looks like this probably: (s t r|q 0) (not sure if the last coord
+   is hardwired to 0, could be 1 too). Interestingly, it actually looks like
+   texgen generates all 4 coords, at least tests with projtex indicated that.
+   So: if we need the q coord in the end (solely determined by the texture
+   target, i.e. 2d / 1d / texrect targets) we swap the third and 4th row.
+   Additionally, if we don't have texgen but 4 tex coords submitted, we swap
+   column 3 and 4 (for the 2d / 1d / texrect targets) since the the q coord
+   will get submitted in the "wrong", i.e. 3rd, slot.
+   If an app submits 3 coords for 2d targets, we assume it is saving on vertex
+   size and using the texture matrix to swap the r and q coords around (ut2k3
+   does exactly that), so we don't need the 3rd / 4th column swap - still need
+   the 3rd / 4th row swap of course. This will potentially break for apps which
+   use TexCoord3x just for fun. Additionally, it will never work if an app uses
+   an "advanced" texture matrix and relies on all 4 texcoord inputs to generate
+   the maximum needed 3. This seems impossible to do with hw tcl on r100, and
+   incredibly hard to detect so we can't just fallback in such a case. Assume
+   it never happens... - rs
+*/
+
+   int idx = TEXMAT_0 + unit;
+   float *dest = ((float *)RADEON_DB_STATE( mat[idx] )) + MAT_ELT_0;
+   int i;
+   struct gl_texture_unit tUnit = rmesa->glCtx->Texture.Unit[unit];
+   GLfloat *src = rmesa->tmpmat[unit].m;
+
+   rmesa->TexMatColSwap &= ~(1 << unit);
+   if ((tUnit._ReallyEnabled & (TEXTURE_3D_BIT | TEXTURE_CUBE_BIT)) == 0) {
+      if (swapcols) {
+	 rmesa->TexMatColSwap |= 1 << unit;
+	 /* attention some elems are swapped 2 times! */
+	 *dest++ = src[0];
+	 *dest++ = src[4];
+	 *dest++ = src[12];
+	 *dest++ = src[8];
+	 *dest++ = src[1];
+	 *dest++ = src[5];
+	 *dest++ = src[13];
+	 *dest++ = src[9];
+	 *dest++ = src[2];
+	 *dest++ = src[6];
+	 *dest++ = src[15];
+	 *dest++ = src[11];
+	 /* those last 4 are probably never used */
+	 *dest++ = src[3];
+	 *dest++ = src[7];
+	 *dest++ = src[14];
+	 *dest++ = src[10];
+      }
+      else {
+	 for (i = 0; i < 2; i++) {
+	    *dest++ = src[i];
+	    *dest++ = src[i+4];
+	    *dest++ = src[i+8];
+	    *dest++ = src[i+12];
+	 }
+	 for (i = 3; i >= 2; i--) {
+	    *dest++ = src[i];
+	    *dest++ = src[i+4];
+	    *dest++ = src[i+8];
+	    *dest++ = src[i+12];
+	 }
+      }
+   }
+   else {
+      for (i = 0 ; i < 4 ; i++) {
+	 *dest++ = src[i];
+	 *dest++ = src[i+4];
+	 *dest++ = src[i+8];
+	 *dest++ = src[i+12];
+      }
+   }
+
+   RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.mat[idx] );
+}
+
+
+static void upload_matrix( radeonContextPtr rmesa, GLfloat *src, int idx )
+{
+   float *dest = ((float *)RADEON_DB_STATE( mat[idx] ))+MAT_ELT_0;
+   int i;
+
+
+   for (i = 0 ; i < 4 ; i++) {
+      *dest++ = src[i];
+      *dest++ = src[i+4];
+      *dest++ = src[i+8];
+      *dest++ = src[i+12];
+   }
+
+   RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.mat[idx] );
+}
+
+static void upload_matrix_t( radeonContextPtr rmesa, GLfloat *src, int idx )
+{
+   float *dest = ((float *)RADEON_DB_STATE( mat[idx] ))+MAT_ELT_0;
+   memcpy(dest, src, 16*sizeof(float));
+   RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.mat[idx] );
+}
+
+
+static void update_texturematrix( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   GLuint tpc = rmesa->hw.tcl.cmd[TCL_TEXTURE_PROC_CTL];
+   GLuint vs = rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL];
+   int unit;
+   GLuint texMatEnabled = 0;
+   rmesa->NeedTexMatrix = 0;
+   rmesa->TexMatColSwap = 0;
+
+   for (unit = 0 ; unit < ctx->Const.MaxTextureUnits; unit++) {
+      if (ctx->Texture.Unit[unit]._ReallyEnabled) {
+	 GLboolean needMatrix = GL_FALSE;
+	 if (ctx->TextureMatrixStack[unit].Top->type != MATRIX_IDENTITY) {
+	    needMatrix = GL_TRUE;
+	    texMatEnabled |= (RADEON_TEXGEN_TEXMAT_0_ENABLE |
+			      RADEON_TEXMAT_0_ENABLE) << unit;
+
+	    if (rmesa->TexGenEnabled & (RADEON_TEXMAT_0_ENABLE << unit)) {
+	       /* Need to preconcatenate any active texgen
+	        * obj/eyeplane matrices:
+	        */
+	       _math_matrix_mul_matrix( &rmesa->tmpmat[unit],
+				     ctx->TextureMatrixStack[unit].Top,
+				     &rmesa->TexGenMatrix[unit] );
+	    }
+	    else {
+	       _math_matrix_copy( &rmesa->tmpmat[unit],
+		  ctx->TextureMatrixStack[unit].Top );
+	    }
+	 }
+	 else if (rmesa->TexGenEnabled & (RADEON_TEXMAT_0_ENABLE << unit)) {
+	    _math_matrix_copy( &rmesa->tmpmat[unit], &rmesa->TexGenMatrix[unit] );
+	    needMatrix = GL_TRUE;
+	 }
+	 if (needMatrix) {
+	    rmesa->NeedTexMatrix |= 1 << unit;
+	    radeonUploadTexMatrix( rmesa, unit,
+			!ctx->Texture.Unit[unit].TexGenEnabled );
+	 }
+      }
+   }
+
+   tpc = (texMatEnabled | rmesa->TexGenEnabled);
+
+   /* TCL_TEX_COMPUTED_x is TCL_TEX_INPUT_x | 0x8 */
+   vs &= ~((RADEON_TCL_TEX_COMPUTED_TEX_0 << RADEON_TCL_TEX_0_OUTPUT_SHIFT) |
+	   (RADEON_TCL_TEX_COMPUTED_TEX_0 << RADEON_TCL_TEX_1_OUTPUT_SHIFT) |
+	   (RADEON_TCL_TEX_COMPUTED_TEX_0 << RADEON_TCL_TEX_2_OUTPUT_SHIFT));
+
+   vs |= (((tpc & RADEON_TEXGEN_TEXMAT_0_ENABLE) <<
+	 (RADEON_TCL_TEX_0_OUTPUT_SHIFT + 3)) |
+      ((tpc & RADEON_TEXGEN_TEXMAT_1_ENABLE) <<
+	 (RADEON_TCL_TEX_1_OUTPUT_SHIFT + 2)) |
+      ((tpc & RADEON_TEXGEN_TEXMAT_2_ENABLE) <<
+	 (RADEON_TCL_TEX_2_OUTPUT_SHIFT + 1)));
+
+   if (tpc != rmesa->hw.tcl.cmd[TCL_TEXTURE_PROC_CTL] ||
+       vs != rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL]) {
+
+      RADEON_STATECHANGE(rmesa, tcl);
+      rmesa->hw.tcl.cmd[TCL_TEXTURE_PROC_CTL] = tpc;
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] = vs;
+   }
+}
+
+
+/**
+ * Tell the card where to render (offset, pitch).
+ * Effected by glDrawBuffer, etc
+ */
+void
+radeonUpdateDrawBuffer(GLcontext *ctx)
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   driRenderbuffer *drb;
+
+   if (fb->_ColorDrawBufferMask[0] == BUFFER_BIT_FRONT_LEFT) {
+      /* draw to front */
+      drb = (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
+   }
+   else if (fb->_ColorDrawBufferMask[0] == BUFFER_BIT_BACK_LEFT) {
+      /* draw to back */
+      drb = (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+   }
+   else {
+      /* drawing to multiple buffers, or none */
+      return;
+   }
+
+   assert(drb);
+   assert(drb->flippedPitch);
+
+   RADEON_STATECHANGE( rmesa, ctx );
+
+   /* Note: we used the (possibly) page-flipped values */
+   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
+     = ((drb->flippedOffset + rmesa->radeonScreen->fbLocation)
+	& RADEON_COLOROFFSET_MASK);
+   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
+   if (rmesa->sarea->tiling_enabled) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= RADEON_COLOR_TILE_ENABLE;
+   }
+}
+
+
+void radeonValidateState( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint new_state = rmesa->NewGLState;
+
+   if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+     radeonUpdateDrawBuffer(ctx);
+   }
+
+   if (new_state & _NEW_TEXTURE) {
+      radeonUpdateTextureState( ctx );
+      new_state |= rmesa->NewGLState; /* may add TEXTURE_MATRIX */
+   }
+
+   /* Need an event driven matrix update?
+    */
+   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION)) 
+      upload_matrix( rmesa, ctx->_ModelProjectMatrix.m, MODEL_PROJ );
+
+   /* Need these for lighting (shouldn't upload otherwise)
+    */
+   if (new_state & (_NEW_MODELVIEW)) {
+      upload_matrix( rmesa, ctx->ModelviewMatrixStack.Top->m, MODEL );
+      upload_matrix_t( rmesa, ctx->ModelviewMatrixStack.Top->inv, MODEL_IT );
+   }
+
+   /* Does this need to be triggered on eg. modelview for
+    * texgen-derived objplane/eyeplane matrices?
+    */
+   if (new_state & _NEW_TEXTURE_MATRIX) {
+      update_texturematrix( ctx );
+   }
+
+   if (new_state & (_NEW_LIGHT|_NEW_MODELVIEW|_MESA_NEW_NEED_EYE_COORDS)) {
+      update_light( ctx );
+   }
+
+   /* emit all active clip planes if projection matrix changes.
+    */
+   if (new_state & (_NEW_PROJECTION)) {
+      if (ctx->Transform.ClipPlanesEnabled) 
+	 radeonUpdateClipPlanes( ctx );
+   }
+
+
+   rmesa->NewGLState = 0;
+}
+
+
+static void radeonInvalidateState( GLcontext *ctx, GLuint new_state )
+{
+   _swrast_InvalidateState( ctx, new_state );
+   _swsetup_InvalidateState( ctx, new_state );
+   _vbo_InvalidateState( ctx, new_state );
+   _tnl_InvalidateState( ctx, new_state );
+   _ae_invalidate_state( ctx, new_state );
+   RADEON_CONTEXT(ctx)->NewGLState |= new_state;
+}
+
+
+/* A hack.  Need a faster way to find this out.
+ */
+static GLboolean check_material( GLcontext *ctx )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   GLint i;
+
+   for (i = _TNL_ATTRIB_MAT_FRONT_AMBIENT; 
+	i < _TNL_ATTRIB_MAT_BACK_INDEXES; 
+	i++)
+      if (tnl->vb.AttribPtr[i] &&
+	  tnl->vb.AttribPtr[i]->stride)
+	 return GL_TRUE;
+
+   return GL_FALSE;
+}
+      
+
+static void radeonWrapRunPipeline( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLboolean has_material;
+
+   if (0)
+      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->NewGLState);
+
+   /* Validate state:
+    */
+   if (rmesa->NewGLState)
+      radeonValidateState( ctx );
+
+   has_material = (ctx->Light.Enabled && check_material( ctx ));
+
+   if (has_material) {
+      TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_MATERIAL, GL_TRUE );
+   }
+
+   /* Run the pipeline.
+    */ 
+   _tnl_run_pipeline( ctx );
+
+   if (has_material) {
+      TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_MATERIAL, GL_FALSE );
+   }
+}
+
+
+/* Initialize the driver's state functions.
+ * Many of the ctx->Driver functions might have been initialized to
+ * software defaults in the earlier _mesa_init_driver_functions() call.
+ */
+void radeonInitStateFuncs( GLcontext *ctx )
+{
+   ctx->Driver.UpdateState		= radeonInvalidateState;
+   ctx->Driver.LightingSpaceChange      = radeonLightingSpaceChange;
+
+   ctx->Driver.DrawBuffer		= radeonDrawBuffer;
+   ctx->Driver.ReadBuffer		= radeonReadBuffer;
+
+   ctx->Driver.AlphaFunc		= radeonAlphaFunc;
+   ctx->Driver.BlendEquationSeparate	= radeonBlendEquationSeparate;
+   ctx->Driver.BlendFuncSeparate	= radeonBlendFuncSeparate;
+   ctx->Driver.ClearColor		= radeonClearColor;
+   ctx->Driver.ClearDepth		= radeonClearDepth;
+   ctx->Driver.ClearIndex		= NULL;
+   ctx->Driver.ClearStencil		= radeonClearStencil;
+   ctx->Driver.ClipPlane		= radeonClipPlane;
+   ctx->Driver.ColorMask		= radeonColorMask;
+   ctx->Driver.CullFace			= radeonCullFace;
+   ctx->Driver.DepthFunc		= radeonDepthFunc;
+   ctx->Driver.DepthMask		= radeonDepthMask;
+   ctx->Driver.DepthRange		= radeonDepthRange;
+   ctx->Driver.Enable			= radeonEnable;
+   ctx->Driver.Fogfv			= radeonFogfv;
+   ctx->Driver.FrontFace		= radeonFrontFace;
+   ctx->Driver.Hint			= NULL;
+   ctx->Driver.IndexMask		= NULL;
+   ctx->Driver.LightModelfv		= radeonLightModelfv;
+   ctx->Driver.Lightfv			= radeonLightfv;
+   ctx->Driver.LineStipple              = radeonLineStipple;
+   ctx->Driver.LineWidth                = radeonLineWidth;
+   ctx->Driver.LogicOpcode		= radeonLogicOpCode;
+   ctx->Driver.PolygonMode		= radeonPolygonMode;
+   ctx->Driver.PolygonOffset		= radeonPolygonOffset;
+   ctx->Driver.PolygonStipple		= radeonPolygonStipple;
+   ctx->Driver.RenderMode		= radeonRenderMode;
+   ctx->Driver.Scissor			= radeonScissor;
+   ctx->Driver.ShadeModel		= radeonShadeModel;
+   ctx->Driver.StencilFuncSeparate	= radeonStencilFuncSeparate;
+   ctx->Driver.StencilMaskSeparate	= radeonStencilMaskSeparate;
+   ctx->Driver.StencilOpSeparate	= radeonStencilOpSeparate;
+   ctx->Driver.Viewport			= radeonViewport;
+
+   TNL_CONTEXT(ctx)->Driver.NotifyMaterialChange = radeonUpdateMaterial;
+   TNL_CONTEXT(ctx)->Driver.RunPipeline = radeonWrapRunPipeline;
+}
diff --git a/radeon/radeon_state.h b/radeon/radeon_state.h
new file mode 100644
index 0000000..ad7db3b
--- /dev/null
+++ b/radeon/radeon_state.h
@@ -0,0 +1,77 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_state.h,v 1.5 2002/11/05 17:46:09 tsi Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Gareth Hughes <gareth@valinux.com>
+ *
+ */
+
+#ifndef __RADEON_STATE_H__
+#define __RADEON_STATE_H__
+
+#include "radeon_context.h"
+
+extern void radeonInitState( radeonContextPtr rmesa );
+extern void radeonInitStateFuncs( GLcontext *ctx );
+
+extern void radeonUpdateMaterial( GLcontext *ctx );
+
+extern void radeonSetCliprects( radeonContextPtr rmesa );
+extern void radeonRecalcScissorRects( radeonContextPtr rmesa );
+extern void radeonUpdateViewportOffset( GLcontext *ctx );
+extern void radeonUpdateWindow( GLcontext *ctx );
+extern void radeonUpdateDrawBuffer( GLcontext *ctx );
+extern void radeonUploadTexMatrix( radeonContextPtr rmesa,
+				   int unit, GLboolean swapcols );
+
+extern void radeonValidateState( GLcontext *ctx );
+
+extern void radeonPrintDirty( radeonContextPtr rmesa,
+			      const char *msg );
+
+
+extern void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode );
+#define FALLBACK( rmesa, bit, mode ) do {				\
+   if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",		\
+		     __FUNCTION__, bit, mode );				\
+   radeonFallback( rmesa->glCtx, bit, mode );				\
+} while (0)
+
+
+#define MODEL_PROJ 0
+#define MODEL      1
+#define MODEL_IT   2
+#define TEXMAT_0   3
+#define TEXMAT_1   4
+#define TEXMAT_2   5
+
+#endif
diff --git a/radeon/radeon_state_init.c b/radeon/radeon_state_init.c
new file mode 100644
index 0000000..5fc34f0
--- /dev/null
+++ b/radeon/radeon_state_init.c
@@ -0,0 +1,618 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_state_init.c,v 1.3 2003/02/22 06:21:11 dawes Exp $ */
+/*
+ * Copyright 2000, 2001 VA Linux Systems Inc., Fremont, California.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Gareth Hughes <gareth@valinux.com>
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "api_arrayelt.h"
+
+#include "swrast/swrast.h"
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "swrast_setup/swrast_setup.h"
+
+#include "radeon_context.h"
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "radeon_tcl.h"
+#include "radeon_tex.h"
+#include "radeon_swtcl.h"
+
+#include "xmlpool.h"
+
+/* =============================================================
+ * State initialization
+ */
+
+void radeonPrintDirty( radeonContextPtr rmesa, const char *msg )
+{
+   struct radeon_state_atom *l;
+
+   fprintf(stderr, msg);
+   fprintf(stderr, ": ");
+
+   foreach(l, &rmesa->hw.atomlist) {
+      if (l->dirty || rmesa->hw.all_dirty)
+	 fprintf(stderr, "%s, ", l->name);
+   }
+
+   fprintf(stderr, "\n");
+}
+
+static int cmdpkt( int id ) 
+{
+   drm_radeon_cmd_header_t h;
+   h.i = 0;
+   h.packet.cmd_type = RADEON_CMD_PACKET;
+   h.packet.packet_id = id;
+   return h.i;
+}
+
+static int cmdvec( int offset, int stride, int count ) 
+{
+   drm_radeon_cmd_header_t h;
+   h.i = 0;
+   h.vectors.cmd_type = RADEON_CMD_VECTORS;
+   h.vectors.offset = offset;
+   h.vectors.stride = stride;
+   h.vectors.count = count;
+   return h.i;
+}
+
+static int cmdscl( int offset, int stride, int count ) 
+{
+   drm_radeon_cmd_header_t h;
+   h.i = 0;
+   h.scalars.cmd_type = RADEON_CMD_SCALARS;
+   h.scalars.offset = offset;
+   h.scalars.stride = stride;
+   h.scalars.count = count;
+   return h.i;
+}
+
+#define CHECK( NM, FLAG )			\
+static GLboolean check_##NM( GLcontext *ctx )	\
+{						\
+   return FLAG;					\
+}
+
+#define TCL_CHECK( NM, FLAG )				\
+static GLboolean check_##NM( GLcontext *ctx )		\
+{							\
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);	\
+   return !rmesa->TclFallback && (FLAG);		\
+}
+
+
+CHECK( always, GL_TRUE )
+CHECK( never, GL_FALSE )
+CHECK( tex0, ctx->Texture.Unit[0]._ReallyEnabled )
+CHECK( tex1, ctx->Texture.Unit[1]._ReallyEnabled )
+/* need this for the cubic_map on disabled unit 2 bug, maybe r100 only? */
+CHECK( tex2, ctx->Texture._EnabledUnits )
+CHECK( cube0, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_CUBE_BIT))
+CHECK( cube1, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_CUBE_BIT))
+CHECK( cube2, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_CUBE_BIT))
+CHECK( fog, ctx->Fog.Enabled )
+TCL_CHECK( tcl, GL_TRUE )
+TCL_CHECK( tcl_tex0, ctx->Texture.Unit[0]._ReallyEnabled )
+TCL_CHECK( tcl_tex1, ctx->Texture.Unit[1]._ReallyEnabled )
+TCL_CHECK( tcl_tex2, ctx->Texture.Unit[2]._ReallyEnabled )
+TCL_CHECK( tcl_lighting, ctx->Light.Enabled )
+TCL_CHECK( tcl_eyespace_or_lighting, ctx->_NeedEyeCoords || ctx->Light.Enabled )
+TCL_CHECK( tcl_lit0, ctx->Light.Enabled && ctx->Light.Light[0].Enabled )
+TCL_CHECK( tcl_lit1, ctx->Light.Enabled && ctx->Light.Light[1].Enabled )
+TCL_CHECK( tcl_lit2, ctx->Light.Enabled && ctx->Light.Light[2].Enabled )
+TCL_CHECK( tcl_lit3, ctx->Light.Enabled && ctx->Light.Light[3].Enabled )
+TCL_CHECK( tcl_lit4, ctx->Light.Enabled && ctx->Light.Light[4].Enabled )
+TCL_CHECK( tcl_lit5, ctx->Light.Enabled && ctx->Light.Light[5].Enabled )
+TCL_CHECK( tcl_lit6, ctx->Light.Enabled && ctx->Light.Light[6].Enabled )
+TCL_CHECK( tcl_lit7, ctx->Light.Enabled && ctx->Light.Light[7].Enabled )
+TCL_CHECK( tcl_ucp0, (ctx->Transform.ClipPlanesEnabled & 0x1) )
+TCL_CHECK( tcl_ucp1, (ctx->Transform.ClipPlanesEnabled & 0x2) )
+TCL_CHECK( tcl_ucp2, (ctx->Transform.ClipPlanesEnabled & 0x4) )
+TCL_CHECK( tcl_ucp3, (ctx->Transform.ClipPlanesEnabled & 0x8) )
+TCL_CHECK( tcl_ucp4, (ctx->Transform.ClipPlanesEnabled & 0x10) )
+TCL_CHECK( tcl_ucp5, (ctx->Transform.ClipPlanesEnabled & 0x20) )
+TCL_CHECK( tcl_eyespace_or_fog, ctx->_NeedEyeCoords || ctx->Fog.Enabled ) 
+
+CHECK( txr0, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_RECT_BIT))
+CHECK( txr1, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_RECT_BIT))
+CHECK( txr2, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_RECT_BIT))
+
+
+
+/* Initialize the context's hardware state.
+ */
+void radeonInitState( radeonContextPtr rmesa )
+{
+   GLcontext *ctx = rmesa->glCtx;
+   GLuint color_fmt, depth_fmt, i;
+   GLint drawPitch, drawOffset;
+
+   switch ( rmesa->radeonScreen->cpp ) {
+   case 2:
+      color_fmt = RADEON_COLOR_FORMAT_RGB565;
+      break;
+   case 4:
+      color_fmt = RADEON_COLOR_FORMAT_ARGB8888;
+      break;
+   default:
+      fprintf( stderr, "Error: Unsupported pixel depth... exiting\n" );
+      exit( -1 );
+   }
+
+   rmesa->state.color.clear = 0x00000000;
+
+   switch ( ctx->Visual.depthBits ) {
+   case 16:
+      rmesa->state.depth.clear = 0x0000ffff;
+      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffff;
+      depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
+      rmesa->state.stencil.clear = 0x00000000;
+      break;
+   case 24:
+      rmesa->state.depth.clear = 0x00ffffff;
+      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffffff;
+      depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
+      rmesa->state.stencil.clear = 0xffff0000;
+      break;
+   default:
+      fprintf( stderr, "Error: Unsupported depth %d... exiting\n",
+	       ctx->Visual.depthBits );
+      exit( -1 );
+   }
+
+   /* Only have hw stencil when depth buffer is 24 bits deep */
+   rmesa->state.stencil.hwBuffer = ( ctx->Visual.stencilBits > 0 &&
+				     ctx->Visual.depthBits == 24 );
+
+   rmesa->Fallback = 0;
+
+   if ( ctx->Visual.doubleBufferMode && rmesa->sarea->pfCurrentPage == 0 ) {
+      drawOffset = rmesa->radeonScreen->backOffset;
+      drawPitch  = rmesa->radeonScreen->backPitch;
+   } else {
+      drawOffset = rmesa->radeonScreen->frontOffset;
+      drawPitch  = rmesa->radeonScreen->frontPitch;
+   }
+
+   rmesa->hw.max_state_size = 0;
+
+#define ALLOC_STATE( ATOM, CHK, SZ, NM, FLAG )				\
+   do {								\
+      rmesa->hw.ATOM.cmd_size = SZ;				\
+      rmesa->hw.ATOM.cmd = (int *)CALLOC(SZ * sizeof(int));	\
+      rmesa->hw.ATOM.lastcmd = (int *)CALLOC(SZ * sizeof(int));	\
+      rmesa->hw.ATOM.name = NM;					\
+      rmesa->hw.ATOM.is_tcl = FLAG;					\
+      rmesa->hw.ATOM.check = check_##CHK;				\
+      rmesa->hw.ATOM.dirty = GL_TRUE;				\
+      rmesa->hw.max_state_size += SZ * sizeof(int);		\
+   } while (0)
+      
+      
+   /* Allocate state buffers:
+    */
+   ALLOC_STATE( ctx, always, CTX_STATE_SIZE, "CTX/context", 0 );
+   ALLOC_STATE( lin, always, LIN_STATE_SIZE, "LIN/line", 0 );
+   ALLOC_STATE( msk, always, MSK_STATE_SIZE, "MSK/mask", 0 );
+   ALLOC_STATE( vpt, always, VPT_STATE_SIZE, "VPT/viewport", 0 );
+   ALLOC_STATE( set, always, SET_STATE_SIZE, "SET/setup", 0 );
+   ALLOC_STATE( msc, always, MSC_STATE_SIZE, "MSC/misc", 0 );
+   ALLOC_STATE( zbs, always, ZBS_STATE_SIZE, "ZBS/zbias", 0 );
+   ALLOC_STATE( tcl, always, TCL_STATE_SIZE, "TCL/tcl", 1 );
+   ALLOC_STATE( mtl, tcl_lighting, MTL_STATE_SIZE, "MTL/material", 1 );
+   ALLOC_STATE( grd, always, GRD_STATE_SIZE, "GRD/guard-band", 1 );
+   ALLOC_STATE( fog, fog, FOG_STATE_SIZE, "FOG/fog", 1 );
+   ALLOC_STATE( glt, tcl_lighting, GLT_STATE_SIZE, "GLT/light-global", 1 );
+   ALLOC_STATE( eye, tcl_lighting, EYE_STATE_SIZE, "EYE/eye-vector", 1 );
+   ALLOC_STATE( tex[0], tex0, TEX_STATE_SIZE, "TEX/tex-0", 0 );
+   ALLOC_STATE( tex[1], tex1, TEX_STATE_SIZE, "TEX/tex-1", 0 );
+   ALLOC_STATE( tex[2], tex2, TEX_STATE_SIZE, "TEX/tex-2", 0 );
+   if (rmesa->radeonScreen->drmSupportsCubeMapsR100)
+   {
+      ALLOC_STATE( cube[0], cube0, CUBE_STATE_SIZE, "CUBE/cube-0", 0 );
+      ALLOC_STATE( cube[1], cube1, CUBE_STATE_SIZE, "CUBE/cube-1", 0 );
+      ALLOC_STATE( cube[2], cube2, CUBE_STATE_SIZE, "CUBE/cube-2", 0 );
+   }
+   else
+   {
+      ALLOC_STATE( cube[0], never, CUBE_STATE_SIZE, "CUBE/cube-0", 0 );
+      ALLOC_STATE( cube[1], never, CUBE_STATE_SIZE, "CUBE/cube-1", 0 );
+      ALLOC_STATE( cube[2], never, CUBE_STATE_SIZE, "CUBE/cube-2", 0 );
+   }
+   ALLOC_STATE( mat[0], tcl, MAT_STATE_SIZE, "MAT/modelproject", 1 );
+   ALLOC_STATE( mat[1], tcl_eyespace_or_fog, MAT_STATE_SIZE, "MAT/modelview", 1 );
+   ALLOC_STATE( mat[2], tcl_eyespace_or_lighting, MAT_STATE_SIZE, "MAT/it-modelview", 1 );
+   ALLOC_STATE( mat[3], tcl_tex0, MAT_STATE_SIZE, "MAT/texmat0", 1 );
+   ALLOC_STATE( mat[4], tcl_tex1, MAT_STATE_SIZE, "MAT/texmat1", 1 );
+   ALLOC_STATE( mat[5], tcl_tex2, MAT_STATE_SIZE, "MAT/texmat2", 1 );
+   ALLOC_STATE( ucp[0], tcl_ucp0, UCP_STATE_SIZE, "UCP/userclip-0", 1 );
+   ALLOC_STATE( ucp[1], tcl_ucp1, UCP_STATE_SIZE, "UCP/userclip-1", 1 );
+   ALLOC_STATE( ucp[2], tcl_ucp2, UCP_STATE_SIZE, "UCP/userclip-2", 1 );
+   ALLOC_STATE( ucp[3], tcl_ucp3, UCP_STATE_SIZE, "UCP/userclip-3", 1 );
+   ALLOC_STATE( ucp[4], tcl_ucp4, UCP_STATE_SIZE, "UCP/userclip-4", 1 );
+   ALLOC_STATE( ucp[5], tcl_ucp5, UCP_STATE_SIZE, "UCP/userclip-5", 1 );
+   ALLOC_STATE( lit[0], tcl_lit0, LIT_STATE_SIZE, "LIT/light-0", 1 );
+   ALLOC_STATE( lit[1], tcl_lit1, LIT_STATE_SIZE, "LIT/light-1", 1 );
+   ALLOC_STATE( lit[2], tcl_lit2, LIT_STATE_SIZE, "LIT/light-2", 1 );
+   ALLOC_STATE( lit[3], tcl_lit3, LIT_STATE_SIZE, "LIT/light-3", 1 );
+   ALLOC_STATE( lit[4], tcl_lit4, LIT_STATE_SIZE, "LIT/light-4", 1 );
+   ALLOC_STATE( lit[5], tcl_lit5, LIT_STATE_SIZE, "LIT/light-5", 1 );
+   ALLOC_STATE( lit[6], tcl_lit6, LIT_STATE_SIZE, "LIT/light-6", 1 );
+   ALLOC_STATE( lit[7], tcl_lit7, LIT_STATE_SIZE, "LIT/light-7", 1 );
+   ALLOC_STATE( txr[0], txr0, TXR_STATE_SIZE, "TXR/txr-0", 0 );
+   ALLOC_STATE( txr[1], txr1, TXR_STATE_SIZE, "TXR/txr-1", 0 );
+   ALLOC_STATE( txr[2], txr2, TXR_STATE_SIZE, "TXR/txr-2", 0 );
+
+   radeonSetUpAtomList( rmesa );
+
+   /* Fill in the packet headers:
+    */
+   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(RADEON_EMIT_PP_MISC);
+   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(RADEON_EMIT_PP_CNTL);
+   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(RADEON_EMIT_RB3D_COLORPITCH);
+   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(RADEON_EMIT_RE_LINE_PATTERN);
+   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(RADEON_EMIT_SE_LINE_WIDTH);
+   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(RADEON_EMIT_RB3D_STENCILREFMASK);
+   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(RADEON_EMIT_SE_VPORT_XSCALE);
+   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(RADEON_EMIT_SE_CNTL);
+   rmesa->hw.set.cmd[SET_CMD_1] = cmdpkt(RADEON_EMIT_SE_CNTL_STATUS);
+   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(RADEON_EMIT_RE_MISC);
+   rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(RADEON_EMIT_PP_TXFILTER_0);
+   rmesa->hw.tex[0].cmd[TEX_CMD_1] = cmdpkt(RADEON_EMIT_PP_BORDER_COLOR_0);
+   rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(RADEON_EMIT_PP_TXFILTER_1);
+   rmesa->hw.tex[1].cmd[TEX_CMD_1] = cmdpkt(RADEON_EMIT_PP_BORDER_COLOR_1);
+   rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(RADEON_EMIT_PP_TXFILTER_2);
+   rmesa->hw.tex[2].cmd[TEX_CMD_1] = cmdpkt(RADEON_EMIT_PP_BORDER_COLOR_2);
+   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(RADEON_EMIT_PP_CUBIC_FACES_0);
+   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(RADEON_EMIT_PP_CUBIC_OFFSETS_T0);
+   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(RADEON_EMIT_PP_CUBIC_FACES_1);
+   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(RADEON_EMIT_PP_CUBIC_OFFSETS_T1);
+   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(RADEON_EMIT_PP_CUBIC_FACES_2);
+   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(RADEON_EMIT_PP_CUBIC_OFFSETS_T2);
+   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(RADEON_EMIT_SE_ZBIAS_FACTOR);
+   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT);
+   rmesa->hw.mtl.cmd[MTL_CMD_0] = 
+      cmdpkt(RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED);
+   rmesa->hw.txr[0].cmd[TXR_CMD_0] = cmdpkt(RADEON_EMIT_PP_TEX_SIZE_0);
+   rmesa->hw.txr[1].cmd[TXR_CMD_0] = cmdpkt(RADEON_EMIT_PP_TEX_SIZE_1);
+   rmesa->hw.txr[2].cmd[TXR_CMD_0] = cmdpkt(RADEON_EMIT_PP_TEX_SIZE_2);
+   rmesa->hw.grd.cmd[GRD_CMD_0] = 
+      cmdscl( RADEON_SS_VERT_GUARD_CLIP_ADJ_ADDR, 1, 4 );
+   rmesa->hw.fog.cmd[FOG_CMD_0] = 
+      cmdvec( RADEON_VS_FOG_PARAM_ADDR, 1, 4 );
+   rmesa->hw.glt.cmd[GLT_CMD_0] = 
+      cmdvec( RADEON_VS_GLOBAL_AMBIENT_ADDR, 1, 4 );
+   rmesa->hw.eye.cmd[EYE_CMD_0] = 
+      cmdvec( RADEON_VS_EYE_VECTOR_ADDR, 1, 4 );
+
+   for (i = 0 ; i < 6; i++) {
+      rmesa->hw.mat[i].cmd[MAT_CMD_0] = 
+	 cmdvec( RADEON_VS_MATRIX_0_ADDR + i*4, 1, 16);
+   }
+
+   for (i = 0 ; i < 8; i++) {
+      rmesa->hw.lit[i].cmd[LIT_CMD_0] = 
+	 cmdvec( RADEON_VS_LIGHT_AMBIENT_ADDR + i, 8, 24 );
+      rmesa->hw.lit[i].cmd[LIT_CMD_1] = 
+	 cmdscl( RADEON_SS_LIGHT_DCD_ADDR + i, 8, 6 );
+   }
+
+   for (i = 0 ; i < 6; i++) {
+      rmesa->hw.ucp[i].cmd[UCP_CMD_0] = 
+	 cmdvec( RADEON_VS_UCP_ADDR + i, 1, 4 );
+   }
+
+   rmesa->last_ReallyEnabled = -1;
+
+   /* Initial Harware state:
+    */
+   rmesa->hw.ctx.cmd[CTX_PP_MISC] = (RADEON_ALPHA_TEST_PASS |
+				     RADEON_CHROMA_FUNC_FAIL |
+				     RADEON_CHROMA_KEY_NEAREST |
+				     RADEON_SHADOW_FUNC_EQUAL |
+				     RADEON_SHADOW_PASS_1 /*|
+				     RADEON_RIGHT_HAND_CUBE_OGL */);
+
+   rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] = (RADEON_FOG_VERTEX |
+					  /* this bit unused for vertex fog */
+					  RADEON_FOG_USE_DEPTH);
+
+   rmesa->hw.ctx.cmd[CTX_RE_SOLID_COLOR] = 0x00000000;
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] = (RADEON_COMB_FCN_ADD_CLAMP |
+					    RADEON_SRC_BLEND_GL_ONE |
+					    RADEON_DST_BLEND_GL_ZERO );
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHOFFSET] =
+      rmesa->radeonScreen->depthOffset + rmesa->radeonScreen->fbLocation;
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] = 
+      ((rmesa->radeonScreen->depthPitch &
+	RADEON_DEPTHPITCH_MASK) |
+       RADEON_DEPTH_ENDIAN_NO_SWAP);
+       
+   if (rmesa->using_hyperz)
+       rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] |= RADEON_DEPTH_HYPERZ;
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (depth_fmt |
+					       RADEON_Z_TEST_LESS |
+					       RADEON_STENCIL_TEST_ALWAYS |
+					       RADEON_STENCIL_FAIL_KEEP |
+					       RADEON_STENCIL_ZPASS_KEEP |
+					       RADEON_STENCIL_ZFAIL_KEEP |
+					       RADEON_Z_WRITE_ENABLE);
+
+   if (rmesa->using_hyperz) {
+       rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_COMPRESSION_ENABLE |
+						   RADEON_Z_DECOMPRESSION_ENABLE;
+      if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+	 /* works for q3, but slight rendering errors with glxgears ? */
+/*	 rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_HIERARCHY_ENABLE;*/
+	 /* need this otherwise get lots of lockups with q3 ??? */
+	 rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_FORCE_Z_DIRTY;
+      } 
+   }
+
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] = (RADEON_SCISSOR_ENABLE |
+				     RADEON_ANTI_ALIAS_NONE);
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = (RADEON_PLANE_MASK_ENABLE |
+				       color_fmt |
+				       RADEON_ZBLOCK16);
+
+   switch ( driQueryOptioni( &rmesa->optionCache, "dither_mode" ) ) {
+   case DRI_CONF_DITHER_XERRORDIFFRESET:
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= RADEON_DITHER_INIT;
+      break;
+   case DRI_CONF_DITHER_ORDERED:
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= RADEON_SCALE_DITHER_ENABLE;
+      break;
+   }
+   if ( driQueryOptioni( &rmesa->optionCache, "round_mode" ) ==
+	DRI_CONF_ROUND_ROUND )
+      rmesa->state.color.roundEnable = RADEON_ROUND_ENABLE;
+   else
+      rmesa->state.color.roundEnable = 0;
+   if ( driQueryOptioni (&rmesa->optionCache, "color_reduction" ) ==
+	DRI_CONF_COLOR_REDUCTION_DITHER )
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= RADEON_DITHER_ENABLE;
+   else
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->state.color.roundEnable;
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = ((drawOffset +
+					       rmesa->radeonScreen->fbLocation)
+					      & RADEON_COLOROFFSET_MASK);
+
+   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = ((drawPitch &
+					      RADEON_COLORPITCH_MASK) |
+					     RADEON_COLOR_ENDIAN_NO_SWAP);
+
+
+   /* (fixed size) sarea is initialized to zero afaics so can omit version check. Phew! */
+   if (rmesa->sarea->tiling_enabled) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= RADEON_COLOR_TILE_ENABLE;
+   }
+
+   rmesa->hw.set.cmd[SET_SE_CNTL] = (RADEON_FFACE_CULL_CCW |
+				     RADEON_BFACE_SOLID |
+				     RADEON_FFACE_SOLID |
+/*  			     RADEON_BADVTX_CULL_DISABLE | */
+				     RADEON_FLAT_SHADE_VTX_LAST |
+				     RADEON_DIFFUSE_SHADE_GOURAUD |
+				     RADEON_ALPHA_SHADE_GOURAUD |
+				     RADEON_SPECULAR_SHADE_GOURAUD |
+				     RADEON_FOG_SHADE_GOURAUD |
+				     RADEON_VPORT_XY_XFORM_ENABLE |
+				     RADEON_VPORT_Z_XFORM_ENABLE |
+				     RADEON_VTX_PIX_CENTER_OGL |
+				     RADEON_ROUND_MODE_TRUNC |
+				     RADEON_ROUND_PREC_8TH_PIX);
+
+   rmesa->hw.set.cmd[SET_SE_CNTL_STATUS] =
+#ifdef MESA_BIG_ENDIAN
+					    RADEON_VC_32BIT_SWAP;
+#else
+  					    RADEON_VC_NO_SWAP;
+#endif
+
+   if (!(rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+     rmesa->hw.set.cmd[SET_SE_CNTL_STATUS] |= RADEON_TCL_BYPASS;
+   }
+
+   rmesa->hw.set.cmd[SET_SE_COORDFMT] = (
+      RADEON_VTX_W0_IS_NOT_1_OVER_W0 |
+      RADEON_TEX1_W_ROUTING_USE_Q1);
+
+
+   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] = ((1 << 16) | 0xffff);
+
+   rmesa->hw.lin.cmd[LIN_RE_LINE_STATE] = 
+      ((0 << RADEON_LINE_CURRENT_PTR_SHIFT) |
+       (1 << RADEON_LINE_CURRENT_COUNT_SHIFT));
+
+   rmesa->hw.lin.cmd[LIN_SE_LINE_WIDTH] = (1 << 4);
+
+   rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] = 
+      ((0x00 << RADEON_STENCIL_REF_SHIFT) |
+       (0xff << RADEON_STENCIL_MASK_SHIFT) |
+       (0xff << RADEON_STENCIL_WRITEMASK_SHIFT));
+
+   rmesa->hw.msk.cmd[MSK_RB3D_ROPCNTL] = RADEON_ROP_COPY;
+   rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK] = 0xffffffff;
+
+   rmesa->hw.msc.cmd[MSC_RE_MISC] = 
+      ((0 << RADEON_STIPPLE_X_OFFSET_SHIFT) |
+       (0 << RADEON_STIPPLE_Y_OFFSET_SHIFT) |
+       RADEON_STIPPLE_BIG_BIT_ORDER);
+
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_XSCALE]  = 0x00000000;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_XOFFSET] = 0x00000000;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_YSCALE]  = 0x00000000;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_YOFFSET] = 0x00000000;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_ZSCALE]  = 0x00000000;
+   rmesa->hw.vpt.cmd[VPT_SE_VPORT_ZOFFSET] = 0x00000000;
+
+   for ( i = 0 ; i < ctx->Const.MaxTextureUnits ; i++ ) {
+      rmesa->hw.tex[i].cmd[TEX_PP_TXFILTER] = RADEON_BORDER_MODE_OGL;
+      rmesa->hw.tex[i].cmd[TEX_PP_TXFORMAT] = 
+	  (RADEON_TXFORMAT_ENDIAN_NO_SWAP |
+	   RADEON_TXFORMAT_PERSPECTIVE_ENABLE |
+	   (i << 24) | /* This is one of RADEON_TXFORMAT_ST_ROUTE_STQ[012] */
+	   (2 << RADEON_TXFORMAT_WIDTH_SHIFT) |
+	   (2 << RADEON_TXFORMAT_HEIGHT_SHIFT));
+
+      /* Initialize the texture offset to the start of the card texture heap */
+      rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET] =
+	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+
+      rmesa->hw.tex[i].cmd[TEX_PP_BORDER_COLOR] = 0;
+      rmesa->hw.tex[i].cmd[TEX_PP_TXCBLEND] =  
+	  (RADEON_COLOR_ARG_A_ZERO |
+	   RADEON_COLOR_ARG_B_ZERO |
+	   RADEON_COLOR_ARG_C_CURRENT_COLOR |
+	   RADEON_BLEND_CTL_ADD |
+	   RADEON_SCALE_1X |
+	   RADEON_CLAMP_TX);
+      rmesa->hw.tex[i].cmd[TEX_PP_TXABLEND] = 
+	  (RADEON_ALPHA_ARG_A_ZERO |
+	   RADEON_ALPHA_ARG_B_ZERO |
+	   RADEON_ALPHA_ARG_C_CURRENT_ALPHA |
+	   RADEON_BLEND_CTL_ADD |
+	   RADEON_SCALE_1X |
+	   RADEON_CLAMP_TX);
+      rmesa->hw.tex[i].cmd[TEX_PP_TFACTOR] = 0;
+
+      rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_FACES] = 0;
+      rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_0] =
+	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+      rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_1] =
+	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+      rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_2] =
+	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+      rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_3] =
+	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+      rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_4] =
+	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+   }
+
+   /* Can only add ST1 at the time of doing some multitex but can keep
+    * it after that.  Errors if DIFFUSE is missing.
+    */
+   rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] = 
+      (RADEON_TCL_VTX_Z0 |
+       RADEON_TCL_VTX_W0 |
+       RADEON_TCL_VTX_PK_DIFFUSE
+	 );	/* need to keep this uptodate */
+						   
+   rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL] =
+      ( RADEON_TCL_COMPUTE_XYZW 	|
+	(RADEON_TCL_TEX_INPUT_TEX_0 << RADEON_TCL_TEX_0_OUTPUT_SHIFT) |
+	(RADEON_TCL_TEX_INPUT_TEX_1 << RADEON_TCL_TEX_1_OUTPUT_SHIFT) |
+	(RADEON_TCL_TEX_INPUT_TEX_2 << RADEON_TCL_TEX_2_OUTPUT_SHIFT));
+
+
+   /* XXX */
+   rmesa->hw.tcl.cmd[TCL_MATRIX_SELECT_0] = 
+      ((MODEL << RADEON_MODELVIEW_0_SHIFT) |
+       (MODEL_IT << RADEON_IT_MODELVIEW_0_SHIFT));
+
+   rmesa->hw.tcl.cmd[TCL_MATRIX_SELECT_1] = 
+      ((MODEL_PROJ << RADEON_MODELPROJECT_0_SHIFT) |
+       (TEXMAT_0 << RADEON_TEXMAT_0_SHIFT) |
+       (TEXMAT_1 << RADEON_TEXMAT_1_SHIFT) |
+       (TEXMAT_2 << RADEON_TEXMAT_2_SHIFT));
+
+   rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] = 
+      (RADEON_UCP_IN_CLIP_SPACE |
+       RADEON_CULL_FRONT_IS_CCW);
+
+   rmesa->hw.tcl.cmd[TCL_TEXTURE_PROC_CTL] = 0; 
+
+   rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = 
+      (RADEON_SPECULAR_LIGHTS |
+       RADEON_DIFFUSE_SPECULAR_COMBINE |
+       RADEON_LOCAL_LIGHT_VEC_GL |
+       (RADEON_LM_SOURCE_STATE_MULT << RADEON_EMISSIVE_SOURCE_SHIFT) |
+       (RADEON_LM_SOURCE_STATE_MULT << RADEON_AMBIENT_SOURCE_SHIFT) |
+       (RADEON_LM_SOURCE_STATE_MULT << RADEON_DIFFUSE_SOURCE_SHIFT) |
+       (RADEON_LM_SOURCE_STATE_MULT << RADEON_SPECULAR_SOURCE_SHIFT));
+
+   for (i = 0 ; i < 8; i++) {
+      struct gl_light *l = &ctx->Light.Light[i];
+      GLenum p = GL_LIGHT0 + i;
+      *(float *)&(rmesa->hw.lit[i].cmd[LIT_RANGE_CUTOFF]) = FLT_MAX;
+
+      ctx->Driver.Lightfv( ctx, p, GL_AMBIENT, l->Ambient );
+      ctx->Driver.Lightfv( ctx, p, GL_DIFFUSE, l->Diffuse );
+      ctx->Driver.Lightfv( ctx, p, GL_SPECULAR, l->Specular );
+      ctx->Driver.Lightfv( ctx, p, GL_POSITION, NULL );
+      ctx->Driver.Lightfv( ctx, p, GL_SPOT_DIRECTION, NULL );
+      ctx->Driver.Lightfv( ctx, p, GL_SPOT_EXPONENT, &l->SpotExponent );
+      ctx->Driver.Lightfv( ctx, p, GL_SPOT_CUTOFF, &l->SpotCutoff );
+      ctx->Driver.Lightfv( ctx, p, GL_CONSTANT_ATTENUATION,
+			   &l->ConstantAttenuation );
+      ctx->Driver.Lightfv( ctx, p, GL_LINEAR_ATTENUATION, 
+			   &l->LinearAttenuation );
+      ctx->Driver.Lightfv( ctx, p, GL_QUADRATIC_ATTENUATION, 
+		     &l->QuadraticAttenuation );
+      *(float *)&(rmesa->hw.lit[i].cmd[LIT_ATTEN_XXX]) = 0.0;
+   }
+
+   ctx->Driver.LightModelfv( ctx, GL_LIGHT_MODEL_AMBIENT, 
+			     ctx->Light.Model.Ambient );
+
+   TNL_CONTEXT(ctx)->Driver.NotifyMaterialChange( ctx );
+
+   for (i = 0 ; i < 6; i++) {
+      ctx->Driver.ClipPlane( ctx, GL_CLIP_PLANE0 + i, NULL );
+   }
+
+   ctx->Driver.Fogfv( ctx, GL_FOG_MODE, NULL );
+   ctx->Driver.Fogfv( ctx, GL_FOG_DENSITY, &ctx->Fog.Density );
+   ctx->Driver.Fogfv( ctx, GL_FOG_START, &ctx->Fog.Start );
+   ctx->Driver.Fogfv( ctx, GL_FOG_END, &ctx->Fog.End );
+   ctx->Driver.Fogfv( ctx, GL_FOG_COLOR, ctx->Fog.Color );
+   ctx->Driver.Fogfv( ctx, GL_FOG_COORDINATE_SOURCE_EXT, NULL );
+   
+   rmesa->hw.grd.cmd[GRD_VERT_GUARD_CLIP_ADJ] = IEEE_ONE;
+   rmesa->hw.grd.cmd[GRD_VERT_GUARD_DISCARD_ADJ] = IEEE_ONE;
+   rmesa->hw.grd.cmd[GRD_HORZ_GUARD_CLIP_ADJ] = IEEE_ONE;
+   rmesa->hw.grd.cmd[GRD_HORZ_GUARD_DISCARD_ADJ] = IEEE_ONE;
+
+   rmesa->hw.eye.cmd[EYE_X] = 0;
+   rmesa->hw.eye.cmd[EYE_Y] = 0;
+   rmesa->hw.eye.cmd[EYE_Z] = IEEE_ONE;
+   rmesa->hw.eye.cmd[EYE_RESCALE_FACTOR] = IEEE_ONE;
+   
+   rmesa->hw.all_dirty = GL_TRUE;
+}
diff --git a/radeon/radeon_swtcl.c b/radeon/radeon_swtcl.c
new file mode 100644
index 0000000..7ce1fa6
--- /dev/null
+++ b/radeon/radeon_swtcl.c
@@ -0,0 +1,890 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_swtcl.c,v 1.6 2003/05/06 23:52:08 daenzer Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "mtypes.h"
+#include "colormac.h"
+#include "enums.h"
+#include "imports.h"
+#include "macros.h"
+
+#include "swrast_setup/swrast_setup.h"
+#include "math/m_translate.h"
+#include "tnl/tnl.h"
+#include "tnl/t_context.h"
+#include "tnl/t_pipeline.h"
+
+#include "radeon_context.h"
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "radeon_swtcl.h"
+#include "radeon_tcl.h"
+
+
+static void flush_last_swtcl_prim( radeonContextPtr rmesa  );
+
+/* R100: xyzw, c0, c1/fog, stq[0..2]  = 4+1+1+3*3 = 15  right? */
+/* R200: xyzw, c0, c1/fog, strq[0..5] = 4+1+1+4*6 = 30 */
+#define RADEON_MAX_TNL_VERTEX_SIZE (15 * sizeof(GLfloat))	/* for mesa _tnl stage */
+
+/***********************************************************************
+ *                         Initialization 
+ ***********************************************************************/
+
+#define EMIT_ATTR( ATTR, STYLE, F0 )					\
+do {									\
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);	\
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE);	\
+   rmesa->swtcl.vertex_attr_count++;					\
+   fmt_0 |= F0;								\
+} while (0)
+
+#define EMIT_PAD( N )							\
+do {									\
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;		\
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;	\
+   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);		\
+   rmesa->swtcl.vertex_attr_count++;					\
+} while (0)
+
+static GLuint radeon_cp_vc_frmts[3][2] =
+{
+   { RADEON_CP_VC_FRMT_ST0, RADEON_CP_VC_FRMT_ST0 | RADEON_CP_VC_FRMT_Q0 },
+   { RADEON_CP_VC_FRMT_ST1, RADEON_CP_VC_FRMT_ST1 | RADEON_CP_VC_FRMT_Q1 },
+   { RADEON_CP_VC_FRMT_ST2, RADEON_CP_VC_FRMT_ST2 | RADEON_CP_VC_FRMT_Q2 },
+};
+
+static void radeonSetVertexFormat( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+   DECLARE_RENDERINPUTS(index_bitset);
+   int fmt_0 = 0;
+   int offset = 0;
+
+   RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
+
+   /* Important:
+    */
+   if ( VB->NdcPtr != NULL ) {
+      VB->AttribPtr[VERT_ATTRIB_POS] = VB->NdcPtr;
+   }
+   else {
+      VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
+   }
+
+   assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
+   rmesa->swtcl.vertex_attr_count = 0;
+
+   /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
+    * build up a hardware vertex.
+    */
+   if ( !rmesa->swtcl.needproj ||
+        RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {	/* for projtex */
+      EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F, 
+		 RADEON_CP_VC_FRMT_XY |	RADEON_CP_VC_FRMT_Z | RADEON_CP_VC_FRMT_W0 );
+      offset = 4;
+   }
+   else {
+      EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_3F, 
+		 RADEON_CP_VC_FRMT_XY |	RADEON_CP_VC_FRMT_Z );
+      offset = 3;
+   }
+
+   rmesa->swtcl.coloroffset = offset;
+#if MESA_LITTLE_ENDIAN 
+   EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA, 
+	      RADEON_CP_VC_FRMT_PKCOLOR );
+#else
+   EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR,
+	      RADEON_CP_VC_FRMT_PKCOLOR );
+#endif
+   offset += 1;
+
+   rmesa->swtcl.specoffset = 0;
+   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 ) ||
+       RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
+
+#if MESA_LITTLE_ENDIAN 
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
+	 rmesa->swtcl.specoffset = offset;
+	 EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_RGB,
+	 	    RADEON_CP_VC_FRMT_PKSPEC );
+      }
+      else {
+	 EMIT_PAD( 3 );
+      }
+
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
+	 EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1UB_1F,
+	 	    RADEON_CP_VC_FRMT_PKSPEC );
+      }
+      else {
+	 EMIT_PAD( 1 );
+      }
+#else
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
+	 EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1UB_1F,
+	 	    RADEON_CP_VC_FRMT_PKSPEC );
+      }
+      else {
+	 EMIT_PAD( 1 );
+      }
+
+      if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
+	 rmesa->swtcl.specoffset = offset;
+	 EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_BGR,
+	 	    RADEON_CP_VC_FRMT_PKSPEC );
+      }
+      else {
+	 EMIT_PAD( 3 );
+      }
+#endif
+   }
+
+   if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+      int i;
+
+      for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+	 if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
+	    GLuint sz = VB->TexCoordPtr[i]->size;
+
+	    switch (sz) {
+	    case 1:
+	    case 2:
+	       EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_2F,
+			  radeon_cp_vc_frmts[i][0] );
+	       break;
+	    case 3:
+	    case 4:
+	       if (ctx->Texture.Unit[i]._ReallyEnabled & (TEXTURE_CUBE_BIT) ) {
+		  EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_3F,
+			     radeon_cp_vc_frmts[i][1] );
+	       } else {
+		  EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_3F_XYW,
+			     radeon_cp_vc_frmts[i][1] );
+	       }
+	       break;
+	    default:
+	       continue;
+	    };
+	 }
+      }
+   }
+
+   if (!RENDERINPUTS_EQUAL( rmesa->tnl_index_bitset, index_bitset ) ||
+	fmt_0 != rmesa->swtcl.vertex_format) {
+      RADEON_NEWPRIM(rmesa);
+      rmesa->swtcl.vertex_format = fmt_0;
+      rmesa->swtcl.vertex_size =
+	  _tnl_install_attrs( ctx,
+			      rmesa->swtcl.vertex_attrs, 
+			      rmesa->swtcl.vertex_attr_count,
+			      NULL, 0 );
+      rmesa->swtcl.vertex_size /= 4;
+      RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
+      if (RADEON_DEBUG & DEBUG_VERTS)
+	 fprintf( stderr, "%s: vertex_size= %d floats\n",
+		  __FUNCTION__, rmesa->swtcl.vertex_size);
+   }
+}
+
+
+static void radeonRenderStart( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+
+   radeonSetVertexFormat( ctx );
+   
+   if (rmesa->dma.flush != 0 && 
+       rmesa->dma.flush != flush_last_swtcl_prim)
+      rmesa->dma.flush( rmesa );
+}
+
+
+/**
+ * Set vertex state for SW TCL.  The primary purpose of this function is to
+ * determine in advance whether or not the hardware can / should do the
+ * projection divide or Mesa should do it.
+ */
+void radeonChooseVertexState( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+
+   GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
+   
+   se_coord_fmt &= ~(RADEON_VTX_XY_PRE_MULT_1_OVER_W0 |
+		     RADEON_VTX_Z_PRE_MULT_1_OVER_W0 |
+		     RADEON_VTX_W0_IS_NOT_1_OVER_W0);
+
+   /* We must ensure that we don't do _tnl_need_projected_coords while in a
+    * rasterization fallback.  As this function will be called again when we
+    * leave a rasterization fallback, we can just skip it for now.
+    */
+   if (rmesa->Fallback != 0)
+      return;
+
+   /* HW perspective divide is a win, but tiny vertex formats are a
+    * bigger one.
+    */
+
+   if ((!RENDERINPUTS_TEST_RANGE( tnl->render_inputs_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX ) &&
+       !RENDERINPUTS_TEST( tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR1 ))
+       || (ctx->_TriangleCaps & (DD_TRI_LIGHT_TWOSIDE|DD_TRI_UNFILLED))) {
+      rmesa->swtcl.needproj = GL_TRUE;
+      se_coord_fmt |= (RADEON_VTX_XY_PRE_MULT_1_OVER_W0 |
+		      RADEON_VTX_Z_PRE_MULT_1_OVER_W0);
+   }
+   else {
+      rmesa->swtcl.needproj = GL_FALSE;
+      se_coord_fmt |= (RADEON_VTX_W0_IS_NOT_1_OVER_W0);
+   }
+
+   _tnl_need_projected_coords( ctx, rmesa->swtcl.needproj );
+
+   if ( se_coord_fmt != rmesa->hw.set.cmd[SET_SE_COORDFMT] ) {
+      RADEON_STATECHANGE( rmesa, set );
+      rmesa->hw.set.cmd[SET_SE_COORDFMT] = se_coord_fmt;
+   }
+}
+
+
+/* Flush vertices in the current dma region.
+ */
+static void flush_last_swtcl_prim( radeonContextPtr rmesa  )
+{
+   if (RADEON_DEBUG & DEBUG_IOCTL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   rmesa->dma.flush = NULL;
+
+   if (rmesa->dma.current.buf) {
+      struct radeon_dma_region *current = &rmesa->dma.current;
+      GLuint current_offset = (rmesa->radeonScreen->gart_buffer_offset +
+			       current->buf->buf->idx * RADEON_BUFFER_SIZE + 
+			       current->start);
+
+      assert (!(rmesa->swtcl.hw_primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
+
+      assert (current->start + 
+	      rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+	      current->ptr);
+
+      if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
+	 radeonEnsureCmdBufSpace( rmesa, VERT_AOS_BUFSZ +
+			          rmesa->hw.max_state_size + VBUF_BUFSZ );
+
+	 radeonEmitVertexAOS( rmesa,
+			      rmesa->swtcl.vertex_size,
+			      current_offset);
+
+	 radeonEmitVbufPrim( rmesa,
+			     rmesa->swtcl.vertex_format,
+			     rmesa->swtcl.hw_primitive,
+			     rmesa->swtcl.numverts);
+      }
+
+      rmesa->swtcl.numverts = 0;
+      current->start = current->ptr;
+   }
+}
+
+
+/* Alloc space in the current dma region.
+ */
+static INLINE void *
+radeonAllocDmaLowVerts( radeonContextPtr rmesa, int nverts, int vsize )
+{
+   GLuint bytes = vsize * nverts;
+
+   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
+      radeonRefillCurrentDmaRegion( rmesa );
+
+   if (!rmesa->dma.flush) {
+      rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+      rmesa->dma.flush = flush_last_swtcl_prim;
+   }
+
+   assert( vsize == rmesa->swtcl.vertex_size * 4 );
+   assert( rmesa->dma.flush == flush_last_swtcl_prim );
+   assert (rmesa->dma.current.start + 
+	   rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+	   rmesa->dma.current.ptr);
+
+
+   {
+      GLubyte *head = (GLubyte *)(rmesa->dma.current.address + rmesa->dma.current.ptr);
+      rmesa->dma.current.ptr += bytes;
+      rmesa->swtcl.numverts += nverts;
+      return head;
+   }
+
+}
+
+
+/*
+ * Render unclipped vertex buffers by emitting vertices directly to
+ * dma buffers.  Use strip/fan hardware primitives where possible.
+ * Try to simulate missing primitives with indexed vertices.
+ */
+#define HAVE_POINTS      1
+#define HAVE_LINES       1
+#define HAVE_LINE_STRIPS 1
+#define HAVE_TRIANGLES   1
+#define HAVE_TRI_STRIPS  1
+#define HAVE_TRI_STRIP_1 0
+#define HAVE_TRI_FANS    1
+#define HAVE_QUADS       0
+#define HAVE_QUAD_STRIPS 0
+#define HAVE_POLYGONS    0
+/* \todo: is it possible to make "ELTS" work with t_vertex code ? */
+#define HAVE_ELTS        0
+
+static const GLuint hw_prim[GL_POLYGON+1] = {
+   RADEON_CP_VC_CNTL_PRIM_TYPE_POINT,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_LINE,
+   0,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_LINE_STRIP,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_STRIP,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_FAN,
+   0,
+   0,
+   0
+};
+
+static INLINE void
+radeonDmaPrimitive( radeonContextPtr rmesa, GLenum prim )
+{
+   RADEON_NEWPRIM( rmesa );
+   rmesa->swtcl.hw_primitive = hw_prim[prim];
+   assert(rmesa->dma.current.ptr == rmesa->dma.current.start);
+}
+
+#define LOCAL_VARS radeonContextPtr rmesa = RADEON_CONTEXT(ctx)
+#define INIT( prim ) radeonDmaPrimitive( rmesa, prim )
+#define FLUSH()  RADEON_NEWPRIM( rmesa )
+#define GET_CURRENT_VB_MAX_VERTS() \
+  (((int)rmesa->dma.current.end - (int)rmesa->dma.current.ptr) / (rmesa->swtcl.vertex_size*4))
+#define GET_SUBSEQUENT_VB_MAX_VERTS() \
+  ((RADEON_BUFFER_SIZE) / (rmesa->swtcl.vertex_size*4))
+#define ALLOC_VERTS( nr ) \
+  radeonAllocDmaLowVerts( rmesa, nr, rmesa->swtcl.vertex_size * 4 )
+#define EMIT_VERTS( ctx, j, nr, buf ) \
+  _tnl_emit_vertices_to_buffer(ctx, j, (j)+(nr), buf)
+
+#define TAG(x) radeon_dma_##x
+#include "tnl_dd/t_dd_dmatmp.h"
+
+
+/**********************************************************************/
+/*                          Render pipeline stage                     */
+/**********************************************************************/
+
+
+static GLboolean radeon_run_render( GLcontext *ctx,
+				    struct tnl_pipeline_stage *stage )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+   tnl_render_func *tab = TAG(render_tab_verts);
+   GLuint i;
+
+   if (rmesa->swtcl.indexed_verts.buf) 
+      RELEASE_ELT_VERTS();
+   	
+   if (rmesa->swtcl.RenderIndex != 0 ||   
+       !radeon_dma_validate_render( ctx, VB ))
+      return GL_TRUE;		
+
+   tnl->Driver.Render.Start( ctx );
+
+   for (i = 0 ; i < VB->PrimitiveCount ; i++)
+   {
+      GLuint prim = VB->Primitive[i].mode;
+      GLuint start = VB->Primitive[i].start;
+      GLuint length = VB->Primitive[i].count;
+
+      if (!length)
+	 continue;
+
+      if (RADEON_DEBUG & DEBUG_PRIMS)
+	 fprintf(stderr, "radeon_render.c: prim %s %d..%d\n", 
+		 _mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK), 
+		 start, start+length);
+
+      if (length)
+	 tab[prim & PRIM_MODE_MASK]( ctx, start, start + length, prim );
+   }
+
+   tnl->Driver.Render.Finish( ctx );
+
+   return GL_FALSE;		/* finished the pipe */
+}
+
+
+
+const struct tnl_pipeline_stage _radeon_render_stage =
+{
+   "radeon render",
+   NULL,
+   NULL,
+   NULL,
+   NULL,
+   radeon_run_render		/* run */
+};
+
+
+/**************************************************************************/
+
+
+static const GLuint reduced_hw_prim[GL_POLYGON+1] = {
+   RADEON_CP_VC_CNTL_PRIM_TYPE_POINT,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_LINE,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_LINE,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_LINE,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST,
+   RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST
+};
+
+static void radeonRasterPrimitive( GLcontext *ctx, GLuint hwprim );
+static void radeonRenderPrimitive( GLcontext *ctx, GLenum prim );
+static void radeonResetLineStipple( GLcontext *ctx );
+
+
+/***********************************************************************
+ *                    Emit primitives as inline vertices               *
+ ***********************************************************************/
+
+#undef LOCAL_VARS
+#undef ALLOC_VERTS
+#define CTX_ARG radeonContextPtr rmesa
+#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
+#define ALLOC_VERTS( n, size ) radeonAllocDmaLowVerts( rmesa, n, (size) * 4 )
+#undef LOCAL_VARS
+#define LOCAL_VARS						\
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);		\
+   const char *radeonverts = (char *)rmesa->swtcl.verts;
+#define VERT(x) (radeonVertex *)(radeonverts + ((x) * (vertsize) * sizeof(int)))
+#define VERTEX radeonVertex 
+#undef TAG
+#define TAG(x) radeon_##x
+#include "tnl_dd/t_dd_triemit.h"
+
+
+/***********************************************************************
+ *          Macros for t_dd_tritmp.h to draw basic primitives          *
+ ***********************************************************************/
+
+#define QUAD( a, b, c, d ) radeon_quad( rmesa, a, b, c, d )
+#define TRI( a, b, c )     radeon_triangle( rmesa, a, b, c )
+#define LINE( a, b )       radeon_line( rmesa, a, b )
+#define POINT( a )         radeon_point( rmesa, a )
+
+/***********************************************************************
+ *              Build render functions from dd templates               *
+ ***********************************************************************/
+
+#define RADEON_TWOSIDE_BIT	0x01
+#define RADEON_UNFILLED_BIT	0x02
+#define RADEON_MAX_TRIFUNC	0x04
+
+
+static struct {
+   tnl_points_func	        points;
+   tnl_line_func		line;
+   tnl_triangle_func	triangle;
+   tnl_quad_func		quad;
+} rast_tab[RADEON_MAX_TRIFUNC];
+
+
+#define DO_FALLBACK  0
+#define DO_OFFSET    0
+#define DO_UNFILLED (IND & RADEON_UNFILLED_BIT)
+#define DO_TWOSIDE  (IND & RADEON_TWOSIDE_BIT)
+#define DO_FLAT      0
+#define DO_TRI       1
+#define DO_QUAD      1
+#define DO_LINE      1
+#define DO_POINTS    1
+#define DO_FULL_QUAD 1
+
+#define HAVE_RGBA   1
+#define HAVE_SPEC   1
+#define HAVE_BACK_COLORS  0
+#define HAVE_HW_FLATSHADE 1
+#define TAB rast_tab
+
+#define DEPTH_SCALE 1.0
+#define UNFILLED_TRI unfilled_tri
+#define UNFILLED_QUAD unfilled_quad
+#define VERT_X(_v) _v->v.x
+#define VERT_Y(_v) _v->v.y
+#define VERT_Z(_v) _v->v.z
+#define AREA_IS_CCW( a ) (a < 0)
+#define GET_VERTEX(e) (rmesa->swtcl.verts + ((e) * rmesa->swtcl.vertex_size * sizeof(int)))
+
+#define VERT_SET_RGBA( v, c )  					\
+do {								\
+   radeon_color_t *color = (radeon_color_t *)&((v)->ui[coloroffset]);	\
+   UNCLAMPED_FLOAT_TO_UBYTE(color->red, (c)[0]);		\
+   UNCLAMPED_FLOAT_TO_UBYTE(color->green, (c)[1]);		\
+   UNCLAMPED_FLOAT_TO_UBYTE(color->blue, (c)[2]);		\
+   UNCLAMPED_FLOAT_TO_UBYTE(color->alpha, (c)[3]);		\
+} while (0)
+
+#define VERT_COPY_RGBA( v0, v1 ) v0->ui[coloroffset] = v1->ui[coloroffset]
+
+#define VERT_SET_SPEC( v, c )					\
+do {								\
+   if (specoffset) {						\
+      radeon_color_t *spec = (radeon_color_t *)&((v)->ui[specoffset]);	\
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->red, (c)[0]);	\
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->green, (c)[1]);	\
+      UNCLAMPED_FLOAT_TO_UBYTE(spec->blue, (c)[2]);	\
+   }								\
+} while (0)
+#define VERT_COPY_SPEC( v0, v1 )			\
+do {							\
+   if (specoffset) {					\
+      radeon_color_t *spec0 = (radeon_color_t *)&((v0)->ui[specoffset]);	\
+      radeon_color_t *spec1 = (radeon_color_t *)&((v1)->ui[specoffset]);	\
+      spec0->red   = spec1->red;	\
+      spec0->green = spec1->green;	\
+      spec0->blue  = spec1->blue; 	\
+   }							\
+} while (0)
+
+/* These don't need LE32_TO_CPU() as they used to save and restore
+ * colors which are already in the correct format.
+ */
+#define VERT_SAVE_RGBA( idx )    color[idx] = v[idx]->ui[coloroffset]
+#define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = color[idx]
+#define VERT_SAVE_SPEC( idx )    if (specoffset) spec[idx] = v[idx]->ui[specoffset]
+#define VERT_RESTORE_SPEC( idx ) if (specoffset) v[idx]->ui[specoffset] = spec[idx]
+
+#undef LOCAL_VARS
+#undef TAG
+#undef INIT
+
+#define LOCAL_VARS(n)							\
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);			\
+   GLuint color[n], spec[n];						\
+   GLuint coloroffset = rmesa->swtcl.coloroffset;	\
+   GLuint specoffset = rmesa->swtcl.specoffset;			\
+   (void) color; (void) spec; (void) coloroffset; (void) specoffset;
+
+/***********************************************************************
+ *                Helpers for rendering unfilled primitives            *
+ ***********************************************************************/
+
+#define RASTERIZE(x) radeonRasterPrimitive( ctx, reduced_hw_prim[x] )
+#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
+#undef TAG
+#define TAG(x) x
+#include "tnl_dd/t_dd_unfilled.h"
+#undef IND
+
+
+/***********************************************************************
+ *                      Generate GL render functions                   *
+ ***********************************************************************/
+
+
+#define IND (0)
+#define TAG(x) x
+#include "tnl_dd/t_dd_tritmp.h"
+
+#define IND (RADEON_TWOSIDE_BIT)
+#define TAG(x) x##_twoside
+#include "tnl_dd/t_dd_tritmp.h"
+
+#define IND (RADEON_UNFILLED_BIT)
+#define TAG(x) x##_unfilled
+#include "tnl_dd/t_dd_tritmp.h"
+
+#define IND (RADEON_TWOSIDE_BIT|RADEON_UNFILLED_BIT)
+#define TAG(x) x##_twoside_unfilled
+#include "tnl_dd/t_dd_tritmp.h"
+
+
+static void init_rast_tab( void )
+{
+   init();
+   init_twoside();
+   init_unfilled();
+   init_twoside_unfilled();
+}
+
+/**********************************************************************/
+/*               Render unclipped begin/end objects                   */
+/**********************************************************************/
+
+#define RENDER_POINTS( start, count )		\
+   for ( ; start < count ; start++)		\
+      radeon_point( rmesa, VERT(start) )
+#define RENDER_LINE( v0, v1 ) \
+   radeon_line( rmesa, VERT(v0), VERT(v1) )
+#define RENDER_TRI( v0, v1, v2 )  \
+   radeon_triangle( rmesa, VERT(v0), VERT(v1), VERT(v2) )
+#define RENDER_QUAD( v0, v1, v2, v3 ) \
+   radeon_quad( rmesa, VERT(v0), VERT(v1), VERT(v2), VERT(v3) )
+#undef INIT
+#define INIT(x) do {					\
+   radeonRenderPrimitive( ctx, x );			\
+} while (0)
+#undef LOCAL_VARS
+#define LOCAL_VARS						\
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);		\
+   const GLuint vertsize = rmesa->swtcl.vertex_size;		\
+   const char *radeonverts = (char *)rmesa->swtcl.verts;		\
+   const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;	\
+   const GLboolean stipple = ctx->Line.StippleFlag;		\
+   (void) elt; (void) stipple;
+#define RESET_STIPPLE	if ( stipple ) radeonResetLineStipple( ctx );
+#define RESET_OCCLUSION
+#define PRESERVE_VB_DEFS
+#define ELT(x) (x)
+#define TAG(x) radeon_##x##_verts
+#include "tnl/t_vb_rendertmp.h"
+#undef ELT
+#undef TAG
+#define TAG(x) radeon_##x##_elts
+#define ELT(x) elt[x]
+#include "tnl/t_vb_rendertmp.h"
+
+
+
+/**********************************************************************/
+/*                    Choose render functions                         */
+/**********************************************************************/
+
+void radeonChooseRenderState( GLcontext *ctx )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint index = 0;
+   GLuint flags = ctx->_TriangleCaps;
+
+   if (!rmesa->TclFallback || rmesa->Fallback) 
+      return;
+
+   if (flags & DD_TRI_LIGHT_TWOSIDE) index |= RADEON_TWOSIDE_BIT;
+   if (flags & DD_TRI_UNFILLED)      index |= RADEON_UNFILLED_BIT;
+
+   if (index != rmesa->swtcl.RenderIndex) {
+      tnl->Driver.Render.Points = rast_tab[index].points;
+      tnl->Driver.Render.Line = rast_tab[index].line;
+      tnl->Driver.Render.ClippedLine = rast_tab[index].line;
+      tnl->Driver.Render.Triangle = rast_tab[index].triangle;
+      tnl->Driver.Render.Quad = rast_tab[index].quad;
+
+      if (index == 0) {
+	 tnl->Driver.Render.PrimTabVerts = radeon_render_tab_verts;
+	 tnl->Driver.Render.PrimTabElts = radeon_render_tab_elts;
+	 tnl->Driver.Render.ClippedPolygon = radeon_fast_clipped_poly;
+      } else {
+	 tnl->Driver.Render.PrimTabVerts = _tnl_render_tab_verts;
+	 tnl->Driver.Render.PrimTabElts = _tnl_render_tab_elts;
+	 tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
+      }
+
+      rmesa->swtcl.RenderIndex = index;
+   }
+}
+
+
+/**********************************************************************/
+/*                 High level hooks for t_vb_render.c                 */
+/**********************************************************************/
+
+
+static void radeonRasterPrimitive( GLcontext *ctx, GLuint hwprim )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   if (rmesa->swtcl.hw_primitive != hwprim) {
+      RADEON_NEWPRIM( rmesa );
+      rmesa->swtcl.hw_primitive = hwprim;
+   }
+}
+
+static void radeonRenderPrimitive( GLcontext *ctx, GLenum prim )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   rmesa->swtcl.render_primitive = prim;
+   if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED)) 
+      radeonRasterPrimitive( ctx, reduced_hw_prim[prim] );
+}
+
+static void radeonRenderFinish( GLcontext *ctx )
+{
+}
+
+static void radeonResetLineStipple( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   RADEON_STATECHANGE( rmesa, lin );
+}
+
+
+/**********************************************************************/
+/*           Transition to/from hardware rasterization.               */
+/**********************************************************************/
+
+static const char * const fallbackStrings[] = {
+   "Texture mode",
+   "glDrawBuffer(GL_FRONT_AND_BACK)",
+   "glEnable(GL_STENCIL) without hw stencil buffer",
+   "glRenderMode(selection or feedback)",
+   "glBlendEquation",
+   "glBlendFunc",
+   "RADEON_NO_RAST",
+   "Mixing GL_CLAMP_TO_BORDER and GL_CLAMP (or GL_MIRROR_CLAMP_ATI)"
+};
+
+
+static const char *getFallbackString(GLuint bit)
+{
+   int i = 0;
+   while (bit > 1) {
+      i++;
+      bit >>= 1;
+   }
+   return fallbackStrings[i];
+}
+
+
+void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   GLuint oldfallback = rmesa->Fallback;
+
+   if (mode) {
+      rmesa->Fallback |= bit;
+      if (oldfallback == 0) {
+	 RADEON_FIREVERTICES( rmesa );
+	 TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_RASTER, GL_TRUE );
+	 _swsetup_Wakeup( ctx );
+	 rmesa->swtcl.RenderIndex = ~0;
+         if (RADEON_DEBUG & DEBUG_FALLBACKS) {
+            fprintf(stderr, "Radeon begin rasterization fallback: 0x%x %s\n",
+                    bit, getFallbackString(bit));
+         }
+      }
+   }
+   else {
+      rmesa->Fallback &= ~bit;
+      if (oldfallback == bit) {
+	 _swrast_flush( ctx );
+	 tnl->Driver.Render.Start = radeonRenderStart;
+	 tnl->Driver.Render.PrimitiveNotify = radeonRenderPrimitive;
+	 tnl->Driver.Render.Finish = radeonRenderFinish;
+
+	 tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+	 tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+	 tnl->Driver.Render.Interp = _tnl_interp;
+
+	 tnl->Driver.Render.ResetLineStipple = radeonResetLineStipple;
+	 TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_RASTER, GL_FALSE );
+	 if (rmesa->TclFallback) {
+	    /* These are already done if rmesa->TclFallback goes to
+	     * zero above. But not if it doesn't (RADEON_NO_TCL for
+	     * example?)
+	     */
+	    _tnl_invalidate_vertex_state( ctx, ~0 );
+	    _tnl_invalidate_vertices( ctx, ~0 );
+	    RENDERINPUTS_ZERO( rmesa->tnl_index_bitset );
+	    radeonChooseVertexState( ctx );
+	    radeonChooseRenderState( ctx );
+	 }
+         if (RADEON_DEBUG & DEBUG_FALLBACKS) {
+            fprintf(stderr, "Radeon end rasterization fallback: 0x%x %s\n",
+                    bit, getFallbackString(bit));
+         }
+      }
+   }
+}
+
+
+/**********************************************************************/
+/*                            Initialization.                         */
+/**********************************************************************/
+
+void radeonInitSwtcl( GLcontext *ctx )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   static int firsttime = 1;
+
+   if (firsttime) {
+      init_rast_tab();
+      firsttime = 0;
+   }
+
+   tnl->Driver.Render.Start = radeonRenderStart;
+   tnl->Driver.Render.Finish = radeonRenderFinish;
+   tnl->Driver.Render.PrimitiveNotify = radeonRenderPrimitive;
+   tnl->Driver.Render.ResetLineStipple = radeonResetLineStipple;
+   tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+   tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+   tnl->Driver.Render.Interp = _tnl_interp;
+
+   _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
+		       RADEON_MAX_TNL_VERTEX_SIZE);
+   
+   rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+   rmesa->swtcl.RenderIndex = ~0;
+   rmesa->swtcl.render_primitive = GL_TRIANGLES;
+   rmesa->swtcl.hw_primitive = 0;
+}
+
+
+void radeonDestroySwtcl( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   if (rmesa->swtcl.indexed_verts.buf) 
+      radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
+			      __FUNCTION__ );
+}
diff --git a/radeon/radeon_swtcl.h b/radeon/radeon_swtcl.h
new file mode 100644
index 0000000..64f9019
--- /dev/null
+++ b/radeon/radeon_swtcl.h
@@ -0,0 +1,68 @@
+/* $XFree86$ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ATI, VA LINUX SYSTEMS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *
+ */
+
+#ifndef __RADEON_TRIS_H__
+#define __RADEON_TRIS_H__
+
+#include "mtypes.h"
+#include "swrast/swrast.h"
+#include "radeon_context.h"
+
+extern void radeonInitSwtcl( GLcontext *ctx );
+extern void radeonDestroySwtcl( GLcontext *ctx );
+
+extern void radeonChooseRenderState( GLcontext *ctx );
+extern void radeonChooseVertexState( GLcontext *ctx );
+
+extern void radeonCheckTexSizes( GLcontext *ctx );
+
+extern void radeonBuildVertices( GLcontext *ctx, GLuint start, GLuint count,
+				 GLuint newinputs );
+
+extern void radeonPrintSetupFlags(char *msg, GLuint flags );
+
+
+extern void radeon_emit_indexed_verts( GLcontext *ctx,
+				       GLuint start,
+				       GLuint count );
+
+extern void radeon_translate_vertex( GLcontext *ctx, 
+				     const radeonVertex *src, 
+				     SWvertex *dst );
+
+extern void radeon_print_vertex( GLcontext *ctx, const radeonVertex *v );
+
+
+#endif
diff --git a/radeon/radeon_tcl.c b/radeon/radeon_tcl.c
new file mode 100644
index 0000000..0f4baf2
--- /dev/null
+++ b/radeon/radeon_tcl.c
@@ -0,0 +1,575 @@
+/* $XFree86$ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     Tungsten Graphics Inc., Austin, Texas.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "light.h"
+#include "mtypes.h"
+#include "enums.h"
+
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+
+#include "radeon_context.h"
+#include "radeon_state.h"
+#include "radeon_ioctl.h"
+#include "radeon_tex.h"
+#include "radeon_tcl.h"
+#include "radeon_swtcl.h"
+#include "radeon_maos.h"
+
+
+
+/*
+ * Render unclipped vertex buffers by emitting vertices directly to
+ * dma buffers.  Use strip/fan hardware primitives where possible.
+ * Try to simulate missing primitives with indexed vertices.
+ */
+#define HAVE_POINTS      1
+#define HAVE_LINES       1
+#define HAVE_LINE_LOOP   0
+#define HAVE_LINE_STRIPS 1
+#define HAVE_TRIANGLES   1
+#define HAVE_TRI_STRIPS  1
+#define HAVE_TRI_STRIP_1 0
+#define HAVE_TRI_FANS    1
+#define HAVE_QUADS       0
+#define HAVE_QUAD_STRIPS 0
+#define HAVE_POLYGONS    1
+#define HAVE_ELTS        1
+
+
+#define HW_POINTS           RADEON_CP_VC_CNTL_PRIM_TYPE_POINT
+#define HW_LINES            RADEON_CP_VC_CNTL_PRIM_TYPE_LINE
+#define HW_LINE_LOOP        0
+#define HW_LINE_STRIP       RADEON_CP_VC_CNTL_PRIM_TYPE_LINE_STRIP
+#define HW_TRIANGLES        RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST
+#define HW_TRIANGLE_STRIP_0 RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_STRIP
+#define HW_TRIANGLE_STRIP_1 0
+#define HW_TRIANGLE_FAN     RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_FAN
+#define HW_QUADS            0
+#define HW_QUAD_STRIP       0
+#define HW_POLYGON          RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_FAN
+
+
+static GLboolean discrete_prim[0x10] = {
+   0,				/* 0 none */
+   1,				/* 1 points */
+   1,				/* 2 lines */
+   0,				/* 3 line_strip */
+   1,				/* 4 tri_list */
+   0,				/* 5 tri_fan */
+   0,				/* 6 tri_type2 */
+   1,				/* 7 rect list (unused) */
+   1,				/* 8 3vert point */
+   1,				/* 9 3vert line */
+   0,
+   0,
+   0,
+   0,
+   0,
+   0,
+};
+   
+
+#define LOCAL_VARS radeonContextPtr rmesa = RADEON_CONTEXT(ctx)
+#define ELT_TYPE  GLushort
+
+#define ELT_INIT(prim, hw_prim) \
+   radeonTclPrimitive( ctx, prim, hw_prim | RADEON_CP_VC_CNTL_PRIM_WALK_IND )
+
+#define GET_MESA_ELTS() rmesa->tcl.Elts
+
+
+/* Don't really know how many elts will fit in what's left of cmdbuf,
+ * as there is state to emit, etc:
+ */
+
+/* Testing on isosurf shows a maximum around here.  Don't know if it's
+ * the card or driver or kernel module that is causing the behaviour.
+ */
+#define GET_MAX_HW_ELTS() 300
+
+
+#define RESET_STIPPLE() do {			\
+   RADEON_STATECHANGE( rmesa, lin );		\
+   radeonEmitState( rmesa );			\
+} while (0)
+
+#define AUTO_STIPPLE( mode )  do {		\
+   RADEON_STATECHANGE( rmesa, lin );		\
+   if (mode)					\
+      rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] |=	\
+	 RADEON_LINE_PATTERN_AUTO_RESET;	\
+   else						\
+      rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] &=	\
+	 ~RADEON_LINE_PATTERN_AUTO_RESET;	\
+   radeonEmitState( rmesa );			\
+} while (0)
+
+
+
+#define ALLOC_ELTS(nr)	radeonAllocElts( rmesa, nr )
+
+static GLushort *radeonAllocElts( radeonContextPtr rmesa, GLuint nr ) 
+{
+   if (rmesa->dma.flush)
+      rmesa->dma.flush( rmesa );
+
+   radeonEnsureCmdBufSpace(rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
+			   rmesa->hw.max_state_size + ELTS_BUFSZ(nr));
+
+   radeonEmitAOS( rmesa,
+		rmesa->tcl.aos_components,
+		rmesa->tcl.nr_aos_components, 0 );
+
+   return radeonAllocEltsOpenEnded( rmesa,
+				    rmesa->tcl.vertex_format, 
+				    rmesa->tcl.hw_primitive, nr );
+}
+
+#define CLOSE_ELTS()  RADEON_NEWPRIM( rmesa )
+
+
+
+/* TODO: Try to extend existing primitive if both are identical,
+ * discrete and there are no intervening state changes.  (Somewhat
+ * duplicates changes to DrawArrays code)
+ */
+static void radeonEmitPrim( GLcontext *ctx, 
+		       GLenum prim, 
+		       GLuint hwprim, 
+		       GLuint start, 
+		       GLuint count)	
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   radeonTclPrimitive( ctx, prim, hwprim );
+   
+   radeonEnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
+			    rmesa->hw.max_state_size + VBUF_BUFSZ );
+
+   radeonEmitAOS( rmesa,
+		  rmesa->tcl.aos_components,
+		  rmesa->tcl.nr_aos_components,
+		  start );
+   
+   /* Why couldn't this packet have taken an offset param?
+    */
+   radeonEmitVbufPrim( rmesa,
+		       rmesa->tcl.vertex_format,
+		       rmesa->tcl.hw_primitive,
+		       count - start );
+}
+
+#define EMIT_PRIM( ctx, prim, hwprim, start, count ) do {       \
+   radeonEmitPrim( ctx, prim, hwprim, start, count );           \
+   (void) rmesa; } while (0)
+
+/* Try & join small primitives
+ */
+#if 0
+#define PREFER_DISCRETE_ELT_PRIM( NR, PRIM ) 0
+#else
+#define PREFER_DISCRETE_ELT_PRIM( NR, PRIM )			\
+  ((NR) < 20 ||							\
+   ((NR) < 40 &&						\
+    rmesa->tcl.hw_primitive == (PRIM|				\
+			    RADEON_CP_VC_CNTL_PRIM_WALK_IND|	\
+			    RADEON_CP_VC_CNTL_TCL_ENABLE)))
+#endif
+
+#ifdef MESA_BIG_ENDIAN
+/* We could do without (most of) this ugliness if dest was always 32 bit word aligned... */
+#define EMIT_ELT(dest, offset, x) do {				\
+	int off = offset + ( ( (GLuint)dest & 0x2 ) >> 1 );	\
+	GLushort *des = (GLushort *)( (GLuint)dest & ~0x2 );	\
+	(des)[ off + 1 - 2 * ( off & 1 ) ] = (GLushort)(x); 	\
+	(void)rmesa; } while (0)
+#else
+#define EMIT_ELT(dest, offset, x) do {				\
+	(dest)[offset] = (GLushort) (x);			\
+	(void)rmesa; } while (0)
+#endif
+
+#define EMIT_TWO_ELTS(dest, offset, x, y)  *(GLuint *)(dest+offset) = ((y)<<16)|(x);
+
+
+
+#define TAG(x) tcl_##x
+#include "tnl_dd/t_dd_dmatmp2.h"
+
+/**********************************************************************/
+/*                          External entrypoints                     */
+/**********************************************************************/
+
+void radeonEmitPrimitive( GLcontext *ctx, 
+			  GLuint first,
+			  GLuint last,
+			  GLuint flags )
+{
+   tcl_render_tab_verts[flags&PRIM_MODE_MASK]( ctx, first, last, flags );
+}
+
+void radeonEmitEltPrimitive( GLcontext *ctx, 
+			     GLuint first,
+			     GLuint last,
+			     GLuint flags )
+{
+   tcl_render_tab_elts[flags&PRIM_MODE_MASK]( ctx, first, last, flags );
+}
+
+void radeonTclPrimitive( GLcontext *ctx, 
+			 GLenum prim,
+			 int hw_prim )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint se_cntl;
+   GLuint newprim = hw_prim | RADEON_CP_VC_CNTL_TCL_ENABLE;
+
+   if (newprim != rmesa->tcl.hw_primitive ||
+       !discrete_prim[hw_prim&0xf]) {
+      RADEON_NEWPRIM( rmesa );
+      rmesa->tcl.hw_primitive = newprim;
+   }
+
+   se_cntl = rmesa->hw.set.cmd[SET_SE_CNTL];
+   se_cntl &= ~RADEON_FLAT_SHADE_VTX_LAST;
+
+   if (prim == GL_POLYGON && (ctx->_TriangleCaps & DD_FLATSHADE)) 
+      se_cntl |= RADEON_FLAT_SHADE_VTX_0;
+   else
+      se_cntl |= RADEON_FLAT_SHADE_VTX_LAST;
+
+   if (se_cntl != rmesa->hw.set.cmd[SET_SE_CNTL]) {
+      RADEON_STATECHANGE( rmesa, set );
+      rmesa->hw.set.cmd[SET_SE_CNTL] = se_cntl;
+   }
+}
+
+/**********************************************************************/
+/*             Fog blend factor computation for hw tcl                */
+/*             same calculation used as in t_vb_fog.c                 */
+/**********************************************************************/
+
+#define FOG_EXP_TABLE_SIZE 256
+#define FOG_MAX (10.0)
+#define EXP_FOG_MAX .0006595
+#define FOG_INCR (FOG_MAX/FOG_EXP_TABLE_SIZE)
+static GLfloat exp_table[FOG_EXP_TABLE_SIZE];
+
+#if 1
+#define NEG_EXP( result, narg )						\
+do {									\
+   GLfloat f = (GLfloat) (narg * (1.0/FOG_INCR));			\
+   GLint k = (GLint) f;							\
+   if (k > FOG_EXP_TABLE_SIZE-2) 					\
+      result = (GLfloat) EXP_FOG_MAX;					\
+   else									\
+      result = exp_table[k] + (f-k)*(exp_table[k+1]-exp_table[k]);	\
+} while (0)
+#else
+#define NEG_EXP( result, narg )					\
+do {								\
+   result = exp(-narg);						\
+} while (0)
+#endif
+
+
+/**
+ * Initialize the exp_table[] lookup table for approximating exp().
+ */
+void
+radeonInitStaticFogData( void )
+{
+   GLfloat f = 0.0F;
+   GLint i = 0;
+   for ( ; i < FOG_EXP_TABLE_SIZE ; i++, f += FOG_INCR) {
+      exp_table[i] = (GLfloat) exp(-f);
+   }
+}
+
+
+/**
+ * Compute per-vertex fog blend factors from fog coordinates by
+ * evaluating the GL_LINEAR, GL_EXP or GL_EXP2 fog function.
+ * Fog coordinates are distances from the eye (typically between the
+ * near and far clip plane distances).
+ * Note the fog (eye Z) coords may be negative so we use ABS(z) below.
+ * Fog blend factors are in the range [0,1].
+ */
+float
+radeonComputeFogBlendFactor( GLcontext *ctx, GLfloat fogcoord )
+{
+   GLfloat end  = ctx->Fog.End;
+   GLfloat d, temp;
+   const GLfloat z = FABSF(fogcoord);
+
+   switch (ctx->Fog.Mode) {
+   case GL_LINEAR:
+      if (ctx->Fog.Start == ctx->Fog.End)
+         d = 1.0F;
+      else
+         d = 1.0F / (ctx->Fog.End - ctx->Fog.Start);
+      temp = (end - z) * d;
+      return CLAMP(temp, 0.0F, 1.0F);
+      break;
+   case GL_EXP:
+      d = ctx->Fog.Density;
+      NEG_EXP( temp, d * z );
+      return temp;
+      break;
+   case GL_EXP2:
+      d = ctx->Fog.Density*ctx->Fog.Density;
+      NEG_EXP( temp, d * z * z );
+      return temp;
+      break;
+   default:
+      _mesa_problem(ctx, "Bad fog mode in make_fog_coord");
+      return 0;
+   }
+}
+
+/**********************************************************************/
+/*                          Render pipeline stage                     */
+/**********************************************************************/
+
+
+/* TCL render.
+ */
+static GLboolean radeon_run_tcl_render( GLcontext *ctx,
+					struct tnl_pipeline_stage *stage )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+   GLuint inputs = VERT_BIT_POS | VERT_BIT_COLOR0;
+   GLuint i;
+
+   /* TODO: separate this from the swtnl pipeline 
+    */
+   if (rmesa->TclFallback)
+      return GL_TRUE;	/* fallback to software t&l */
+
+   if (VB->Count == 0)
+      return GL_FALSE;
+
+   /* NOTE: inputs != tnl->render_inputs - these are the untransformed
+    * inputs.
+    */
+   if (ctx->Light.Enabled) {
+      inputs |= VERT_BIT_NORMAL;
+   }
+
+   if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR) {
+      inputs |= VERT_BIT_COLOR1;
+   }
+
+   if ( (ctx->Fog.FogCoordinateSource == GL_FOG_COORD) && ctx->Fog.Enabled ) {
+      inputs |= VERT_BIT_FOG;
+   }
+
+   for (i = 0 ; i < ctx->Const.MaxTextureUnits; i++) {
+      if (ctx->Texture.Unit[i]._ReallyEnabled) {
+      /* TODO: probably should not emit texture coords when texgen is enabled */
+	 if (rmesa->TexGenNeedNormals[i]) {
+	    inputs |= VERT_BIT_NORMAL;
+	 }
+	 inputs |= VERT_BIT_TEX(i);
+      }
+   }
+
+   radeonReleaseArrays( ctx, ~0 );
+   radeonEmitArrays( ctx, inputs );
+
+   rmesa->tcl.Elts = VB->Elts;
+
+   for (i = 0 ; i < VB->PrimitiveCount ; i++)
+   {
+      GLuint prim = _tnl_translate_prim(&VB->Primitive[i]);
+      GLuint start = VB->Primitive[i].start;
+      GLuint length = VB->Primitive[i].count;
+
+      if (!length)
+	 continue;
+
+      if (rmesa->tcl.Elts)
+	 radeonEmitEltPrimitive( ctx, start, start+length, prim );
+      else
+	 radeonEmitPrimitive( ctx, start, start+length, prim );
+   }
+
+   return GL_FALSE;		/* finished the pipe */
+}
+
+
+
+/* Initial state for tcl stage.  
+ */
+const struct tnl_pipeline_stage _radeon_tcl_stage =
+{
+   "radeon render",
+   NULL,
+   NULL,
+   NULL,
+   NULL,
+   radeon_run_tcl_render	/* run */
+};
+
+
+
+/**********************************************************************/
+/*                 Validate state at pipeline start                   */
+/**********************************************************************/
+
+
+/*-----------------------------------------------------------------------
+ * Manage TCL fallbacks
+ */
+
+
+static void transition_to_swtnl( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   GLuint se_cntl;
+
+   RADEON_NEWPRIM( rmesa );
+   rmesa->swtcl.vertex_format = 0;
+
+   radeonChooseVertexState( ctx );
+   radeonChooseRenderState( ctx );
+
+   _mesa_validate_all_lighting_tables( ctx ); 
+
+   tnl->Driver.NotifyMaterialChange = 
+      _mesa_validate_all_lighting_tables;
+
+   radeonReleaseArrays( ctx, ~0 );
+
+   se_cntl = rmesa->hw.set.cmd[SET_SE_CNTL];
+   se_cntl |= RADEON_FLAT_SHADE_VTX_LAST;
+	 
+   if (se_cntl != rmesa->hw.set.cmd[SET_SE_CNTL]) {
+      RADEON_STATECHANGE( rmesa, set );
+      rmesa->hw.set.cmd[SET_SE_CNTL] = se_cntl;
+   }
+}
+
+
+static void transition_to_hwtnl( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
+
+   se_coord_fmt &= ~(RADEON_VTX_XY_PRE_MULT_1_OVER_W0 |
+		     RADEON_VTX_Z_PRE_MULT_1_OVER_W0 |
+		     RADEON_VTX_W0_IS_NOT_1_OVER_W0);
+   se_coord_fmt |= RADEON_VTX_W0_IS_NOT_1_OVER_W0;
+
+   if ( se_coord_fmt != rmesa->hw.set.cmd[SET_SE_COORDFMT] ) {
+      RADEON_STATECHANGE( rmesa, set );
+      rmesa->hw.set.cmd[SET_SE_COORDFMT] = se_coord_fmt;
+      _tnl_need_projected_coords( ctx, GL_FALSE );
+   }
+
+   radeonUpdateMaterial( ctx );
+
+   tnl->Driver.NotifyMaterialChange = radeonUpdateMaterial;
+
+   if ( rmesa->dma.flush )			
+      rmesa->dma.flush( rmesa );	
+
+   rmesa->dma.flush = NULL;
+   rmesa->swtcl.vertex_format = 0;
+   
+   if (rmesa->swtcl.indexed_verts.buf) 
+      radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
+			      __FUNCTION__ );
+
+   if (RADEON_DEBUG & DEBUG_FALLBACKS) 
+      fprintf(stderr, "Radeon end tcl fallback\n");
+}
+
+static char *fallbackStrings[] = {
+   "Rasterization fallback",
+   "Unfilled triangles",
+   "Twosided lighting, differing materials",
+   "Materials in VB (maybe between begin/end)",
+   "Texgen unit 0",
+   "Texgen unit 1",
+   "Texgen unit 2",
+   "User disable",
+   "Fogcoord with separate specular lighting"
+};
+
+
+static char *getFallbackString(GLuint bit)
+{
+   int i = 0;
+   while (bit > 1) {
+      i++;
+      bit >>= 1;
+   }
+   return fallbackStrings[i];
+}
+
+
+
+void radeonTclFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint oldfallback = rmesa->TclFallback;
+
+   if (mode) {
+      rmesa->TclFallback |= bit;
+      if (oldfallback == 0) {
+	 if (RADEON_DEBUG & DEBUG_FALLBACKS) 
+	    fprintf(stderr, "Radeon begin tcl fallback %s\n",
+		    getFallbackString( bit ));
+	 transition_to_swtnl( ctx );
+      }
+   }
+   else {
+      rmesa->TclFallback &= ~bit;
+      if (oldfallback == bit) {
+	 if (RADEON_DEBUG & DEBUG_FALLBACKS) 
+	    fprintf(stderr, "Radeon end tcl fallback %s\n",
+		    getFallbackString( bit ));
+	 transition_to_hwtnl( ctx );
+      }
+   }
+}
diff --git a/radeon/radeon_tcl.h b/radeon/radeon_tcl.h
new file mode 100644
index 0000000..168ab95
--- /dev/null
+++ b/radeon/radeon_tcl.h
@@ -0,0 +1,68 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_tcl.h,v 1.2 2003/02/08 21:26:45 dawes Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     Tungsten Grahpics Inc., Austin, Texas.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *
+ */
+
+#ifndef __RADEON_TCL_H__
+#define __RADEON_TCL_H__
+
+#include "radeon_context.h"
+
+extern void radeonTclPrimitive( GLcontext *ctx, GLenum prim, int hw_prim );
+extern void radeonEmitEltPrimitive( GLcontext *ctx, GLuint first, GLuint last,
+				    GLuint flags );
+extern void radeonEmitPrimitive( GLcontext *ctx, GLuint first, GLuint last,
+				 GLuint flags );
+
+extern void radeonTclFallback( GLcontext *ctx, GLuint bit, GLboolean mode );
+
+extern void radeonInitStaticFogData( void );
+extern float radeonComputeFogBlendFactor( GLcontext *ctx, GLfloat fogcoord );
+
+#define RADEON_TCL_FALLBACK_RASTER            0x1 /* rasterization */
+#define RADEON_TCL_FALLBACK_UNFILLED          0x2 /* unfilled tris */
+#define RADEON_TCL_FALLBACK_LIGHT_TWOSIDE     0x4 /* twoside tris */
+#define RADEON_TCL_FALLBACK_MATERIAL          0x8 /* material in vb */
+#define RADEON_TCL_FALLBACK_TEXGEN_0          0x10 /* texgen, unit 0 */
+#define RADEON_TCL_FALLBACK_TEXGEN_1          0x20 /* texgen, unit 1 */
+#define RADEON_TCL_FALLBACK_TEXGEN_2          0x40 /* texgen, unit 2 */
+#define RADEON_TCL_FALLBACK_TCL_DISABLE       0x80 /* user disable */
+#define RADEON_TCL_FALLBACK_FOGCOORDSPEC      0x100 /* fogcoord, sep. spec light */
+
+/* max maos_verts vertex format has a size of 18 floats */
+#define RADEON_MAX_TCL_VERTSIZE (18*4)
+
+#define TCL_FALLBACK( ctx, bit, mode )	radeonTclFallback( ctx, bit, mode )
+
+#endif
diff --git a/radeon/radeon_tex.c b/radeon/radeon_tex.c
new file mode 100644
index 0000000..edaea6c
--- /dev/null
+++ b/radeon/radeon_tex.c
@@ -0,0 +1,883 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_tex.c,v 1.6 2002/09/16 18:05:20 eich Exp $ */
+/*
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/*
+ * Authors:
+ *    Gareth Hughes <gareth@valinux.com>
+ *    Brian Paul <brianp@valinux.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "colormac.h"
+#include "context.h"
+#include "enums.h"
+#include "image.h"
+#include "simple_list.h"
+#include "texformat.h"
+#include "texstore.h"
+#include "teximage.h"
+#include "texobj.h"
+
+
+#include "radeon_context.h"
+#include "radeon_state.h"
+#include "radeon_ioctl.h"
+#include "radeon_swtcl.h"
+#include "radeon_tex.h"
+
+#include "xmlpool.h"
+
+
+
+/**
+ * Set the texture wrap modes.
+ * 
+ * \param t Texture object whose wrap modes are to be set
+ * \param swrap Wrap mode for the \a s texture coordinate
+ * \param twrap Wrap mode for the \a t texture coordinate
+ */
+
+static void radeonSetTexWrap( radeonTexObjPtr t, GLenum swrap, GLenum twrap )
+{
+   GLboolean  is_clamp = GL_FALSE;
+   GLboolean  is_clamp_to_border = GL_FALSE;
+
+   t->pp_txfilter &= ~(RADEON_CLAMP_S_MASK | RADEON_CLAMP_T_MASK | RADEON_BORDER_MODE_D3D);
+
+   switch ( swrap ) {
+   case GL_REPEAT:
+      t->pp_txfilter |= RADEON_CLAMP_S_WRAP;
+      break;
+   case GL_CLAMP:
+      t->pp_txfilter |= RADEON_CLAMP_S_CLAMP_GL;
+      is_clamp = GL_TRUE;
+      break;
+   case GL_CLAMP_TO_EDGE:
+      t->pp_txfilter |= RADEON_CLAMP_S_CLAMP_LAST;
+      break;
+   case GL_CLAMP_TO_BORDER:
+      t->pp_txfilter |= RADEON_CLAMP_S_CLAMP_GL;
+      is_clamp_to_border = GL_TRUE;
+      break;
+   case GL_MIRRORED_REPEAT:
+      t->pp_txfilter |= RADEON_CLAMP_S_MIRROR;
+      break;
+   case GL_MIRROR_CLAMP_EXT:
+      t->pp_txfilter |= RADEON_CLAMP_S_MIRROR_CLAMP_GL;
+      is_clamp = GL_TRUE;
+      break;
+   case GL_MIRROR_CLAMP_TO_EDGE_EXT:
+      t->pp_txfilter |= RADEON_CLAMP_S_MIRROR_CLAMP_LAST;
+      break;
+   case GL_MIRROR_CLAMP_TO_BORDER_EXT:
+      t->pp_txfilter |= RADEON_CLAMP_S_MIRROR_CLAMP_GL;
+      is_clamp_to_border = GL_TRUE;
+      break;
+   default:
+      _mesa_problem(NULL, "bad S wrap mode in %s", __FUNCTION__);
+   }
+
+   switch ( twrap ) {
+   case GL_REPEAT:
+      t->pp_txfilter |= RADEON_CLAMP_T_WRAP;
+      break;
+   case GL_CLAMP:
+      t->pp_txfilter |= RADEON_CLAMP_T_CLAMP_GL;
+      is_clamp = GL_TRUE;
+      break;
+   case GL_CLAMP_TO_EDGE:
+      t->pp_txfilter |= RADEON_CLAMP_T_CLAMP_LAST;
+      break;
+   case GL_CLAMP_TO_BORDER:
+      t->pp_txfilter |= RADEON_CLAMP_T_CLAMP_GL;
+      is_clamp_to_border = GL_TRUE;
+      break;
+   case GL_MIRRORED_REPEAT:
+      t->pp_txfilter |= RADEON_CLAMP_T_MIRROR;
+      break;
+   case GL_MIRROR_CLAMP_EXT:
+      t->pp_txfilter |= RADEON_CLAMP_T_MIRROR_CLAMP_GL;
+      is_clamp = GL_TRUE;
+      break;
+   case GL_MIRROR_CLAMP_TO_EDGE_EXT:
+      t->pp_txfilter |= RADEON_CLAMP_T_MIRROR_CLAMP_LAST;
+      break;
+   case GL_MIRROR_CLAMP_TO_BORDER_EXT:
+      t->pp_txfilter |= RADEON_CLAMP_T_MIRROR_CLAMP_GL;
+      is_clamp_to_border = GL_TRUE;
+      break;
+   default:
+      _mesa_problem(NULL, "bad T wrap mode in %s", __FUNCTION__);
+   }
+
+   if ( is_clamp_to_border ) {
+      t->pp_txfilter |= RADEON_BORDER_MODE_D3D;
+   }
+
+   t->border_fallback = (is_clamp && is_clamp_to_border);
+}
+
+static void radeonSetTexMaxAnisotropy( radeonTexObjPtr t, GLfloat max )
+{
+   t->pp_txfilter &= ~RADEON_MAX_ANISO_MASK;
+
+   if ( max == 1.0 ) {
+      t->pp_txfilter |= RADEON_MAX_ANISO_1_TO_1;
+   } else if ( max <= 2.0 ) {
+      t->pp_txfilter |= RADEON_MAX_ANISO_2_TO_1;
+   } else if ( max <= 4.0 ) {
+      t->pp_txfilter |= RADEON_MAX_ANISO_4_TO_1;
+   } else if ( max <= 8.0 ) {
+      t->pp_txfilter |= RADEON_MAX_ANISO_8_TO_1;
+   } else {
+      t->pp_txfilter |= RADEON_MAX_ANISO_16_TO_1;
+   }
+}
+
+/**
+ * Set the texture magnification and minification modes.
+ * 
+ * \param t Texture whose filter modes are to be set
+ * \param minf Texture minification mode
+ * \param magf Texture magnification mode
+ */
+
+static void radeonSetTexFilter( radeonTexObjPtr t, GLenum minf, GLenum magf )
+{
+   GLuint anisotropy = (t->pp_txfilter & RADEON_MAX_ANISO_MASK);
+
+   t->pp_txfilter &= ~(RADEON_MIN_FILTER_MASK | RADEON_MAG_FILTER_MASK);
+
+   /* r100 chips can't handle mipmaps/aniso for cubemap/volume textures */
+   if ( t->base.tObj->Target == GL_TEXTURE_CUBE_MAP ) {
+      switch ( minf ) {
+      case GL_NEAREST:
+      case GL_NEAREST_MIPMAP_NEAREST:
+      case GL_NEAREST_MIPMAP_LINEAR:
+	 t->pp_txfilter |= RADEON_MIN_FILTER_NEAREST;
+	 break;
+      case GL_LINEAR:
+      case GL_LINEAR_MIPMAP_NEAREST:
+      case GL_LINEAR_MIPMAP_LINEAR:
+	 t->pp_txfilter |= RADEON_MIN_FILTER_LINEAR;
+	 break;
+      default:
+	 break;
+      }
+   }
+   else if ( anisotropy == RADEON_MAX_ANISO_1_TO_1 ) {
+      switch ( minf ) {
+      case GL_NEAREST:
+	 t->pp_txfilter |= RADEON_MIN_FILTER_NEAREST;
+	 break;
+      case GL_LINEAR:
+	 t->pp_txfilter |= RADEON_MIN_FILTER_LINEAR;
+	 break;
+      case GL_NEAREST_MIPMAP_NEAREST:
+	 t->pp_txfilter |= RADEON_MIN_FILTER_NEAREST_MIP_NEAREST;
+	 break;
+      case GL_NEAREST_MIPMAP_LINEAR:
+	 t->pp_txfilter |= RADEON_MIN_FILTER_LINEAR_MIP_NEAREST;
+	 break;
+      case GL_LINEAR_MIPMAP_NEAREST:
+	 t->pp_txfilter |= RADEON_MIN_FILTER_NEAREST_MIP_LINEAR;
+	 break;
+      case GL_LINEAR_MIPMAP_LINEAR:
+	 t->pp_txfilter |= RADEON_MIN_FILTER_LINEAR_MIP_LINEAR;
+	 break;
+      }
+   } else {
+      switch ( minf ) {
+      case GL_NEAREST:
+	 t->pp_txfilter |= RADEON_MIN_FILTER_ANISO_NEAREST;
+	 break;
+      case GL_LINEAR:
+	 t->pp_txfilter |= RADEON_MIN_FILTER_ANISO_LINEAR;
+	 break;
+      case GL_NEAREST_MIPMAP_NEAREST:
+      case GL_LINEAR_MIPMAP_NEAREST:
+	 t->pp_txfilter |= RADEON_MIN_FILTER_ANISO_NEAREST_MIP_NEAREST;
+	 break;
+      case GL_NEAREST_MIPMAP_LINEAR:
+      case GL_LINEAR_MIPMAP_LINEAR:
+	 t->pp_txfilter |= RADEON_MIN_FILTER_ANISO_NEAREST_MIP_LINEAR;
+	 break;
+      }
+   }
+
+   switch ( magf ) {
+   case GL_NEAREST:
+      t->pp_txfilter |= RADEON_MAG_FILTER_NEAREST;
+      break;
+   case GL_LINEAR:
+      t->pp_txfilter |= RADEON_MAG_FILTER_LINEAR;
+      break;
+   }
+}
+
+static void radeonSetTexBorderColor( radeonTexObjPtr t, GLubyte c[4] )
+{
+   t->pp_border_color = radeonPackColor( 4, c[0], c[1], c[2], c[3] );
+}
+
+
+/**
+ * Allocate space for and load the mesa images into the texture memory block.
+ * This will happen before drawing with a new texture, or drawing with a
+ * texture after it was swapped out or teximaged again.
+ */
+
+static radeonTexObjPtr radeonAllocTexObj( struct gl_texture_object *texObj )
+{
+   radeonTexObjPtr t;
+
+   t = CALLOC_STRUCT( radeon_tex_obj );
+   texObj->DriverData = t;
+   if ( t != NULL ) {
+      if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
+	 fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, (void *)texObj, (void *)t );
+      }
+
+      /* Initialize non-image-dependent parts of the state:
+       */
+      t->base.tObj = texObj;
+      t->border_fallback = GL_FALSE;
+
+      t->pp_txfilter = RADEON_BORDER_MODE_OGL;
+      t->pp_txformat = (RADEON_TXFORMAT_ENDIAN_NO_SWAP |
+			RADEON_TXFORMAT_PERSPECTIVE_ENABLE);
+
+      make_empty_list( & t->base );
+
+      radeonSetTexWrap( t, texObj->WrapS, texObj->WrapT );
+      radeonSetTexMaxAnisotropy( t, texObj->MaxAnisotropy );
+      radeonSetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
+      radeonSetTexBorderColor( t, texObj->_BorderChan );
+   }
+
+   return t;
+}
+
+
+static const struct gl_texture_format *
+radeonChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
+                           GLenum format, GLenum type )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   const GLboolean do32bpt =
+       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32 );
+   const GLboolean force16bpt =
+       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16 );
+   (void) format;
+
+   switch ( internalFormat ) {
+   case 4:
+   case GL_RGBA:
+   case GL_COMPRESSED_RGBA:
+      switch ( type ) {
+      case GL_UNSIGNED_INT_10_10_10_2:
+      case GL_UNSIGNED_INT_2_10_10_10_REV:
+	 return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_argb1555;
+      case GL_UNSIGNED_SHORT_4_4_4_4:
+      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+	 return _dri_texformat_argb4444;
+      case GL_UNSIGNED_SHORT_5_5_5_1:
+      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+	 return _dri_texformat_argb1555;
+      default:
+         return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_argb4444;
+      }
+
+   case 3:
+   case GL_RGB:
+   case GL_COMPRESSED_RGB:
+      switch ( type ) {
+      case GL_UNSIGNED_SHORT_4_4_4_4:
+      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+	 return _dri_texformat_argb4444;
+      case GL_UNSIGNED_SHORT_5_5_5_1:
+      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+	 return _dri_texformat_argb1555;
+      case GL_UNSIGNED_SHORT_5_6_5:
+      case GL_UNSIGNED_SHORT_5_6_5_REV:
+	 return _dri_texformat_rgb565;
+      default:
+         return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
+      }
+
+   case GL_RGBA8:
+   case GL_RGB10_A2:
+   case GL_RGBA12:
+   case GL_RGBA16:
+      return !force16bpt ?
+	  _dri_texformat_argb8888 : _dri_texformat_argb4444;
+
+   case GL_RGBA4:
+   case GL_RGBA2:
+      return _dri_texformat_argb4444;
+
+   case GL_RGB5_A1:
+      return _dri_texformat_argb1555;
+
+   case GL_RGB8:
+   case GL_RGB10:
+   case GL_RGB12:
+   case GL_RGB16:
+      return !force16bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
+
+   case GL_RGB5:
+   case GL_RGB4:
+   case GL_R3_G3_B2:
+      return _dri_texformat_rgb565;
+
+   case GL_ALPHA:
+   case GL_ALPHA4:
+   case GL_ALPHA8:
+   case GL_ALPHA12:
+   case GL_ALPHA16:
+   case GL_COMPRESSED_ALPHA:
+      return _dri_texformat_a8;
+
+   case 1:
+   case GL_LUMINANCE:
+   case GL_LUMINANCE4:
+   case GL_LUMINANCE8:
+   case GL_LUMINANCE12:
+   case GL_LUMINANCE16:
+   case GL_COMPRESSED_LUMINANCE:
+      return _dri_texformat_l8;
+
+   case 2:
+   case GL_LUMINANCE_ALPHA:
+   case GL_LUMINANCE4_ALPHA4:
+   case GL_LUMINANCE6_ALPHA2:
+   case GL_LUMINANCE8_ALPHA8:
+   case GL_LUMINANCE12_ALPHA4:
+   case GL_LUMINANCE12_ALPHA12:
+   case GL_LUMINANCE16_ALPHA16:
+   case GL_COMPRESSED_LUMINANCE_ALPHA:
+      return _dri_texformat_al88;
+
+   case GL_INTENSITY:
+   case GL_INTENSITY4:
+   case GL_INTENSITY8:
+   case GL_INTENSITY12:
+   case GL_INTENSITY16:
+   case GL_COMPRESSED_INTENSITY:
+      return _dri_texformat_i8;
+
+   case GL_YCBCR_MESA:
+      if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
+          type == GL_UNSIGNED_BYTE)
+         return &_mesa_texformat_ycbcr;
+      else
+         return &_mesa_texformat_ycbcr_rev;
+
+   case GL_RGB_S3TC:
+   case GL_RGB4_S3TC:
+   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+      return &_mesa_texformat_rgb_dxt1;
+
+   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+      return &_mesa_texformat_rgba_dxt1;
+
+   case GL_RGBA_S3TC:
+   case GL_RGBA4_S3TC:
+   case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+      return &_mesa_texformat_rgba_dxt3;
+
+   case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+      return &_mesa_texformat_rgba_dxt5;
+
+   default:
+      _mesa_problem(ctx, "unexpected texture format in %s", __FUNCTION__);
+      return NULL;
+   }
+
+   return NULL; /* never get here */
+}
+
+
+static void radeonTexImage1D( GLcontext *ctx, GLenum target, GLint level,
+                              GLint internalFormat,
+                              GLint width, GLint border,
+                              GLenum format, GLenum type, const GLvoid *pixels,
+                              const struct gl_pixelstore_attrib *packing,
+                              struct gl_texture_object *texObj,
+                              struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+
+   if ( t ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) radeonAllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
+         return;
+      }
+   }
+
+   /* Note, this will call ChooseTextureFormat */
+   _mesa_store_teximage1d(ctx, target, level, internalFormat,
+                          width, border, format, type, pixels,
+                          &ctx->Unpack, texObj, texImage);
+
+   t->dirty_images[0] |= (1 << level);
+}
+
+
+static void radeonTexSubImage1D( GLcontext *ctx, GLenum target, GLint level,
+                                 GLint xoffset,
+                                 GLsizei width,
+                                 GLenum format, GLenum type,
+                                 const GLvoid *pixels,
+                                 const struct gl_pixelstore_attrib *packing,
+                                 struct gl_texture_object *texObj,
+                                 struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+
+   assert( t ); /* this _should_ be true */
+   if ( t ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) radeonAllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
+         return;
+      }
+   }
+
+   _mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
+			     format, type, pixels, packing, texObj,
+			     texImage);
+
+   t->dirty_images[0] |= (1 << level);
+}
+
+
+static void radeonTexImage2D( GLcontext *ctx, GLenum target, GLint level,
+                              GLint internalFormat,
+                              GLint width, GLint height, GLint border,
+                              GLenum format, GLenum type, const GLvoid *pixels,
+                              const struct gl_pixelstore_attrib *packing,
+                              struct gl_texture_object *texObj,
+                              struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+   GLuint face;
+
+   /* which cube face or ordinary 2D image */
+   switch (target) {
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+      ASSERT(face < 6);
+      break;
+   default:
+      face = 0;
+   }
+
+   if ( t != NULL ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) radeonAllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
+         return;
+      }
+   }
+
+   /* Note, this will call ChooseTextureFormat */
+   _mesa_store_teximage2d(ctx, target, level, internalFormat,
+                          width, height, border, format, type, pixels,
+                          &ctx->Unpack, texObj, texImage);
+
+   t->dirty_images[face] |= (1 << level);
+}
+
+
+static void radeonTexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
+                                 GLint xoffset, GLint yoffset,
+                                 GLsizei width, GLsizei height,
+                                 GLenum format, GLenum type,
+                                 const GLvoid *pixels,
+                                 const struct gl_pixelstore_attrib *packing,
+                                 struct gl_texture_object *texObj,
+                                 struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+   GLuint face;
+
+   /* which cube face or ordinary 2D image */
+   switch (target) {
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+      ASSERT(face < 6);
+      break;
+   default:
+      face = 0;
+   }
+
+   assert( t ); /* this _should_ be true */
+   if ( t ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) radeonAllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
+         return;
+      }
+   }
+
+   _mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
+			     height, format, type, pixels, packing, texObj,
+			     texImage);
+
+   t->dirty_images[face] |= (1 << level);
+}
+
+static void radeonCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
+                              GLint internalFormat,
+                              GLint width, GLint height, GLint border,
+                              GLsizei imageSize, const GLvoid *data,
+                              struct gl_texture_object *texObj,
+                              struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+   GLuint face;
+
+   /* which cube face or ordinary 2D image */
+   switch (target) {
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+      ASSERT(face < 6);
+      break;
+   default:
+      face = 0;
+   }
+
+   if ( t != NULL ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) radeonAllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexImage2D");
+         return;
+      }
+   }
+
+   /* Note, this will call ChooseTextureFormat */
+   _mesa_store_compressed_teximage2d(ctx, target, level, internalFormat, width,
+                                 height, border, imageSize, data, texObj, texImage);
+
+   t->dirty_images[face] |= (1 << level);
+}
+
+
+static void radeonCompressedTexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
+                                 GLint xoffset, GLint yoffset,
+                                 GLsizei width, GLsizei height,
+                                 GLenum format,
+                                 GLsizei imageSize, const GLvoid *data,
+                                 struct gl_texture_object *texObj,
+                                 struct gl_texture_image *texImage )
+{
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+   GLuint face;
+
+
+   /* which cube face or ordinary 2D image */
+   switch (target) {
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+      ASSERT(face < 6);
+      break;
+   default:
+      face = 0;
+   }
+
+   assert( t ); /* this _should_ be true */
+   if ( t ) {
+      driSwapOutTextureObject( t );
+   }
+   else {
+      t = (driTextureObject *) radeonAllocTexObj( texObj );
+      if (!t) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexSubImage2D");
+         return;
+      }
+   }
+
+   _mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
+                                 height, format, imageSize, data, texObj, texImage);
+
+   t->dirty_images[face] |= (1 << level);
+}
+
+#define SCALED_FLOAT_TO_BYTE( x, scale ) \
+		(((GLuint)((255.0F / scale) * (x))) / 2)
+
+static void radeonTexEnv( GLcontext *ctx, GLenum target,
+			  GLenum pname, const GLfloat *param )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint unit = ctx->Texture.CurrentUnit;
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+
+   if ( RADEON_DEBUG & DEBUG_STATE ) {
+      fprintf( stderr, "%s( %s )\n",
+	       __FUNCTION__, _mesa_lookup_enum_by_nr( pname ) );
+   }
+
+   switch ( pname ) {
+   case GL_TEXTURE_ENV_COLOR: {
+      GLubyte c[4];
+      GLuint envColor;
+      UNCLAMPED_FLOAT_TO_RGBA_CHAN( c, texUnit->EnvColor );
+      envColor = radeonPackColor( 4, c[0], c[1], c[2], c[3] );
+      if ( rmesa->hw.tex[unit].cmd[TEX_PP_TFACTOR] != envColor ) {
+	 RADEON_STATECHANGE( rmesa, tex[unit] );
+	 rmesa->hw.tex[unit].cmd[TEX_PP_TFACTOR] = envColor;
+      }
+      break;
+   }
+
+   case GL_TEXTURE_LOD_BIAS_EXT: {
+      GLfloat bias, min;
+      GLuint b;
+
+      /* The Radeon's LOD bias is a signed 2's complement value with a
+       * range of -1.0 <= bias < 4.0.  We break this into two linear
+       * functions, one mapping [-1.0,0.0] to [-128,0] and one mapping
+       * [0.0,4.0] to [0,127].
+       */
+      min = driQueryOptionb (&rmesa->optionCache, "no_neg_lod_bias") ?
+	  0.0 : -1.0;
+      bias = CLAMP( *param, min, 4.0 );
+      if ( bias == 0 ) {
+	 b = 0;
+      } else if ( bias > 0 ) {
+	 b = ((GLuint)SCALED_FLOAT_TO_BYTE( bias, 4.0 )) << RADEON_LOD_BIAS_SHIFT;
+      } else {
+	 b = ((GLuint)SCALED_FLOAT_TO_BYTE( bias, 1.0 )) << RADEON_LOD_BIAS_SHIFT;
+      }
+      if ( (rmesa->hw.tex[unit].cmd[TEX_PP_TXFILTER] & RADEON_LOD_BIAS_MASK) != b ) {
+	 RADEON_STATECHANGE( rmesa, tex[unit] );
+	 rmesa->hw.tex[unit].cmd[TEX_PP_TXFILTER] &= ~RADEON_LOD_BIAS_MASK;
+	 rmesa->hw.tex[unit].cmd[TEX_PP_TXFILTER] |= (b & RADEON_LOD_BIAS_MASK);
+      }
+      break;
+   }
+
+   default:
+      return;
+   }
+}
+
+
+/**
+ * Changes variables and flags for a state update, which will happen at the
+ * next UpdateTextureState
+ */
+
+static void radeonTexParameter( GLcontext *ctx, GLenum target,
+				struct gl_texture_object *texObj,
+				GLenum pname, const GLfloat *params )
+{
+   radeonTexObjPtr t = (radeonTexObjPtr) texObj->DriverData;
+
+   if ( RADEON_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+      fprintf( stderr, "%s( %s )\n", __FUNCTION__,
+	       _mesa_lookup_enum_by_nr( pname ) );
+   }
+
+   switch ( pname ) {
+   case GL_TEXTURE_MIN_FILTER:
+   case GL_TEXTURE_MAG_FILTER:
+   case GL_TEXTURE_MAX_ANISOTROPY_EXT:
+      radeonSetTexMaxAnisotropy( t, texObj->MaxAnisotropy );
+      radeonSetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
+      break;
+
+   case GL_TEXTURE_WRAP_S:
+   case GL_TEXTURE_WRAP_T:
+      radeonSetTexWrap( t, texObj->WrapS, texObj->WrapT );
+      break;
+
+   case GL_TEXTURE_BORDER_COLOR:
+      radeonSetTexBorderColor( t, texObj->_BorderChan );
+      break;
+
+   case GL_TEXTURE_BASE_LEVEL:
+   case GL_TEXTURE_MAX_LEVEL:
+   case GL_TEXTURE_MIN_LOD:
+   case GL_TEXTURE_MAX_LOD:
+      /* This isn't the most efficient solution but there doesn't appear to
+       * be a nice alternative.  Since there's no LOD clamping,
+       * we just have to rely on loading the right subset of mipmap levels
+       * to simulate a clamped LOD.
+       */
+      driSwapOutTextureObject( (driTextureObject *) t );
+      break;
+
+   default:
+      return;
+   }
+
+   /* Mark this texobj as dirty (one bit per tex unit)
+    */
+   t->dirty_state = TEX_ALL;
+}
+
+
+static void radeonBindTexture( GLcontext *ctx, GLenum target,
+			       struct gl_texture_object *texObj )
+{
+   if ( RADEON_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+      fprintf( stderr, "%s( %p ) unit=%d\n", __FUNCTION__, (void *)texObj,
+	       ctx->Texture.CurrentUnit );
+   }
+
+   assert( (target != GL_TEXTURE_1D && target != GL_TEXTURE_2D &&
+            target != GL_TEXTURE_RECTANGLE_NV && target != GL_TEXTURE_CUBE_MAP) ||
+           (texObj->DriverData != NULL) );
+}
+
+
+static void radeonDeleteTexture( GLcontext *ctx,
+				 struct gl_texture_object *texObj )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+
+   if ( RADEON_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+      fprintf( stderr, "%s( %p (target = %s) )\n", __FUNCTION__, (void *)texObj,
+	       _mesa_lookup_enum_by_nr( texObj->Target ) );
+   }
+
+   if ( t != NULL ) {
+      if ( rmesa ) {
+         RADEON_FIREVERTICES( rmesa );
+      }
+
+      driDestroyTextureObject( t );
+   }
+
+   /* Free mipmap images and the texture object itself */
+   _mesa_delete_texture_object(ctx, texObj);
+}
+
+/* Need:  
+ *  - Same GEN_MODE for all active bits
+ *  - Same EyePlane/ObjPlane for all active bits when using Eye/Obj
+ *  - STRQ presumably all supported (matrix means incoming R values
+ *    can end up in STQ, this has implications for vertex support,
+ *    presumably ok if maos is used, though?)
+ *  
+ * Basically impossible to do this on the fly - just collect some
+ * basic info & do the checks from ValidateState().
+ */
+static void radeonTexGen( GLcontext *ctx,
+			  GLenum coord,
+			  GLenum pname,
+			  const GLfloat *params )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLuint unit = ctx->Texture.CurrentUnit;
+   rmesa->recheck_texgen[unit] = GL_TRUE;
+}
+
+/**
+ * Allocate a new texture object.
+ * Called via ctx->Driver.NewTextureObject.
+ * Note: we could use containment here to 'derive' the driver-specific
+ * texture object from the core mesa gl_texture_object.  Not done at this time.
+ */
+static struct gl_texture_object *
+radeonNewTextureObject( GLcontext *ctx, GLuint name, GLenum target )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   struct gl_texture_object *obj;
+   obj = _mesa_new_texture_object(ctx, name, target);
+   if (!obj)
+      return NULL;
+   obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
+   radeonAllocTexObj( obj );
+   return obj;
+}
+
+
+void radeonInitTextureFuncs( struct dd_function_table *functions )
+{
+   functions->ChooseTextureFormat	= radeonChooseTextureFormat;
+   functions->TexImage1D		= radeonTexImage1D;
+   functions->TexImage2D		= radeonTexImage2D;
+   functions->TexSubImage1D		= radeonTexSubImage1D;
+   functions->TexSubImage2D		= radeonTexSubImage2D;
+
+   functions->NewTextureObject		= radeonNewTextureObject;
+   functions->BindTexture		= radeonBindTexture;
+   functions->DeleteTexture		= radeonDeleteTexture;
+   functions->IsTextureResident		= driIsTextureResident;
+
+   functions->TexEnv			= radeonTexEnv;
+   functions->TexParameter		= radeonTexParameter;
+   functions->TexGen			= radeonTexGen;
+
+   functions->CompressedTexImage2D	= radeonCompressedTexImage2D;
+   functions->CompressedTexSubImage2D	= radeonCompressedTexSubImage2D;
+
+   driInitTextureFormats();
+}
diff --git a/radeon/radeon_tex.h b/radeon/radeon_tex.h
new file mode 100644
index 0000000..a806981
--- /dev/null
+++ b/radeon/radeon_tex.h
@@ -0,0 +1,50 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_tex.h,v 1.3 2002/02/22 21:45:01 dawes Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Gareth Hughes <gareth@valinux.com>
+ *
+ */
+
+#ifndef __RADEON_TEX_H__
+#define __RADEON_TEX_H__
+
+extern void radeonUpdateTextureState( GLcontext *ctx );
+
+extern int radeonUploadTexImages( radeonContextPtr rmesa, radeonTexObjPtr t,
+				  GLuint face );
+
+extern void radeonDestroyTexObj( radeonContextPtr rmesa, radeonTexObjPtr t );
+
+extern void radeonInitTextureFuncs( struct dd_function_table *functions );
+
+#endif /* __RADEON_TEX_H__ */
diff --git a/radeon/radeon_texmem.c b/radeon/radeon_texmem.c
new file mode 100644
index 0000000..20f25dd
--- /dev/null
+++ b/radeon/radeon_texmem.c
@@ -0,0 +1,405 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_texmem.c,v 1.7 2002/12/16 16:18:59 dawes Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation on the rights to use, copy, modify, merge, publish,
+distribute, sub license, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
+SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Gareth Hughes <gareth@valinux.com>
+ *
+ */
+#include <errno.h> 
+
+#include "glheader.h"
+#include "imports.h"
+#include "context.h"
+#include "macros.h"
+
+#include "radeon_context.h"
+#include "radeon_ioctl.h"
+#include "radeon_tex.h"
+
+#include <unistd.h>  /* for usleep() */
+
+
+/**
+ * Destroy any device-dependent state associated with the texture.  This may
+ * include NULLing out hardware state that points to the texture.
+ */
+void
+radeonDestroyTexObj( radeonContextPtr rmesa, radeonTexObjPtr t )
+{
+   if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
+      fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, (void *)t, (void *)t->base.tObj );
+   }
+
+   if ( rmesa != NULL ) {
+      unsigned   i;
+
+
+      for ( i = 0 ; i < rmesa->glCtx->Const.MaxTextureUnits ; i++ ) {
+	 if ( t == rmesa->state.texture.unit[i].texobj ) {
+	    rmesa->state.texture.unit[i].texobj = NULL;
+	 }
+      }
+   }
+}
+
+
+/* ------------------------------------------------------------
+ * Texture image conversions
+ */
+
+
+static void radeonUploadRectSubImage( radeonContextPtr rmesa,
+				      radeonTexObjPtr t, 
+				      struct gl_texture_image *texImage,
+				      GLint x, GLint y, 
+				      GLint width, GLint height )
+{
+   const struct gl_texture_format *texFormat = texImage->TexFormat;
+   int blit_format, dstPitch, done;
+
+   switch ( texFormat->TexelBytes ) {
+   case 1:
+      blit_format = RADEON_GMC_DST_8BPP_CI;
+      break;
+   case 2:
+      blit_format = RADEON_GMC_DST_16BPP;
+      break;
+   case 4:
+      blit_format = RADEON_GMC_DST_32BPP;
+      break;
+   default:
+      fprintf( stderr, "radeonUploadRectSubImage: unknown blit_format (texelbytes=%d)\n", 
+      	       texFormat->TexelBytes);
+      return;
+   }
+
+   t->image[0][0].data = texImage->Data;
+
+   /* Currently don't need to cope with small pitches.
+    */
+   width = texImage->Width;
+   height = texImage->Height;
+   dstPitch = t->pp_txpitch + 32;
+
+   {	/* FIXME: prefer GART-texturing if possible */
+      /* Data not in GART memory, or bad pitch.
+       */
+      for (done = 0; done < height ; ) {
+	 struct radeon_dma_region region;
+	 int lines = MIN2( height - done, RADEON_BUFFER_SIZE / dstPitch );
+	 int src_pitch;
+	 char *tex;
+
+         src_pitch = texImage->RowStride * texFormat->TexelBytes;
+
+	 tex = (char *)texImage->Data + done * src_pitch;
+
+	 memset(&region, 0, sizeof(region));
+	 radeonAllocDmaRegion( rmesa, &region, lines * dstPitch, 1024 );
+
+	 /* Copy texdata to dma:
+	  */
+	 if (0)
+	    fprintf(stderr, "%s: src_pitch %d dst_pitch %d\n",
+		    __FUNCTION__, src_pitch, dstPitch);
+
+	 if (src_pitch == dstPitch) {
+	    memcpy( region.address + region.start, tex, lines * src_pitch );
+	 } 
+	 else {
+	    char *buf = region.address + region.start;
+	    int i;
+	    for (i = 0 ; i < lines ; i++) {
+	       memcpy( buf, tex, src_pitch );
+	       buf += dstPitch;
+	       tex += src_pitch;
+	    }
+	 }
+
+	 radeonEmitWait( rmesa, RADEON_WAIT_3D );
+
+	 
+
+	 /* Blit to framebuffer
+	  */
+	 radeonEmitBlit( rmesa,
+		       blit_format,
+		       dstPitch, GET_START( &region ),
+		       dstPitch, t->bufAddr,
+		       0, 0,
+		       0, done,
+		       width, lines );
+	 
+	 radeonEmitWait( rmesa, RADEON_WAIT_2D );
+
+	 radeonReleaseDmaRegion( rmesa, &region, __FUNCTION__ );
+	 done += lines;
+      }
+   }
+}
+
+
+/**
+ * Upload the texture image associated with texture \a t at the specified
+ * level at the address relative to \a start.
+ */
+static void uploadSubImage( radeonContextPtr rmesa, radeonTexObjPtr t, 
+			    GLint hwlevel,
+			    GLint x, GLint y, GLint width, GLint height,
+			    GLuint face )
+{
+   struct gl_texture_image *texImage = NULL;
+   GLuint offset;
+   GLint imageWidth, imageHeight;
+   GLint ret;
+   drm_radeon_texture_t tex;
+   drm_radeon_tex_image_t tmp;
+   const int level = hwlevel + t->base.firstLevel;
+
+   if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
+      fprintf( stderr, "%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n", 
+	       __FUNCTION__, (void *)t, (void *)t->base.tObj, level, width, height, face );
+   }
+
+   ASSERT(face < 6);
+
+   /* Ensure we have a valid texture to upload */
+   if ( ( hwlevel < 0 ) || ( hwlevel >= RADEON_MAX_TEXTURE_LEVELS ) ) {
+      _mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
+      return;
+   }
+
+   texImage = t->base.tObj->Image[face][level];
+
+   if ( !texImage ) {
+      if ( RADEON_DEBUG & DEBUG_TEXTURE )
+	 fprintf( stderr, "%s: texImage %d is NULL!\n", __FUNCTION__, level );
+      return;
+   }
+   if ( !texImage->Data ) {
+      if ( RADEON_DEBUG & DEBUG_TEXTURE )
+	 fprintf( stderr, "%s: image data is NULL!\n", __FUNCTION__ );
+      return;
+   }
+
+
+   if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+      assert(level == 0);
+      assert(hwlevel == 0);
+      if ( RADEON_DEBUG & DEBUG_TEXTURE )
+	 fprintf( stderr, "%s: image data is rectangular\n", __FUNCTION__);
+      radeonUploadRectSubImage( rmesa, t, texImage, x, y, width, height );
+      return;
+   }
+
+   imageWidth = texImage->Width;
+   imageHeight = texImage->Height;
+
+   offset = t->bufAddr + t->base.totalSize * face / 6;
+
+   if ( RADEON_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
+      GLint imageX = 0;
+      GLint imageY = 0;
+      GLint blitX = t->image[face][hwlevel].x;
+      GLint blitY = t->image[face][hwlevel].y;
+      GLint blitWidth = t->image[face][hwlevel].width;
+      GLint blitHeight = t->image[face][hwlevel].height;
+      fprintf( stderr, "   upload image: %d,%d at %d,%d\n",
+	       imageWidth, imageHeight, imageX, imageY );
+      fprintf( stderr, "   upload  blit: %d,%d at %d,%d\n",
+	       blitWidth, blitHeight, blitX, blitY );
+      fprintf( stderr, "       blit ofs: 0x%07x level: %d/%d\n",
+	       (GLuint)offset, hwlevel, level );
+   }
+
+   t->image[face][hwlevel].data = texImage->Data;
+
+   /* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
+    * NOTE: we're always use a 1KB-wide blit and I8 texture format.
+    * We used to use 1, 2 and 4-byte texels and used to use the texture
+    * width to dictate the blit width - but that won't work for compressed
+    * textures. (Brian)
+    * NOTE: can't do that with texture tiling. (sroland)
+    */
+   tex.offset = offset;
+   tex.image = &tmp;
+   /* copy (x,y,width,height,data) */
+   memcpy( &tmp, &t->image[face][hwlevel], sizeof(drm_radeon_tex_image_t) );
+
+   if (texImage->TexFormat->TexelBytes) {
+      /* use multi-byte upload scheme */
+      tex.height = imageHeight;
+      tex.width = imageWidth;
+      tex.format = t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK;
+      tex.pitch = MAX2((texImage->Width * texImage->TexFormat->TexelBytes) / 64, 1);
+      tex.offset += tmp.x & ~1023;
+      tmp.x = tmp.x % 1024;
+      if (t->tile_bits & RADEON_TXO_MICRO_TILE_X2) {
+	 /* need something like "tiled coordinates" ? */
+	 tmp.y = tmp.x / (tex.pitch * 128) * 2;
+	 tmp.x = tmp.x % (tex.pitch * 128) / 2 / texImage->TexFormat->TexelBytes;
+	 tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
+      }
+      else {
+	 tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
+      }
+      if ((t->tile_bits & RADEON_TXO_MACRO_TILE) &&
+	 (texImage->Width * texImage->TexFormat->TexelBytes >= 256)) {
+	 /* radeon switches off macro tiling for small textures/mipmaps it seems */
+	 tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
+      }
+   }
+   else {
+      /* In case of for instance 8x8 texture (2x2 dxt blocks), padding after the first two blocks is
+         needed (only with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
+      /* set tex.height to 1/4 since 1 "macropixel" (dxt-block) has 4 real pixels. Needed
+         so the kernel module reads the right amount of data. */
+      tex.format = RADEON_TXFORMAT_I8; /* any 1-byte texel format */
+      tex.pitch = (BLIT_WIDTH_BYTES / 64);
+      tex.height = (imageHeight + 3) / 4;
+      tex.width = (imageWidth + 3) / 4;
+      switch (t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK) {
+      case RADEON_TXFORMAT_DXT1:
+         tex.width *= 8;
+         break;
+      case RADEON_TXFORMAT_DXT23:
+      case RADEON_TXFORMAT_DXT45:
+         tex.width *= 16;
+         break;
+      }
+   }
+
+   LOCK_HARDWARE( rmesa );
+   do {
+      ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_TEXTURE,
+                                 &tex, sizeof(drm_radeon_texture_t) );
+   } while ( ret == -EAGAIN );
+
+   UNLOCK_HARDWARE( rmesa );
+
+   if ( ret ) {
+      fprintf( stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret );
+      fprintf( stderr, "   offset=0x%08x\n",
+	       offset );
+      fprintf( stderr, "   image width=%d height=%d\n",
+	       imageWidth, imageHeight );
+      fprintf( stderr, "    blit width=%d height=%d data=%p\n",
+	       t->image[face][hwlevel].width, t->image[face][hwlevel].height,
+	       t->image[face][hwlevel].data );
+      exit( 1 );
+   }
+}
+
+
+/**
+ * Upload the texture images associated with texture \a t.  This might
+ * require the allocation of texture memory.
+ * 
+ * \param rmesa Context pointer
+ * \param t Texture to be uploaded
+ * \param face Cube map face to be uploaded.  Zero for non-cube maps.
+ */
+
+int radeonUploadTexImages( radeonContextPtr rmesa, radeonTexObjPtr t, GLuint face )
+{
+   int numLevels;
+
+   if ( !t || t->base.totalSize == 0 )
+      return 0;
+
+   if ( RADEON_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
+      fprintf( stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
+	       (void *)rmesa->glCtx, (void *)t->base.tObj, t->base.totalSize,
+	       t->base.firstLevel, t->base.lastLevel );
+   }
+
+   numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+
+   if (RADEON_DEBUG & DEBUG_SYNC) {
+      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
+      radeonFinish( rmesa->glCtx );
+   }
+
+   LOCK_HARDWARE( rmesa );
+
+   if ( t->base.memBlock == NULL ) {
+      int heap;
+
+      heap = driAllocateTexture( rmesa->texture_heaps, rmesa->nr_heaps,
+				 (driTextureObject *) t );
+      if ( heap == -1 ) {
+	 UNLOCK_HARDWARE( rmesa );
+	 return -1;
+      }
+
+      /* Set the base offset of the texture image */
+      t->bufAddr = rmesa->radeonScreen->texOffset[heap] 
+	   + t->base.memBlock->ofs;
+      t->pp_txoffset = t->bufAddr;
+
+      if (!(t->base.tObj->Image[0][0]->IsClientData)) {
+	 /* hope it's safe to add that here... */
+	 t->pp_txoffset |= t->tile_bits;
+      }
+
+      /* Mark this texobj as dirty on all units:
+       */
+      t->dirty_state = TEX_ALL;
+   }
+
+
+   /* Let the world know we've used this memory recently.
+    */
+   driUpdateTextureLRU( (driTextureObject *) t );
+   UNLOCK_HARDWARE( rmesa );
+
+
+   /* Upload any images that are new */
+   if (t->base.dirty_images[face]) {
+      int i;
+      for ( i = 0 ; i < numLevels ; i++ ) {
+         if ( (t->base.dirty_images[face] & (1 << (i+t->base.firstLevel))) != 0 ) {
+            uploadSubImage( rmesa, t, i, 0, 0, t->image[face][i].width,
+			    t->image[face][i].height, face );
+         }
+      }
+      t->base.dirty_images[face] = 0;
+   }
+
+   if (RADEON_DEBUG & DEBUG_SYNC) {
+      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
+      radeonFinish( rmesa->glCtx );
+   }
+
+   return 0;
+}
diff --git a/radeon/radeon_texstate.c b/radeon/radeon_texstate.c
new file mode 100644
index 0000000..37bb749
--- /dev/null
+++ b/radeon/radeon_texstate.c
@@ -0,0 +1,1334 @@
+/* $XFree86: xc/lib/GL/mesa/src/drv/radeon/radeon_texstate.c,v 1.6 2002/12/16 16:18:59 dawes Exp $ */
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Gareth Hughes <gareth@valinux.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "colormac.h"
+#include "context.h"
+#include "macros.h"
+#include "texformat.h"
+#include "enums.h"
+
+#include "radeon_context.h"
+#include "radeon_state.h"
+#include "radeon_ioctl.h"
+#include "radeon_swtcl.h"
+#include "radeon_tex.h"
+#include "radeon_tcl.h"
+
+
+#define RADEON_TXFORMAT_A8        RADEON_TXFORMAT_I8
+#define RADEON_TXFORMAT_L8        RADEON_TXFORMAT_I8
+#define RADEON_TXFORMAT_AL88      RADEON_TXFORMAT_AI88
+#define RADEON_TXFORMAT_YCBCR     RADEON_TXFORMAT_YVYU422
+#define RADEON_TXFORMAT_YCBCR_REV RADEON_TXFORMAT_VYUY422
+#define RADEON_TXFORMAT_RGB_DXT1  RADEON_TXFORMAT_DXT1
+#define RADEON_TXFORMAT_RGBA_DXT1 RADEON_TXFORMAT_DXT1
+#define RADEON_TXFORMAT_RGBA_DXT3 RADEON_TXFORMAT_DXT23
+#define RADEON_TXFORMAT_RGBA_DXT5 RADEON_TXFORMAT_DXT45
+
+#define _COLOR(f) \
+    [ MESA_FORMAT_ ## f ] = { RADEON_TXFORMAT_ ## f, 0 }
+#define _COLOR_REV(f) \
+    [ MESA_FORMAT_ ## f ## _REV ] = { RADEON_TXFORMAT_ ## f, 0 }
+#define _ALPHA(f) \
+    [ MESA_FORMAT_ ## f ] = { RADEON_TXFORMAT_ ## f | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 }
+#define _ALPHA_REV(f) \
+    [ MESA_FORMAT_ ## f ## _REV ] = { RADEON_TXFORMAT_ ## f | RADEON_TXFORMAT_ALPHA_IN_MAP, 0 }
+#define _YUV(f) \
+   [ MESA_FORMAT_ ## f ] = { RADEON_TXFORMAT_ ## f, RADEON_YUV_TO_RGB }
+#define _INVALID(f) \
+    [ MESA_FORMAT_ ## f ] = { 0xffffffff, 0 }
+#define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5) \
+			     && (tx_table[f].format != 0xffffffff) )
+
+static const struct {
+   GLuint format, filter;
+}
+tx_table[] =
+{
+   _ALPHA(RGBA8888),
+   _ALPHA_REV(RGBA8888),
+   _ALPHA(ARGB8888),
+   _ALPHA_REV(ARGB8888),
+   _INVALID(RGB888),
+   _COLOR(RGB565),
+   _COLOR_REV(RGB565),
+   _ALPHA(ARGB4444),
+   _ALPHA_REV(ARGB4444),
+   _ALPHA(ARGB1555),
+   _ALPHA_REV(ARGB1555),
+   _ALPHA(AL88),
+   _ALPHA_REV(AL88),
+   _ALPHA(A8),
+   _COLOR(L8),
+   _ALPHA(I8),
+   _INVALID(CI8),
+   _YUV(YCBCR),
+   _YUV(YCBCR_REV),
+   _INVALID(RGB_FXT1),
+   _INVALID(RGBA_FXT1),
+   _COLOR(RGB_DXT1),
+   _ALPHA(RGBA_DXT1),
+   _ALPHA(RGBA_DXT3),
+   _ALPHA(RGBA_DXT5),
+};
+
+#undef _COLOR
+#undef _ALPHA
+#undef _INVALID
+
+/**
+ * This function computes the number of bytes of storage needed for
+ * the given texture object (all mipmap levels, all cube faces).
+ * The \c image[face][level].x/y/width/height parameters for upload/blitting
+ * are computed here.  \c pp_txfilter, \c pp_txformat, etc. will be set here
+ * too.
+ * 
+ * \param rmesa Context pointer
+ * \param tObj GL texture object whose images are to be posted to
+ *                 hardware state.
+ */
+static void radeonSetTexImages( radeonContextPtr rmesa,
+				struct gl_texture_object *tObj )
+{
+   radeonTexObjPtr t = (radeonTexObjPtr)tObj->DriverData;
+   const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
+   GLint curOffset, blitWidth;
+   GLint i, texelBytes;
+   GLint numLevels;
+   GLint log2Width, log2Height, log2Depth;
+
+   /* Set the hardware texture format
+    */
+
+   t->pp_txformat &= ~(RADEON_TXFORMAT_FORMAT_MASK |
+		       RADEON_TXFORMAT_ALPHA_IN_MAP);
+   t->pp_txfilter &= ~RADEON_YUV_TO_RGB;
+
+   if ( VALID_FORMAT( baseImage->TexFormat->MesaFormat ) ) {
+      t->pp_txformat |= tx_table[ baseImage->TexFormat->MesaFormat ].format;
+      t->pp_txfilter |= tx_table[ baseImage->TexFormat->MesaFormat ].filter;
+   }
+   else {
+      _mesa_problem(NULL, "unexpected texture format in %s", __FUNCTION__);
+      return;
+   }
+
+   texelBytes = baseImage->TexFormat->TexelBytes;
+
+   /* Compute which mipmap levels we really want to send to the hardware.
+    */
+
+   if (tObj->Target != GL_TEXTURE_CUBE_MAP)
+      driCalculateTextureFirstLastLevel( (driTextureObject *) t );
+   else {
+      /* r100 can't handle mipmaps for cube/3d textures, so don't waste
+         memory for them */
+      t->base.firstLevel = t->base.lastLevel = tObj->BaseLevel;
+   }
+   log2Width  = tObj->Image[0][t->base.firstLevel]->WidthLog2;
+   log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
+   log2Depth  = tObj->Image[0][t->base.firstLevel]->DepthLog2;
+
+   numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+
+   assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
+
+   /* Calculate mipmap offsets and dimensions for blitting (uploading)
+    * The idea is that we lay out the mipmap levels within a block of
+    * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
+    */
+   curOffset = 0;
+   blitWidth = BLIT_WIDTH_BYTES;
+   t->tile_bits = 0;
+
+   /* figure out if this texture is suitable for tiling. */
+   if (texelBytes && (tObj->Target != GL_TEXTURE_RECTANGLE_NV)) {
+      if (rmesa->texmicrotile && (baseImage->Height > 1)) {
+	 /* allow 32 (bytes) x 1 mip (which will use two times the space
+	    the non-tiled version would use) max if base texture is large enough */
+	 if ((numLevels == 1) ||
+	   (((baseImage->Width * texelBytes / baseImage->Height) <= 32) &&
+	       (baseImage->Width * texelBytes > 64)) ||
+	    ((baseImage->Width * texelBytes / baseImage->Height) <= 16)) {
+	    /* R100 has two microtile bits (only the txoffset reg, not the blitter)
+	       weird: X2 + OPT: 32bit correct, 16bit completely hosed
+		      X2: 32bit correct, 16bit correct
+		      OPT: 32bit large mips correct, small mips hosed, 16bit completely hosed */
+	    t->tile_bits |= RADEON_TXO_MICRO_TILE_X2 /*| RADEON_TXO_MICRO_TILE_OPT*/;
+	 }
+      }
+      if ((baseImage->Width * texelBytes >= 256) && (baseImage->Height >= 16)) {
+	 /* R100 disables macro tiling only if mip width is smaller than 256 bytes, and not
+	    in the case if height is smaller than 16 (not 100% sure), as does the r200,
+	    so need to disable macro tiling in that case */
+	 if ((numLevels == 1) || ((baseImage->Width * texelBytes / baseImage->Height) <= 4)) {
+	    t->tile_bits |= RADEON_TXO_MACRO_TILE;
+	 }
+      }
+   }
+
+   for (i = 0; i < numLevels; i++) {
+      const struct gl_texture_image *texImage;
+      GLuint size;
+
+      texImage = tObj->Image[0][i + t->base.firstLevel];
+      if ( !texImage )
+	 break;
+
+      /* find image size in bytes */
+      if (texImage->IsCompressed) {
+      /* need to calculate the size AFTER padding even though the texture is
+         submitted without padding.
+         Only handle pot textures currently - don't know if npot is even possible,
+         size calculation would certainly need (trivial) adjustments.
+         Align (and later pad) to 32byte, not sure what that 64byte blit width is
+         good for? */
+         if ((t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK) == RADEON_TXFORMAT_DXT1) {
+            /* RGB_DXT1/RGBA_DXT1, 8 bytes per block */
+            if ((texImage->Width + 3) < 8) /* width one block */
+               size = texImage->CompressedSize * 4;
+            else if ((texImage->Width + 3) < 16)
+               size = texImage->CompressedSize * 2;
+            else size = texImage->CompressedSize;
+         }
+         else /* DXT3/5, 16 bytes per block */
+            if ((texImage->Width + 3) < 8)
+               size = texImage->CompressedSize * 2;
+            else size = texImage->CompressedSize;
+      }
+      else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+	 size = ((texImage->Width * texelBytes + 63) & ~63) * texImage->Height;
+      }
+      else if (t->tile_bits & RADEON_TXO_MICRO_TILE_X2) {
+	 /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+	    though the actual offset may be different (if texture is less than
+	    32 bytes width) to the untiled case */
+	 int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
+	 size = (w * ((texImage->Height + 1) / 2)) * texImage->Depth;
+	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+      }
+      else {
+	 int w = (texImage->Width * texelBytes + 31) & ~31;
+	 size = w * texImage->Height * texImage->Depth;
+	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+      }
+      assert(size > 0);
+
+      /* Align to 32-byte offset.  It is faster to do this unconditionally
+       * (no branch penalty).
+       */
+
+      curOffset = (curOffset + 0x1f) & ~0x1f;
+
+      if (texelBytes) {
+	 t->image[0][i].x = curOffset; /* fix x and y coords up later together with offset */
+	 t->image[0][i].y = 0;
+	 t->image[0][i].width = MIN2(size / texelBytes, blitWidth);
+	 t->image[0][i].height = (size / texelBytes) / t->image[0][i].width;
+      }
+      else {
+         t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
+         t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
+         t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
+         t->image[0][i].height = size / t->image[0][i].width;     
+      }
+
+#if 0
+      /* for debugging only and only  applicable to non-rectangle targets */
+      assert(size % t->image[0][i].width == 0);
+      assert(t->image[0][i].x == 0
+             || (size < BLIT_WIDTH_BYTES && t->image[0][i].height == 1));
+#endif
+
+      if (0)
+         fprintf(stderr,
+                 "level %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
+                 i, texImage->Width, texImage->Height,
+                 t->image[0][i].x, t->image[0][i].y,
+                 t->image[0][i].width, t->image[0][i].height, size, curOffset);
+
+      curOffset += size;
+
+   }
+
+   /* Align the total size of texture memory block.
+    */
+   t->base.totalSize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+
+   /* Setup remaining cube face blits, if needed */
+   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+      const GLuint faceSize = t->base.totalSize;
+      GLuint face;
+      /* reuse face 0 x/y/width/height - just update the offset when uploading */
+      for (face = 1; face < 6; face++) {
+         for (i = 0; i < numLevels; i++) {
+            t->image[face][i].x =  t->image[0][i].x;
+            t->image[face][i].y =  t->image[0][i].y;
+            t->image[face][i].width  = t->image[0][i].width;
+            t->image[face][i].height = t->image[0][i].height;
+         }
+      }
+      t->base.totalSize = 6 * faceSize; /* total texmem needed */
+   }
+
+   /* Hardware state:
+    */
+   t->pp_txfilter &= ~RADEON_MAX_MIP_LEVEL_MASK;
+   t->pp_txfilter |= (numLevels - 1) << RADEON_MAX_MIP_LEVEL_SHIFT;
+
+   t->pp_txformat &= ~(RADEON_TXFORMAT_WIDTH_MASK |
+		       RADEON_TXFORMAT_HEIGHT_MASK |
+                       RADEON_TXFORMAT_CUBIC_MAP_ENABLE |
+                       RADEON_TXFORMAT_F5_WIDTH_MASK |
+                       RADEON_TXFORMAT_F5_HEIGHT_MASK);
+   t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_WIDTH_SHIFT) |
+		      (log2Height << RADEON_TXFORMAT_HEIGHT_SHIFT));
+
+   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+      assert(log2Width == log2Height);
+      t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_F5_WIDTH_SHIFT) |
+                         (log2Height << RADEON_TXFORMAT_F5_HEIGHT_SHIFT) |
+                         (RADEON_TXFORMAT_CUBIC_MAP_ENABLE));
+      t->pp_cubic_faces = ((log2Width << RADEON_FACE_WIDTH_1_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_1_SHIFT) |
+                           (log2Width << RADEON_FACE_WIDTH_2_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_2_SHIFT) |
+                           (log2Width << RADEON_FACE_WIDTH_3_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_3_SHIFT) |
+                           (log2Width << RADEON_FACE_WIDTH_4_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_4_SHIFT));
+   }
+
+   t->pp_txsize = (((tObj->Image[0][t->base.firstLevel]->Width - 1) << 0) |
+                   ((tObj->Image[0][t->base.firstLevel]->Height - 1) << 16));
+
+   /* Only need to round to nearest 32 for textures, but the blitter
+    * requires 64-byte aligned pitches, and we may/may not need the
+    * blitter.   NPOT only!
+    */
+   if (baseImage->IsCompressed)
+      t->pp_txpitch = (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
+   else
+      t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * texelBytes) + 63) & ~(63);
+   t->pp_txpitch -= 32;
+
+   t->dirty_state = TEX_ALL;
+
+   /* FYI: radeonUploadTexImages( rmesa, t ); used to be called here */
+}
+
+
+
+/* ================================================================
+ * Texture combine functions
+ */
+
+/* GL_ARB_texture_env_combine support
+ */
+
+/* The color tables have combine functions for GL_SRC_COLOR,
+ * GL_ONE_MINUS_SRC_COLOR, GL_SRC_ALPHA and GL_ONE_MINUS_SRC_ALPHA.
+ */
+static GLuint radeon_texture_color[][RADEON_MAX_TEXTURE_UNITS] =
+{
+   {
+      RADEON_COLOR_ARG_A_T0_COLOR,
+      RADEON_COLOR_ARG_A_T1_COLOR,
+      RADEON_COLOR_ARG_A_T2_COLOR
+   },
+   {
+      RADEON_COLOR_ARG_A_T0_COLOR | RADEON_COMP_ARG_A,
+      RADEON_COLOR_ARG_A_T1_COLOR | RADEON_COMP_ARG_A,
+      RADEON_COLOR_ARG_A_T2_COLOR | RADEON_COMP_ARG_A
+   },
+   {
+      RADEON_COLOR_ARG_A_T0_ALPHA,
+      RADEON_COLOR_ARG_A_T1_ALPHA,
+      RADEON_COLOR_ARG_A_T2_ALPHA
+   },
+   {
+      RADEON_COLOR_ARG_A_T0_ALPHA | RADEON_COMP_ARG_A,
+      RADEON_COLOR_ARG_A_T1_ALPHA | RADEON_COMP_ARG_A,
+      RADEON_COLOR_ARG_A_T2_ALPHA | RADEON_COMP_ARG_A
+   },
+};
+
+static GLuint radeon_tfactor_color[] =
+{
+   RADEON_COLOR_ARG_A_TFACTOR_COLOR,
+   RADEON_COLOR_ARG_A_TFACTOR_COLOR | RADEON_COMP_ARG_A,
+   RADEON_COLOR_ARG_A_TFACTOR_ALPHA,
+   RADEON_COLOR_ARG_A_TFACTOR_ALPHA | RADEON_COMP_ARG_A
+};
+
+static GLuint radeon_primary_color[] =
+{
+   RADEON_COLOR_ARG_A_DIFFUSE_COLOR,
+   RADEON_COLOR_ARG_A_DIFFUSE_COLOR | RADEON_COMP_ARG_A,
+   RADEON_COLOR_ARG_A_DIFFUSE_ALPHA,
+   RADEON_COLOR_ARG_A_DIFFUSE_ALPHA | RADEON_COMP_ARG_A
+};
+
+static GLuint radeon_previous_color[] =
+{
+   RADEON_COLOR_ARG_A_CURRENT_COLOR,
+   RADEON_COLOR_ARG_A_CURRENT_COLOR | RADEON_COMP_ARG_A,
+   RADEON_COLOR_ARG_A_CURRENT_ALPHA,
+   RADEON_COLOR_ARG_A_CURRENT_ALPHA | RADEON_COMP_ARG_A
+};
+
+/* GL_ZERO table - indices 0-3
+ * GL_ONE  table - indices 1-4
+ */
+static GLuint radeon_zero_color[] =
+{
+   RADEON_COLOR_ARG_A_ZERO,
+   RADEON_COLOR_ARG_A_ZERO | RADEON_COMP_ARG_A,
+   RADEON_COLOR_ARG_A_ZERO,
+   RADEON_COLOR_ARG_A_ZERO | RADEON_COMP_ARG_A,
+   RADEON_COLOR_ARG_A_ZERO
+};
+
+
+/* The alpha tables only have GL_SRC_ALPHA and GL_ONE_MINUS_SRC_ALPHA.
+ */
+static GLuint radeon_texture_alpha[][RADEON_MAX_TEXTURE_UNITS] =
+{
+   {
+      RADEON_ALPHA_ARG_A_T0_ALPHA,
+      RADEON_ALPHA_ARG_A_T1_ALPHA,
+      RADEON_ALPHA_ARG_A_T2_ALPHA
+   },
+   {
+      RADEON_ALPHA_ARG_A_T0_ALPHA | RADEON_COMP_ARG_A,
+      RADEON_ALPHA_ARG_A_T1_ALPHA | RADEON_COMP_ARG_A,
+      RADEON_ALPHA_ARG_A_T2_ALPHA | RADEON_COMP_ARG_A
+   },
+};
+
+static GLuint radeon_tfactor_alpha[] =
+{
+   RADEON_ALPHA_ARG_A_TFACTOR_ALPHA,
+   RADEON_ALPHA_ARG_A_TFACTOR_ALPHA | RADEON_COMP_ARG_A
+};
+
+static GLuint radeon_primary_alpha[] =
+{
+   RADEON_ALPHA_ARG_A_DIFFUSE_ALPHA,
+   RADEON_ALPHA_ARG_A_DIFFUSE_ALPHA | RADEON_COMP_ARG_A
+};
+
+static GLuint radeon_previous_alpha[] =
+{
+   RADEON_ALPHA_ARG_A_CURRENT_ALPHA,
+   RADEON_ALPHA_ARG_A_CURRENT_ALPHA | RADEON_COMP_ARG_A
+};
+
+/* GL_ZERO table - indices 0-1
+ * GL_ONE  table - indices 1-2
+ */
+static GLuint radeon_zero_alpha[] =
+{
+   RADEON_ALPHA_ARG_A_ZERO,
+   RADEON_ALPHA_ARG_A_ZERO | RADEON_COMP_ARG_A,
+   RADEON_ALPHA_ARG_A_ZERO
+};
+
+
+/* Extract the arg from slot A, shift it into the correct argument slot
+ * and set the corresponding complement bit.
+ */
+#define RADEON_COLOR_ARG( n, arg )			\
+do {							\
+   color_combine |=					\
+      ((color_arg[n] & RADEON_COLOR_ARG_MASK)		\
+       << RADEON_COLOR_ARG_##arg##_SHIFT);		\
+   color_combine |=					\
+      ((color_arg[n] >> RADEON_COMP_ARG_SHIFT)		\
+       << RADEON_COMP_ARG_##arg##_SHIFT);		\
+} while (0)
+
+#define RADEON_ALPHA_ARG( n, arg )			\
+do {							\
+   alpha_combine |=					\
+      ((alpha_arg[n] & RADEON_ALPHA_ARG_MASK)		\
+       << RADEON_ALPHA_ARG_##arg##_SHIFT);		\
+   alpha_combine |=					\
+      ((alpha_arg[n] >> RADEON_COMP_ARG_SHIFT)		\
+       << RADEON_COMP_ARG_##arg##_SHIFT);		\
+} while (0)
+
+
+/* ================================================================
+ * Texture unit state management
+ */
+
+static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   GLuint color_combine, alpha_combine;
+   const GLuint color_combine0 = RADEON_COLOR_ARG_A_ZERO | RADEON_COLOR_ARG_B_ZERO
+         | RADEON_COLOR_ARG_C_CURRENT_COLOR | RADEON_BLEND_CTL_ADD
+         | RADEON_SCALE_1X | RADEON_CLAMP_TX;
+   const GLuint alpha_combine0 = RADEON_ALPHA_ARG_A_ZERO | RADEON_ALPHA_ARG_B_ZERO
+         | RADEON_ALPHA_ARG_C_CURRENT_ALPHA | RADEON_BLEND_CTL_ADD
+         | RADEON_SCALE_1X | RADEON_CLAMP_TX;
+
+
+   /* texUnit->_Current can be NULL if and only if the texture unit is
+    * not actually enabled.
+    */
+   assert( (texUnit->_ReallyEnabled == 0)
+	   || (texUnit->_Current != NULL) );
+
+   if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
+      fprintf( stderr, "%s( %p, %d )\n", __FUNCTION__, (void *)ctx, unit );
+   }
+
+   /* Set the texture environment state.  Isn't this nice and clean?
+    * The chip will automagically set the texture alpha to 0xff when
+    * the texture format does not include an alpha component. This
+    * reduces the amount of special-casing we have to do, alpha-only
+    * textures being a notable exception. Doesn't work for luminance
+    * textures realized with I8 and ALPHA_IN_MAP not set neither (on r100).
+    */
+    /* Don't cache these results.
+    */
+   rmesa->state.texture.unit[unit].format = 0;
+   rmesa->state.texture.unit[unit].envMode = 0;
+
+   if ( !texUnit->_ReallyEnabled ) {
+      color_combine = color_combine0;
+      alpha_combine = alpha_combine0;
+   }
+   else {
+      GLuint color_arg[3], alpha_arg[3];
+      GLuint i;
+      const GLuint numColorArgs = texUnit->_CurrentCombine->_NumArgsRGB;
+      const GLuint numAlphaArgs = texUnit->_CurrentCombine->_NumArgsA;
+      GLuint RGBshift = texUnit->_CurrentCombine->ScaleShiftRGB;
+      GLuint Ashift = texUnit->_CurrentCombine->ScaleShiftA;
+
+
+      /* Step 1:
+       * Extract the color and alpha combine function arguments.
+       */
+      for ( i = 0 ; i < numColorArgs ; i++ ) {
+	 const GLint op = texUnit->_CurrentCombine->OperandRGB[i] - GL_SRC_COLOR;
+	 const GLuint srcRGBi = texUnit->_CurrentCombine->SourceRGB[i];
+	 assert(op >= 0);
+	 assert(op <= 3);
+	 switch ( srcRGBi ) {
+	 case GL_TEXTURE:
+	    if (texUnit->_Current->Image[0][0]->_BaseFormat == GL_ALPHA)
+	       color_arg[i] = radeon_zero_color[op];
+	    else
+	       color_arg[i] = radeon_texture_color[op][unit];
+	    break;
+	 case GL_CONSTANT:
+	    color_arg[i] = radeon_tfactor_color[op];
+	    break;
+	 case GL_PRIMARY_COLOR:
+	    color_arg[i] = radeon_primary_color[op];
+	    break;
+	 case GL_PREVIOUS:
+	    color_arg[i] = radeon_previous_color[op];
+	    break;
+	 case GL_ZERO:
+	    color_arg[i] = radeon_zero_color[op];
+	    break;
+	 case GL_ONE:
+	    color_arg[i] = radeon_zero_color[op+1];
+	    break;
+	 case GL_TEXTURE0:
+	 case GL_TEXTURE1:
+	 case GL_TEXTURE2: {
+	    GLuint txunit = srcRGBi - GL_TEXTURE0;
+	    if (ctx->Texture.Unit[txunit]._Current->Image[0][0]->_BaseFormat == GL_ALPHA)
+	       color_arg[i] = radeon_zero_color[op];
+	    else
+	 /* implement ogl 1.4/1.5 core spec here, not specification of
+	  * GL_ARB_texture_env_crossbar (which would require disabling blending
+	  * instead of undefined results when referencing not enabled texunit) */
+	      color_arg[i] = radeon_texture_color[op][txunit];
+	    }
+	    break;
+	 default:
+	    return GL_FALSE;
+	 }
+      }
+
+      for ( i = 0 ; i < numAlphaArgs ; i++ ) {
+	 const GLint op = texUnit->_CurrentCombine->OperandA[i] - GL_SRC_ALPHA;
+	 const GLuint srcAi = texUnit->_CurrentCombine->SourceA[i];
+	 assert(op >= 0);
+	 assert(op <= 1);
+	 switch ( srcAi ) {
+	 case GL_TEXTURE:
+	    if (texUnit->_Current->Image[0][0]->_BaseFormat == GL_LUMINANCE)
+	       alpha_arg[i] = radeon_zero_alpha[op+1];
+	    else
+	       alpha_arg[i] = radeon_texture_alpha[op][unit];
+	    break;
+	 case GL_CONSTANT:
+	    alpha_arg[i] = radeon_tfactor_alpha[op];
+	    break;
+	 case GL_PRIMARY_COLOR:
+	    alpha_arg[i] = radeon_primary_alpha[op];
+	    break;
+	 case GL_PREVIOUS:
+	    alpha_arg[i] = radeon_previous_alpha[op];
+	    break;
+	 case GL_ZERO:
+	    alpha_arg[i] = radeon_zero_alpha[op];
+	    break;
+	 case GL_ONE:
+	    alpha_arg[i] = radeon_zero_alpha[op+1];
+	    break;
+	 case GL_TEXTURE0:
+	 case GL_TEXTURE1:
+	 case GL_TEXTURE2: {    
+	    GLuint txunit = srcAi - GL_TEXTURE0;
+	    if (ctx->Texture.Unit[txunit]._Current->Image[0][0]->_BaseFormat == GL_LUMINANCE)
+	       alpha_arg[i] = radeon_zero_alpha[op+1];
+	    else
+	       alpha_arg[i] = radeon_texture_alpha[op][txunit];
+	    }
+	    break;
+	 default:
+	    return GL_FALSE;
+	 }
+      }
+
+      /* Step 2:
+       * Build up the color and alpha combine functions.
+       */
+      switch ( texUnit->_CurrentCombine->ModeRGB ) {
+      case GL_REPLACE:
+	 color_combine = (RADEON_COLOR_ARG_A_ZERO |
+			  RADEON_COLOR_ARG_B_ZERO |
+			  RADEON_BLEND_CTL_ADD |
+			  RADEON_CLAMP_TX);
+	 RADEON_COLOR_ARG( 0, C );
+	 break;
+      case GL_MODULATE:
+	 color_combine = (RADEON_COLOR_ARG_C_ZERO |
+			  RADEON_BLEND_CTL_ADD |
+			  RADEON_CLAMP_TX);
+	 RADEON_COLOR_ARG( 0, A );
+	 RADEON_COLOR_ARG( 1, B );
+	 break;
+      case GL_ADD:
+	 color_combine = (RADEON_COLOR_ARG_B_ZERO |
+			  RADEON_COMP_ARG_B |
+			  RADEON_BLEND_CTL_ADD |
+			  RADEON_CLAMP_TX);
+	 RADEON_COLOR_ARG( 0, A );
+	 RADEON_COLOR_ARG( 1, C );
+	 break;
+      case GL_ADD_SIGNED:
+	 color_combine = (RADEON_COLOR_ARG_B_ZERO |
+			  RADEON_COMP_ARG_B |
+			  RADEON_BLEND_CTL_ADDSIGNED |
+			  RADEON_CLAMP_TX);
+	 RADEON_COLOR_ARG( 0, A );
+	 RADEON_COLOR_ARG( 1, C );
+	 break;
+      case GL_SUBTRACT:
+	 color_combine = (RADEON_COLOR_ARG_B_ZERO |
+			  RADEON_COMP_ARG_B |
+			  RADEON_BLEND_CTL_SUBTRACT |
+			  RADEON_CLAMP_TX);
+	 RADEON_COLOR_ARG( 0, A );
+	 RADEON_COLOR_ARG( 1, C );
+	 break;
+      case GL_INTERPOLATE:
+	 color_combine = (RADEON_BLEND_CTL_BLEND |
+			  RADEON_CLAMP_TX);
+	 RADEON_COLOR_ARG( 0, B );
+	 RADEON_COLOR_ARG( 1, A );
+	 RADEON_COLOR_ARG( 2, C );
+	 break;
+
+      case GL_DOT3_RGB_EXT:
+      case GL_DOT3_RGBA_EXT:
+	 /* The EXT version of the DOT3 extension does not support the
+	  * scale factor, but the ARB version (and the version in OpenGL
+	  * 1.3) does.
+	  */
+	 RGBshift = 0;
+	 /* FALLTHROUGH */
+
+      case GL_DOT3_RGB:
+      case GL_DOT3_RGBA:
+	 /* The R100 / RV200 only support a 1X multiplier in hardware
+	  * w/the ARB version.
+	  */
+	 if ( RGBshift != (RADEON_SCALE_1X >> RADEON_SCALE_SHIFT) ) {
+	    return GL_FALSE;
+	 }
+
+	 RGBshift += 2;
+	 if ( (texUnit->_CurrentCombine->ModeRGB == GL_DOT3_RGBA_EXT)
+	    || (texUnit->_CurrentCombine->ModeRGB == GL_DOT3_RGBA) ) {
+            /* is it necessary to set this or will it be ignored anyway? */
+	    Ashift = RGBshift;
+	 }
+
+	 color_combine = (RADEON_COLOR_ARG_C_ZERO |
+			  RADEON_BLEND_CTL_DOT3 |
+			  RADEON_CLAMP_TX);
+	 RADEON_COLOR_ARG( 0, A );
+	 RADEON_COLOR_ARG( 1, B );
+	 break;
+
+      case GL_MODULATE_ADD_ATI:
+	 color_combine = (RADEON_BLEND_CTL_ADD |
+			  RADEON_CLAMP_TX);
+	 RADEON_COLOR_ARG( 0, A );
+	 RADEON_COLOR_ARG( 1, C );
+	 RADEON_COLOR_ARG( 2, B );
+	 break;
+      case GL_MODULATE_SIGNED_ADD_ATI:
+	 color_combine = (RADEON_BLEND_CTL_ADDSIGNED |
+			  RADEON_CLAMP_TX);
+	 RADEON_COLOR_ARG( 0, A );
+	 RADEON_COLOR_ARG( 1, C );
+	 RADEON_COLOR_ARG( 2, B );
+	 break;
+      case GL_MODULATE_SUBTRACT_ATI:
+	 color_combine = (RADEON_BLEND_CTL_SUBTRACT |
+			  RADEON_CLAMP_TX);
+	 RADEON_COLOR_ARG( 0, A );
+	 RADEON_COLOR_ARG( 1, C );
+	 RADEON_COLOR_ARG( 2, B );
+	 break;
+      default:
+	 return GL_FALSE;
+      }
+
+      switch ( texUnit->_CurrentCombine->ModeA ) {
+      case GL_REPLACE:
+	 alpha_combine = (RADEON_ALPHA_ARG_A_ZERO |
+			  RADEON_ALPHA_ARG_B_ZERO |
+			  RADEON_BLEND_CTL_ADD |
+			  RADEON_CLAMP_TX);
+	 RADEON_ALPHA_ARG( 0, C );
+	 break;
+      case GL_MODULATE:
+	 alpha_combine = (RADEON_ALPHA_ARG_C_ZERO |
+			  RADEON_BLEND_CTL_ADD |
+			  RADEON_CLAMP_TX);
+	 RADEON_ALPHA_ARG( 0, A );
+	 RADEON_ALPHA_ARG( 1, B );
+	 break;
+      case GL_ADD:
+	 alpha_combine = (RADEON_ALPHA_ARG_B_ZERO |
+			  RADEON_COMP_ARG_B |
+			  RADEON_BLEND_CTL_ADD |
+			  RADEON_CLAMP_TX);
+	 RADEON_ALPHA_ARG( 0, A );
+	 RADEON_ALPHA_ARG( 1, C );
+	 break;
+      case GL_ADD_SIGNED:
+	 alpha_combine = (RADEON_ALPHA_ARG_B_ZERO |
+			  RADEON_COMP_ARG_B |
+			  RADEON_BLEND_CTL_ADDSIGNED |
+			  RADEON_CLAMP_TX);
+	 RADEON_ALPHA_ARG( 0, A );
+	 RADEON_ALPHA_ARG( 1, C );
+	 break;
+      case GL_SUBTRACT:
+	 alpha_combine = (RADEON_COLOR_ARG_B_ZERO |
+			  RADEON_COMP_ARG_B |
+			  RADEON_BLEND_CTL_SUBTRACT |
+			  RADEON_CLAMP_TX);
+	 RADEON_ALPHA_ARG( 0, A );
+	 RADEON_ALPHA_ARG( 1, C );
+	 break;
+      case GL_INTERPOLATE:
+	 alpha_combine = (RADEON_BLEND_CTL_BLEND |
+			  RADEON_CLAMP_TX);
+	 RADEON_ALPHA_ARG( 0, B );
+	 RADEON_ALPHA_ARG( 1, A );
+	 RADEON_ALPHA_ARG( 2, C );
+	 break;
+
+      case GL_MODULATE_ADD_ATI:
+	 alpha_combine = (RADEON_BLEND_CTL_ADD |
+			  RADEON_CLAMP_TX);
+	 RADEON_ALPHA_ARG( 0, A );
+	 RADEON_ALPHA_ARG( 1, C );
+	 RADEON_ALPHA_ARG( 2, B );
+	 break;
+      case GL_MODULATE_SIGNED_ADD_ATI:
+	 alpha_combine = (RADEON_BLEND_CTL_ADDSIGNED |
+			  RADEON_CLAMP_TX);
+	 RADEON_ALPHA_ARG( 0, A );
+	 RADEON_ALPHA_ARG( 1, C );
+	 RADEON_ALPHA_ARG( 2, B );
+	 break;
+      case GL_MODULATE_SUBTRACT_ATI:
+	 alpha_combine = (RADEON_BLEND_CTL_SUBTRACT |
+			  RADEON_CLAMP_TX);
+	 RADEON_ALPHA_ARG( 0, A );
+	 RADEON_ALPHA_ARG( 1, C );
+	 RADEON_ALPHA_ARG( 2, B );
+	 break;
+      default:
+	 return GL_FALSE;
+      }
+
+      if ( (texUnit->_CurrentCombine->ModeRGB == GL_DOT3_RGB_EXT)
+	   || (texUnit->_CurrentCombine->ModeRGB == GL_DOT3_RGB) ) {
+	 alpha_combine |= RADEON_DOT_ALPHA_DONT_REPLICATE;
+      }
+
+      /* Step 3:
+       * Apply the scale factor.
+       */
+      color_combine |= (RGBshift << RADEON_SCALE_SHIFT);
+      alpha_combine |= (Ashift   << RADEON_SCALE_SHIFT);
+
+      /* All done!
+       */
+   }
+
+   if ( rmesa->hw.tex[unit].cmd[TEX_PP_TXCBLEND] != color_combine ||
+	rmesa->hw.tex[unit].cmd[TEX_PP_TXABLEND] != alpha_combine ) {
+      RADEON_STATECHANGE( rmesa, tex[unit] );
+      rmesa->hw.tex[unit].cmd[TEX_PP_TXCBLEND] = color_combine;
+      rmesa->hw.tex[unit].cmd[TEX_PP_TXABLEND] = alpha_combine;
+   }
+
+   return GL_TRUE;
+}
+
+#define TEXOBJ_TXFILTER_MASK (RADEON_MAX_MIP_LEVEL_MASK |	\
+			      RADEON_MIN_FILTER_MASK | 		\
+			      RADEON_MAG_FILTER_MASK |		\
+			      RADEON_MAX_ANISO_MASK |		\
+			      RADEON_YUV_TO_RGB |		\
+			      RADEON_YUV_TEMPERATURE_MASK |	\
+			      RADEON_CLAMP_S_MASK | 		\
+			      RADEON_CLAMP_T_MASK | 		\
+			      RADEON_BORDER_MODE_D3D )
+
+#define TEXOBJ_TXFORMAT_MASK (RADEON_TXFORMAT_WIDTH_MASK |	\
+			      RADEON_TXFORMAT_HEIGHT_MASK |	\
+			      RADEON_TXFORMAT_FORMAT_MASK |	\
+                              RADEON_TXFORMAT_F5_WIDTH_MASK |	\
+                              RADEON_TXFORMAT_F5_HEIGHT_MASK |	\
+			      RADEON_TXFORMAT_ALPHA_IN_MAP |	\
+			      RADEON_TXFORMAT_CUBIC_MAP_ENABLE |	\
+                              RADEON_TXFORMAT_NON_POWER2)
+
+
+static void import_tex_obj_state( radeonContextPtr rmesa,
+				  int unit,
+				  radeonTexObjPtr texobj )
+{
+/* do not use RADEON_DB_STATE to avoid stale texture caches */
+   int *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
+   GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
+
+   RADEON_STATECHANGE( rmesa, tex[unit] );
+
+   cmd[TEX_PP_TXFILTER] &= ~TEXOBJ_TXFILTER_MASK;
+   cmd[TEX_PP_TXFILTER] |= texobj->pp_txfilter & TEXOBJ_TXFILTER_MASK;
+   cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
+   cmd[TEX_PP_TXFORMAT] |= texobj->pp_txformat & TEXOBJ_TXFORMAT_MASK;
+   cmd[TEX_PP_TXOFFSET] = texobj->pp_txoffset;
+   cmd[TEX_PP_BORDER_COLOR] = texobj->pp_border_color;
+
+   if (texobj->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+      GLuint *txr_cmd = RADEON_DB_STATE( txr[unit] );
+      txr_cmd[TXR_PP_TEX_SIZE] = texobj->pp_txsize; /* NPOT only! */
+      txr_cmd[TXR_PP_TEX_PITCH] = texobj->pp_txpitch; /* NPOT only! */
+      RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.txr[unit] );
+      se_coord_fmt |= RADEON_VTX_ST0_NONPARAMETRIC << unit;
+   }
+   else {
+      se_coord_fmt &= ~(RADEON_VTX_ST0_NONPARAMETRIC << unit);
+
+      if (texobj->base.tObj->Target == GL_TEXTURE_CUBE_MAP) {
+	 int *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
+	 GLuint bytesPerFace = texobj->base.totalSize / 6;
+	 ASSERT(texobj->base.totalSize % 6 == 0);
+
+	 RADEON_STATECHANGE( rmesa, cube[unit] );
+	 cube_cmd[CUBE_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
+	 /* dont know if this setup conforms to OpenGL.. 
+	  * at least it matches the behavior of mesa software renderer
+	  */
+	 cube_cmd[CUBE_PP_CUBIC_OFFSET_0] = texobj->pp_txoffset; /* right */
+	 cube_cmd[CUBE_PP_CUBIC_OFFSET_1] = texobj->pp_txoffset + 1 * bytesPerFace; /* left */
+	 cube_cmd[CUBE_PP_CUBIC_OFFSET_2] = texobj->pp_txoffset + 2 * bytesPerFace; /* top */
+	 cube_cmd[CUBE_PP_CUBIC_OFFSET_3] = texobj->pp_txoffset + 3 * bytesPerFace; /* bottom */
+	 cube_cmd[CUBE_PP_CUBIC_OFFSET_4] = texobj->pp_txoffset + 4 * bytesPerFace; /* front */
+	 cmd[TEX_PP_TXOFFSET] = texobj->pp_txoffset + 5 * bytesPerFace; /* back */
+      }
+   }
+
+   if (se_coord_fmt != rmesa->hw.set.cmd[SET_SE_COORDFMT]) {
+      RADEON_STATECHANGE( rmesa, set );
+      rmesa->hw.set.cmd[SET_SE_COORDFMT] = se_coord_fmt;
+   }
+
+   texobj->dirty_state &= ~(1<<unit);
+}
+
+
+
+
+static void set_texgen_matrix( radeonContextPtr rmesa, 
+			       GLuint unit,
+			       const GLfloat *s_plane,
+			       const GLfloat *t_plane,
+			       const GLfloat *r_plane,
+			       const GLfloat *q_plane )
+{
+   rmesa->TexGenMatrix[unit].m[0]  = s_plane[0];
+   rmesa->TexGenMatrix[unit].m[4]  = s_plane[1];
+   rmesa->TexGenMatrix[unit].m[8]  = s_plane[2];
+   rmesa->TexGenMatrix[unit].m[12] = s_plane[3];
+
+   rmesa->TexGenMatrix[unit].m[1]  = t_plane[0];
+   rmesa->TexGenMatrix[unit].m[5]  = t_plane[1];
+   rmesa->TexGenMatrix[unit].m[9]  = t_plane[2];
+   rmesa->TexGenMatrix[unit].m[13] = t_plane[3];
+
+   rmesa->TexGenMatrix[unit].m[2]  = r_plane[0];
+   rmesa->TexGenMatrix[unit].m[6]  = r_plane[1];
+   rmesa->TexGenMatrix[unit].m[10] = r_plane[2];
+   rmesa->TexGenMatrix[unit].m[14] = r_plane[3];
+
+   rmesa->TexGenMatrix[unit].m[3]  = q_plane[0];
+   rmesa->TexGenMatrix[unit].m[7]  = q_plane[1];
+   rmesa->TexGenMatrix[unit].m[11] = q_plane[2];
+   rmesa->TexGenMatrix[unit].m[15] = q_plane[3];
+
+   rmesa->TexGenEnabled |= RADEON_TEXMAT_0_ENABLE << unit;
+   rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+}
+
+/* Returns GL_FALSE if fallback required.
+ */
+static GLboolean radeon_validate_texgen( GLcontext *ctx, GLuint unit )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
+   GLuint tmp = rmesa->TexGenEnabled;
+   static const GLfloat reflect[16] = {
+      -1,  0,  0,  0,
+       0, -1,  0,  0,
+       0,  0,  -1, 0,
+       0,  0,  0,  1 };
+
+   rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_TEXMAT_0_ENABLE << unit);
+   rmesa->TexGenEnabled &= ~(RADEON_TEXMAT_0_ENABLE << unit);
+   rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_INPUT_MASK << inputshift);
+   rmesa->TexGenNeedNormals[unit] = 0;
+
+   if ((texUnit->TexGenEnabled & (S_BIT|T_BIT|R_BIT|Q_BIT)) == 0) {
+      /* Disabled, no fallback:
+       */
+      rmesa->TexGenEnabled |=
+	 (RADEON_TEXGEN_INPUT_TEXCOORD_0 + unit) << inputshift;
+      return GL_TRUE;
+   }
+   /* the r100 cannot do texgen for some coords and not for others
+    * we do not detect such cases (certainly can't do it here) and just
+    * ASSUME that when S and T are texgen enabled we do not need other
+    * non-texgen enabled coords, no matter if the R and Q bits are texgen
+    * enabled. Still check for mixed mode texgen for all coords.
+    */
+   else if ( (texUnit->TexGenEnabled & S_BIT) &&
+	     (texUnit->TexGenEnabled & T_BIT) &&
+	     (texUnit->GenModeS == texUnit->GenModeT) ) {
+      if ( ((texUnit->TexGenEnabled & R_BIT) &&
+	    (texUnit->GenModeS != texUnit->GenModeR)) ||
+	   ((texUnit->TexGenEnabled & Q_BIT) &&
+	    (texUnit->GenModeS != texUnit->GenModeQ)) ) {
+	 /* Mixed modes, fallback:
+	  */
+	 if (RADEON_DEBUG & DEBUG_FALLBACKS)
+	    fprintf(stderr, "fallback mixed texgen\n");
+	 return GL_FALSE;
+      }
+      rmesa->TexGenEnabled |= RADEON_TEXGEN_TEXMAT_0_ENABLE << unit;
+   }
+   else {
+   /* some texgen mode not including both S and T bits */
+      if (RADEON_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "fallback mixed texgen/nontexgen\n");
+      return GL_FALSE;
+   }
+
+   if ((texUnit->TexGenEnabled & (R_BIT | Q_BIT)) != 0) {
+      /* need this here for vtxfmt presumably. Argh we need to set
+         this from way too many places, would be much easier if we could leave
+         tcl q coord always enabled as on r200) */
+      RADEON_STATECHANGE( rmesa, tcl );
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_Q_BIT(unit);
+   }
+
+   switch (texUnit->GenModeS) {
+   case GL_OBJECT_LINEAR:
+      rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_OBJ << inputshift;
+      set_texgen_matrix( rmesa, unit,
+			 texUnit->ObjectPlaneS,
+			 texUnit->ObjectPlaneT,
+			 texUnit->ObjectPlaneR,
+			 texUnit->ObjectPlaneQ);
+      break;
+
+   case GL_EYE_LINEAR:
+      rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE << inputshift;
+      set_texgen_matrix( rmesa, unit,
+			 texUnit->EyePlaneS,
+			 texUnit->EyePlaneT,
+			 texUnit->EyePlaneR,
+			 texUnit->EyePlaneQ);
+      break;
+
+   case GL_REFLECTION_MAP_NV:
+      rmesa->TexGenNeedNormals[unit] = GL_TRUE;
+      rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE_REFLECT << inputshift;
+      /* TODO: unknown if this is needed/correct */
+      set_texgen_matrix( rmesa, unit, reflect, reflect + 4,
+			reflect + 8, reflect + 12 );
+      break;
+
+   case GL_NORMAL_MAP_NV:
+      rmesa->TexGenNeedNormals[unit] = GL_TRUE;
+      rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE_NORMAL << inputshift;
+      break;
+
+   case GL_SPHERE_MAP:
+      /* the mode which everyone uses :-( */
+   default:
+      /* Unsupported mode, fallback:
+       */
+      if (RADEON_DEBUG & DEBUG_FALLBACKS) 
+	 fprintf(stderr, "fallback GL_SPHERE_MAP\n");
+      return GL_FALSE;
+   }
+
+   if (tmp != rmesa->TexGenEnabled) {
+      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+   }
+
+   return GL_TRUE;
+}
+
+
+static void disable_tex( GLcontext *ctx, int unit )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+   if (rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (RADEON_TEX_0_ENABLE<<unit)) {
+      /* Texture unit disabled */
+      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
+	 /* The old texture is no longer bound to this texture unit.
+	  * Mark it as such.
+	  */
+
+	 rmesa->state.texture.unit[unit].texobj->base.bound &= ~(1UL << unit);
+	 rmesa->state.texture.unit[unit].texobj = NULL;
+      }
+
+      RADEON_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= 
+	  ~((RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit);
+
+      RADEON_STATECHANGE( rmesa, tcl );
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~(RADEON_ST_BIT(unit) |
+						RADEON_Q_BIT(unit));
+
+      if (rmesa->TclFallback & (RADEON_TCL_FALLBACK_TEXGEN_0<<unit)) {
+	 TCL_FALLBACK( ctx, (RADEON_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
+	 rmesa->recheck_texgen[unit] = GL_TRUE;
+      }
+
+      if (rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] & RADEON_TXFORMAT_CUBIC_MAP_ENABLE) {
+      /* this seems to be a genuine (r100 only?) hw bug. Need to remove the
+         cubic_map bit on unit 2 when the unit is disabled, otherwise every
+	 2nd (2d) mipmap on unit 0 will be broken (may not be needed for other
+	 units, better be safe than sorry though).*/
+	 RADEON_STATECHANGE( rmesa, tex[unit] );
+	 rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] &= ~RADEON_TXFORMAT_CUBIC_MAP_ENABLE;
+      }
+
+      {
+	 GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
+	 GLuint tmp = rmesa->TexGenEnabled;
+
+	 rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_TEXMAT_0_ENABLE<<unit);
+	 rmesa->TexGenEnabled &= ~(RADEON_TEXMAT_0_ENABLE<<unit);
+	 rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_INPUT_MASK<<inputshift);
+	 rmesa->TexGenNeedNormals[unit] = 0;
+	 rmesa->TexGenEnabled |= 
+	     (RADEON_TEXGEN_INPUT_TEXCOORD_0+unit) << inputshift;
+
+	 if (tmp != rmesa->TexGenEnabled) {
+	    rmesa->recheck_texgen[unit] = GL_TRUE;
+	    rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+	 }
+      }
+   }
+}
+
+static GLboolean enable_tex_2d( GLcontext *ctx, int unit )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   struct gl_texture_object *tObj = texUnit->_Current;
+   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
+
+   /* Need to load the 2d images associated with this unit.
+    */
+   if (t->pp_txformat & RADEON_TXFORMAT_NON_POWER2) {
+      t->pp_txformat &= ~RADEON_TXFORMAT_NON_POWER2;
+      t->base.dirty_images[0] = ~0;
+   }
+
+   ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
+
+   if ( t->base.dirty_images[0] ) {
+      RADEON_FIREVERTICES( rmesa );
+      radeonSetTexImages( rmesa, tObj );
+      radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, 0 );
+      if ( !t->base.memBlock ) 
+	return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+static GLboolean enable_tex_cube( GLcontext *ctx, int unit )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   struct gl_texture_object *tObj = texUnit->_Current;
+   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
+   GLuint face;
+
+   /* Need to load the 2d images associated with this unit.
+    */
+   if (t->pp_txformat & RADEON_TXFORMAT_NON_POWER2) {
+      t->pp_txformat &= ~RADEON_TXFORMAT_NON_POWER2;
+      for (face = 0; face < 6; face++)
+         t->base.dirty_images[face] = ~0;
+   }
+
+   ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
+
+   if ( t->base.dirty_images[0] || t->base.dirty_images[1] ||
+        t->base.dirty_images[2] || t->base.dirty_images[3] ||
+        t->base.dirty_images[4] || t->base.dirty_images[5] ) {
+      /* flush */
+      RADEON_FIREVERTICES( rmesa );
+      /* layout memory space, once for all faces */
+      radeonSetTexImages( rmesa, tObj );
+   }
+
+   /* upload (per face) */
+   for (face = 0; face < 6; face++) {
+      if (t->base.dirty_images[face]) {
+         radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, face );
+      }
+   }
+      
+   if ( !t->base.memBlock ) {
+      /* texmem alloc failed, use s/w fallback */
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+static GLboolean enable_tex_rect( GLcontext *ctx, int unit )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   struct gl_texture_object *tObj = texUnit->_Current;
+   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
+
+   if (!(t->pp_txformat & RADEON_TXFORMAT_NON_POWER2)) {
+      t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
+      t->base.dirty_images[0] = ~0;
+   }
+
+   ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
+
+   if ( t->base.dirty_images[0] ) {
+      RADEON_FIREVERTICES( rmesa );
+      radeonSetTexImages( rmesa, tObj );
+      radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, 0 );
+      if ( !t->base.memBlock /* && !rmesa->prefer_gart_client_texturing  FIXME */ ) {
+	 fprintf(stderr, "%s: upload failed\n", __FUNCTION__);
+	 return GL_FALSE;
+      }
+   }
+
+   return GL_TRUE;
+}
+
+
+static GLboolean update_tex_common( GLcontext *ctx, int unit )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   struct gl_texture_object *tObj = texUnit->_Current;
+   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
+   GLenum format;
+
+   /* Fallback if there's a texture border */
+   if ( tObj->Image[0][tObj->BaseLevel]->Border > 0 ) {
+      fprintf(stderr, "%s: border\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+   /* yuv conversion only works in first unit */
+   if (unit != 0 && (t->pp_txfilter & RADEON_YUV_TO_RGB))
+      return GL_FALSE;
+
+   /* Update state if this is a different texture object to last
+    * time.
+    */
+   if ( rmesa->state.texture.unit[unit].texobj != t ) {
+      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
+	 /* The old texture is no longer bound to this texture unit.
+	  * Mark it as such.
+	  */
+
+	 rmesa->state.texture.unit[unit].texobj->base.bound &= 
+	     ~(1UL << unit);
+      }
+
+      rmesa->state.texture.unit[unit].texobj = t;
+      t->base.bound |= (1UL << unit);
+      t->dirty_state |= 1<<unit;
+      driUpdateTextureLRU( (driTextureObject *) t ); /* XXX: should be locked! */
+   }
+
+
+   /* Newly enabled?
+    */
+   if ( !(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (RADEON_TEX_0_ENABLE<<unit))) {
+      RADEON_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= 
+	  (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit;
+
+      RADEON_STATECHANGE( rmesa, tcl );
+
+      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_ST_BIT(unit);
+
+      rmesa->recheck_texgen[unit] = GL_TRUE;
+   }
+
+   if (t->dirty_state & (1<<unit)) {
+      import_tex_obj_state( rmesa, unit, t );
+      /* may need to update texture matrix (for texrect adjustments) */
+      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+   }
+
+   if (rmesa->recheck_texgen[unit]) {
+      GLboolean fallback = !radeon_validate_texgen( ctx, unit );
+      TCL_FALLBACK( ctx, (RADEON_TCL_FALLBACK_TEXGEN_0<<unit), fallback);
+      rmesa->recheck_texgen[unit] = 0;
+      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+   }
+
+   format = tObj->Image[0][tObj->BaseLevel]->_BaseFormat;
+   if ( rmesa->state.texture.unit[unit].format != format ||
+	rmesa->state.texture.unit[unit].envMode != texUnit->EnvMode ) {
+      rmesa->state.texture.unit[unit].format = format;
+      rmesa->state.texture.unit[unit].envMode = texUnit->EnvMode;
+      if ( ! radeonUpdateTextureEnv( ctx, unit ) ) {
+	 return GL_FALSE;
+      }
+   }
+
+   FALLBACK( rmesa, RADEON_FALLBACK_BORDER_MODE, t->border_fallback );
+   return !t->border_fallback;
+}
+
+
+
+static GLboolean radeonUpdateTextureUnit( GLcontext *ctx, int unit )
+{
+   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+
+   if ( texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT) ) {
+      return (enable_tex_rect( ctx, unit ) &&
+	      update_tex_common( ctx, unit ));
+   }
+   else if ( texUnit->_ReallyEnabled & (TEXTURE_1D_BIT | TEXTURE_2D_BIT) ) {
+      return (enable_tex_2d( ctx, unit ) &&
+	      update_tex_common( ctx, unit ));
+   }
+   else if ( texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT) ) {
+      return (enable_tex_cube( ctx, unit ) &&
+	      update_tex_common( ctx, unit ));
+   }
+   else if ( texUnit->_ReallyEnabled ) {
+      return GL_FALSE;
+   }
+   else {
+      disable_tex( ctx, unit );
+      return GL_TRUE;
+   }
+}
+
+void radeonUpdateTextureState( GLcontext *ctx )
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   GLboolean ok;
+
+   ok = (radeonUpdateTextureUnit( ctx, 0 ) &&
+	 radeonUpdateTextureUnit( ctx, 1 ) &&
+	 radeonUpdateTextureUnit( ctx, 2 ));
+
+   FALLBACK( rmesa, RADEON_FALLBACK_TEXTURE, !ok );
+
+   if (rmesa->TclFallback)
+      radeonChooseVertexState( ctx );
+}
diff --git a/radeon/server/radeon.h b/radeon/server/radeon.h
new file mode 100644
index 0000000..6f6c2e6
--- /dev/null
+++ b/radeon/server/radeon.h
@@ -0,0 +1,209 @@
+/**
+ * \file server/radeon.h
+ * \brief Radeon 2D driver data structures.
+ */
+
+/*
+ * Copyright 2000 ATI Technologies Inc., Markham, Ontario, and
+ *                VA Linux Systems Inc., Fremont, California.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation on the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT.  IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR
+ * THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* $XFree86: xc/programs/Xserver/hw/xfree86/drivers/ati/radeon.h,v 1.29 2002/10/12 01:38:07 martin Exp $ */
+
+#ifndef _RADEON_H_
+#define _RADEON_H_
+
+#include "xf86drm.h"		/* drm_handle_t, etc */
+
+#       define RADEON_AGP_1X_MODE           0x01
+#       define RADEON_AGP_2X_MODE           0x02
+#       define RADEON_AGP_4X_MODE           0x04
+#       define RADEON_AGP_FW_MODE           0x10
+#       define RADEON_AGP_MODE_MASK         0x17
+#define RADEON_CP_CSQ_CNTL                  0x0740
+#       define RADEON_CSQ_CNT_PRIMARY_MASK     (0xff << 0)
+#       define RADEON_CSQ_PRIDIS_INDDIS        (0    << 28)
+#       define RADEON_CSQ_PRIPIO_INDDIS        (1    << 28)
+#       define RADEON_CSQ_PRIBM_INDDIS         (2    << 28)
+#       define RADEON_CSQ_PRIPIO_INDBM         (3    << 28)
+#       define RADEON_CSQ_PRIBM_INDBM          (4    << 28)
+#       define RADEON_CSQ_PRIPIO_INDPIO        (15   << 28)
+
+#define RADEON_PCIGART_TABLE_SIZE       32768
+
+#define PCI_CHIP_R200_BB                0x4242
+#define PCI_CHIP_RV250_Id               0x4964
+#define PCI_CHIP_RV250_Ie               0x4965
+#define PCI_CHIP_RV250_If               0x4966
+#define PCI_CHIP_RV250_Ig               0x4967
+#define PCI_CHIP_RADEON_LW		0x4C57
+#define PCI_CHIP_RADEON_LX		0x4C58
+#define PCI_CHIP_RADEON_LY		0x4C59
+#define PCI_CHIP_RADEON_LZ		0x4C5A
+#define PCI_CHIP_RV250_Ld		0x4C64
+#define PCI_CHIP_RV250_Le		0x4C65
+#define PCI_CHIP_RV250_Lf		0x4C66
+#define PCI_CHIP_RV250_Lg		0x4C67
+#define PCI_CHIP_R300_ND		0x4E44
+#define PCI_CHIP_R300_NE		0x4E45
+#define PCI_CHIP_R300_NF		0x4E46
+#define PCI_CHIP_R300_NG		0x4E47
+#define PCI_CHIP_RADEON_QD		0x5144
+#define PCI_CHIP_RADEON_QE		0x5145
+#define PCI_CHIP_RADEON_QF		0x5146
+#define PCI_CHIP_RADEON_QG		0x5147
+#define PCI_CHIP_R200_QL		0x514C
+#define PCI_CHIP_R200_QN		0x514E
+#define PCI_CHIP_R200_QO		0x514F
+#define PCI_CHIP_RV200_QW		0x5157
+#define PCI_CHIP_RV200_QX		0x5158
+#define PCI_CHIP_RADEON_QY		0x5159
+#define PCI_CHIP_RADEON_QZ		0x515A
+#define PCI_CHIP_R200_Ql		0x516C
+#define PCI_CHIP_RV370_5460             0x5460
+#define PCI_CHIP_RV280_Y_		0x5960
+#define PCI_CHIP_RV280_Ya		0x5961
+#define PCI_CHIP_RV280_Yb		0x5962
+#define PCI_CHIP_RV280_Yc		0x5963
+
+/**
+ * \brief Chip families.
+ */
+typedef enum {
+    CHIP_FAMILY_UNKNOW,
+    CHIP_FAMILY_LEGACY,
+    CHIP_FAMILY_R128,
+    CHIP_FAMILY_M3,
+    CHIP_FAMILY_RADEON,
+    CHIP_FAMILY_VE,
+    CHIP_FAMILY_M6,
+    CHIP_FAMILY_RV200,
+    CHIP_FAMILY_M7,
+    CHIP_FAMILY_R200,
+    CHIP_FAMILY_RV250,
+    CHIP_FAMILY_M9,
+    CHIP_FAMILY_RV280,
+    CHIP_FAMILY_R300,
+    CHIP_FAMILY_R350,
+    CHIP_FAMILY_RV350,
+    CHIP_FAMILY_RV380,  /* RV370/RV380/M22/M24 */
+    CHIP_FAMILY_R420,   /* R420/R423/M18 */
+} RADEONChipFamily;
+
+
+typedef unsigned long memType;
+
+
+/**
+ * \brief Radeon DDX driver private data.
+ */
+typedef struct {
+   int               Chipset;          /**< \brief Chipset number */
+   RADEONChipFamily  ChipFamily;       /**< \brief Chip family */
+
+   unsigned long     LinearAddr;       /**< \brief Frame buffer physical address */
+
+
+   drmSize           registerSize;     /**< \brief MMIO register map size */
+   drm_handle_t         registerHandle;   /**< \brief MMIO register map handle */
+
+   int               IsPCI;            /* Current card is a PCI card */
+   
+   /**
+    * \name AGP
+    */
+   /*@{*/
+   drmSize           gartSize;          /**< \brief AGP map size */
+   drm_handle_t         gartMemHandle;     /**< \brief AGP map handle */
+   unsigned long     gartOffset;        /**< \brief AGP offset */
+   int               gartMode;          /**< \brief AGP mode */
+   int               gartFastWrite;
+   /*@}*/
+
+   /**
+    * \name CP ring buffer data
+    */
+   /*@{*/
+   unsigned long     ringStart;        /**< \brief Offset into AGP space */
+   drm_handle_t         ringHandle;       /**< \brief Handle from drmAddMap() */
+   drmSize           ringMapSize;      /**< \brief Size of map */
+   int               ringSize;         /**< \brief Size of ring (in MB) */
+
+   unsigned long     ringReadOffset;   /**< \brief Read offset into AGP space */
+   drm_handle_t         ringReadPtrHandle;/**< \brief Handle from drmAddMap() */
+   drmSize           ringReadMapSize;  /**< \brief Size of map */
+   /*@}*/
+
+   /**
+    * \name CP vertex/indirect buffer data
+    */
+   /*@{*/
+   unsigned long     bufStart;         /**< \brief Offset into AGP space */
+   drm_handle_t         bufHandle;        /**< \brief Handle from drmAddMap() */
+   drmSize           bufMapSize;       /**< \brief Size of map */
+   int               bufSize;          /**< \brief Size of buffers (in MB) */
+   int               bufNumBufs;       /**< \brief Number of buffers */
+   /*@}*/
+
+   /**
+    * \name CP AGP Texture data
+    */
+   /*@{*/
+   unsigned long     gartTexStart;      /**< \brief Offset into AGP space */
+   drm_handle_t         gartTexHandle;     /**< \brief Handle from drmAddMap() */
+   drmSize           gartTexMapSize;    /**< \brief Size of map */
+   int               gartTexSize;       /**< \brief Size of AGP tex space (in MB) */
+   int               log2GARTTexGran;
+   /*@}*/
+
+   int               drmMinor;         /**< \brief DRM device minor number */
+
+   int               frontOffset;      /**< \brief Front color buffer offset */
+   int               frontPitch;       /**< \brief Front color buffer pitch */
+   int               backOffset;       /**< \brief Back color buffer offset */
+   int               backPitch;        /**< \brief Back color buffer pitch */
+   int               depthOffset;      /**< \brief Depth buffer offset */
+   int               depthPitch;       /**< \brief Depth buffer pitch */
+   int               textureOffset;    /**< \brief Texture area offset */
+   int               textureSize;      /**< \brief Texture area size */
+   int               log2TexGran;      /**< \brief Texture granularity in base 2 log */
+
+   unsigned int      frontPitchOffset;
+   unsigned int      backPitchOffset;
+   unsigned int      depthPitchOffset;
+   
+   int               colorTiling;      /**< \brief Enable color tiling */
+
+   int               irq;              /**< \brief IRQ number */
+   int               page_flip_enable; /**< \brief Page Flip enable */
+   unsigned int      gen_int_cntl;
+   unsigned int      crtc_offset_cntl;
+
+   unsigned long     pcieGartTableOffset;
+} RADEONInfoRec, *RADEONInfoPtr;
+
+
+#endif /* _RADEON_H_ */
diff --git a/radeon/server/radeon_dri.c b/radeon/server/radeon_dri.c
new file mode 100644
index 0000000..7ead588
--- /dev/null
+++ b/radeon/server/radeon_dri.c
@@ -0,0 +1,1337 @@
+/**
+ * \file server/radeon_dri.c
+ * \brief File to perform the device-specific initialization tasks typically
+ * done in the X server.
+ *
+ * Here they are converted to run in the client (or perhaps a standalone
+ * process), and to work with the frame buffer device rather than the X
+ * server infrastructure.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "driver.h"
+#include "drm.h"
+#include "memops.h"
+
+#include "radeon.h"
+#include "radeon_dri.h"
+#include "radeon_macros.h"
+#include "radeon_reg.h"
+#include "drm_sarea.h"
+
+static size_t radeon_drm_page_size;
+
+static int RadeonSetParam(const DRIDriverContext *ctx, int param, int value)
+{
+   drm_radeon_setparam_t sp;
+
+   memset(&sp, 0, sizeof(sp));
+   sp.param = param;
+   sp.value = value;
+
+   if (drmCommandWrite(ctx->drmFD, DRM_RADEON_SETPARAM, &sp, sizeof(sp))) {
+     return -1;
+   }
+
+   return 0;
+}
+
+/**
+ * \brief Wait for free FIFO entries.
+ *
+ * \param ctx display handle.
+ * \param entries number of free entries to wait.
+ *
+ * It polls the free entries from the chip until it reaches the requested value
+ * or a timeout (3000 tries) occurs. Aborts the program if the FIFO times out.
+ */
+static void RADEONWaitForFifo( const DRIDriverContext *ctx,
+			       int entries )
+{
+   unsigned char *RADEONMMIO = ctx->MMIOAddress;
+   int i;
+
+   for (i = 0; i < 3000; i++) {
+      int fifo_slots =
+	 INREG(RADEON_RBBM_STATUS) & RADEON_RBBM_FIFOCNT_MASK;
+      if (fifo_slots >= entries) return;
+   }
+
+   /* There are recoveries possible, but I haven't seen them work
+    * in practice:
+    */
+   fprintf(stderr, "FIFO timed out: %d entries, stat=0x%08x\n",
+	   INREG(RADEON_RBBM_STATUS) & RADEON_RBBM_FIFOCNT_MASK,
+	   INREG(RADEON_RBBM_STATUS));
+   exit(1);
+}
+
+/**
+ * \brief Read a PLL register.
+ *
+ * \param ctx display handle.
+ * \param addr PLL register index.
+ *
+ * \return value of the PLL register.
+ */
+static unsigned int RADEONINPLL( const DRIDriverContext *ctx, int addr)
+{
+    unsigned char *RADEONMMIO = ctx->MMIOAddress;
+    unsigned int data;
+
+    OUTREG8(RADEON_CLOCK_CNTL_INDEX, addr & 0x3f);
+    data = INREG(RADEON_CLOCK_CNTL_DATA);
+
+    return data;
+}
+
+/**
+ * \brief Reset graphics card to known state.
+ *
+ * \param ctx display handle.
+ *
+ * Resets the values of several Radeon registers.
+ */
+static void RADEONEngineReset( const DRIDriverContext *ctx )
+{
+   unsigned char *RADEONMMIO = ctx->MMIOAddress;
+   unsigned int clock_cntl_index;
+   unsigned int mclk_cntl;
+   unsigned int rbbm_soft_reset;
+   unsigned int host_path_cntl;
+   int i;
+
+   OUTREGP(RADEON_RB2D_DSTCACHE_CTLSTAT,
+	   RADEON_RB2D_DC_FLUSH_ALL,
+	   ~RADEON_RB2D_DC_FLUSH_ALL);
+   for (i = 0; i < 512; i++) {
+      if (!(INREG(RADEON_RB2D_DSTCACHE_CTLSTAT) & RADEON_RB2D_DC_BUSY))
+	 break;
+   }
+
+   clock_cntl_index = INREG(RADEON_CLOCK_CNTL_INDEX);
+
+   mclk_cntl = INPLL(ctx, RADEON_MCLK_CNTL);
+   OUTPLL(RADEON_MCLK_CNTL, (mclk_cntl |
+			     RADEON_FORCEON_MCLKA |
+			     RADEON_FORCEON_MCLKB |
+			     RADEON_FORCEON_YCLKA |
+			     RADEON_FORCEON_YCLKB |
+			     RADEON_FORCEON_MC |
+			     RADEON_FORCEON_AIC));
+
+   /* Soft resetting HDP thru RBBM_SOFT_RESET register can cause some
+    * unexpected behaviour on some machines.  Here we use
+    * RADEON_HOST_PATH_CNTL to reset it.
+    */
+   host_path_cntl = INREG(RADEON_HOST_PATH_CNTL);
+   rbbm_soft_reset = INREG(RADEON_RBBM_SOFT_RESET);
+
+   OUTREG(RADEON_RBBM_SOFT_RESET, (rbbm_soft_reset |
+				   RADEON_SOFT_RESET_CP |
+				   RADEON_SOFT_RESET_HI |
+				   RADEON_SOFT_RESET_SE |
+				   RADEON_SOFT_RESET_RE |
+				   RADEON_SOFT_RESET_PP |
+				   RADEON_SOFT_RESET_E2 |
+				   RADEON_SOFT_RESET_RB));
+   INREG(RADEON_RBBM_SOFT_RESET);
+   OUTREG(RADEON_RBBM_SOFT_RESET, (rbbm_soft_reset & 
+				   (unsigned int) ~(RADEON_SOFT_RESET_CP |
+						    RADEON_SOFT_RESET_HI |
+						    RADEON_SOFT_RESET_SE |
+						    RADEON_SOFT_RESET_RE |
+						    RADEON_SOFT_RESET_PP |
+						    RADEON_SOFT_RESET_E2 |
+						    RADEON_SOFT_RESET_RB)));
+   INREG(RADEON_RBBM_SOFT_RESET);
+
+   OUTREG(RADEON_HOST_PATH_CNTL, host_path_cntl | RADEON_HDP_SOFT_RESET);
+   INREG(RADEON_HOST_PATH_CNTL);
+   OUTREG(RADEON_HOST_PATH_CNTL, host_path_cntl);
+
+   OUTREG(RADEON_RBBM_SOFT_RESET, rbbm_soft_reset);
+
+   OUTREG(RADEON_CLOCK_CNTL_INDEX, clock_cntl_index);
+   OUTPLL(RADEON_MCLK_CNTL, mclk_cntl);
+}
+
+/**
+ * \brief Restore the drawing engine.
+ *
+ * \param ctx display handle
+ *
+ * Resets the graphics card and sets initial values for several registers of
+ * the card's drawing engine.
+ *
+ * Turns on the radeon command processor engine (i.e., the ringbuffer).
+ */
+static int RADEONEngineRestore( const DRIDriverContext *ctx )
+{
+   RADEONInfoPtr info = ctx->driverPrivate;
+   unsigned char *RADEONMMIO = ctx->MMIOAddress;
+   int pitch64, datatype, dp_gui_master_cntl, err;
+
+   fprintf(stderr, "%s\n", __FUNCTION__);
+
+   OUTREG(RADEON_RB3D_CNTL, 0);
+   RADEONEngineReset( ctx );
+
+   switch (ctx->bpp) {
+   case 16: datatype = 4; break;
+   case 32: datatype = 6; break;
+   default: return 0;
+   }
+
+   dp_gui_master_cntl =
+      ((datatype << RADEON_GMC_DST_DATATYPE_SHIFT)
+       | RADEON_GMC_CLR_CMP_CNTL_DIS);
+
+   pitch64 = ((ctx->shared.virtualWidth * (ctx->bpp / 8) + 0x3f)) >> 6;
+
+   RADEONWaitForFifo(ctx, 1);
+   OUTREG(RADEON_DEFAULT_OFFSET, ((INREG(RADEON_DEFAULT_OFFSET) & 0xC0000000)
+				  | (pitch64 << 22)));
+
+   RADEONWaitForFifo(ctx, 1);
+   OUTREG(RADEON_SURFACE_CNTL, RADEON_SURF_TRANSLATION_DIS); 
+
+   RADEONWaitForFifo(ctx, 1);
+   OUTREG(RADEON_DEFAULT_SC_BOTTOM_RIGHT, (RADEON_DEFAULT_SC_RIGHT_MAX
+					   | RADEON_DEFAULT_SC_BOTTOM_MAX));
+
+   RADEONWaitForFifo(ctx, 1);
+   OUTREG(RADEON_DP_GUI_MASTER_CNTL, (dp_gui_master_cntl
+				      | RADEON_GMC_BRUSH_SOLID_COLOR
+				      | RADEON_GMC_SRC_DATATYPE_COLOR));
+
+   RADEONWaitForFifo(ctx, 7);
+   OUTREG(RADEON_DST_LINE_START,    0);
+   OUTREG(RADEON_DST_LINE_END,      0);
+   OUTREG(RADEON_DP_BRUSH_FRGD_CLR, 0xffffffff);
+   OUTREG(RADEON_DP_BRUSH_BKGD_CLR, 0);
+   OUTREG(RADEON_DP_SRC_FRGD_CLR,   0xffffffff);
+   OUTREG(RADEON_DP_SRC_BKGD_CLR,   0);
+   OUTREG(RADEON_DP_WRITE_MASK,     0xffffffff);
+   OUTREG(RADEON_AUX_SC_CNTL,       0);
+
+/*    RADEONWaitForIdleMMIO(ctx); */
+   usleep(100); 
+
+
+   OUTREG(RADEON_GEN_INT_CNTL, info->gen_int_cntl);
+   if (info->colorTiling)
+	   info->crtc_offset_cntl |= RADEON_CRTC_TILE_EN;
+   OUTREG(RADEON_CRTC_OFFSET_CNTL, info->crtc_offset_cntl);
+
+   /* Initialize and start the CP if required */
+   if ((err = drmCommandNone(ctx->drmFD, DRM_RADEON_CP_START)) != 0) {
+      fprintf(stderr, "%s: CP start %d\n", __FUNCTION__, err);
+      return 0;
+   }
+
+   return 1;
+}
+
+
+/**
+ * \brief Shutdown the drawing engine.
+ *
+ * \param ctx display handle
+ *
+ * Turns off the command processor engine & restores the graphics card
+ * to a state that fbdev understands.
+ */
+static int RADEONEngineShutdown( const DRIDriverContext *ctx )
+{
+   drm_radeon_cp_stop_t  stop;
+   int              ret, i;
+
+   stop.flush = 1;
+   stop.idle  = 1;
+
+   ret = drmCommandWrite(ctx->drmFD, DRM_RADEON_CP_STOP, &stop, 
+			 sizeof(drm_radeon_cp_stop_t));
+
+   if (ret == 0) {
+      return 0;
+   } else if (errno != EBUSY) {
+      return -errno;
+   }
+
+   stop.flush = 0;
+ 
+   i = 0;
+   do {
+      ret = drmCommandWrite(ctx->drmFD, DRM_RADEON_CP_STOP, &stop, 
+			    sizeof(drm_radeon_cp_stop_t));
+   } while (ret && errno == EBUSY && i++ < 10);
+
+   if (ret == 0) {
+      return 0;
+   } else if (errno != EBUSY) {
+      return -errno;
+   }
+
+   stop.idle = 0;
+
+   if (drmCommandWrite(ctx->drmFD, DRM_RADEON_CP_STOP,
+		       &stop, sizeof(drm_radeon_cp_stop_t))) {
+      return -errno;
+   } else {
+      return 0;
+   }
+}
+
+/**
+ * \brief Compute base 2 logarithm.
+ *
+ * \param val value.
+ * 
+ * \return base 2 logarithm of \p val.
+ */
+static int RADEONMinBits(int val)
+{
+   int  bits;
+
+   if (!val) return 1;
+   for (bits = 0; val; val >>= 1, ++bits);
+   return bits;
+}
+
+/**
+ * \brief Initialize the AGP state
+ *
+ * \param ctx display handle.
+ * \param info driver private data.
+ *
+ * \return one on success, or zero on failure.
+ * 
+ * Acquires and enables the AGP device. Reserves memory in the AGP space for
+ * the ring buffer, vertex buffers and textures. Initialize the Radeon
+ * registers to point to that memory and add client mappings.
+ */
+static int RADEONDRIAgpInit( const DRIDriverContext *ctx, RADEONInfoPtr info)
+{
+   unsigned char *RADEONMMIO = ctx->MMIOAddress;
+   unsigned long  mode;
+   int            ret;
+   int            s, l;
+
+   if (drmAgpAcquire(ctx->drmFD) < 0) {
+      fprintf(stderr, "[gart] AGP not available\n");
+      return 0;
+   }
+    
+   /* Modify the mode if the default mode is not appropriate for this
+    * particular combination of graphics card and AGP chipset.
+    */
+   mode   = drmAgpGetMode(ctx->drmFD);	/* Default mode */
+
+   /* Disable fast write entirely - too many lockups.
+    */
+   mode &= ~RADEON_AGP_MODE_MASK;
+   switch (ctx->agpmode) {
+   case 4:          mode |= RADEON_AGP_4X_MODE;
+   case 2:          mode |= RADEON_AGP_2X_MODE;
+   case 1: default: mode |= RADEON_AGP_1X_MODE;
+   }
+
+   if (drmAgpEnable(ctx->drmFD, mode) < 0) {
+      fprintf(stderr, "[gart] AGP not enabled\n");
+      drmAgpRelease(ctx->drmFD);
+      return 0;
+   }
+   else
+     fprintf(stderr, "[gart] AGP enabled at %dx\n", ctx->agpmode);
+
+   /* Workaround for some hardware bugs */
+   if (info->ChipFamily < CHIP_FAMILY_R200)
+      OUTREG(RADEON_AGP_CNTL, INREG(RADEON_AGP_CNTL) | 0x000e0000);
+
+   info->gartOffset = 0;
+
+   if ((ret = drmAgpAlloc(ctx->drmFD, info->gartSize*1024*1024, 0, NULL,
+			  &info->gartMemHandle)) < 0) {
+      fprintf(stderr, "[gart] Out of memory (%d)\n", ret);
+      drmAgpRelease(ctx->drmFD);
+      return 0;
+   }
+   fprintf(stderr,
+	   "[gart] %d kB allocated with handle 0x%08x\n",
+	   info->gartSize*1024, (unsigned)info->gartMemHandle);
+    
+   if (drmAgpBind(ctx->drmFD,
+		  info->gartMemHandle, info->gartOffset) < 0) {
+      fprintf(stderr, "[gart] Could not bind\n");
+      drmAgpFree(ctx->drmFD, info->gartMemHandle);
+      drmAgpRelease(ctx->drmFD);
+      return 0;
+   }
+
+   /* Initialize the CP ring buffer data */
+   info->ringStart       = info->gartOffset;
+   info->ringMapSize     = info->ringSize*1024*1024 + radeon_drm_page_size;
+
+   info->ringReadOffset  = info->ringStart + info->ringMapSize;
+   info->ringReadMapSize = radeon_drm_page_size;
+
+   /* Reserve space for vertex/indirect buffers */
+   info->bufStart        = info->ringReadOffset + info->ringReadMapSize;
+   info->bufMapSize      = info->bufSize*1024*1024;
+
+   /* Reserve the rest for AGP textures */
+   info->gartTexStart     = info->bufStart + info->bufMapSize;
+   s = (info->gartSize*1024*1024 - info->gartTexStart);
+   l = RADEONMinBits((s-1) / RADEON_NR_TEX_REGIONS);
+   if (l < RADEON_LOG_TEX_GRANULARITY) l = RADEON_LOG_TEX_GRANULARITY;
+   info->gartTexMapSize   = (s >> l) << l;
+   info->log2GARTTexGran  = l;
+
+   if (drmAddMap(ctx->drmFD, info->ringStart, info->ringMapSize,
+		 DRM_AGP, DRM_READ_ONLY, &info->ringHandle) < 0) {
+      fprintf(stderr, "[gart] Could not add ring mapping\n");
+      return 0;
+   }
+   fprintf(stderr, "[gart] ring handle = 0x%08x\n", info->ringHandle);
+    
+
+   if (drmAddMap(ctx->drmFD, info->ringReadOffset, info->ringReadMapSize,
+		 DRM_AGP, DRM_READ_ONLY, &info->ringReadPtrHandle) < 0) {
+      fprintf(stderr,
+	      "[gart] Could not add ring read ptr mapping\n");
+      return 0;
+   }
+    
+   fprintf(stderr,
+ 	   "[gart] ring read ptr handle = 0x%08lx\n",
+	   info->ringReadPtrHandle);
+    
+   if (drmAddMap(ctx->drmFD, info->bufStart, info->bufMapSize,
+		 DRM_AGP, 0, &info->bufHandle) < 0) {
+      fprintf(stderr,
+	      "[gart] Could not add vertex/indirect buffers mapping\n");
+      return 0;
+   }
+   fprintf(stderr,
+ 	   "[gart] vertex/indirect buffers handle = 0x%08x\n",
+	   info->bufHandle);
+
+   if (drmAddMap(ctx->drmFD, info->gartTexStart, info->gartTexMapSize,
+		 DRM_AGP, 0, &info->gartTexHandle) < 0) {
+      fprintf(stderr,
+	      "[gart] Could not add AGP texture map mapping\n");
+      return 0;
+   }
+   fprintf(stderr,
+ 	   "[gart] AGP texture map handle = 0x%08lx\n",
+	   info->gartTexHandle);
+
+   /* Initialize Radeon's AGP registers */
+   /* Ring buffer is at AGP offset 0 */
+   OUTREG(RADEON_AGP_BASE, info->ringHandle);
+
+   return 1;
+}
+
+/* Initialize the PCI GART state.  Request memory for use in PCI space,
+ * and initialize the Radeon registers to point to that memory.
+ */
+static int RADEONDRIPciInit(const DRIDriverContext *ctx, RADEONInfoPtr info)
+{
+    int  ret;
+    int  flags = DRM_READ_ONLY | DRM_LOCKED | DRM_KERNEL;
+    int            s, l;
+
+    ret = drmScatterGatherAlloc(ctx->drmFD, info->gartSize*1024*1024,
+				&info->gartMemHandle);
+    if (ret < 0) {
+	fprintf(stderr, "[pci] Out of memory (%d)\n", ret);
+	return 0;
+    }
+    fprintf(stderr,
+	       "[pci] %d kB allocated with handle 0x%08lx\n",
+	       info->gartSize*1024, info->gartMemHandle);
+
+   info->gartOffset = 0;
+   
+   /* Initialize the CP ring buffer data */
+   info->ringStart       = info->gartOffset;
+   info->ringMapSize     = info->ringSize*1024*1024 + radeon_drm_page_size;
+
+   info->ringReadOffset  = info->ringStart + info->ringMapSize;
+   info->ringReadMapSize = radeon_drm_page_size;
+
+   /* Reserve space for vertex/indirect buffers */
+   info->bufStart        = info->ringReadOffset + info->ringReadMapSize;
+   info->bufMapSize      = info->bufSize*1024*1024;
+
+   /* Reserve the rest for AGP textures */
+   info->gartTexStart     = info->bufStart + info->bufMapSize;
+   s = (info->gartSize*1024*1024 - info->gartTexStart);
+   l = RADEONMinBits((s-1) / RADEON_NR_TEX_REGIONS);
+   if (l < RADEON_LOG_TEX_GRANULARITY) l = RADEON_LOG_TEX_GRANULARITY;
+   info->gartTexMapSize   = (s >> l) << l;
+   info->log2GARTTexGran  = l;
+
+    if (drmAddMap(ctx->drmFD, info->ringStart, info->ringMapSize,
+		  DRM_SCATTER_GATHER, flags, &info->ringHandle) < 0) {
+	fprintf(stderr,
+		   "[pci] Could not add ring mapping\n");
+	return 0;
+    }
+    fprintf(stderr,
+	       "[pci] ring handle = 0x%08x\n", info->ringHandle);
+
+    if (drmAddMap(ctx->drmFD, info->ringReadOffset, info->ringReadMapSize,
+		  DRM_SCATTER_GATHER, flags, &info->ringReadPtrHandle) < 0) {
+	fprintf(stderr,
+		   "[pci] Could not add ring read ptr mapping\n");
+	return 0;
+    }
+    fprintf(stderr,
+ 	       "[pci] ring read ptr handle = 0x%08lx\n",
+	       info->ringReadPtrHandle);
+
+    if (drmAddMap(ctx->drmFD, info->bufStart, info->bufMapSize,
+		  DRM_SCATTER_GATHER, 0, &info->bufHandle) < 0) {
+	fprintf(stderr,
+		   "[pci] Could not add vertex/indirect buffers mapping\n");
+	return 0;
+    }
+    fprintf(stderr,
+ 	       "[pci] vertex/indirect buffers handle = 0x%08lx\n",
+	       info->bufHandle);
+
+    if (drmAddMap(ctx->drmFD, info->gartTexStart, info->gartTexMapSize,
+		  DRM_SCATTER_GATHER, 0, &info->gartTexHandle) < 0) {
+	fprintf(stderr,
+		   "[pci] Could not add GART texture map mapping\n");
+	return 0;
+    }
+    fprintf(stderr,
+ 	       "[pci] GART texture map handle = 0x%08x\n",
+	       info->gartTexHandle);
+
+    return 1;
+}
+
+
+/**
+ * \brief Initialize the kernel data structures and enable the CP engine.
+ *
+ * \param ctx display handle.
+ * \param info driver private data.
+ *
+ * \return non-zero on success, or zero on failure.
+ *
+ * This function is a wrapper around the DRM_RADEON_CP_INIT command, passing
+ * all the parameters in a drm_radeon_init_t structure.
+ */
+static int RADEONDRIKernelInit( const DRIDriverContext *ctx,
+			       RADEONInfoPtr info)
+{
+   int cpp = ctx->bpp / 8;
+   drm_radeon_init_t  drmInfo;
+   int ret;
+
+   memset(&drmInfo, 0, sizeof(drm_radeon_init_t));
+
+   if ( (info->ChipFamily == CHIP_FAMILY_R200) ||
+	(info->ChipFamily == CHIP_FAMILY_RV250) ||
+	(info->ChipFamily == CHIP_FAMILY_M9) ||
+	(info->ChipFamily == CHIP_FAMILY_RV280) )
+      drmInfo.func             = RADEON_INIT_R200_CP;
+   else
+      drmInfo.func             = RADEON_INIT_CP;
+
+   /* This is the struct passed to the kernel module for its initialization */
+   drmInfo.sarea_priv_offset   = sizeof(drm_sarea_t);
+   drmInfo.is_pci              = ctx->isPCI;
+   drmInfo.cp_mode             = RADEON_DEFAULT_CP_BM_MODE;
+   drmInfo.gart_size            = info->gartSize*1024*1024;
+   drmInfo.ring_size           = info->ringSize*1024*1024;
+   drmInfo.usec_timeout        = 1000;
+   drmInfo.fb_bpp              = ctx->bpp;
+   drmInfo.depth_bpp           = ctx->bpp;
+   drmInfo.front_offset        = info->frontOffset;
+   drmInfo.front_pitch         = info->frontPitch * cpp;
+   drmInfo.back_offset         = info->backOffset;
+   drmInfo.back_pitch          = info->backPitch * cpp;
+   drmInfo.depth_offset        = info->depthOffset;
+   drmInfo.depth_pitch         = info->depthPitch * cpp;
+   drmInfo.fb_offset           = info->LinearAddr;
+   drmInfo.mmio_offset         = info->registerHandle;
+   drmInfo.ring_offset         = info->ringHandle;
+   drmInfo.ring_rptr_offset    = info->ringReadPtrHandle;
+   drmInfo.buffers_offset      = info->bufHandle;
+   drmInfo.gart_textures_offset = info->gartTexHandle;
+
+   ret = drmCommandWrite(ctx->drmFD, DRM_RADEON_CP_INIT, &drmInfo, 
+			 sizeof(drm_radeon_init_t));
+
+   return ret >= 0;
+}
+
+
+/**
+ * \brief Initialize the AGP heap.
+ *
+ * \param ctx display handle.
+ * \param info driver private data.
+ *
+ * This function is a wrapper around the DRM_RADEON_INIT_HEAP command, passing
+ * all the parameters in a drm_radeon_mem_init_heap structure.
+ */
+static void RADEONDRIAgpHeapInit(const DRIDriverContext *ctx,
+				 RADEONInfoPtr info)
+{
+   drm_radeon_mem_init_heap_t drmHeap;
+
+   /* Start up the simple memory manager for gart space */
+   drmHeap.region = RADEON_MEM_REGION_GART;
+   drmHeap.start  = 0;
+   drmHeap.size   = info->gartTexMapSize;
+    
+   if (drmCommandWrite(ctx->drmFD, DRM_RADEON_INIT_HEAP,
+		       &drmHeap, sizeof(drmHeap))) {
+      fprintf(stderr,
+	      "[drm] Failed to initialized gart heap manager\n");
+   } else {
+      fprintf(stderr,
+	      "[drm] Initialized kernel gart heap manager, %d\n",
+	      info->gartTexMapSize);
+   }
+}
+
+/**
+ * \brief Add a map for the vertex buffers that will be accessed by any
+ * DRI-based clients.
+ * 
+ * \param ctx display handle.
+ * \param info driver private data.
+ *
+ * \return one on success, or zero on failure.
+ *
+ * Calls drmAddBufs() with the previously allocated vertex buffers.
+ */
+static int RADEONDRIBufInit( const DRIDriverContext *ctx, RADEONInfoPtr info )
+{
+   /* Initialize vertex buffers */
+   info->bufNumBufs = drmAddBufs(ctx->drmFD,
+				 info->bufMapSize / RADEON_BUFFER_SIZE,
+				 RADEON_BUFFER_SIZE,
+				 ctx->isPCI ? DRM_SG_BUFFER : DRM_AGP_BUFFER,
+				 info->bufStart);
+
+   if (info->bufNumBufs <= 0) {
+      fprintf(stderr,
+	      "[drm] Could not create vertex/indirect buffers list\n");
+      return 0;
+   }
+   fprintf(stderr,
+	   "[drm] Added %d %d byte vertex/indirect buffers\n",
+	   info->bufNumBufs, RADEON_BUFFER_SIZE);
+   
+   return 1;
+}
+
+/**
+ * \brief Install an IRQ handler.
+ * 
+ * \param ctx display handle.
+ * \param info driver private data.
+ *
+ * Attempts to install an IRQ handler via drmCtlInstHandler(), falling back to
+ * IRQ-free operation on failure.
+ */
+static void RADEONDRIIrqInit(const DRIDriverContext *ctx,
+			     RADEONInfoPtr info)
+{
+   if (!info->irq) {
+      info->irq = drmGetInterruptFromBusID(ctx->drmFD,
+					   ctx->pciBus,
+					   ctx->pciDevice,
+					   ctx->pciFunc);
+
+      if ((drmCtlInstHandler(ctx->drmFD, info->irq)) != 0) {
+	 fprintf(stderr,
+		 "[drm] failure adding irq handler, "
+		 "there is a device already using that irq\n"
+		 "[drm] falling back to irq-free operation\n");
+	 info->irq = 0;
+      }
+   }
+
+   if (info->irq)
+      fprintf(stderr,
+	      "[drm] dma control initialized, using IRQ %d\n",
+	      info->irq);
+}
+
+static int RADEONCheckDRMVersion( const DRIDriverContext *ctx,
+				  RADEONInfoPtr info )
+{
+   drmVersionPtr  version;
+
+   version = drmGetVersion(ctx->drmFD);
+   if (version) {
+      int req_minor, req_patch;
+
+      /* Need 1.8.x for proper cleanup-on-client-exit behaviour.
+       */
+      req_minor = 8;
+      req_patch = 0;	
+
+      if (version->version_major != 1 ||
+	  version->version_minor < req_minor ||
+	  (version->version_minor == req_minor && 
+	   version->version_patchlevel < req_patch)) {
+	 /* Incompatible drm version */
+	 fprintf(stderr,
+		 "[dri] RADEONDRIScreenInit failed because of a version "
+		 "mismatch.\n"
+		 "[dri] radeon.o kernel module version is %d.%d.%d "
+		 "but version 1.%d.%d or newer is needed.\n"
+		 "[dri] Disabling DRI.\n",
+		 version->version_major,
+		 version->version_minor,
+		 version->version_patchlevel,
+		 req_minor,
+		 req_patch);
+	 drmFreeVersion(version);
+	 return 0;
+      }
+
+      info->drmMinor = version->version_minor;
+      drmFreeVersion(version);
+   }
+
+   return 1;
+}
+
+static int RADEONMemoryInit( const DRIDriverContext *ctx, RADEONInfoPtr info )
+{
+   int        width_bytes = ctx->shared.virtualWidth * ctx->cpp;
+   int        cpp         = ctx->cpp;
+   int        bufferSize  = ((((ctx->shared.virtualHeight+15) & ~15) * width_bytes			     + RADEON_BUFFER_ALIGN) & ~RADEON_BUFFER_ALIGN);
+   int        depthSize   = ((((ctx->shared.virtualHeight+15) & ~15) * width_bytes
+			     + RADEON_BUFFER_ALIGN) & ~RADEON_BUFFER_ALIGN);
+   int        l;
+
+   info->frontOffset = 0;
+   info->frontPitch = ctx->shared.virtualWidth;
+
+   fprintf(stderr, 
+	   "Using %d MB AGP aperture\n", info->gartSize);
+   fprintf(stderr, 
+	   "Using %d MB for the ring buffer\n", info->ringSize);
+   fprintf(stderr, 
+	   "Using %d MB for vertex/indirect buffers\n", info->bufSize);
+   fprintf(stderr, 
+	   "Using %d MB for AGP textures\n", info->gartTexSize);
+
+   /* Front, back and depth buffers - everything else texture??
+    */
+   info->textureSize = ctx->shared.fbSize - 2 * bufferSize - depthSize;
+
+   if (ctx->colorTiling==1)
+   {
+	info->textureSize = ctx->shared.fbSize - ((ctx->shared.fbSize - info->textureSize + width_bytes * 16 - 1) / (width_bytes * 16)) * (width_bytes*16);
+   }
+
+   if (info->textureSize < 0) 
+      return 0;
+
+   l = RADEONMinBits((info->textureSize-1) / RADEON_NR_TEX_REGIONS);
+   if (l < RADEON_LOG_TEX_GRANULARITY) l = RADEON_LOG_TEX_GRANULARITY;
+
+   /* Round the texture size up to the nearest whole number of
+    * texture regions.  Again, be greedy about this, don't
+    * round down.
+    */
+   info->log2TexGran = l;
+   info->textureSize = (info->textureSize >> l) << l;
+
+   /* Set a minimum usable local texture heap size.  This will fit
+    * two 256x256x32bpp textures.
+    */
+   if (info->textureSize < 512 * 1024) {
+      info->textureOffset = 0;
+      info->textureSize = 0;
+   }
+
+   /* Reserve space for textures */
+   if (ctx->colorTiling==1)
+   {
+      info->textureOffset = ((ctx->shared.fbSize - info->textureSize) / 
+			(width_bytes * 16)) * (width_bytes*16);
+   }
+   else
+   {
+      info->textureOffset = ((ctx->shared.fbSize - info->textureSize +
+   	 		   RADEON_BUFFER_ALIGN) &
+			  ~RADEON_BUFFER_ALIGN);
+   }
+   /* Reserve space for the shared depth
+    * buffer.
+    */
+   info->depthOffset = ((info->textureOffset - depthSize +
+			 RADEON_BUFFER_ALIGN) &
+			~RADEON_BUFFER_ALIGN);
+   info->depthPitch = ctx->shared.virtualWidth;
+
+   info->backOffset = ((info->depthOffset - bufferSize +
+			RADEON_BUFFER_ALIGN) &
+		       ~RADEON_BUFFER_ALIGN);
+   info->backPitch = ctx->shared.virtualWidth;
+
+
+   fprintf(stderr, 
+	   "Will use back buffer at offset 0x%x\n",
+	   info->backOffset);
+   fprintf(stderr, 
+	   "Will use depth buffer at offset 0x%x\n",
+	   info->depthOffset);
+   fprintf(stderr, 
+	   "Will use %d kb for textures at offset 0x%x\n",
+	   info->textureSize/1024, info->textureOffset);
+
+   info->frontPitchOffset = (((info->frontPitch * cpp / 64) << 22) |
+			     (info->frontOffset >> 10));
+
+   info->backPitchOffset = (((info->backPitch * cpp / 64) << 22) |
+			    (info->backOffset >> 10));
+
+   info->depthPitchOffset = (((info->depthPitch * cpp / 64) << 22) |
+			     (info->depthOffset >> 10));
+
+   return 1;
+}
+
+static int RADEONColorTilingInit( const DRIDriverContext *ctx, RADEONInfoPtr info )
+{
+   int        width_bytes = ctx->shared.virtualWidth * ctx->cpp;
+   int        bufferSize  = ((((ctx->shared.virtualHeight+15) & ~15) * width_bytes			     + RADEON_BUFFER_ALIGN)
+			     & ~RADEON_BUFFER_ALIGN);
+   /* Setup color tiling */
+   if (info->drmMinor<14)
+      info->colorTiling=0;
+
+   if (info->colorTiling)
+   {
+
+      int colorTilingFlag;
+      drm_radeon_surface_alloc_t front,back;
+
+      RadeonSetParam(ctx, RADEON_SETPARAM_SWITCH_TILING, info->colorTiling ? 1 : 0);
+      
+      /* Setup the surfaces */
+      if (info->ChipFamily < CHIP_FAMILY_R200)
+         colorTilingFlag=RADEON_SURF_TILE_COLOR_MACRO;
+      else
+         colorTilingFlag=R200_SURF_TILE_COLOR_MACRO;
+
+      front.address = info->frontOffset;
+      front.size = bufferSize;
+      front.flags = (width_bytes) | colorTilingFlag;
+      drmCommandWrite(ctx->drmFD, DRM_RADEON_SURF_ALLOC, &front,sizeof(front)); 
+ 
+      back.address = info->backOffset;
+      back.size = bufferSize;
+      back.flags = (width_bytes) | colorTilingFlag;
+      drmCommandWrite(ctx->drmFD, DRM_RADEON_SURF_ALLOC, &back,sizeof(back)); 
+
+   }
+   return 1;
+} 
+
+
+
+/**
+ * Called at the start of each server generation.
+ *
+ * \param ctx display handle.
+ * \param info driver private data.
+ *
+ * \return non-zero on success, or zero on failure.
+ *
+ * Performs static frame buffer allocation. Opens the DRM device and add maps
+ * to the SAREA, framebuffer and MMIO regions. Fills in \p info with more
+ * information. Creates a \e server context to grab the lock for the
+ * initialization ioctls and calls the other initilization functions in this
+ * file. Starts the CP engine via the DRM_RADEON_CP_START command.
+ *
+ * Setups a RADEONDRIRec structure to be passed to radeon_dri.so for its
+ * initialization.
+ */
+static int RADEONScreenInit( DRIDriverContext *ctx, RADEONInfoPtr info )
+{
+   RADEONDRIPtr   pRADEONDRI;
+   int err;
+
+   usleep(100);
+   /*assert(!ctx->IsClient);*/
+
+   {
+      int  width_bytes = (ctx->shared.virtualWidth * ctx->cpp);
+      int  maxy        = ctx->shared.fbSize / width_bytes;
+
+
+      if (maxy <= ctx->shared.virtualHeight * 3) {
+	 fprintf(stderr, 
+		 "Static buffer allocation failed -- "
+		 "need at least %d kB video memory (have %d kB)\n",
+		 (ctx->shared.virtualWidth * ctx->shared.virtualHeight *
+		  ctx->cpp * 3 + 1023) / 1024,
+		 ctx->shared.fbSize / 1024);
+	 return 0;
+      } 
+   }
+
+
+   if (info->ChipFamily >= CHIP_FAMILY_R300) {
+      fprintf(stderr, 
+	      "Direct rendering not yet supported on "
+	      "Radeon 9700 and newer cards\n");
+      return 0;
+   }
+   
+   radeon_drm_page_size = getpagesize();   
+
+   info->registerSize = ctx->MMIOSize;
+   ctx->shared.SAREASize = SAREA_MAX;
+
+   /* Note that drmOpen will try to load the kernel module, if needed. */
+   ctx->drmFD = drmOpen("radeon", NULL );
+   if (ctx->drmFD < 0) {
+      fprintf(stderr, "[drm] drmOpen failed\n");
+      return 0;
+   }
+
+   if ((err = drmSetBusid(ctx->drmFD, ctx->pciBusID)) < 0) {
+      fprintf(stderr, "[drm] drmSetBusid failed (%d, %s), %s\n",
+	      ctx->drmFD, ctx->pciBusID, strerror(-err));
+      return 0;
+   }
+
+   if (drmAddMap( ctx->drmFD,
+		  0,
+		  ctx->shared.SAREASize,
+		  DRM_SHM,
+		  DRM_CONTAINS_LOCK,
+		  &ctx->shared.hSAREA) < 0)
+   {
+      fprintf(stderr, "[drm] drmAddMap failed\n");
+      return 0;
+   }
+   fprintf(stderr, "[drm] added %d byte SAREA at 0x%08lx\n",
+	   ctx->shared.SAREASize, ctx->shared.hSAREA);
+
+   if (drmMap( ctx->drmFD,
+	       ctx->shared.hSAREA,
+	       ctx->shared.SAREASize,
+	       (drmAddressPtr)(&ctx->pSAREA)) < 0)
+   {
+      fprintf(stderr, "[drm] drmMap failed\n");
+      return 0;
+   }
+   memset(ctx->pSAREA, 0, ctx->shared.SAREASize);
+   fprintf(stderr, "[drm] mapped SAREA 0x%08lx to %p, size %d\n",
+	   ctx->shared.hSAREA, ctx->pSAREA, ctx->shared.SAREASize);
+   
+   /* Need to AddMap the framebuffer and mmio regions here:
+    */
+   if (drmAddMap( ctx->drmFD,
+		  (drm_handle_t)ctx->FBStart,
+		  ctx->FBSize,
+		  DRM_FRAME_BUFFER,
+#ifndef _EMBEDDED
+		  0,
+#else
+		  DRM_READ_ONLY,
+#endif
+		  &ctx->shared.hFrameBuffer) < 0)
+   {
+      fprintf(stderr, "[drm] drmAddMap framebuffer failed\n");
+      return 0;
+   }
+
+   fprintf(stderr, "[drm] framebuffer handle = 0x%08lx\n",
+	   ctx->shared.hFrameBuffer);
+
+
+
+   if (drmAddMap(ctx->drmFD, 
+		 ctx->MMIOStart,
+		 ctx->MMIOSize,
+		 DRM_REGISTERS, 
+		 DRM_READ_ONLY, 
+		 &info->registerHandle) < 0) {
+      fprintf(stderr, "[drm] drmAddMap mmio failed\n");	
+      return 0;
+   }
+   fprintf(stderr,
+	   "[drm] register handle = 0x%08lx\n", info->registerHandle);
+
+   /* Check the radeon DRM version */
+   if (!RADEONCheckDRMVersion(ctx, info)) {
+      return 0;
+   }
+
+   if (ctx->isPCI) {
+      /* Initialize PCI */
+      if (!RADEONDRIPciInit(ctx, info))
+         return 0;
+   }
+   else {
+      /* Initialize AGP */
+      if (!RADEONDRIAgpInit(ctx, info))
+         return 0;
+   }
+
+   /* Memory manager setup */
+   if (!RADEONMemoryInit(ctx, info)) {
+      return 0;
+   }
+
+   /* Create a 'server' context so we can grab the lock for
+    * initialization ioctls.
+    */
+   if ((err = drmCreateContext(ctx->drmFD, &ctx->serverContext)) != 0) {
+      fprintf(stderr, "%s: drmCreateContext failed %d\n", __FUNCTION__, err);
+      return 0;
+   }
+
+   DRM_LOCK(ctx->drmFD, ctx->pSAREA, ctx->serverContext, 0); 
+
+   /* Initialize the kernel data structures */
+   if (!RADEONDRIKernelInit(ctx, info)) {
+      fprintf(stderr, "RADEONDRIKernelInit failed\n");
+      DRM_UNLOCK(ctx->drmFD, ctx->pSAREA, ctx->serverContext);
+      return 0;
+   }
+
+   /* Initialize the vertex buffers list */
+   if (!RADEONDRIBufInit(ctx, info)) {
+      fprintf(stderr, "RADEONDRIBufInit failed\n");
+      DRM_UNLOCK(ctx->drmFD, ctx->pSAREA, ctx->serverContext);
+      return 0;
+   }
+
+   RADEONColorTilingInit(ctx, info);
+
+   /* Initialize IRQ */
+   RADEONDRIIrqInit(ctx, info);
+
+   /* Initialize kernel gart memory manager */
+   RADEONDRIAgpHeapInit(ctx, info);
+
+   fprintf(stderr,"color tiling %sabled\n", info->colorTiling?"en":"dis");
+   fprintf(stderr,"page flipping %sabled\n", info->page_flip_enable?"en":"dis");
+   /* Initialize the SAREA private data structure */
+   {
+      drm_radeon_sarea_t *pSAREAPriv;
+      pSAREAPriv = (drm_radeon_sarea_t *)(((char*)ctx->pSAREA) + 
+					sizeof(drm_sarea_t));
+      memset(pSAREAPriv, 0, sizeof(*pSAREAPriv));
+      pSAREAPriv->pfState = info->page_flip_enable;
+   }
+
+
+   /* Quick hack to clear the front & back buffers.  Could also use
+    * the clear ioctl to do this, but would need to setup hw state
+    * first.
+    */
+   drimemsetio((char *)ctx->FBAddress + info->frontOffset,
+	  0,
+	  info->frontPitch * ctx->cpp * ctx->shared.virtualHeight );
+
+   drimemsetio((char *)ctx->FBAddress + info->backOffset,
+	  0,
+	  info->backPitch * ctx->cpp * ctx->shared.virtualHeight );
+
+   /* This is the struct passed to radeon_dri.so for its initialization */
+   ctx->driverClientMsg = malloc(sizeof(RADEONDRIRec));
+   ctx->driverClientMsgSize = sizeof(RADEONDRIRec);
+   pRADEONDRI                    = (RADEONDRIPtr)ctx->driverClientMsg;
+   pRADEONDRI->deviceID          = info->Chipset;
+   pRADEONDRI->width             = ctx->shared.virtualWidth;
+   pRADEONDRI->height            = ctx->shared.virtualHeight;
+   pRADEONDRI->depth             = ctx->bpp; /* XXX: depth */
+   pRADEONDRI->bpp               = ctx->bpp;
+   pRADEONDRI->IsPCI             = ctx->isPCI;
+   pRADEONDRI->AGPMode           = ctx->agpmode;
+   pRADEONDRI->frontOffset       = info->frontOffset;
+   pRADEONDRI->frontPitch        = info->frontPitch;
+   pRADEONDRI->backOffset        = info->backOffset;
+   pRADEONDRI->backPitch         = info->backPitch;
+   pRADEONDRI->depthOffset       = info->depthOffset;
+   pRADEONDRI->depthPitch        = info->depthPitch;
+   pRADEONDRI->textureOffset     = info->textureOffset;
+   pRADEONDRI->textureSize       = info->textureSize;
+   pRADEONDRI->log2TexGran       = info->log2TexGran;
+   pRADEONDRI->registerHandle    = info->registerHandle;
+   pRADEONDRI->registerSize      = info->registerSize; 
+   pRADEONDRI->statusHandle      = info->ringReadPtrHandle;
+   pRADEONDRI->statusSize        = info->ringReadMapSize;
+   pRADEONDRI->gartTexHandle      = info->gartTexHandle;
+   pRADEONDRI->gartTexMapSize     = info->gartTexMapSize;
+   pRADEONDRI->log2GARTTexGran    = info->log2GARTTexGran;
+   pRADEONDRI->gartTexOffset      = info->gartTexStart;
+   pRADEONDRI->sarea_priv_offset = sizeof(drm_sarea_t);
+
+   /* Don't release the lock now - let the VT switch handler do it. */
+
+   return 1;
+}
+
+
+/**
+ * \brief Get Radeon chip family from chipset number.
+ * 
+ * \param info driver private data.
+ *
+ * \return non-zero on success, or zero on failure.
+ *
+ * Called by radeonInitFBDev() to set RADEONInfoRec::ChipFamily
+ * according to the value of RADEONInfoRec::Chipset.  Fails if the
+ * chipset is unrecognized or not appropriate for this driver (i.e., not
+ * an r100 style radeon)
+ */
+static int get_chipfamily_from_chipset( RADEONInfoPtr info )
+{
+    switch (info->Chipset) {
+    case PCI_CHIP_RADEON_LY:
+    case PCI_CHIP_RADEON_LZ:
+	info->ChipFamily = CHIP_FAMILY_M6;
+	break;
+
+    case PCI_CHIP_RADEON_QY:
+    case PCI_CHIP_RADEON_QZ:
+	info->ChipFamily = CHIP_FAMILY_VE;
+	break;
+
+    case PCI_CHIP_R200_QL:
+    case PCI_CHIP_R200_QN:
+    case PCI_CHIP_R200_QO:
+    case PCI_CHIP_R200_Ql:
+    case PCI_CHIP_R200_BB:
+	info->ChipFamily = CHIP_FAMILY_R200;
+	break;
+
+    case PCI_CHIP_RV200_QW: /* RV200 desktop */
+    case PCI_CHIP_RV200_QX:
+	info->ChipFamily = CHIP_FAMILY_RV200;
+	break;
+
+    case PCI_CHIP_RADEON_LW:
+    case PCI_CHIP_RADEON_LX:
+	info->ChipFamily = CHIP_FAMILY_M7;
+	break;
+
+    case PCI_CHIP_RV250_Id:
+    case PCI_CHIP_RV250_Ie:
+    case PCI_CHIP_RV250_If:
+    case PCI_CHIP_RV250_Ig:
+	info->ChipFamily = CHIP_FAMILY_RV250;
+	break;
+
+    case PCI_CHIP_RV250_Ld:
+    case PCI_CHIP_RV250_Le:
+    case PCI_CHIP_RV250_Lf:
+    case PCI_CHIP_RV250_Lg:
+	info->ChipFamily = CHIP_FAMILY_M9;
+	break;
+
+    case PCI_CHIP_RV280_Y_:
+    case PCI_CHIP_RV280_Ya:
+    case PCI_CHIP_RV280_Yb:
+    case PCI_CHIP_RV280_Yc:
+	info->ChipFamily = CHIP_FAMILY_RV280;
+        break;
+
+    case PCI_CHIP_R300_ND:
+    case PCI_CHIP_R300_NE:
+    case PCI_CHIP_R300_NF:
+    case PCI_CHIP_R300_NG:
+	info->ChipFamily = CHIP_FAMILY_R300;
+        break;
+
+    default:
+	/* Original Radeon/7200 */
+	info->ChipFamily = CHIP_FAMILY_RADEON;
+    }
+
+    return 1;
+}
+
+
+/**
+ * \brief Validate the fbdev mode.
+ * 
+ * \param ctx display handle.
+ *
+ * \return one on success, or zero on failure.
+ *
+ * Saves some registers and returns 1.
+ *
+ * \sa radeonValidateMode().
+ */
+static int radeonValidateMode( const DRIDriverContext *ctx )
+{
+   unsigned char *RADEONMMIO = ctx->MMIOAddress;
+   RADEONInfoPtr info = ctx->driverPrivate;
+
+   info->gen_int_cntl = INREG(RADEON_GEN_INT_CNTL);
+   info->crtc_offset_cntl = INREG(RADEON_CRTC_OFFSET_CNTL);
+
+   if (info->colorTiling)
+	   info->crtc_offset_cntl |= RADEON_CRTC_TILE_EN;
+   return 1;
+}
+
+
+/**
+ * \brief Examine mode returned by fbdev.
+ * 
+ * \param ctx display handle.
+ *
+ * \return one on success, or zero on failure.
+ *
+ * Restores registers that fbdev has clobbered and returns 1.
+ *
+ * \sa radeonValidateMode().
+ */
+static int radeonPostValidateMode( const DRIDriverContext *ctx )
+{
+   unsigned char *RADEONMMIO = ctx->MMIOAddress;
+   RADEONInfoPtr info = ctx->driverPrivate;
+
+   RADEONColorTilingInit( ctx, info);
+   OUTREG(RADEON_GEN_INT_CNTL, info->gen_int_cntl);
+   if (info->colorTiling)
+	   info->crtc_offset_cntl |= RADEON_CRTC_TILE_EN;
+   OUTREG(RADEON_CRTC_OFFSET_CNTL, info->crtc_offset_cntl);
+   
+   return 1;
+}
+
+
+/**
+ * \brief Initialize the framebuffer device mode
+ *
+ * \param ctx display handle.
+ *
+ * \return one on success, or zero on failure.
+ *
+ * Fills in \p info with some default values and some information from \p ctx
+ * and then calls RADEONScreenInit() for the screen initialization.
+ * 
+ * Before exiting clears the framebuffer memory accessing it directly.
+ */
+static int radeonInitFBDev( DRIDriverContext *ctx )
+{
+   RADEONInfoPtr info = calloc(1, sizeof(*info));
+
+   {
+      int  dummy = ctx->shared.virtualWidth;
+
+      if (ctx->colorTiling==1)
+      {
+         switch (ctx->bpp / 8) {
+         case 1: dummy = (ctx->shared.virtualWidth + 255) & ~255; break;
+         case 2: dummy = (ctx->shared.virtualWidth + 127) & ~127; break;
+         case 3:
+         case 4: dummy = (ctx->shared.virtualWidth +  63) &  ~63; break;
+         }
+      } else {
+	 switch (ctx->bpp / 8) {
+         case 1: dummy = (ctx->shared.virtualWidth + 127) & ~127; break;
+         case 2: dummy = (ctx->shared.virtualWidth +  31) &  ~31; break;
+         case 3:
+         case 4: dummy = (ctx->shared.virtualWidth +  15) &  ~15; break;
+         }
+      }
+
+      ctx->shared.virtualWidth = dummy;
+      ctx->shared.Width = dummy;
+   }
+
+   fprintf(stderr,"shared virtual width is %d\n", ctx->shared.virtualWidth);
+   ctx->driverPrivate = (void *)info;
+   
+   info->gartFastWrite  = RADEON_DEFAULT_AGP_FAST_WRITE;
+   info->gartSize       = RADEON_DEFAULT_AGP_SIZE;
+   info->gartTexSize    = RADEON_DEFAULT_AGP_TEX_SIZE;
+   info->bufSize       = RADEON_DEFAULT_BUFFER_SIZE;
+   info->ringSize      = RADEON_DEFAULT_RING_SIZE;
+   info->page_flip_enable = RADEON_DEFAULT_PAGE_FLIP;
+   info->colorTiling = ctx->colorTiling;
+  
+   info->Chipset = ctx->chipset;
+
+   if (!get_chipfamily_from_chipset( info )) {
+      fprintf(stderr, "Unknown or non-radeon chipset -- cannot continue\n");
+      fprintf(stderr, "==> Verify PCI BusID is correct in miniglx.conf\n");
+      return 0;
+   }
+
+   info->frontPitch = ctx->shared.virtualWidth;
+   info->LinearAddr = ctx->FBStart & 0xfc000000;
+    
+
+   if (!RADEONScreenInit( ctx, info ))
+      return 0;
+
+
+   return 1;
+}
+
+
+/**
+ * \brief The screen is being closed, so clean up any state and free any
+ * resources used by the DRI.
+ *
+ * \param ctx display handle.
+ *
+ * Unmaps the SAREA, closes the DRM device file descriptor and frees the driver
+ * private data.
+ */
+static void radeonHaltFBDev( DRIDriverContext *ctx )
+{
+    drmUnmap( ctx->pSAREA, ctx->shared.SAREASize );
+    drmClose(ctx->drmFD);
+
+    if (ctx->driverPrivate) {
+       free(ctx->driverPrivate);
+       ctx->driverPrivate = 0;
+    }
+}
+
+
+extern void radeonNotifyFocus( int );
+
+/**
+ * \brief Exported driver interface for Mini GLX.
+ *
+ * \sa DRIDriverRec.
+ */
+const struct DRIDriverRec __driDriver = {
+   radeonValidateMode,
+   radeonPostValidateMode,
+   radeonInitFBDev,
+   radeonHaltFBDev,
+   RADEONEngineShutdown,
+   RADEONEngineRestore,  
+#ifndef _EMBEDDED
+   0,
+#else
+   radeonNotifyFocus, 
+#endif
+};
diff --git a/radeon/server/radeon_dri.h b/radeon/server/radeon_dri.h
new file mode 100644
index 0000000..ecd5323
--- /dev/null
+++ b/radeon/server/radeon_dri.h
@@ -0,0 +1,116 @@
+/**
+ * \file server/radeon_dri.h
+ * \brief Radeon server-side structures.
+ * 
+ * \author Kevin E. Martin <martin@xfree86.org>
+ * \author Rickard E. Faith <faith@valinux.com>
+ */
+
+/*
+ * Copyright 2000 ATI Technologies Inc., Markham, Ontario,
+ *                VA Linux Systems Inc., Fremont, California.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation on the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT.  IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR
+ * THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* $XFree86: xc/programs/Xserver/hw/xfree86/drivers/ati/radeon_dri.h,v 1.3 2002/04/24 16:20:40 martin Exp $ */
+
+#ifndef _RADEON_DRI_
+#define _RADEON_DRI_
+
+#include "xf86drm.h"
+#include "drm.h"
+#include "radeon_drm.h"
+
+/* DRI Driver defaults */
+#define RADEON_DEFAULT_CP_PIO_MODE    RADEON_CSQ_PRIPIO_INDPIO
+#define RADEON_DEFAULT_CP_BM_MODE     RADEON_CSQ_PRIBM_INDBM
+#define RADEON_DEFAULT_AGP_MODE       1
+#define RADEON_DEFAULT_AGP_FAST_WRITE 0
+#define RADEON_DEFAULT_AGP_SIZE       8 /* MB (must be 2^n and > 4MB) */
+#define RADEON_DEFAULT_RING_SIZE      1 /* MB (must be page aligned) */
+#define RADEON_DEFAULT_BUFFER_SIZE    2 /* MB (must be page aligned) */
+#define RADEON_DEFAULT_AGP_TEX_SIZE   1 /* MB (must be page aligned) */
+#define RADEON_DEFAULT_CP_TIMEOUT     10000  /* usecs */
+#define RADEON_DEFAULT_PAGE_FLIP      0 /* page flipping diabled */
+#define RADEON_BUFFER_ALIGN           0x00000fff
+
+/**
+ * \brief Radeon DRI driver private data.
+ */
+typedef struct {
+    /**
+     * \name DRI screen private data
+     */
+    /*@{*/
+    int           deviceID;	 /**< \brief PCI device ID */
+    int           width;	 /**< \brief width in pixels of display */
+    int           height;	 /**< \brief height in scanlines of display */
+    int           depth;	 /**< \brief depth of display (8, 15, 16, 24) */
+    int           bpp;		 /**< \brief bit depth of display (8, 16, 24, 32) */
+
+    int           IsPCI;	 /**< \brief is current card a PCI card? */
+    int           AGPMode;	 /**< \brief AGP mode */
+
+    int           frontOffset;   /**< \brief front buffer offset */
+    int           frontPitch;	 /**< \brief front buffer pitch */
+    int           backOffset;    /**< \brief shared back buffer offset */
+    int           backPitch;     /**< \brief shared back buffer pitch */
+    int           depthOffset;   /**< \brief shared depth buffer offset */
+    int           depthPitch;    /**< \brief shared depth buffer pitch */
+    int           textureOffset; /**< \brief start of texture data in frame buffer */
+    int           textureSize;   /**< \brief size of texture date */
+    int           log2TexGran;   /**< \brief log2 texture granularity */
+    /*@}*/
+
+    /**
+     * \name MMIO register data
+     */
+    /*@{*/
+    drm_handle_t     registerHandle; /**< \brief MMIO register map size */
+    drmSize       registerSize;   /**< \brief MMIO register map handle */
+    /*@}*/
+
+    /**
+     * \name CP in-memory status information
+     */
+    /*@{*/
+    drm_handle_t     statusHandle;   /**< \brief status map handle */
+    drmSize       statusSize;     /**< \brief status map size */
+    /*@}*/
+
+    /**
+     * \name CP AGP Texture data
+     */
+    /*@{*/
+    drm_handle_t     gartTexHandle;   /**< \brief AGP texture area map handle */
+    drmSize       gartTexMapSize;  /**< \brief AGP texture area map size */
+    int           log2GARTTexGran; /**< \brief AGP texture granularity in log base 2 */
+    int           gartTexOffset;   /**< \brief AGP texture area offset in AGP space */
+    /*@}*/
+
+    unsigned int  sarea_priv_offset; /**< \brief offset of the private SAREA data*/
+} RADEONDRIRec, *RADEONDRIPtr;
+
+#endif
diff --git a/radeon/server/radeon_egl.c b/radeon/server/radeon_egl.c
new file mode 100644
index 0000000..2f6ea55
--- /dev/null
+++ b/radeon/server/radeon_egl.c
@@ -0,0 +1,1088 @@
+/*
+ * EGL driver for radeon_dri.so
+ */
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include "eglconfig.h"
+#include "eglcontext.h"
+#include "egldisplay.h"
+#include "egldriver.h"
+#include "eglglobals.h"
+#include "egllog.h"
+#include "eglmode.h"
+#include "eglscreen.h"
+#include "eglsurface.h"
+#include "egldri.h"
+
+#include "mtypes.h"
+#include "memops.h"
+#include "drm.h"
+#include "drm_sarea.h"
+#include "radeon_drm.h"
+#include "radeon_dri.h"
+#include "radeon.h"
+
+static size_t radeon_drm_page_size;
+
+/**
+ * radeon driver-specific driver class derived from _EGLDriver
+ */
+typedef struct radeon_driver
+{
+   _EGLDriver Base;  /* base class/object */
+   GLuint radeonStuff;
+} radeonDriver;
+
+static int
+RADEONSetParam(driDisplay  *disp, int param, int value)
+{
+   drm_radeon_setparam_t sp;
+   int ret;
+
+   memset(&sp, 0, sizeof(sp));
+   sp.param = param;
+   sp.value = value;
+
+   if ((ret=drmCommandWrite(disp->drmFD, DRM_RADEON_SETPARAM, &sp, sizeof(sp)))) {
+     fprintf(stderr,"Set param failed\n", ret);
+      return -1;
+   }
+
+   return 0;
+}
+
+static int
+RADEONCheckDRMVersion(driDisplay *disp, RADEONInfoPtr info)
+{
+   drmVersionPtr  version;
+
+   version = drmGetVersion(disp->drmFD);
+   if (version) {
+      int req_minor, req_patch;
+
+      /* Need 1.21.x for card type detection getparam
+       */
+      req_minor = 21;
+      req_patch = 0;
+
+      if (version->version_major != 1 ||
+          version->version_minor < req_minor ||
+          (version->version_minor == req_minor &&
+           version->version_patchlevel < req_patch)) {
+         /* Incompatible drm version */
+         fprintf(stderr,
+                 "[dri] RADEONDRIScreenInit failed because of a version "
+                 "mismatch.\n"
+                 "[dri] radeon.o kernel module version is %d.%d.%d "
+                 "but version 1.%d.%d or newer is needed.\n"
+                 "[dri] Disabling DRI.\n",
+                 version->version_major,
+                 version->version_minor,
+                 version->version_patchlevel,
+                 req_minor,
+                 req_patch);
+         drmFreeVersion(version);
+         return 0;
+      }
+
+      info->drmMinor = version->version_minor;
+      drmFreeVersion(version);
+   }
+
+   return 1;
+}
+
+
+/**
+ * \brief Compute base 2 logarithm.
+ *
+ * \param val value.
+ *
+ * \return base 2 logarithm of \p val.
+ */
+static int RADEONMinBits(int val)
+{
+   int  bits;
+
+   if (!val) return 1;
+   for (bits = 0; val; val >>= 1, ++bits);
+   return bits;
+}
+
+
+/* Initialize the PCI GART state.  Request memory for use in PCI space,
+ * and initialize the Radeon registers to point to that memory.
+ */
+static int RADEONDRIPciInit(driDisplay *disp, RADEONInfoPtr info)
+{
+    int  ret;
+    int  flags = DRM_READ_ONLY | DRM_LOCKED | DRM_KERNEL;
+    int            s, l;
+
+    ret = drmScatterGatherAlloc(disp->drmFD, info->gartSize*1024*1024,
+                                &info->gartMemHandle);
+    if (ret < 0) {
+        fprintf(stderr, "[pci] Out of memory (%d)\n", ret);
+        return 0;
+    }
+    fprintf(stderr,
+               "[pci] %d kB allocated with handle 0x%04lx\n",
+            info->gartSize*1024, (long) info->gartMemHandle);
+
+   info->gartOffset = 0;
+
+   /* Initialize the CP ring buffer data */
+   info->ringStart       = info->gartOffset;
+   info->ringMapSize     = info->ringSize*1024*1024 + radeon_drm_page_size;
+
+   info->ringReadOffset  = info->ringStart + info->ringMapSize;
+   info->ringReadMapSize = radeon_drm_page_size;
+
+   /* Reserve space for vertex/indirect buffers */
+   info->bufStart        = info->ringReadOffset + info->ringReadMapSize;
+   info->bufMapSize      = info->bufSize*1024*1024;
+
+   /* Reserve the rest for AGP textures */
+   info->gartTexStart     = info->bufStart + info->bufMapSize;
+   s = (info->gartSize*1024*1024 - info->gartTexStart);
+   l = RADEONMinBits((s-1) / RADEON_NR_TEX_REGIONS);
+   if (l < RADEON_LOG_TEX_GRANULARITY) l = RADEON_LOG_TEX_GRANULARITY;
+   info->gartTexMapSize   = (s >> l) << l;
+   info->log2GARTTexGran  = l;
+
+    if (drmAddMap(disp->drmFD, info->ringStart, info->ringMapSize,
+                  DRM_SCATTER_GATHER, flags, &info->ringHandle) < 0) {
+        fprintf(stderr,
+                   "[pci] Could not add ring mapping\n");
+        return 0;
+    }
+    fprintf(stderr,
+               "[pci] ring handle = 0x%08lx\n", info->ringHandle);
+
+    if (drmAddMap(disp->drmFD, info->ringReadOffset, info->ringReadMapSize,
+                  DRM_SCATTER_GATHER, flags, &info->ringReadPtrHandle) < 0) {
+        fprintf(stderr,
+                   "[pci] Could not add ring read ptr mapping\n");
+        return 0;
+    }
+    fprintf(stderr,
+               "[pci] ring read ptr handle = 0x%08lx\n",
+               info->ringReadPtrHandle);
+
+    if (drmAddMap(disp->drmFD, info->bufStart, info->bufMapSize,
+                  DRM_SCATTER_GATHER, 0, &info->bufHandle) < 0) {
+        fprintf(stderr,
+                   "[pci] Could not add vertex/indirect buffers mapping\n");
+        return 0;
+    }
+    fprintf(stderr,
+               "[pci] vertex/indirect buffers handle = 0x%08lx\n",
+               info->bufHandle);
+
+    if (drmAddMap(disp->drmFD, info->gartTexStart, info->gartTexMapSize,
+                  DRM_SCATTER_GATHER, 0, &info->gartTexHandle) < 0) {
+        fprintf(stderr,
+                   "[pci] Could not add GART texture map mapping\n");
+        return 0;
+    }
+    fprintf(stderr,
+               "[pci] GART texture map handle = 0x%08lx\n",
+               info->gartTexHandle);
+
+    return 1;
+}
+
+
+/**
+ * \brief Initialize the AGP state
+ *
+ * \param ctx display handle.
+ * \param info driver private data.
+ *
+ * \return one on success, or zero on failure.
+ *
+ * Acquires and enables the AGP device. Reserves memory in the AGP space for
+ * the ring buffer, vertex buffers and textures. Initialize the Radeon
+ * registers to point to that memory and add client mappings.
+ */
+static int RADEONDRIAgpInit( driDisplay *disp, RADEONInfoPtr info)
+{
+   int            mode, ret;
+   int            s, l;
+   int agpmode = 1;
+
+   if (drmAgpAcquire(disp->drmFD) < 0) {
+      fprintf(stderr, "[gart] AGP not available\n");
+      return 0;
+   }
+
+   mode = drmAgpGetMode(disp->drmFD);	/* Default mode */
+   /* Disable fast write entirely - too many lockups.
+    */
+   mode &= ~RADEON_AGP_MODE_MASK;
+   switch (agpmode) {
+   case 4:          mode |= RADEON_AGP_4X_MODE;
+   case 2:          mode |= RADEON_AGP_2X_MODE;
+   case 1: default: mode |= RADEON_AGP_1X_MODE;
+   }
+
+   if (drmAgpEnable(disp->drmFD, mode) < 0) {
+      fprintf(stderr, "[gart] AGP not enabled\n");
+      drmAgpRelease(disp->drmFD);
+      return 0;
+   }
+
+#if 0
+   /* Workaround for some hardware bugs */
+   if (info->ChipFamily < CHIP_FAMILY_R200)
+      OUTREG(RADEON_AGP_CNTL, INREG(RADEON_AGP_CNTL) | 0x000e0000);
+#endif
+   info->gartOffset = 0;
+
+   if ((ret = drmAgpAlloc(disp->drmFD, info->gartSize*1024*1024, 0, NULL,
+                          &info->gartMemHandle)) < 0) {
+      fprintf(stderr, "[gart] Out of memory (%d)\n", ret);
+      drmAgpRelease(disp->drmFD);
+      return 0;
+   }
+   fprintf(stderr,
+           "[gart] %d kB allocated with handle 0x%08x\n",
+           info->gartSize*1024, (unsigned)info->gartMemHandle);
+
+   if (drmAgpBind(disp->drmFD,
+                  info->gartMemHandle, info->gartOffset) < 0) {
+      fprintf(stderr, "[gart] Could not bind\n");
+      drmAgpFree(disp->drmFD, info->gartMemHandle);
+      drmAgpRelease(disp->drmFD);
+      return 0;
+   }
+
+   /* Initialize the CP ring buffer data */
+   info->ringStart       = info->gartOffset;
+   info->ringMapSize     = info->ringSize*1024*1024 + radeon_drm_page_size;
+
+   info->ringReadOffset  = info->ringStart + info->ringMapSize;
+   info->ringReadMapSize = radeon_drm_page_size;
+
+   /* Reserve space for vertex/indirect buffers */
+   info->bufStart        = info->ringReadOffset + info->ringReadMapSize;
+   info->bufMapSize      = info->bufSize*1024*1024;
+
+   /* Reserve the rest for AGP textures */
+   info->gartTexStart     = info->bufStart + info->bufMapSize;
+   s = (info->gartSize*1024*1024 - info->gartTexStart);
+   l = RADEONMinBits((s-1) / RADEON_NR_TEX_REGIONS);
+   if (l < RADEON_LOG_TEX_GRANULARITY) l = RADEON_LOG_TEX_GRANULARITY;
+   info->gartTexMapSize   = (s >> l) << l;
+   info->log2GARTTexGran  = l;
+
+   if (drmAddMap(disp->drmFD, info->ringStart, info->ringMapSize,
+                 DRM_AGP, DRM_READ_ONLY, &info->ringHandle) < 0) {
+      fprintf(stderr, "[gart] Could not add ring mapping\n");
+      return 0;
+   }
+   fprintf(stderr, "[gart] ring handle = 0x%08lx\n", info->ringHandle);
+
+
+   if (drmAddMap(disp->drmFD, info->ringReadOffset, info->ringReadMapSize,
+                 DRM_AGP, DRM_READ_ONLY, &info->ringReadPtrHandle) < 0) {
+      fprintf(stderr,
+              "[gart] Could not add ring read ptr mapping\n");
+      return 0;
+   }
+
+   fprintf(stderr,
+           "[gart] ring read ptr handle = 0x%08lx\n",
+           info->ringReadPtrHandle);
+
+   if (drmAddMap(disp->drmFD, info->bufStart, info->bufMapSize,
+                 DRM_AGP, 0, &info->bufHandle) < 0) {
+      fprintf(stderr,
+              "[gart] Could not add vertex/indirect buffers mapping\n");
+      return 0;
+   }
+   fprintf(stderr,
+           "[gart] vertex/indirect buffers handle = 0x%08lx\n",
+           info->bufHandle);
+
+   if (drmAddMap(disp->drmFD, info->gartTexStart, info->gartTexMapSize,
+                 DRM_AGP, 0, &info->gartTexHandle) < 0) {
+      fprintf(stderr,
+              "[gart] Could not add AGP texture map mapping\n");
+      return 0;
+   }
+   fprintf(stderr,
+           "[gart] AGP texture map handle = 0x%08lx\n",
+           info->gartTexHandle);
+
+   return 1;
+}
+
+
+/**
+ * Initialize all the memory-related fields of the RADEONInfo object.
+ * This includes the various 'offset' and 'size' fields.
+ */
+static int
+RADEONMemoryInit(driDisplay *disp, RADEONInfoPtr info)
+{
+   int        width_bytes = disp->virtualWidth * disp->cpp;
+   int        cpp         = disp->cpp;
+   int        bufferSize  = ((disp->virtualHeight * width_bytes
+                              + RADEON_BUFFER_ALIGN)
+                             & ~RADEON_BUFFER_ALIGN);
+   int        depthSize   = ((((disp->virtualHeight+15) & ~15) * width_bytes
+                              + RADEON_BUFFER_ALIGN)
+                             & ~RADEON_BUFFER_ALIGN);
+   int        l;
+   int        pcie_gart_table_size = 0;
+
+   info->frontOffset = 0;
+   info->frontPitch = disp->virtualWidth;
+
+   if (disp->card_type==RADEON_CARD_PCIE)
+     pcie_gart_table_size  = RADEON_PCIGART_TABLE_SIZE;
+
+   /* Front, back and depth buffers - everything else texture??
+    */
+   info->textureSize = disp->fbSize - pcie_gart_table_size - 2 * bufferSize - depthSize;
+
+   if (info->textureSize < 0)
+      return 0;
+
+   l = RADEONMinBits((info->textureSize-1) / RADEON_NR_TEX_REGIONS);
+   if (l < RADEON_LOG_TEX_GRANULARITY) l = RADEON_LOG_TEX_GRANULARITY;
+
+   /* Round the texture size up to the nearest whole number of
+    * texture regions.  Again, be greedy about this, don't
+    * round down.
+    */
+   info->log2TexGran = l;
+   info->textureSize = (info->textureSize >> l) << l;
+
+   /* Set a minimum usable local texture heap size.  This will fit
+    * two 256x256x32bpp textures.
+    */
+   if (info->textureSize < 512 * 1024) {
+      info->textureOffset = 0;
+      info->textureSize = 0;
+   }
+
+   /* Reserve space for textures */
+   info->textureOffset = ((disp->fbSize - pcie_gart_table_size - info->textureSize +
+                           RADEON_BUFFER_ALIGN) &
+                          ~RADEON_BUFFER_ALIGN);
+
+   /* Reserve space for the shared depth
+    * buffer.
+    */
+   info->depthOffset = ((info->textureOffset - depthSize +
+                         RADEON_BUFFER_ALIGN) &
+                        ~RADEON_BUFFER_ALIGN);
+   info->depthPitch = disp->virtualWidth;
+
+   info->backOffset = ((info->depthOffset - bufferSize +
+                        RADEON_BUFFER_ALIGN) &
+                       ~RADEON_BUFFER_ALIGN);
+   info->backPitch = disp->virtualWidth;
+
+   if (pcie_gart_table_size)
+     info->pcieGartTableOffset = disp->fbSize - pcie_gart_table_size;
+
+   fprintf(stderr,
+           "Will use back buffer at offset 0x%x, pitch %d\n",
+           info->backOffset, info->backPitch);
+   fprintf(stderr,
+           "Will use depth buffer at offset 0x%x, pitch %d\n",
+           info->depthOffset, info->depthPitch);
+   fprintf(stderr,
+           "Will use %d kb for textures at offset 0x%x\n",
+           info->textureSize/1024, info->textureOffset);
+   if (pcie_gart_table_size)
+   { 
+     fprintf(stderr,
+	     "Will use %d kb for PCIE GART Table at offset 0x%x\n",
+	     pcie_gart_table_size/1024, info->pcieGartTableOffset);
+   }
+
+   /* XXX I don't think these are needed. */
+#if 0
+   info->frontPitchOffset = (((info->frontPitch * cpp / 64) << 22) |
+                             (info->frontOffset >> 10));
+
+   info->backPitchOffset = (((info->backPitch * cpp / 64) << 22) |
+                            (info->backOffset >> 10));
+
+   info->depthPitchOffset = (((info->depthPitch * cpp / 64) << 22) |
+                             (info->depthOffset >> 10));
+#endif
+
+   if (pcie_gart_table_size)
+     RADEONSetParam(disp, RADEON_SETPARAM_PCIGART_LOCATION, info->pcieGartTableOffset);
+
+   return 1;
+}
+
+
+/**
+ * \brief Initialize the kernel data structures and enable the CP engine.
+ *
+ * \param ctx display handle.
+ * \param info driver private data.
+ *
+ * \return non-zero on success, or zero on failure.
+ *
+ * This function is a wrapper around the DRM_RADEON_CP_INIT command, passing
+ * all the parameters in a drm_radeon_init_t structure.
+ */
+static int RADEONDRIKernelInit( driDisplay *disp,
+                               RADEONInfoPtr info)
+{
+   int cpp = disp->bpp / 8;
+   drm_radeon_init_t  drmInfo;
+   int ret;
+
+   memset(&drmInfo, 0, sizeof(drmInfo));
+
+   if ( (info->ChipFamily >= CHIP_FAMILY_R300) )
+      drmInfo.func            = RADEON_INIT_R300_CP;
+   else if ( (info->ChipFamily == CHIP_FAMILY_R200) ||
+        (info->ChipFamily == CHIP_FAMILY_RV250) ||
+        (info->ChipFamily == CHIP_FAMILY_M9) ||
+        (info->ChipFamily == CHIP_FAMILY_RV280) )
+      drmInfo.func             = RADEON_INIT_R200_CP;
+   else
+      drmInfo.func             = RADEON_INIT_CP;
+
+   /* This is the struct passed to the kernel module for its initialization */
+   /* XXX problem here:
+    * The front/back/depth_offset/pitch fields may change depending upon
+    * which drawing surface we're using!!!  They can't be set just once
+    * during initialization.
+    * Looks like we'll need a new ioctl to update these fields for drawing
+    * to other surfaces...
+    */
+   drmInfo.sarea_priv_offset   = sizeof(drm_sarea_t);
+   drmInfo.cp_mode             = RADEON_DEFAULT_CP_BM_MODE;
+   drmInfo.gart_size            = info->gartSize*1024*1024;
+   drmInfo.ring_size           = info->ringSize*1024*1024;
+   drmInfo.usec_timeout        = 1000;
+   drmInfo.fb_bpp              = disp->bpp;
+   drmInfo.depth_bpp           = disp->bpp;
+   drmInfo.front_offset        = info->frontOffset;
+   drmInfo.front_pitch         = info->frontPitch * cpp;
+   drmInfo.back_offset         = info->backOffset;
+   drmInfo.back_pitch          = info->backPitch * cpp;
+   drmInfo.depth_offset        = info->depthOffset;
+   drmInfo.depth_pitch         = info->depthPitch * cpp;
+   drmInfo.ring_offset         = info->ringHandle;
+   drmInfo.ring_rptr_offset    = info->ringReadPtrHandle;
+   drmInfo.buffers_offset      = info->bufHandle;
+   drmInfo.gart_textures_offset = info->gartTexHandle;
+
+   ret = drmCommandWrite(disp->drmFD, DRM_RADEON_CP_INIT, &drmInfo,
+                         sizeof(drm_radeon_init_t));
+
+   return ret >= 0;
+}
+
+
+/**
+ * \brief Add a map for the vertex buffers that will be accessed by any
+ * DRI-based clients.
+ *
+ * \param ctx display handle.
+ * \param info driver private data.
+ *
+ * \return one on success, or zero on failure.
+ *
+ * Calls drmAddBufs() with the previously allocated vertex buffers.
+ */
+static int RADEONDRIBufInit( driDisplay *disp, RADEONInfoPtr info )
+{
+   /* Initialize vertex buffers */
+   info->bufNumBufs = drmAddBufs(disp->drmFD,
+                                 info->bufMapSize / RADEON_BUFFER_SIZE,
+                                 RADEON_BUFFER_SIZE,
+				 (disp->card_type!=RADEON_CARD_AGP) ? DRM_SG_BUFFER : DRM_AGP_BUFFER,
+                                 info->bufStart);
+
+   if (info->bufNumBufs <= 0) {
+      fprintf(stderr,
+              "[drm] Could not create vertex/indirect buffers list\n");
+      return 0;
+   }
+   fprintf(stderr,
+           "[drm] Added %d %d byte vertex/indirect buffers\n",
+           info->bufNumBufs, RADEON_BUFFER_SIZE);
+
+   return 1;
+}
+
+
+/**
+ * \brief Install an IRQ handler.
+ *
+ * \param disp display handle.
+ * \param info driver private data.
+ *
+ * Attempts to install an IRQ handler via drmCtlInstHandler(), falling back to
+ * IRQ-free operation on failure.
+ */
+static void RADEONDRIIrqInit(driDisplay *disp, RADEONInfoPtr info)
+{
+   if ((drmCtlInstHandler(disp->drmFD, 0)) != 0)
+      fprintf(stderr, "[drm] failure adding irq handler, "
+                 "there is a device already using that irq\n"
+                 "[drm] falling back to irq-free operation\n");
+}
+
+
+/**
+ * \brief Initialize the AGP heap.
+ *
+ * \param disp display handle.
+ * \param info driver private data.
+ *
+ * This function is a wrapper around the DRM_RADEON_INIT_HEAP command, passing
+ * all the parameters in a drm_radeon_mem_init_heap structure.
+ */
+static void RADEONDRIAgpHeapInit(driDisplay *disp,
+                                 RADEONInfoPtr info)
+{
+   drm_radeon_mem_init_heap_t drmHeap;
+
+   /* Start up the simple memory manager for gart space */
+   drmHeap.region = RADEON_MEM_REGION_GART;
+   drmHeap.start  = 0;
+   drmHeap.size   = info->gartTexMapSize;
+
+   if (drmCommandWrite(disp->drmFD, DRM_RADEON_INIT_HEAP,
+                       &drmHeap, sizeof(drmHeap))) {
+      fprintf(stderr,
+              "[drm] Failed to initialized gart heap manager\n");
+   } else {
+      fprintf(stderr,
+              "[drm] Initialized kernel gart heap manager, %d\n",
+              info->gartTexMapSize);
+   }
+}
+
+static int RADEONGetCardType(driDisplay *disp, RADEONInfoPtr info)
+{
+   drm_radeon_getparam_t gp;  
+   int ret;
+ 
+   gp.param = RADEON_PARAM_CARD_TYPE;
+   gp.value = &disp->card_type;
+
+   ret=drmCommandWriteRead(disp->drmFD, DRM_RADEON_GETPARAM, &gp, sizeof(gp));
+   if (ret) {
+     fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_CARD_TYPE) : %d\n", ret);
+     return -1;
+   }
+
+   return disp->card_type;
+}
+
+/**
+ * Called at the start of each server generation.
+ *
+ * \param disp display handle.
+ * \param info driver private data.
+ *
+ * \return non-zero on success, or zero on failure.
+ *
+ * Performs static frame buffer allocation. Opens the DRM device and add maps
+ * to the SAREA, framebuffer and MMIO regions. Fills in \p info with more
+ * information. Creates a \e server context to grab the lock for the
+ * initialization ioctls and calls the other initilization functions in this
+ * file. Starts the CP engine via the DRM_RADEON_CP_START command.
+ *
+ * Setups a RADEONDRIRec structure to be passed to radeon_dri.so for its
+ * initialization.
+ */
+static int
+RADEONScreenInit( driDisplay *disp, RADEONInfoPtr info,
+                  RADEONDRIPtr pRADEONDRI)
+{
+   int i, err;
+
+   /* XXX this probably isn't needed here */
+   {
+      int  width_bytes = (disp->virtualWidth * disp->cpp);
+      int  maxy        = disp->fbSize / width_bytes;
+
+      if (maxy <= disp->virtualHeight * 3) {
+         _eglLog(_EGL_WARNING,
+                 "Static buffer allocation failed -- "
+                 "need at least %d kB video memory (have %d kB)\n",
+                 (disp->virtualWidth * disp->virtualHeight *
+                  disp->cpp * 3 + 1023) / 1024,
+                 disp->fbSize / 1024);
+         return 0;
+      }
+   }
+
+   /* Memory manager setup */
+   if (!RADEONMemoryInit(disp, info)) {
+      return 0;
+   }
+
+   /* Create a 'server' context so we can grab the lock for
+    * initialization ioctls.
+    */
+   if ((err = drmCreateContext(disp->drmFD, &disp->serverContext)) != 0) {
+      _eglLog(_EGL_WARNING, "%s: drmCreateContext failed %d\n",
+              __FUNCTION__, err);
+      return 0;
+   }
+
+   DRM_LOCK(disp->drmFD, disp->pSAREA, disp->serverContext, 0);
+
+   /* Initialize the kernel data structures */
+   if (!RADEONDRIKernelInit(disp, info)) {
+      _eglLog(_EGL_WARNING, "RADEONDRIKernelInit failed\n");
+      DRM_UNLOCK(disp->drmFD, disp->pSAREA, disp->serverContext);
+      return 0;
+   }
+
+   /* Initialize the vertex buffers list */
+   if (!RADEONDRIBufInit(disp, info)) {
+      fprintf(stderr, "RADEONDRIBufInit failed\n");
+      DRM_UNLOCK(disp->drmFD, disp->pSAREA, disp->serverContext);
+      return 0;
+   }
+
+   /* Initialize IRQ */
+   RADEONDRIIrqInit(disp, info);
+
+   /* Initialize kernel gart memory manager */
+   RADEONDRIAgpHeapInit(disp, info);
+
+   /* Initialize the SAREA private data structure */
+   {
+      drm_radeon_sarea_t *pSAREAPriv;
+      pSAREAPriv = (drm_radeon_sarea_t *)(((char*)disp->pSAREA) +
+                                        sizeof(drm_sarea_t));
+      memset(pSAREAPriv, 0, sizeof(*pSAREAPriv));
+      pSAREAPriv->pfState = info->page_flip_enable;
+   }
+
+   for ( i = 0;; i++ ) {
+      drmMapType type;
+      drmMapFlags flags;
+      drm_handle_t handle, offset;
+      drmSize size;
+      int rc, mtrr;
+
+      if ( ( rc = drmGetMap( disp->drmFD, i, &offset, &size, &type, &flags, &handle, &mtrr ) ) != 0 )
+         break;
+      if ( type == DRM_REGISTERS ) {
+         pRADEONDRI->registerHandle = offset;
+         pRADEONDRI->registerSize = size;
+         break;
+      }
+   }
+   /* Quick hack to clear the front & back buffers.  Could also use
+    * the clear ioctl to do this, but would need to setup hw state
+    * first.
+    */
+   drimemsetio((char *)disp->pFB + info->frontOffset,
+          0xEE,
+          info->frontPitch * disp->cpp * disp->virtualHeight );
+
+   drimemsetio((char *)disp->pFB + info->backOffset,
+          0x30,
+          info->backPitch * disp->cpp * disp->virtualHeight );
+
+
+   /* This is the struct passed to radeon_dri.so for its initialization */
+   pRADEONDRI->deviceID          = info->Chipset;
+   pRADEONDRI->width             = disp->virtualWidth;
+   pRADEONDRI->height            = disp->virtualHeight;
+   pRADEONDRI->depth             = disp->bpp; /* XXX: depth */
+   pRADEONDRI->bpp               = disp->bpp;
+   pRADEONDRI->IsPCI             = (disp->card_type != RADEON_CARD_AGP);;
+   pRADEONDRI->frontOffset       = info->frontOffset;
+   pRADEONDRI->frontPitch        = info->frontPitch;
+   pRADEONDRI->backOffset        = info->backOffset;
+   pRADEONDRI->backPitch         = info->backPitch;
+   pRADEONDRI->depthOffset       = info->depthOffset;
+   pRADEONDRI->depthPitch        = info->depthPitch;
+   pRADEONDRI->textureOffset     = info->textureOffset;
+   pRADEONDRI->textureSize       = info->textureSize;
+   pRADEONDRI->log2TexGran       = info->log2TexGran;
+   pRADEONDRI->statusHandle      = info->ringReadPtrHandle;
+   pRADEONDRI->statusSize        = info->ringReadMapSize;
+   pRADEONDRI->gartTexHandle      = info->gartTexHandle;
+   pRADEONDRI->gartTexMapSize     = info->gartTexMapSize;
+   pRADEONDRI->log2GARTTexGran    = info->log2GARTTexGran;
+   pRADEONDRI->gartTexOffset      = info->gartTexStart;
+   pRADEONDRI->sarea_priv_offset = sizeof(drm_sarea_t);
+
+   /* Don't release the lock now - let the VT switch handler do it. */
+
+   return 1;
+}
+
+
+/**
+ * \brief Get Radeon chip family from chipset number.
+ *
+ * \param info driver private data.
+ *
+ * \return non-zero on success, or zero on failure.
+ *
+ * Called by radeonInitFBDev() to set RADEONInfoRec::ChipFamily
+ * according to the value of RADEONInfoRec::Chipset.  Fails if the
+ * chipset is unrecognized or not appropriate for this driver (i.e., not
+ * an r100 style radeon)
+ */
+static int get_chipfamily_from_chipset( RADEONInfoPtr info )
+{
+    switch (info->Chipset) {
+    case PCI_CHIP_RADEON_LY:
+    case PCI_CHIP_RADEON_LZ:
+        info->ChipFamily = CHIP_FAMILY_M6;
+        break;
+
+    case PCI_CHIP_RADEON_QY:
+    case PCI_CHIP_RADEON_QZ:
+        info->ChipFamily = CHIP_FAMILY_VE;
+        break;
+
+    case PCI_CHIP_R200_QL:
+    case PCI_CHIP_R200_QN:
+    case PCI_CHIP_R200_QO:
+    case PCI_CHIP_R200_Ql:
+    case PCI_CHIP_R200_BB:
+        info->ChipFamily = CHIP_FAMILY_R200;
+        break;
+
+    case PCI_CHIP_RV200_QW: /* RV200 desktop */
+    case PCI_CHIP_RV200_QX:
+        info->ChipFamily = CHIP_FAMILY_RV200;
+        break;
+
+    case PCI_CHIP_RADEON_LW:
+    case PCI_CHIP_RADEON_LX:
+        info->ChipFamily = CHIP_FAMILY_M7;
+        break;
+
+    case PCI_CHIP_RV250_Id:
+    case PCI_CHIP_RV250_Ie:
+    case PCI_CHIP_RV250_If:
+    case PCI_CHIP_RV250_Ig:
+        info->ChipFamily = CHIP_FAMILY_RV250;
+        break;
+
+    case PCI_CHIP_RV250_Ld:
+    case PCI_CHIP_RV250_Le:
+    case PCI_CHIP_RV250_Lf:
+    case PCI_CHIP_RV250_Lg:
+        info->ChipFamily = CHIP_FAMILY_M9;
+        break;
+
+    case PCI_CHIP_RV280_Y_:
+    case PCI_CHIP_RV280_Ya:
+    case PCI_CHIP_RV280_Yb:
+    case PCI_CHIP_RV280_Yc:
+        info->ChipFamily = CHIP_FAMILY_RV280;
+        break;
+
+    case PCI_CHIP_R300_ND:
+    case PCI_CHIP_R300_NE:
+    case PCI_CHIP_R300_NF:
+    case PCI_CHIP_R300_NG:
+        info->ChipFamily = CHIP_FAMILY_R300;
+        break;
+
+    case PCI_CHIP_RV370_5460:
+        info->ChipFamily = CHIP_FAMILY_RV380;
+	break;
+
+    default:
+        /* Original Radeon/7200 */
+        info->ChipFamily = CHIP_FAMILY_RADEON;
+    }
+
+    return 1;
+}
+
+
+/**
+ * \brief Initialize the framebuffer device mode
+ *
+ * \param disp display handle.
+ *
+ * \return one on success, or zero on failure.
+ *
+ * Fills in \p info with some default values and some information from \p disp
+ * and then calls RADEONScreenInit() for the screen initialization.
+ *
+ * Before exiting clears the framebuffer memory accessing it directly.
+ */
+static int radeonInitFBDev( driDisplay *disp, RADEONDRIPtr pRADEONDRI )
+{
+   int err;
+   RADEONInfoPtr info = calloc(1, sizeof(*info));
+
+   disp->driverPrivate = (void *)info;
+
+   info->gartFastWrite  = RADEON_DEFAULT_AGP_FAST_WRITE;
+   info->gartSize       = RADEON_DEFAULT_AGP_SIZE;
+   info->gartTexSize    = RADEON_DEFAULT_AGP_TEX_SIZE;
+   info->bufSize       = RADEON_DEFAULT_BUFFER_SIZE;
+   info->ringSize      = RADEON_DEFAULT_RING_SIZE;
+   info->page_flip_enable = RADEON_DEFAULT_PAGE_FLIP;
+
+   fprintf(stderr,
+           "Using %d MB AGP aperture\n", info->gartSize);
+   fprintf(stderr,
+           "Using %d MB for the ring buffer\n", info->ringSize);
+   fprintf(stderr,
+           "Using %d MB for vertex/indirect buffers\n", info->bufSize);
+   fprintf(stderr,
+           "Using %d MB for AGP textures\n", info->gartTexSize);
+   fprintf(stderr,
+           "page flipping %sabled\n", info->page_flip_enable?"en":"dis");
+
+   info->Chipset = disp->chipset;
+
+   if (!get_chipfamily_from_chipset( info )) {
+      fprintf(stderr, "Unknown or non-radeon chipset -- cannot continue\n");
+      fprintf(stderr, "==> Verify PCI BusID is correct in miniglx.conf\n");
+      return 0;
+   }
+#if 0
+   if (info->ChipFamily >= CHIP_FAMILY_R300) {
+      fprintf(stderr,
+              "Direct rendering not yet supported on "
+              "Radeon 9700 and newer cards\n");
+      return 0;
+   }
+#endif
+
+#if 00
+   /* don't seem to need this here */
+   info->frontPitch = disp->virtualWidth;
+#endif
+
+   /* Check the radeon DRM version */
+   if (!RADEONCheckDRMVersion(disp, info)) {
+      return 0;
+   }
+
+   if (RADEONGetCardType(disp, info)<0)
+      return 0;
+
+   if (disp->card_type!=RADEON_CARD_AGP) {
+      /* Initialize PCI */
+      if (!RADEONDRIPciInit(disp, info))
+         return 0;
+   }
+   else {
+      /* Initialize AGP */
+      if (!RADEONDRIAgpInit(disp, info))
+         return 0;
+   }
+
+   if (!RADEONScreenInit( disp, info, pRADEONDRI))
+      return 0;
+
+   /* Initialize and start the CP if required */
+   if ((err = drmCommandNone(disp->drmFD, DRM_RADEON_CP_START)) != 0) {
+      fprintf(stderr, "%s: CP start %d\n", __FUNCTION__, err);
+      return 0;
+   }
+
+   return 1;
+}
+
+
+/**
+ * Create list of all supported surface configs, attach list to the display.
+ */
+static EGLBoolean
+radeonFillInConfigs(_EGLDisplay *disp, unsigned pixel_bits,
+                    unsigned depth_bits,
+                    unsigned stencil_bits, GLboolean have_back_buffer)
+{
+   _EGLConfig *configs;
+   _EGLConfig *c;
+   unsigned int i, num_configs;
+   unsigned int depth_buffer_factor;
+   unsigned int back_buffer_factor;
+   GLenum fb_format;
+   GLenum fb_type;
+
+   /* Right now GLX_SWAP_COPY_OML isn't supported, but it would be easy
+   * enough to add support.  Basically, if a context is created with an
+   * fbconfig where the swap method is GLX_SWAP_COPY_OML, pageflipping
+   * will never be used.
+   */
+   static const GLenum back_buffer_modes[] = {
+            GLX_NONE, GLX_SWAP_UNDEFINED_OML /*, GLX_SWAP_COPY_OML */
+         };
+
+   u_int8_t depth_bits_array[2];
+   u_int8_t stencil_bits_array[2];
+
+   depth_bits_array[0] = depth_bits;
+   depth_bits_array[1] = depth_bits;
+
+   /* Just like with the accumulation buffer, always provide some modes
+   * with a stencil buffer.  It will be a sw fallback, but some apps won't
+   * care about that.
+   */
+   stencil_bits_array[0] = 0;
+   stencil_bits_array[1] = (stencil_bits == 0) ? 8 : stencil_bits;
+
+   depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 2 : 1;
+   back_buffer_factor = (have_back_buffer) ? 2 : 1;
+
+   num_configs = depth_buffer_factor * back_buffer_factor * 2;
+
+   if (pixel_bits == 16) {
+      fb_format = GL_RGB;
+      fb_type = GL_UNSIGNED_SHORT_5_6_5;
+   } else {
+      fb_format = GL_RGBA;
+      fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
+   }
+
+   configs = calloc(sizeof(*configs), num_configs);
+   c = configs;
+   if (!_eglFillInConfigs(c, fb_format, fb_type,
+                          depth_bits_array, stencil_bits_array,
+                          depth_buffer_factor,
+                          back_buffer_modes, back_buffer_factor,
+                          GLX_TRUE_COLOR)) {
+      fprintf(stderr, "[%s:%u] Error creating FBConfig!\n",
+               __func__, __LINE__);
+      return EGL_FALSE;
+   }
+
+   /* Mark the visual as slow if there are "fake" stencil bits.
+   */
+   for (i = 0, c = configs; i < num_configs; i++, c++) {
+      int stencil = GET_CONFIG_ATTRIB(c, EGL_STENCIL_SIZE);
+      if ((stencil != 0)  && (stencil != stencil_bits)) {
+         SET_CONFIG_ATTRIB(c, EGL_CONFIG_CAVEAT, EGL_SLOW_CONFIG);
+      }
+   }
+
+   for (i = 0, c = configs; i < num_configs; i++, c++)
+      _eglAddConfig(disp, c);
+
+   free(configs);
+
+   return EGL_TRUE;
+}
+
+
+/**
+ * Show the given surface on the named screen.
+ * If surface is EGL_NO_SURFACE, disable the screen's output.
+ */
+static EGLBoolean
+radeonShowScreenSurfaceMESA(_EGLDriver *drv, EGLDisplay dpy, EGLScreenMESA screen,
+                      EGLSurface surface, EGLModeMESA m)
+{
+   EGLBoolean b = _eglDRIShowScreenSurfaceMESA(drv, dpy, screen, surface, m);
+   return b;
+}
+
+
+/**
+ * Called via eglInitialize() by user.
+ */
+static EGLBoolean
+radeonInitialize(_EGLDriver *drv, EGLDisplay dpy, EGLint *major, EGLint *minor)
+{
+   __DRIframebuffer framebuffer;
+   driDisplay *display;
+
+   /* one-time init */
+   radeon_drm_page_size = getpagesize();
+
+   if (!_eglDRIInitialize(drv, dpy, major, minor))
+      return EGL_FALSE;
+
+   display = Lookup_driDisplay(dpy);
+
+   framebuffer.dev_priv_size = sizeof(RADEONDRIRec);
+   framebuffer.dev_priv = malloc(sizeof(RADEONDRIRec));
+
+   /* XXX we shouldn't hard-code values here! */
+   /* we won't know the screen surface size until the user calls
+    * eglCreateScreenSurfaceMESA().
+    */
+#if 0
+   display->virtualWidth = 1024;
+   display->virtualHeight = 768;
+#else
+   display->virtualWidth = 1280;
+   display->virtualHeight = 1024;
+#endif
+   display->bpp = 32;
+   display->cpp = 4;
+
+   if (!_eglDRIGetDisplayInfo(display))
+      return EGL_FALSE;
+
+   framebuffer.base = display->pFB;
+   framebuffer.width = display->virtualWidth;
+   framebuffer.height = display->virtualHeight;
+   framebuffer.stride = display->virtualWidth;
+   framebuffer.size = display->fbSize;
+   radeonInitFBDev( display, framebuffer.dev_priv );
+
+   if (!_eglDRICreateDisplay(display, &framebuffer))
+      return EGL_FALSE;
+
+   if (!_eglDRICreateScreens(display))
+      return EGL_FALSE;
+
+   /* create a variety of both 32 and 16-bit configurations */
+   radeonFillInConfigs(&display->Base, 32, 24, 8, GL_TRUE);
+   radeonFillInConfigs(&display->Base, 16, 16, 0, GL_TRUE);
+
+   drv->Initialized = EGL_TRUE;
+   return EGL_TRUE;
+}
+
+
+/**
+ * The bootstrap function.  Return a new radeonDriver object and
+ * plug in API functions.
+ */
+_EGLDriver *
+_eglMain(_EGLDisplay *dpy)
+{
+   radeonDriver *radeon;
+
+   radeon = (radeonDriver *) calloc(1, sizeof(*radeon));
+   if (!radeon) {
+      return NULL;
+   }
+
+   /* First fill in the dispatch table with defaults */
+   _eglDRIInitDriverFallbacks(&radeon->Base);
+
+   /* then plug in our radeon-specific functions */
+   radeon->Base.API.Initialize = radeonInitialize;
+   radeon->Base.API.ShowScreenSurfaceMESA = radeonShowScreenSurfaceMESA;
+
+   return &radeon->Base;
+}
diff --git a/radeon/server/radeon_macros.h b/radeon/server/radeon_macros.h
new file mode 100644
index 0000000..60f0fa2
--- /dev/null
+++ b/radeon/server/radeon_macros.h
@@ -0,0 +1,129 @@
+/**
+ * \file server/radeon_macros.h
+ * \brief Macros for Radeon MMIO operation.
+ *
+ * \authors Kevin E. Martin <martin@xfree86.org>
+ * \authors Rickard E. Faith <faith@valinux.com>
+ * \authors Alan Hourihane <alanh@fairlite.demon.co.uk>
+ */
+
+/*
+ * Copyright 2000 ATI Technologies Inc., Markham, Ontario, and
+ *                VA Linux Systems Inc., Fremont, California.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation on the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT.  IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR
+ * THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* $XFree86: xc/programs/Xserver/hw/xfree86/drivers/ati/radeon_reg.h,v 1.20 2002/10/12 01:38:07 martin Exp $ */
+
+#ifndef _RADEON_MACROS_H_
+#define _RADEON_MACROS_H_
+
+#include <mmio.h>
+
+#  define MMIO_IN8(base, offset) \
+	*(volatile unsigned char *)(((unsigned char*)(base)) + (offset))
+#  define MMIO_IN32(base, offset) \
+	read_MMIO_LE32(base, offset)
+#  define MMIO_OUT8(base, offset, val) \
+	*(volatile unsigned char *)(((unsigned char*)(base)) + (offset)) = (val)
+#  define MMIO_OUT32(base, offset, val) \
+	*(volatile unsigned int *)(void *)(((unsigned char*)(base)) + (offset)) = CPU_TO_LE32(val)
+
+
+				/* Memory mapped register access macros */
+#define INREG8(addr)        MMIO_IN8(RADEONMMIO, addr)
+#define INREG(addr)         MMIO_IN32(RADEONMMIO, addr)
+#define OUTREG8(addr, val)  MMIO_OUT8(RADEONMMIO, addr, val)
+#define OUTREG(addr, val)   MMIO_OUT32(RADEONMMIO, addr, val)
+
+#define ADDRREG(addr)       ((volatile GLuint *)(pointer)(RADEONMMIO + (addr)))
+
+
+#define OUTREGP(addr, val, mask)					\
+do {									\
+    GLuint tmp = INREG(addr);						\
+    tmp &= (mask);							\
+    tmp |= (val);							\
+    OUTREG(addr, tmp);							\
+} while (0)
+
+#define INPLL(dpy, addr) RADEONINPLL(dpy, addr)
+
+#define OUTPLL(addr, val)						\
+do {									\
+    OUTREG8(RADEON_CLOCK_CNTL_INDEX, (((addr) & 0x3f) |			\
+				      RADEON_PLL_WR_EN));		\
+    OUTREG(RADEON_CLOCK_CNTL_DATA, val);				\
+} while (0)
+
+#define OUTPLLP(dpy, addr, val, mask)					\
+do {									\
+    GLuint tmp = INPLL(dpy, addr);					\
+    tmp &= (mask);							\
+    tmp |= (val);							\
+    OUTPLL(addr, tmp);							\
+} while (0)
+
+#define OUTPAL_START(idx)						\
+do {									\
+    OUTREG8(RADEON_PALETTE_INDEX, (idx));				\
+} while (0)
+
+#define OUTPAL_NEXT(r, g, b)						\
+do {									\
+    OUTREG(RADEON_PALETTE_DATA, ((r) << 16) | ((g) << 8) | (b));	\
+} while (0)
+
+#define OUTPAL_NEXT_CARD32(v)						\
+do {									\
+    OUTREG(RADEON_PALETTE_DATA, (v & 0x00ffffff));			\
+} while (0)
+
+#define OUTPAL(idx, r, g, b)						\
+do {									\
+    OUTPAL_START((idx));						\
+    OUTPAL_NEXT((r), (g), (b));						\
+} while (0)
+
+#define INPAL_START(idx)						\
+do {									\
+    OUTREG(RADEON_PALETTE_INDEX, (idx) << 16);				\
+} while (0)
+
+#define INPAL_NEXT() INREG(RADEON_PALETTE_DATA)
+
+#define PAL_SELECT(idx)							\
+do {									\
+    if (!idx) {								\
+	OUTREG(RADEON_DAC_CNTL2, INREG(RADEON_DAC_CNTL2) &		\
+	       (GLuint)~RADEON_DAC2_PALETTE_ACC_CTL);			\
+    } else {								\
+	OUTREG(RADEON_DAC_CNTL2, INREG(RADEON_DAC_CNTL2) |		\
+	       RADEON_DAC2_PALETTE_ACC_CTL);				\
+    }									\
+} while (0)
+
+
+#endif
diff --git a/radeon/server/radeon_reg.h b/radeon/server/radeon_reg.h
new file mode 100644
index 0000000..4dcce63
--- /dev/null
+++ b/radeon/server/radeon_reg.h
@@ -0,0 +1,2144 @@
+/* $XFree86: xc/programs/Xserver/hw/xfree86/drivers/ati/radeon_reg.h,v 1.30 2003/10/07 22:47:12 martin Exp $ */
+/*
+ * Copyright 2000 ATI Technologies Inc., Markham, Ontario, and
+ *                VA Linux Systems Inc., Fremont, California.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation on the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT.  IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR
+ * THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@xfree86.org>
+ *   Rickard E. Faith <faith@valinux.com>
+ *   Alan Hourihane <alanh@fairlite.demon.co.uk>
+ *
+ * References:
+ *
+ * !!!! FIXME !!!!
+ *   RAGE 128 VR/ RAGE 128 GL Register Reference Manual (Technical
+ *   Reference Manual P/N RRG-G04100-C Rev. 0.04), ATI Technologies: April
+ *   1999.
+ *
+ * !!!! FIXME !!!!
+ *   RAGE 128 Software Development Manual (Technical Reference Manual P/N
+ *   SDK-G04000 Rev. 0.01), ATI Technologies: June 1999.
+ *
+ */
+
+/* !!!! FIXME !!!!  NOTE: THIS FILE HAS BEEN CONVERTED FROM r128_reg.h
+ * AND CONTAINS REGISTERS AND REGISTER DEFINITIONS THAT ARE NOT CORRECT
+ * ON THE RADEON.  A FULL AUDIT OF THIS CODE IS NEEDED!  */
+
+#ifndef _RADEON_REG_H_
+#define _RADEON_REG_H_
+
+				/* Registers for 2D/Video/Overlay */
+#define RADEON_ADAPTER_ID                   0x0f2c /* PCI */
+#define RADEON_AGP_BASE                     0x0170
+#define RADEON_AGP_CNTL                     0x0174
+#       define RADEON_AGP_APER_SIZE_256MB   (0x00 << 0)
+#       define RADEON_AGP_APER_SIZE_128MB   (0x20 << 0)
+#       define RADEON_AGP_APER_SIZE_64MB    (0x30 << 0)
+#       define RADEON_AGP_APER_SIZE_32MB    (0x38 << 0)
+#       define RADEON_AGP_APER_SIZE_16MB    (0x3c << 0)
+#       define RADEON_AGP_APER_SIZE_8MB     (0x3e << 0)
+#       define RADEON_AGP_APER_SIZE_4MB     (0x3f << 0)
+#       define RADEON_AGP_APER_SIZE_MASK    (0x3f << 0)
+#define RADEON_AGP_COMMAND                  0x0f60 /* PCI */
+#define RADEON_AGP_COMMAND_PCI_CONFIG       0x0060 /* offset in PCI config*/
+#       define RADEON_AGP_ENABLE            (1<<8)
+#define RADEON_AGP_PLL_CNTL                 0x000b /* PLL */
+#define RADEON_AGP_STATUS                   0x0f5c /* PCI */
+#       define RADEON_AGP_1X_MODE           0x01
+#       define RADEON_AGP_2X_MODE           0x02
+#       define RADEON_AGP_4X_MODE           0x04
+#       define RADEON_AGP_FW_MODE           0x10
+#       define RADEON_AGP_MODE_MASK         0x17
+#define RADEON_ATTRDR                       0x03c1 /* VGA */
+#define RADEON_ATTRDW                       0x03c0 /* VGA */
+#define RADEON_ATTRX                        0x03c0 /* VGA */
+#define RADEON_AUX_SC_CNTL                  0x1660
+#       define RADEON_AUX1_SC_EN            (1 << 0)
+#       define RADEON_AUX1_SC_MODE_OR       (0 << 1)
+#       define RADEON_AUX1_SC_MODE_NAND     (1 << 1)
+#       define RADEON_AUX2_SC_EN            (1 << 2)
+#       define RADEON_AUX2_SC_MODE_OR       (0 << 3)
+#       define RADEON_AUX2_SC_MODE_NAND     (1 << 3)
+#       define RADEON_AUX3_SC_EN            (1 << 4)
+#       define RADEON_AUX3_SC_MODE_OR       (0 << 5)
+#       define RADEON_AUX3_SC_MODE_NAND     (1 << 5)
+#define RADEON_AUX1_SC_BOTTOM               0x1670
+#define RADEON_AUX1_SC_LEFT                 0x1664
+#define RADEON_AUX1_SC_RIGHT                0x1668
+#define RADEON_AUX1_SC_TOP                  0x166c
+#define RADEON_AUX2_SC_BOTTOM               0x1680
+#define RADEON_AUX2_SC_LEFT                 0x1674
+#define RADEON_AUX2_SC_RIGHT                0x1678
+#define RADEON_AUX2_SC_TOP                  0x167c
+#define RADEON_AUX3_SC_BOTTOM               0x1690
+#define RADEON_AUX3_SC_LEFT                 0x1684
+#define RADEON_AUX3_SC_RIGHT                0x1688
+#define RADEON_AUX3_SC_TOP                  0x168c
+#define RADEON_AUX_WINDOW_HORZ_CNTL         0x02d8
+#define RADEON_AUX_WINDOW_VERT_CNTL         0x02dc
+
+#define RADEON_BASE_CODE                    0x0f0b
+#define RADEON_BIOS_0_SCRATCH               0x0010
+#define RADEON_BIOS_1_SCRATCH               0x0014
+#define RADEON_BIOS_2_SCRATCH               0x0018
+#define RADEON_BIOS_3_SCRATCH               0x001c
+#define RADEON_BIOS_4_SCRATCH               0x0020
+#define RADEON_BIOS_5_SCRATCH               0x0024
+#define RADEON_BIOS_6_SCRATCH               0x0028
+#define RADEON_BIOS_7_SCRATCH               0x002c
+#define RADEON_BIOS_ROM                     0x0f30 /* PCI */
+#define RADEON_BIST                         0x0f0f /* PCI */
+#define RADEON_BRUSH_DATA0                  0x1480
+#define RADEON_BRUSH_DATA1                  0x1484
+#define RADEON_BRUSH_DATA10                 0x14a8
+#define RADEON_BRUSH_DATA11                 0x14ac
+#define RADEON_BRUSH_DATA12                 0x14b0
+#define RADEON_BRUSH_DATA13                 0x14b4
+#define RADEON_BRUSH_DATA14                 0x14b8
+#define RADEON_BRUSH_DATA15                 0x14bc
+#define RADEON_BRUSH_DATA16                 0x14c0
+#define RADEON_BRUSH_DATA17                 0x14c4
+#define RADEON_BRUSH_DATA18                 0x14c8
+#define RADEON_BRUSH_DATA19                 0x14cc
+#define RADEON_BRUSH_DATA2                  0x1488
+#define RADEON_BRUSH_DATA20                 0x14d0
+#define RADEON_BRUSH_DATA21                 0x14d4
+#define RADEON_BRUSH_DATA22                 0x14d8
+#define RADEON_BRUSH_DATA23                 0x14dc
+#define RADEON_BRUSH_DATA24                 0x14e0
+#define RADEON_BRUSH_DATA25                 0x14e4
+#define RADEON_BRUSH_DATA26                 0x14e8
+#define RADEON_BRUSH_DATA27                 0x14ec
+#define RADEON_BRUSH_DATA28                 0x14f0
+#define RADEON_BRUSH_DATA29                 0x14f4
+#define RADEON_BRUSH_DATA3                  0x148c
+#define RADEON_BRUSH_DATA30                 0x14f8
+#define RADEON_BRUSH_DATA31                 0x14fc
+#define RADEON_BRUSH_DATA32                 0x1500
+#define RADEON_BRUSH_DATA33                 0x1504
+#define RADEON_BRUSH_DATA34                 0x1508
+#define RADEON_BRUSH_DATA35                 0x150c
+#define RADEON_BRUSH_DATA36                 0x1510
+#define RADEON_BRUSH_DATA37                 0x1514
+#define RADEON_BRUSH_DATA38                 0x1518
+#define RADEON_BRUSH_DATA39                 0x151c
+#define RADEON_BRUSH_DATA4                  0x1490
+#define RADEON_BRUSH_DATA40                 0x1520
+#define RADEON_BRUSH_DATA41                 0x1524
+#define RADEON_BRUSH_DATA42                 0x1528
+#define RADEON_BRUSH_DATA43                 0x152c
+#define RADEON_BRUSH_DATA44                 0x1530
+#define RADEON_BRUSH_DATA45                 0x1534
+#define RADEON_BRUSH_DATA46                 0x1538
+#define RADEON_BRUSH_DATA47                 0x153c
+#define RADEON_BRUSH_DATA48                 0x1540
+#define RADEON_BRUSH_DATA49                 0x1544
+#define RADEON_BRUSH_DATA5                  0x1494
+#define RADEON_BRUSH_DATA50                 0x1548
+#define RADEON_BRUSH_DATA51                 0x154c
+#define RADEON_BRUSH_DATA52                 0x1550
+#define RADEON_BRUSH_DATA53                 0x1554
+#define RADEON_BRUSH_DATA54                 0x1558
+#define RADEON_BRUSH_DATA55                 0x155c
+#define RADEON_BRUSH_DATA56                 0x1560
+#define RADEON_BRUSH_DATA57                 0x1564
+#define RADEON_BRUSH_DATA58                 0x1568
+#define RADEON_BRUSH_DATA59                 0x156c
+#define RADEON_BRUSH_DATA6                  0x1498
+#define RADEON_BRUSH_DATA60                 0x1570
+#define RADEON_BRUSH_DATA61                 0x1574
+#define RADEON_BRUSH_DATA62                 0x1578
+#define RADEON_BRUSH_DATA63                 0x157c
+#define RADEON_BRUSH_DATA7                  0x149c
+#define RADEON_BRUSH_DATA8                  0x14a0
+#define RADEON_BRUSH_DATA9                  0x14a4
+#define RADEON_BRUSH_SCALE                  0x1470
+#define RADEON_BRUSH_Y_X                    0x1474
+#define RADEON_BUS_CNTL                     0x0030
+#       define RADEON_BUS_MASTER_DIS         (1 << 6)
+#       define RADEON_BUS_RD_DISCARD_EN      (1 << 24)
+#       define RADEON_BUS_RD_ABORT_EN        (1 << 25)
+#       define RADEON_BUS_MSTR_DISCONNECT_EN (1 << 28)
+#       define RADEON_BUS_WRT_BURST          (1 << 29)
+#       define RADEON_BUS_READ_BURST         (1 << 30)
+#define RADEON_BUS_CNTL1                    0x0034
+#       define RADEON_BUS_WAIT_ON_LOCK_EN    (1 << 4)
+
+#define RADEON_CACHE_CNTL                   0x1724
+#define RADEON_CACHE_LINE                   0x0f0c /* PCI */
+#define RADEON_CAP0_TRIG_CNTL               0x0950 /* ? */
+#define RADEON_CAP1_TRIG_CNTL               0x09c0 /* ? */
+#define RADEON_CAPABILITIES_ID              0x0f50 /* PCI */
+#define RADEON_CAPABILITIES_PTR             0x0f34 /* PCI */
+#define RADEON_CLK_PIN_CNTL                 0x0001 /* PLL */
+#define RADEON_CLOCK_CNTL_DATA              0x000c
+#define RADEON_CLOCK_CNTL_INDEX             0x0008
+#       define RADEON_PLL_WR_EN             (1 << 7)
+#       define RADEON_PLL_DIV_SEL           (3 << 8)
+#       define RADEON_PLL2_DIV_SEL_MASK     ~(3 << 8)
+#define RADEON_CLR_CMP_CLR_3D               0x1a24
+#define RADEON_CLR_CMP_CLR_DST              0x15c8
+#define RADEON_CLR_CMP_CLR_SRC              0x15c4
+#define RADEON_CLR_CMP_CNTL                 0x15c0
+#       define RADEON_SRC_CMP_EQ_COLOR      (4 <<  0)
+#       define RADEON_SRC_CMP_NEQ_COLOR     (5 <<  0)
+#       define RADEON_CLR_CMP_SRC_SOURCE    (1 << 24)
+#define RADEON_CLR_CMP_MASK                 0x15cc
+#       define RADEON_CLR_CMP_MSK           0xffffffff
+#define RADEON_CLR_CMP_MASK_3D              0x1A28
+#define RADEON_COMMAND                      0x0f04 /* PCI */
+#define RADEON_COMPOSITE_SHADOW_ID          0x1a0c
+#define RADEON_CONFIG_APER_0_BASE           0x0100
+#define RADEON_CONFIG_APER_1_BASE           0x0104
+#define RADEON_CONFIG_APER_SIZE             0x0108
+#define RADEON_CONFIG_BONDS                 0x00e8
+#define RADEON_CONFIG_CNTL                  0x00e0
+#       define RADEON_CFG_ATI_REV_A11       (0   << 16)
+#       define RADEON_CFG_ATI_REV_A12       (1   << 16)
+#       define RADEON_CFG_ATI_REV_A13       (2   << 16)
+#       define RADEON_CFG_ATI_REV_ID_MASK   (0xf << 16)
+#define RADEON_CONFIG_MEMSIZE               0x00f8
+#define RADEON_CONFIG_MEMSIZE_EMBEDDED      0x0114
+#define RADEON_CONFIG_REG_1_BASE            0x010c
+#define RADEON_CONFIG_REG_APER_SIZE         0x0110
+#define RADEON_CONFIG_XSTRAP                0x00e4
+#define RADEON_CONSTANT_COLOR_C             0x1d34
+#       define RADEON_CONSTANT_COLOR_MASK   0x00ffffff
+#       define RADEON_CONSTANT_COLOR_ONE    0x00ffffff
+#       define RADEON_CONSTANT_COLOR_ZERO   0x00000000
+#define RADEON_CRC_CMDFIFO_ADDR             0x0740
+#define RADEON_CRC_CMDFIFO_DOUT             0x0744
+#define RADEON_GRPH_BUFFER_CNTL             0x02f0
+#       define RADEON_GRPH_START_REQ_MASK          (0x7f)
+#       define RADEON_GRPH_START_REQ_SHIFT         0
+#       define RADEON_GRPH_STOP_REQ_MASK           (0x7f<<8)
+#       define RADEON_GRPH_STOP_REQ_SHIFT          8
+#       define RADEON_GRPH_CRITICAL_POINT_MASK     (0x7f<<16)
+#       define RADEON_GRPH_CRITICAL_POINT_SHIFT    16
+#       define RADEON_GRPH_CRITICAL_CNTL           (1<<28)
+#       define RADEON_GRPH_BUFFER_SIZE             (1<<29)
+#       define RADEON_GRPH_CRITICAL_AT_SOF         (1<<30)
+#       define RADEON_GRPH_STOP_CNTL               (1<<31)
+#define RADEON_GRPH2_BUFFER_CNTL            0x03f0
+#       define RADEON_GRPH2_START_REQ_MASK         (0x7f)
+#       define RADEON_GRPH2_START_REQ_SHIFT         0
+#       define RADEON_GRPH2_STOP_REQ_MASK          (0x7f<<8)
+#       define RADEON_GRPH2_STOP_REQ_SHIFT         8
+#       define RADEON_GRPH2_CRITICAL_POINT_MASK    (0x7f<<16)
+#       define RADEON_GRPH2_CRITICAL_POINT_SHIFT   16
+#       define RADEON_GRPH2_CRITICAL_CNTL          (1<<28)
+#       define RADEON_GRPH2_BUFFER_SIZE            (1<<29)
+#       define RADEON_GRPH2_CRITICAL_AT_SOF        (1<<30)
+#       define RADEON_GRPH2_STOP_CNTL              (1<<31)
+#define RADEON_CRTC_CRNT_FRAME              0x0214
+#define RADEON_CRTC_EXT_CNTL                0x0054
+#       define RADEON_CRTC_VGA_XOVERSCAN    (1 <<  0)
+#       define RADEON_VGA_ATI_LINEAR        (1 <<  3)
+#       define RADEON_XCRT_CNT_EN           (1 <<  6)
+#       define RADEON_CRTC_HSYNC_DIS        (1 <<  8)
+#       define RADEON_CRTC_VSYNC_DIS        (1 <<  9)
+#       define RADEON_CRTC_DISPLAY_DIS      (1 << 10)
+#       define RADEON_CRTC_SYNC_TRISTAT     (1 << 11)
+#       define RADEON_CRTC_CRT_ON           (1 << 15)
+#define RADEON_CRTC_EXT_CNTL_DPMS_BYTE      0x0055
+#       define RADEON_CRTC_HSYNC_DIS_BYTE   (1 <<  0)
+#       define RADEON_CRTC_VSYNC_DIS_BYTE   (1 <<  1)
+#       define RADEON_CRTC_DISPLAY_DIS_BYTE (1 <<  2)
+#define RADEON_CRTC_GEN_CNTL                0x0050
+#       define RADEON_CRTC_DBL_SCAN_EN      (1 <<  0)
+#       define RADEON_CRTC_INTERLACE_EN     (1 <<  1)
+#       define RADEON_CRTC_CSYNC_EN         (1 <<  4)
+#       define RADEON_CRTC_CUR_EN           (1 << 16)
+#       define RADEON_CRTC_CUR_MODE_MASK    (7 << 17)
+#       define RADEON_CRTC_ICON_EN          (1 << 20)
+#       define RADEON_CRTC_EXT_DISP_EN      (1 << 24)
+#       define RADEON_CRTC_EN               (1 << 25)
+#       define RADEON_CRTC_DISP_REQ_EN_B    (1 << 26)
+#define RADEON_CRTC2_GEN_CNTL               0x03f8
+#       define RADEON_CRTC2_DBL_SCAN_EN     (1 <<  0)
+#       define RADEON_CRTC2_INTERLACE_EN    (1 <<  1)
+#       define RADEON_CRTC2_SYNC_TRISTAT    (1 <<  4)
+#       define RADEON_CRTC2_HSYNC_TRISTAT   (1 <<  5)
+#       define RADEON_CRTC2_VSYNC_TRISTAT   (1 <<  6)
+#       define RADEON_CRTC2_CRT2_ON         (1 <<  7)
+#       define RADEON_CRTC2_ICON_EN         (1 << 15)
+#       define RADEON_CRTC2_CUR_EN          (1 << 16)
+#       define RADEON_CRTC2_CUR_MODE_MASK   (7 << 20)
+#       define RADEON_CRTC2_DISP_DIS        (1 << 23)
+#       define RADEON_CRTC2_EN              (1 << 25)
+#       define RADEON_CRTC2_DISP_REQ_EN_B   (1 << 26)
+#       define RADEON_CRTC2_CSYNC_EN        (1 << 27)
+#       define RADEON_CRTC2_HSYNC_DIS       (1 << 28)
+#       define RADEON_CRTC2_VSYNC_DIS       (1 << 29)
+#define RADEON_CRTC_MORE_CNTL               0x27c
+#       define RADEON_CRTC_H_CUTOFF_ACTIVE_EN (1<<4)   
+#       define RADEON_CRTC_V_CUTOFF_ACTIVE_EN (1<<5)   
+#define RADEON_CRTC_GUI_TRIG_VLINE          0x0218
+#define RADEON_CRTC_H_SYNC_STRT_WID         0x0204
+#       define RADEON_CRTC_H_SYNC_STRT_PIX        (0x07  <<  0)
+#       define RADEON_CRTC_H_SYNC_STRT_CHAR       (0x3ff <<  3)
+#       define RADEON_CRTC_H_SYNC_STRT_CHAR_SHIFT 3
+#       define RADEON_CRTC_H_SYNC_WID             (0x3f  << 16)
+#       define RADEON_CRTC_H_SYNC_WID_SHIFT       16
+#       define RADEON_CRTC_H_SYNC_POL             (1     << 23)
+#define RADEON_CRTC2_H_SYNC_STRT_WID        0x0304
+#       define RADEON_CRTC2_H_SYNC_STRT_PIX        (0x07  <<  0)
+#       define RADEON_CRTC2_H_SYNC_STRT_CHAR       (0x3ff <<  3)
+#       define RADEON_CRTC2_H_SYNC_STRT_CHAR_SHIFT 3
+#       define RADEON_CRTC2_H_SYNC_WID             (0x3f  << 16)
+#       define RADEON_CRTC2_H_SYNC_WID_SHIFT       16
+#       define RADEON_CRTC2_H_SYNC_POL             (1     << 23)
+#define RADEON_CRTC_H_TOTAL_DISP            0x0200
+#       define RADEON_CRTC_H_TOTAL          (0x03ff << 0)
+#       define RADEON_CRTC_H_TOTAL_SHIFT    0
+#       define RADEON_CRTC_H_DISP           (0x01ff << 16)
+#       define RADEON_CRTC_H_DISP_SHIFT     16
+#define RADEON_CRTC2_H_TOTAL_DISP           0x0300
+#       define RADEON_CRTC2_H_TOTAL         (0x03ff << 0)
+#       define RADEON_CRTC2_H_TOTAL_SHIFT   0
+#       define RADEON_CRTC2_H_DISP          (0x01ff << 16)
+#       define RADEON_CRTC2_H_DISP_SHIFT    16
+#define RADEON_CRTC_OFFSET                  0x0224
+#define RADEON_CRTC2_OFFSET                 0x0324
+#define RADEON_CRTC_OFFSET_CNTL             0x0228
+#       define RADEON_CRTC_TILE_EN          (1 << 15)
+#define RADEON_CRTC2_OFFSET_CNTL            0x0328
+#       define RADEON_CRTC2_TILE_EN         (1 << 15)
+#define RADEON_CRTC_PITCH                   0x022c
+#define RADEON_CRTC2_PITCH                  0x032c
+#define RADEON_CRTC_STATUS                  0x005c
+#       define RADEON_CRTC_VBLANK_SAVE      (1 <<  1)
+#       define RADEON_CRTC_VBLANK_SAVE_CLEAR  (1 <<  1)
+#define RADEON_CRTC2_STATUS                  0x03fc
+#       define RADEON_CRTC2_VBLANK_SAVE      (1 <<  1)
+#       define RADEON_CRTC2_VBLANK_SAVE_CLEAR  (1 <<  1)
+#define RADEON_CRTC_V_SYNC_STRT_WID         0x020c
+#       define RADEON_CRTC_V_SYNC_STRT        (0x7ff <<  0)
+#       define RADEON_CRTC_V_SYNC_STRT_SHIFT  0
+#       define RADEON_CRTC_V_SYNC_WID         (0x1f  << 16)
+#       define RADEON_CRTC_V_SYNC_WID_SHIFT   16
+#       define RADEON_CRTC_V_SYNC_POL         (1     << 23)
+#define RADEON_CRTC2_V_SYNC_STRT_WID        0x030c
+#       define RADEON_CRTC2_V_SYNC_STRT       (0x7ff <<  0)
+#       define RADEON_CRTC2_V_SYNC_STRT_SHIFT 0
+#       define RADEON_CRTC2_V_SYNC_WID        (0x1f  << 16)
+#       define RADEON_CRTC2_V_SYNC_WID_SHIFT  16
+#       define RADEON_CRTC2_V_SYNC_POL        (1     << 23)
+#define RADEON_CRTC_V_TOTAL_DISP            0x0208
+#       define RADEON_CRTC_V_TOTAL          (0x07ff << 0)
+#       define RADEON_CRTC_V_TOTAL_SHIFT    0
+#       define RADEON_CRTC_V_DISP           (0x07ff << 16)
+#       define RADEON_CRTC_V_DISP_SHIFT     16
+#define RADEON_CRTC2_V_TOTAL_DISP           0x0308
+#       define RADEON_CRTC2_V_TOTAL         (0x07ff << 0)
+#       define RADEON_CRTC2_V_TOTAL_SHIFT   0
+#       define RADEON_CRTC2_V_DISP          (0x07ff << 16)
+#       define RADEON_CRTC2_V_DISP_SHIFT    16
+#define RADEON_CRTC_VLINE_CRNT_VLINE        0x0210
+#       define RADEON_CRTC_CRNT_VLINE_MASK  (0x7ff << 16)
+#define RADEON_CRTC2_CRNT_FRAME             0x0314
+#define RADEON_CRTC2_GUI_TRIG_VLINE         0x0318
+#define RADEON_CRTC2_STATUS                 0x03fc
+#define RADEON_CRTC2_VLINE_CRNT_VLINE       0x0310
+#define RADEON_CRTC8_DATA                   0x03d5 /* VGA, 0x3b5 */
+#define RADEON_CRTC8_IDX                    0x03d4 /* VGA, 0x3b4 */
+#define RADEON_CUR_CLR0                     0x026c
+#define RADEON_CUR_CLR1                     0x0270
+#define RADEON_CUR_HORZ_VERT_OFF            0x0268
+#define RADEON_CUR_HORZ_VERT_POSN           0x0264
+#define RADEON_CUR_OFFSET                   0x0260
+#       define RADEON_CUR_LOCK              (1 << 31)
+#define RADEON_CUR2_CLR0                    0x036c
+#define RADEON_CUR2_CLR1                    0x0370
+#define RADEON_CUR2_HORZ_VERT_OFF           0x0368
+#define RADEON_CUR2_HORZ_VERT_POSN          0x0364
+#define RADEON_CUR2_OFFSET                  0x0360
+#       define RADEON_CUR2_LOCK             (1 << 31)
+
+#define RADEON_DAC_CNTL                     0x0058
+#       define RADEON_DAC_RANGE_CNTL        (3 <<  0)
+#       define RADEON_DAC_RANGE_CNTL_MASK   0x03
+#       define RADEON_DAC_BLANKING          (1 <<  2)
+#       define RADEON_DAC_CMP_EN            (1 <<  3)
+#       define RADEON_DAC_CMP_OUTPUT        (1 <<  7)
+#       define RADEON_DAC_8BIT_EN           (1 <<  8)
+#       define RADEON_DAC_VGA_ADR_EN        (1 << 13)
+#       define RADEON_DAC_PDWN              (1 << 15)
+#       define RADEON_DAC_MASK_ALL          (0xff << 24)
+#define RADEON_DAC_CNTL2                    0x007c
+#       define RADEON_DAC2_DAC_CLK_SEL      (1 <<  0)
+#       define RADEON_DAC2_DAC2_CLK_SEL     (1 <<  1)
+#       define RADEON_DAC2_PALETTE_ACC_CTL  (1 <<  5)
+#define RADEON_DAC_EXT_CNTL                 0x0280
+#       define RADEON_DAC_FORCE_BLANK_OFF_EN (1 << 4)
+#       define RADEON_DAC_FORCE_DATA_EN      (1 << 5)
+#       define RADEON_DAC_FORCE_DATA_SEL_MASK (3 << 6)
+#       define RADEON_DAC_FORCE_DATA_MASK   0x0003ff00
+#       define RADEON_DAC_FORCE_DATA_SHIFT  8
+#define RADEON_TV_DAC_CNTL                  0x088c
+#       define RADEON_TV_DAC_STD_MASK       0x0300
+#       define RADEON_TV_DAC_RDACPD         (1 <<  24)
+#       define RADEON_TV_DAC_GDACPD         (1 <<  25)
+#       define RADEON_TV_DAC_BDACPD         (1 <<  26)
+#define RADEON_DISP_HW_DEBUG                0x0d14
+#       define RADEON_CRT2_DISP1_SEL        (1 <<  5)
+#define RADEON_DISP_OUTPUT_CNTL             0x0d64
+#       define RADEON_DISP_DAC_SOURCE_MASK  0x03
+#       define RADEON_DISP_DAC2_SOURCE_MASK  0x0c
+#       define RADEON_DISP_DAC_SOURCE_CRTC2 0x01
+#       define RADEON_DISP_DAC2_SOURCE_CRTC2 0x04
+#define RADEON_DAC_CRC_SIG                  0x02cc
+#define RADEON_DAC_DATA                     0x03c9 /* VGA */
+#define RADEON_DAC_MASK                     0x03c6 /* VGA */
+#define RADEON_DAC_R_INDEX                  0x03c7 /* VGA */
+#define RADEON_DAC_W_INDEX                  0x03c8 /* VGA */
+#define RADEON_DDA_CONFIG                   0x02e0
+#define RADEON_DDA_ON_OFF                   0x02e4
+#define RADEON_DEFAULT_OFFSET               0x16e0
+#define RADEON_DEFAULT_PITCH                0x16e4
+#define RADEON_DEFAULT_SC_BOTTOM_RIGHT      0x16e8
+#       define RADEON_DEFAULT_SC_RIGHT_MAX  (0x1fff <<  0)
+#       define RADEON_DEFAULT_SC_BOTTOM_MAX (0x1fff << 16)
+#define RADEON_DESTINATION_3D_CLR_CMP_VAL   0x1820
+#define RADEON_DESTINATION_3D_CLR_CMP_MSK   0x1824
+#define RADEON_DEVICE_ID                    0x0f02 /* PCI */
+#define RADEON_DISP_MISC_CNTL               0x0d00
+#       define RADEON_SOFT_RESET_GRPH_PP    (1 << 0)
+#define RADEON_DISP_MERGE_CNTL	          0x0d60
+#       define RADEON_DISP_ALPHA_MODE_MASK  0x03
+#       define RADEON_DISP_ALPHA_MODE_KEY   0
+#       define RADEON_DISP_ALPHA_MODE_PER_PIXEL 1
+#       define RADEON_DISP_ALPHA_MODE_GLOBAL 2
+#       define RADEON_DISP_RGB_OFFSET_EN    (1<<8)
+#       define RADEON_DISP_GRPH_ALPHA_MASK  (0xff << 16)
+#       define RADEON_DISP_OV0_ALPHA_MASK   (0xff << 24)
+#	define RADEON_DISP_LIN_TRANS_BYPASS (0x01 << 9)
+#define RADEON_DISP2_MERGE_CNTL	            0x0d68
+#       define RADEON_DISP2_RGB_OFFSET_EN   (1<<8)
+#define RADEON_DISP_LIN_TRANS_GRPH_A        0x0d80
+#define RADEON_DISP_LIN_TRANS_GRPH_B        0x0d84
+#define RADEON_DISP_LIN_TRANS_GRPH_C        0x0d88
+#define RADEON_DISP_LIN_TRANS_GRPH_D        0x0d8c
+#define RADEON_DISP_LIN_TRANS_GRPH_E        0x0d90
+#define RADEON_DISP_LIN_TRANS_GRPH_F        0x0d98
+#define RADEON_DP_BRUSH_BKGD_CLR            0x1478
+#define RADEON_DP_BRUSH_FRGD_CLR            0x147c
+#define RADEON_DP_CNTL                      0x16c0
+#       define RADEON_DST_X_LEFT_TO_RIGHT   (1 <<  0)
+#       define RADEON_DST_Y_TOP_TO_BOTTOM   (1 <<  1)
+#define RADEON_DP_CNTL_XDIR_YDIR_YMAJOR     0x16d0
+#       define RADEON_DST_Y_MAJOR             (1 <<  2)
+#       define RADEON_DST_Y_DIR_TOP_TO_BOTTOM (1 << 15)
+#       define RADEON_DST_X_DIR_LEFT_TO_RIGHT (1 << 31)
+#define RADEON_DP_DATATYPE                  0x16c4
+#       define RADEON_HOST_BIG_ENDIAN_EN    (1 << 29)
+#define RADEON_DP_GUI_MASTER_CNTL           0x146c
+#       define RADEON_GMC_SRC_PITCH_OFFSET_CNTL   (1    <<  0)
+#       define RADEON_GMC_DST_PITCH_OFFSET_CNTL   (1    <<  1)
+#       define RADEON_GMC_SRC_CLIPPING            (1    <<  2)
+#       define RADEON_GMC_DST_CLIPPING            (1    <<  3)
+#       define RADEON_GMC_BRUSH_DATATYPE_MASK     (0x0f <<  4)
+#       define RADEON_GMC_BRUSH_8X8_MONO_FG_BG    (0    <<  4)
+#       define RADEON_GMC_BRUSH_8X8_MONO_FG_LA    (1    <<  4)
+#       define RADEON_GMC_BRUSH_1X8_MONO_FG_BG    (4    <<  4)
+#       define RADEON_GMC_BRUSH_1X8_MONO_FG_LA    (5    <<  4)
+#       define RADEON_GMC_BRUSH_32x1_MONO_FG_BG   (6    <<  4)
+#       define RADEON_GMC_BRUSH_32x1_MONO_FG_LA   (7    <<  4)
+#       define RADEON_GMC_BRUSH_32x32_MONO_FG_BG  (8    <<  4)
+#       define RADEON_GMC_BRUSH_32x32_MONO_FG_LA  (9    <<  4)
+#       define RADEON_GMC_BRUSH_8x8_COLOR         (10   <<  4)
+#       define RADEON_GMC_BRUSH_1X8_COLOR         (12   <<  4)
+#       define RADEON_GMC_BRUSH_SOLID_COLOR       (13   <<  4)
+#       define RADEON_GMC_BRUSH_NONE              (15   <<  4)
+#       define RADEON_GMC_DST_8BPP_CI             (2    <<  8)
+#       define RADEON_GMC_DST_15BPP               (3    <<  8)
+#       define RADEON_GMC_DST_16BPP               (4    <<  8)
+#       define RADEON_GMC_DST_24BPP               (5    <<  8)
+#       define RADEON_GMC_DST_32BPP               (6    <<  8)
+#       define RADEON_GMC_DST_8BPP_RGB            (7    <<  8)
+#       define RADEON_GMC_DST_Y8                  (8    <<  8)
+#       define RADEON_GMC_DST_RGB8                (9    <<  8)
+#       define RADEON_GMC_DST_VYUY                (11   <<  8)
+#       define RADEON_GMC_DST_YVYU                (12   <<  8)
+#       define RADEON_GMC_DST_AYUV444             (14   <<  8)
+#       define RADEON_GMC_DST_ARGB4444            (15   <<  8)
+#       define RADEON_GMC_DST_DATATYPE_MASK       (0x0f <<  8)
+#       define RADEON_GMC_DST_DATATYPE_SHIFT      8
+#       define RADEON_GMC_SRC_DATATYPE_MASK       (3    << 12)
+#       define RADEON_GMC_SRC_DATATYPE_MONO_FG_BG (0    << 12)
+#       define RADEON_GMC_SRC_DATATYPE_MONO_FG_LA (1    << 12)
+#       define RADEON_GMC_SRC_DATATYPE_COLOR      (3    << 12)
+#       define RADEON_GMC_BYTE_PIX_ORDER          (1    << 14)
+#       define RADEON_GMC_BYTE_MSB_TO_LSB         (0    << 14)
+#       define RADEON_GMC_BYTE_LSB_TO_MSB         (1    << 14)
+#       define RADEON_GMC_CONVERSION_TEMP         (1    << 15)
+#       define RADEON_GMC_CONVERSION_TEMP_6500    (0    << 15)
+#       define RADEON_GMC_CONVERSION_TEMP_9300    (1    << 15)
+#       define RADEON_GMC_ROP3_MASK               (0xff << 16)
+#       define RADEON_DP_SRC_SOURCE_MASK          (7    << 24)
+#       define RADEON_DP_SRC_SOURCE_MEMORY        (2    << 24)
+#       define RADEON_DP_SRC_SOURCE_HOST_DATA     (3    << 24)
+#       define RADEON_GMC_3D_FCN_EN               (1    << 27)
+#       define RADEON_GMC_CLR_CMP_CNTL_DIS        (1    << 28)
+#       define RADEON_GMC_AUX_CLIP_DIS            (1    << 29)
+#       define RADEON_GMC_WR_MSK_DIS              (1    << 30)
+#       define RADEON_GMC_LD_BRUSH_Y_X            (1    << 31)
+#       define RADEON_ROP3_ZERO             0x00000000
+#       define RADEON_ROP3_DSa              0x00880000
+#       define RADEON_ROP3_SDna             0x00440000
+#       define RADEON_ROP3_S                0x00cc0000
+#       define RADEON_ROP3_DSna             0x00220000
+#       define RADEON_ROP3_D                0x00aa0000
+#       define RADEON_ROP3_DSx              0x00660000
+#       define RADEON_ROP3_DSo              0x00ee0000
+#       define RADEON_ROP3_DSon             0x00110000
+#       define RADEON_ROP3_DSxn             0x00990000
+#       define RADEON_ROP3_Dn               0x00550000
+#       define RADEON_ROP3_SDno             0x00dd0000
+#       define RADEON_ROP3_Sn               0x00330000
+#       define RADEON_ROP3_DSno             0x00bb0000
+#       define RADEON_ROP3_DSan             0x00770000
+#       define RADEON_ROP3_ONE              0x00ff0000
+#       define RADEON_ROP3_DPa              0x00a00000
+#       define RADEON_ROP3_PDna             0x00500000
+#       define RADEON_ROP3_P                0x00f00000
+#       define RADEON_ROP3_DPna             0x000a0000
+#       define RADEON_ROP3_D                0x00aa0000
+#       define RADEON_ROP3_DPx              0x005a0000
+#       define RADEON_ROP3_DPo              0x00fa0000
+#       define RADEON_ROP3_DPon             0x00050000
+#       define RADEON_ROP3_PDxn             0x00a50000
+#       define RADEON_ROP3_PDno             0x00f50000
+#       define RADEON_ROP3_Pn               0x000f0000
+#       define RADEON_ROP3_DPno             0x00af0000
+#       define RADEON_ROP3_DPan             0x005f0000
+#define RADEON_DP_GUI_MASTER_CNTL_C         0x1c84
+#define RADEON_DP_MIX                       0x16c8
+#define RADEON_DP_SRC_BKGD_CLR              0x15dc
+#define RADEON_DP_SRC_FRGD_CLR              0x15d8
+#define RADEON_DP_WRITE_MASK                0x16cc
+#define RADEON_DST_BRES_DEC                 0x1630
+#define RADEON_DST_BRES_ERR                 0x1628
+#define RADEON_DST_BRES_INC                 0x162c
+#define RADEON_DST_BRES_LNTH                0x1634
+#define RADEON_DST_BRES_LNTH_SUB            0x1638
+#define RADEON_DST_HEIGHT                   0x1410
+#define RADEON_DST_HEIGHT_WIDTH             0x143c
+#define RADEON_DST_HEIGHT_WIDTH_8           0x158c
+#define RADEON_DST_HEIGHT_WIDTH_BW          0x15b4
+#define RADEON_DST_HEIGHT_Y                 0x15a0
+#define RADEON_DST_LINE_START               0x1600
+#define RADEON_DST_LINE_END                 0x1604
+#define RADEON_DST_LINE_PATCOUNT            0x1608
+#       define RADEON_BRES_CNTL_SHIFT       8
+#define RADEON_DST_OFFSET                   0x1404
+#define RADEON_DST_PITCH                    0x1408
+#define RADEON_DST_PITCH_OFFSET             0x142c
+#define RADEON_DST_PITCH_OFFSET_C           0x1c80
+#       define RADEON_PITCH_SHIFT           21
+#       define RADEON_DST_TILE_LINEAR       (0 << 30)
+#       define RADEON_DST_TILE_MACRO        (1 << 30)
+#       define RADEON_DST_TILE_MICRO        (2 << 30)
+#       define RADEON_DST_TILE_BOTH         (3 << 30)
+#define RADEON_DST_WIDTH                    0x140c
+#define RADEON_DST_WIDTH_HEIGHT             0x1598
+#define RADEON_DST_WIDTH_X                  0x1588
+#define RADEON_DST_WIDTH_X_INCY             0x159c
+#define RADEON_DST_X                        0x141c
+#define RADEON_DST_X_SUB                    0x15a4
+#define RADEON_DST_X_Y                      0x1594
+#define RADEON_DST_Y                        0x1420
+#define RADEON_DST_Y_SUB                    0x15a8
+#define RADEON_DST_Y_X                      0x1438
+
+#define RADEON_FCP_CNTL                     0x0910
+#      define RADEON_FCP0_SRC_PCICLK             0
+#      define RADEON_FCP0_SRC_PCLK               1
+#      define RADEON_FCP0_SRC_PCLKb              2
+#      define RADEON_FCP0_SRC_HREF               3
+#      define RADEON_FCP0_SRC_GND                4
+#      define RADEON_FCP0_SRC_HREFb              5
+#define RADEON_FLUSH_1                      0x1704
+#define RADEON_FLUSH_2                      0x1708
+#define RADEON_FLUSH_3                      0x170c
+#define RADEON_FLUSH_4                      0x1710
+#define RADEON_FLUSH_5                      0x1714
+#define RADEON_FLUSH_6                      0x1718
+#define RADEON_FLUSH_7                      0x171c
+#define RADEON_FOG_3D_TABLE_START           0x1810
+#define RADEON_FOG_3D_TABLE_END             0x1814
+#define RADEON_FOG_3D_TABLE_DENSITY         0x181c
+#define RADEON_FOG_TABLE_INDEX              0x1a14
+#define RADEON_FOG_TABLE_DATA               0x1a18
+#define RADEON_FP_CRTC_H_TOTAL_DISP         0x0250
+#define RADEON_FP_CRTC_V_TOTAL_DISP         0x0254
+#define RADEON_FP_CRTC2_H_TOTAL_DISP        0x0350
+#define RADEON_FP_CRTC2_V_TOTAL_DISP        0x0354
+#       define RADEON_FP_CRTC_H_TOTAL_MASK      0x000003ff
+#       define RADEON_FP_CRTC_H_DISP_MASK       0x01ff0000
+#       define RADEON_FP_CRTC_V_TOTAL_MASK      0x00000fff
+#       define RADEON_FP_CRTC_V_DISP_MASK       0x0fff0000
+#       define RADEON_FP_H_SYNC_STRT_CHAR_MASK  0x00001ff8
+#       define RADEON_FP_H_SYNC_WID_MASK        0x003f0000
+#       define RADEON_FP_V_SYNC_STRT_MASK       0x00000fff
+#       define RADEON_FP_V_SYNC_WID_MASK        0x001f0000
+#       define RADEON_FP_CRTC_H_TOTAL_SHIFT     0x00000000
+#       define RADEON_FP_CRTC_H_DISP_SHIFT      0x00000010
+#       define RADEON_FP_CRTC_V_TOTAL_SHIFT     0x00000000
+#       define RADEON_FP_CRTC_V_DISP_SHIFT      0x00000010
+#       define RADEON_FP_H_SYNC_STRT_CHAR_SHIFT 0x00000003
+#       define RADEON_FP_H_SYNC_WID_SHIFT       0x00000010
+#       define RADEON_FP_V_SYNC_STRT_SHIFT      0x00000000
+#       define RADEON_FP_V_SYNC_WID_SHIFT       0x00000010
+#define RADEON_FP_GEN_CNTL                  0x0284
+#       define RADEON_FP_FPON                  (1 <<  0)
+#       define RADEON_FP_TMDS_EN               (1 <<  2)
+#       define RADEON_FP_PANEL_FORMAT          (1 <<  3)
+#       define RADEON_FP_EN_TMDS               (1 <<  7)
+#       define RADEON_FP_DETECT_SENSE          (1 <<  8)
+#       define RADEON_FP_SEL_CRTC2             (1 << 13)
+#       define RADEON_FP_CRTC_DONT_SHADOW_HPAR (1 << 15)
+#       define RADEON_FP_CRTC_DONT_SHADOW_VPAR (1 << 16)
+#       define RADEON_FP_CRTC_DONT_SHADOW_HEND (1 << 17)
+#       define RADEON_FP_CRTC_USE_SHADOW_VEND  (1 << 18)
+#       define RADEON_FP_RMX_HVSYNC_CONTROL_EN (1 << 20)
+#       define RADEON_FP_DFP_SYNC_SEL          (1 << 21)
+#       define RADEON_FP_CRTC_LOCK_8DOT        (1 << 22)
+#       define RADEON_FP_CRT_SYNC_SEL          (1 << 23)
+#       define RADEON_FP_USE_SHADOW_EN         (1 << 24)
+#       define RADEON_FP_CRT_SYNC_ALT          (1 << 26)
+#define RADEON_FP2_GEN_CNTL                 0x0288
+#       define RADEON_FP2_BLANK_EN             (1 <<  1)
+#       define RADEON_FP2_ON                   (1 <<  2)
+#       define RADEON_FP2_PANEL_FORMAT         (1 <<  3)
+#       define RADEON_FP2_SOURCE_SEL_MASK      (3 << 10)
+#       define RADEON_FP2_SOURCE_SEL_CRTC2     (1 << 10)
+#       define RADEON_FP2_SRC_SEL_MASK         (3 << 13)
+#       define RADEON_FP2_SRC_SEL_CRTC2        (1 << 13)
+#       define RADEON_FP2_FP_POL               (1 << 16)
+#       define RADEON_FP2_LP_POL               (1 << 17)
+#       define RADEON_FP2_SCK_POL              (1 << 18)
+#       define RADEON_FP2_LCD_CNTL_MASK        (7 << 19)
+#       define RADEON_FP2_PAD_FLOP_EN          (1 << 22)
+#       define RADEON_FP2_CRC_EN               (1 << 23)
+#       define RADEON_FP2_CRC_READ_EN          (1 << 24)
+#       define RADEON_FP2_DV0_EN               (1 << 25)
+#       define RADEON_FP2_DV0_RATE_SEL_SDR     (1 << 26)
+#define RADEON_FP_H_SYNC_STRT_WID           0x02c4
+#define RADEON_FP_H2_SYNC_STRT_WID          0x03c4
+#define RADEON_FP_HORZ_STRETCH              0x028c
+#define RADEON_FP_HORZ2_STRETCH             0x038c
+#       define RADEON_HORZ_STRETCH_RATIO_MASK 0xffff
+#       define RADEON_HORZ_STRETCH_RATIO_MAX  4096
+#       define RADEON_HORZ_PANEL_SIZE         (0x1ff   << 16)
+#       define RADEON_HORZ_PANEL_SHIFT        16
+#       define RADEON_HORZ_STRETCH_PIXREP     (0      << 25)
+#       define RADEON_HORZ_STRETCH_BLEND      (1      << 26)
+#       define RADEON_HORZ_STRETCH_ENABLE     (1      << 25)
+#       define RADEON_HORZ_AUTO_RATIO         (1      << 27)
+#       define RADEON_HORZ_FP_LOOP_STRETCH    (0x7    << 28)
+#       define RADEON_HORZ_AUTO_RATIO_INC     (1      << 31)
+#define RADEON_FP_V_SYNC_STRT_WID           0x02c8
+#define RADEON_FP_VERT_STRETCH              0x0290
+#define RADEON_FP_V2_SYNC_STRT_WID          0x03c8
+#define RADEON_FP_VERT2_STRETCH             0x0390
+#       define RADEON_VERT_PANEL_SIZE          (0xfff << 12)
+#       define RADEON_VERT_PANEL_SHIFT         12
+#       define RADEON_VERT_STRETCH_RATIO_MASK  0xfff
+#       define RADEON_VERT_STRETCH_RATIO_SHIFT 0
+#       define RADEON_VERT_STRETCH_RATIO_MAX   4096
+#       define RADEON_VERT_STRETCH_ENABLE      (1     << 25)
+#       define RADEON_VERT_STRETCH_LINEREP     (0     << 26)
+#       define RADEON_VERT_STRETCH_BLEND       (1     << 26)
+#       define RADEON_VERT_AUTO_RATIO_EN       (1     << 27)
+#       define RADEON_VERT_STRETCH_RESERVED    0xf1000000
+
+#define RADEON_GEN_INT_CNTL                 0x0040
+#define RADEON_GEN_INT_STATUS               0x0044
+#       define RADEON_VSYNC_INT_AK          (1 <<  2)
+#       define RADEON_VSYNC_INT             (1 <<  2)
+#       define RADEON_VSYNC2_INT_AK         (1 <<  6)
+#       define RADEON_VSYNC2_INT            (1 <<  6)
+#define RADEON_GENENB                       0x03c3 /* VGA */
+#define RADEON_GENFC_RD                     0x03ca /* VGA */
+#define RADEON_GENFC_WT                     0x03da /* VGA, 0x03ba */
+#define RADEON_GENMO_RD                     0x03cc /* VGA */
+#define RADEON_GENMO_WT                     0x03c2 /* VGA */
+#define RADEON_GENS0                        0x03c2 /* VGA */
+#define RADEON_GENS1                        0x03da /* VGA, 0x03ba */
+#define RADEON_GPIO_MONID                   0x0068 /* DDC interface via I2C */
+#define RADEON_GPIO_MONIDB                  0x006c
+#define RADEON_GPIO_CRT2_DDC                0x006c
+#define RADEON_GPIO_DVI_DDC                 0x0064
+#define RADEON_GPIO_VGA_DDC                 0x0060
+#       define RADEON_GPIO_A_0              (1 <<  0)
+#       define RADEON_GPIO_A_1              (1 <<  1)
+#       define RADEON_GPIO_Y_0              (1 <<  8)
+#       define RADEON_GPIO_Y_1              (1 <<  9)
+#       define RADEON_GPIO_Y_SHIFT_0        8
+#       define RADEON_GPIO_Y_SHIFT_1        9
+#       define RADEON_GPIO_EN_0             (1 << 16)
+#       define RADEON_GPIO_EN_1             (1 << 17)
+#       define RADEON_GPIO_MASK_0           (1 << 24) /*??*/
+#       define RADEON_GPIO_MASK_1           (1 << 25) /*??*/
+#define RADEON_GRPH8_DATA                   0x03cf /* VGA */
+#define RADEON_GRPH8_IDX                    0x03ce /* VGA */
+#define RADEON_GUI_SCRATCH_REG0             0x15e0
+#define RADEON_GUI_SCRATCH_REG1             0x15e4
+#define RADEON_GUI_SCRATCH_REG2             0x15e8
+#define RADEON_GUI_SCRATCH_REG3             0x15ec
+#define RADEON_GUI_SCRATCH_REG4             0x15f0
+#define RADEON_GUI_SCRATCH_REG5             0x15f4
+
+#define RADEON_HEADER                       0x0f0e /* PCI */
+#define RADEON_HOST_DATA0                   0x17c0
+#define RADEON_HOST_DATA1                   0x17c4
+#define RADEON_HOST_DATA2                   0x17c8
+#define RADEON_HOST_DATA3                   0x17cc
+#define RADEON_HOST_DATA4                   0x17d0
+#define RADEON_HOST_DATA5                   0x17d4
+#define RADEON_HOST_DATA6                   0x17d8
+#define RADEON_HOST_DATA7                   0x17dc
+#define RADEON_HOST_DATA_LAST               0x17e0
+#define RADEON_HOST_PATH_CNTL               0x0130
+#       define RADEON_HDP_SOFT_RESET        (1 << 26)
+#define RADEON_HTOTAL_CNTL                  0x0009 /* PLL */
+#define RADEON_HTOTAL2_CNTL                 0x002e /* PLL */
+
+#define RADEON_I2C_CNTL_1                   0x0094 /* ? */
+#define RADEON_DVI_I2C_CNTL_1               0x02e4 /* ? */
+#define RADEON_INTERRUPT_LINE               0x0f3c /* PCI */
+#define RADEON_INTERRUPT_PIN                0x0f3d /* PCI */
+#define RADEON_IO_BASE                      0x0f14 /* PCI */
+
+#define RADEON_LATENCY                      0x0f0d /* PCI */
+#define RADEON_LEAD_BRES_DEC                0x1608
+#define RADEON_LEAD_BRES_LNTH               0x161c
+#define RADEON_LEAD_BRES_LNTH_SUB           0x1624
+#define RADEON_LVDS_GEN_CNTL                0x02d0
+#       define RADEON_LVDS_ON               (1   <<  0)
+#       define RADEON_LVDS_DISPLAY_DIS      (1   <<  1)
+#       define RADEON_LVDS_PANEL_TYPE       (1   <<  2)
+#       define RADEON_LVDS_PANEL_FORMAT     (1   <<  3)
+#       define RADEON_LVDS_EN               (1   <<  7)
+#       define RADEON_LVDS_DIGON            (1   << 18)
+#       define RADEON_LVDS_BLON             (1   << 19)
+#       define RADEON_LVDS_SEL_CRTC2        (1   << 23)
+#define RADEON_LVDS_PLL_CNTL                0x02d4
+#       define RADEON_HSYNC_DELAY_SHIFT     28
+#       define RADEON_HSYNC_DELAY_MASK      (0xf << 28)
+
+#define RADEON_MAX_LATENCY                  0x0f3f /* PCI */
+#define RADEON_MC_AGP_LOCATION              0x014c
+#define RADEON_MC_FB_LOCATION               0x0148
+#define RADEON_DISPLAY_BASE_ADDR            0x23c
+#define RADEON_DISPLAY2_BASE_ADDR           0x33c
+#define RADEON_OV0_BASE_ADDR                0x43c
+#define RADEON_NB_TOM                       0x15c
+#define RADEON_MCLK_CNTL                    0x0012 /* PLL */
+#       define RADEON_FORCEON_MCLKA         (1 << 16)
+#       define RADEON_FORCEON_MCLKB         (1 << 17)
+#       define RADEON_FORCEON_YCLKA         (1 << 18)
+#       define RADEON_FORCEON_YCLKB         (1 << 19)
+#       define RADEON_FORCEON_MC            (1 << 20)
+#       define RADEON_FORCEON_AIC           (1 << 21)
+#define RADEON_MDGPIO_A_REG                 0x01ac
+#define RADEON_MDGPIO_EN_REG                0x01b0
+#define RADEON_MDGPIO_MASK                  0x0198
+#define RADEON_MDGPIO_Y_REG                 0x01b4
+#define RADEON_MEM_ADDR_CONFIG              0x0148
+#define RADEON_MEM_BASE                     0x0f10 /* PCI */
+#define RADEON_MEM_CNTL                     0x0140
+#       define RADEON_MEM_NUM_CHANNELS_MASK 0x01
+#       define RADEON_MEM_USE_B_CH_ONLY     (1<<1)
+#       define RV100_HALF_MODE              (1<<3)
+#       define R300_MEM_NUM_CHANNELS_MASK   0x03
+#       define R300_MEM_USE_CD_CH_ONLY      (1<<2)
+#define RADEON_MEM_TIMING_CNTL              0x0144 /* EXT_MEM_CNTL */
+#define RADEON_MEM_INIT_LAT_TIMER           0x0154
+#define RADEON_MEM_INTF_CNTL                0x014c
+#define RADEON_MEM_SDRAM_MODE_REG           0x0158
+#define RADEON_MEM_STR_CNTL                 0x0150
+#define RADEON_MEM_VGA_RP_SEL               0x003c
+#define RADEON_MEM_VGA_WP_SEL               0x0038
+#define RADEON_MIN_GRANT                    0x0f3e /* PCI */
+#define RADEON_MM_DATA                      0x0004
+#define RADEON_MM_INDEX                     0x0000
+#define RADEON_MPLL_CNTL                    0x000e /* PLL */
+#define RADEON_MPP_TB_CONFIG                0x01c0 /* ? */
+#define RADEON_MPP_GP_CONFIG                0x01c8 /* ? */
+#define R300_MC_IND_INDEX                   0x01f8
+#       define R300_MC_IND_ADDR_MASK        0x3f
+#define R300_MC_IND_DATA                    0x01fc
+#define R300_MC_READ_CNTL_AB                0x017c
+#       define R300_MEM_RBS_POSITION_A_MASK 0x03
+#define R300_MC_READ_CNTL_CD_mcind	    0x24
+#       define R300_MEM_RBS_POSITION_C_MASK 0x03
+
+#define RADEON_N_VIF_COUNT                  0x0248
+
+#define RADEON_OV0_AUTO_FLIP_CNTL           0x0470
+#define RADEON_OV0_COLOUR_CNTL              0x04E0
+#define RADEON_OV0_DEINTERLACE_PATTERN      0x0474
+#define RADEON_OV0_EXCLUSIVE_HORZ           0x0408
+#       define  RADEON_EXCL_HORZ_START_MASK        0x000000ff
+#       define  RADEON_EXCL_HORZ_END_MASK          0x0000ff00
+#       define  RADEON_EXCL_HORZ_BACK_PORCH_MASK   0x00ff0000
+#       define  RADEON_EXCL_HORZ_EXCLUSIVE_EN      0x80000000
+#define RADEON_OV0_EXCLUSIVE_VERT           0x040C
+#       define  RADEON_EXCL_VERT_START_MASK        0x000003ff
+#       define  RADEON_EXCL_VERT_END_MASK          0x03ff0000
+#define RADEON_OV0_FILTER_CNTL              0x04A0
+#define RADEON_OV0_FOUR_TAP_COEF_0          0x04B0
+#define RADEON_OV0_FOUR_TAP_COEF_1          0x04B4
+#define RADEON_OV0_FOUR_TAP_COEF_2          0x04B8
+#define RADEON_OV0_FOUR_TAP_COEF_3          0x04BC
+#define RADEON_OV0_FOUR_TAP_COEF_4          0x04C0
+#define RADEON_OV0_GAMMA_000_00F            0x0d40
+#define RADEON_OV0_GAMMA_010_01F            0x0d44
+#define RADEON_OV0_GAMMA_020_03F            0x0d48
+#define RADEON_OV0_GAMMA_040_07F            0x0d4c
+#define RADEON_OV0_GAMMA_080_0BF            0x0e00
+#define RADEON_OV0_GAMMA_0C0_0FF            0x0e04
+#define RADEON_OV0_GAMMA_100_13F            0x0e08
+#define RADEON_OV0_GAMMA_140_17F            0x0e0c
+#define RADEON_OV0_GAMMA_180_1BF            0x0e10
+#define RADEON_OV0_GAMMA_1C0_1FF            0x0e14
+#define RADEON_OV0_GAMMA_200_23F            0x0e18
+#define RADEON_OV0_GAMMA_240_27F            0x0e1c
+#define RADEON_OV0_GAMMA_280_2BF            0x0e20
+#define RADEON_OV0_GAMMA_2C0_2FF            0x0e24
+#define RADEON_OV0_GAMMA_300_33F            0x0e28
+#define RADEON_OV0_GAMMA_340_37F            0x0e2c
+#define RADEON_OV0_GAMMA_380_3BF            0x0d50
+#define RADEON_OV0_GAMMA_3C0_3FF            0x0d54
+#define RADEON_OV0_GRAPHICS_KEY_CLR_LOW     0x04EC
+#define RADEON_OV0_GRAPHICS_KEY_CLR_HIGH    0x04F0
+#define RADEON_OV0_H_INC                    0x0480
+#define RADEON_OV0_KEY_CNTL                 0x04F4
+#       define  RADEON_VIDEO_KEY_FN_MASK    0x00000003L
+#       define  RADEON_VIDEO_KEY_FN_FALSE   0x00000000L
+#       define  RADEON_VIDEO_KEY_FN_TRUE    0x00000001L
+#       define  RADEON_VIDEO_KEY_FN_EQ      0x00000002L
+#       define  RADEON_VIDEO_KEY_FN_NE      0x00000003L
+#       define  RADEON_GRAPHIC_KEY_FN_MASK  0x00000030L
+#       define  RADEON_GRAPHIC_KEY_FN_FALSE 0x00000000L
+#       define  RADEON_GRAPHIC_KEY_FN_TRUE  0x00000010L
+#       define  RADEON_GRAPHIC_KEY_FN_EQ    0x00000020L
+#       define  RADEON_GRAPHIC_KEY_FN_NE    0x00000030L
+#       define  RADEON_CMP_MIX_MASK         0x00000100L
+#       define  RADEON_CMP_MIX_OR           0x00000000L
+#       define  RADEON_CMP_MIX_AND          0x00000100L
+#define RADEON_OV0_LIN_TRANS_A              0x0d20
+#define RADEON_OV0_LIN_TRANS_B              0x0d24
+#define RADEON_OV0_LIN_TRANS_C              0x0d28
+#define RADEON_OV0_LIN_TRANS_D              0x0d2c
+#define RADEON_OV0_LIN_TRANS_E              0x0d30
+#define RADEON_OV0_LIN_TRANS_F              0x0d34
+#define RADEON_OV0_P1_BLANK_LINES_AT_TOP    0x0430
+#       define  RADEON_P1_BLNK_LN_AT_TOP_M1_MASK   0x00000fffL
+#       define  RADEON_P1_ACTIVE_LINES_M1          0x0fff0000L
+#define RADEON_OV0_P1_H_ACCUM_INIT          0x0488
+#define RADEON_OV0_P1_V_ACCUM_INIT          0x0428
+#       define  RADEON_OV0_P1_MAX_LN_IN_PER_LN_OUT 0x00000003L
+#       define  RADEON_OV0_P1_V_ACCUM_INIT_MASK    0x01ff8000L
+#define RADEON_OV0_P1_X_START_END           0x0494
+#define RADEON_OV0_P2_X_START_END           0x0498
+#define RADEON_OV0_P23_BLANK_LINES_AT_TOP   0x0434
+#       define  RADEON_P23_BLNK_LN_AT_TOP_M1_MASK  0x000007ffL
+#       define  RADEON_P23_ACTIVE_LINES_M1         0x07ff0000L
+#define RADEON_OV0_P23_H_ACCUM_INIT         0x048C
+#define RADEON_OV0_P23_V_ACCUM_INIT         0x042C
+#define RADEON_OV0_P3_X_START_END           0x049C
+#define RADEON_OV0_REG_LOAD_CNTL            0x0410
+#       define  RADEON_REG_LD_CTL_LOCK                 0x00000001L
+#       define  RADEON_REG_LD_CTL_VBLANK_DURING_LOCK   0x00000002L
+#       define  RADEON_REG_LD_CTL_STALL_GUI_UNTIL_FLIP 0x00000004L
+#       define  RADEON_REG_LD_CTL_LOCK_READBACK        0x00000008L
+#define RADEON_OV0_SCALE_CNTL               0x0420
+#       define  RADEON_SCALER_HORZ_PICK_NEAREST    0x00000004L
+#       define  RADEON_SCALER_VERT_PICK_NEAREST    0x00000008L
+#       define  RADEON_SCALER_SIGNED_UV            0x00000010L
+#       define  RADEON_SCALER_GAMMA_SEL_MASK       0x00000060L
+#       define  RADEON_SCALER_GAMMA_SEL_BRIGHT     0x00000000L
+#       define  RADEON_SCALER_GAMMA_SEL_G22        0x00000020L
+#       define  RADEON_SCALER_GAMMA_SEL_G18        0x00000040L
+#       define  RADEON_SCALER_GAMMA_SEL_G14        0x00000060L
+#       define  RADEON_SCALER_COMCORE_SHIFT_UP_ONE 0x00000080L
+#       define  RADEON_SCALER_SURFAC_FORMAT        0x00000f00L
+#       define  RADEON_SCALER_SOURCE_15BPP         0x00000300L
+#       define  RADEON_SCALER_SOURCE_16BPP         0x00000400L
+#       define  RADEON_SCALER_SOURCE_32BPP         0x00000600L
+#       define  RADEON_SCALER_SOURCE_YUV9          0x00000900L
+#       define  RADEON_SCALER_SOURCE_YUV12         0x00000A00L
+#       define  RADEON_SCALER_SOURCE_VYUY422       0x00000B00L
+#       define  RADEON_SCALER_SOURCE_YVYU422       0x00000C00L
+#       define  RADEON_SCALER_ADAPTIVE_DEINT       0x00001000L
+#       define  RADEON_SCALER_TEMPORAL_DEINT       0x00002000L
+#       define  RADEON_SCALER_SMART_SWITCH         0x00008000L
+#       define  RADEON_SCALER_BURST_PER_PLANE      0x007F0000L
+#       define  RADEON_SCALER_DOUBLE_BUFFER        0x01000000L
+#       define  RADEON_SCALER_DIS_LIMIT            0x08000000L
+#       define  RADEON_SCALER_INT_EMU              0x20000000L
+#       define  RADEON_SCALER_ENABLE               0x40000000L
+#       define  RADEON_SCALER_SOFT_RESET           0x80000000L
+#       define  RADEON_SCALER_ADAPTIVE_DEINT       0x00001000L
+#define RADEON_OV0_STEP_BY                  0x0484
+#define RADEON_OV0_TEST                     0x04F8
+#define RADEON_OV0_V_INC                    0x0424
+#define RADEON_OV0_VID_BUF_PITCH0_VALUE     0x0460
+#define RADEON_OV0_VID_BUF_PITCH1_VALUE     0x0464
+#define RADEON_OV0_VID_BUF0_BASE_ADRS       0x0440
+#       define  RADEON_VIF_BUF0_PITCH_SEL          0x00000001L
+#       define  RADEON_VIF_BUF0_TILE_ADRS          0x00000002L
+#       define  RADEON_VIF_BUF0_BASE_ADRS_MASK     0x03fffff0L
+#       define  RADEON_VIF_BUF0_1ST_LINE_LSBS_MASK 0x48000000L
+#define RADEON_OV0_VID_BUF1_BASE_ADRS       0x0444
+#       define  RADEON_VIF_BUF1_PITCH_SEL          0x00000001L
+#       define  RADEON_VIF_BUF1_TILE_ADRS          0x00000002L
+#       define  RADEON_VIF_BUF1_BASE_ADRS_MASK     0x03fffff0L
+#       define  RADEON_VIF_BUF1_1ST_LINE_LSBS_MASK 0x48000000L
+#define RADEON_OV0_VID_BUF2_BASE_ADRS       0x0448
+#       define  RADEON_VIF_BUF2_PITCH_SEL          0x00000001L
+#       define  RADEON_VIF_BUF2_TILE_ADRS          0x00000002L
+#       define  RADEON_VIF_BUF2_BASE_ADRS_MASK     0x03fffff0L
+#       define  RADEON_VIF_BUF2_1ST_LINE_LSBS_MASK 0x48000000L
+#define RADEON_OV0_VID_BUF3_BASE_ADRS       0x044C
+#define RADEON_OV0_VID_BUF4_BASE_ADRS       0x0450
+#define RADEON_OV0_VID_BUF5_BASE_ADRS       0x0454
+#define RADEON_OV0_VIDEO_KEY_CLR_HIGH       0x04E8
+#define RADEON_OV0_VIDEO_KEY_CLR_LOW        0x04E4
+#define RADEON_OV0_Y_X_START                0x0400
+#define RADEON_OV0_Y_X_END                  0x0404
+#define RADEON_OV1_Y_X_START                0x0600
+#define RADEON_OV1_Y_X_END                  0x0604
+#define RADEON_OVR_CLR                      0x0230
+#define RADEON_OVR_WID_LEFT_RIGHT           0x0234
+#define RADEON_OVR_WID_TOP_BOTTOM           0x0238
+
+#define RADEON_P2PLL_CNTL                   0x002a /* P2PLL */
+#       define RADEON_P2PLL_RESET                (1 <<  0)
+#       define RADEON_P2PLL_SLEEP                (1 <<  1)
+#       define RADEON_P2PLL_ATOMIC_UPDATE_EN     (1 << 16)
+#       define RADEON_P2PLL_VGA_ATOMIC_UPDATE_EN (1 << 17)
+#       define RADEON_P2PLL_ATOMIC_UPDATE_VSYNC  (1 << 18)
+#define RADEON_P2PLL_DIV_0                  0x002c
+#       define RADEON_P2PLL_FB0_DIV_MASK    0x07ff
+#       define RADEON_P2PLL_POST0_DIV_MASK  0x00070000
+#define RADEON_P2PLL_REF_DIV                0x002B /* PLL */
+#       define RADEON_P2PLL_REF_DIV_MASK    0x03ff
+#       define RADEON_P2PLL_ATOMIC_UPDATE_R (1 << 15) /* same as _W */
+#       define RADEON_P2PLL_ATOMIC_UPDATE_W (1 << 15) /* same as _R */
+#       define R300_PPLL_REF_DIV_ACC_MASK   (0x3ff << 18)
+#       define R300_PPLL_REF_DIV_ACC_SHIFT  18
+#define RADEON_PALETTE_DATA                 0x00b4
+#define RADEON_PALETTE_30_DATA              0x00b8
+#define RADEON_PALETTE_INDEX                0x00b0
+#define RADEON_PCI_GART_PAGE                0x017c
+#define RADEON_PIXCLKS_CNTL                 0x002d
+#       define RADEON_PIX2CLK_SRC_SEL_MASK     0x03
+#       define RADEON_PIX2CLK_SRC_SEL_CPUCLK   0x00
+#       define RADEON_PIX2CLK_SRC_SEL_PSCANCLK 0x01
+#       define RADEON_PIX2CLK_SRC_SEL_BYTECLK  0x02
+#       define RADEON_PIX2CLK_SRC_SEL_P2PLLCLK 0x03
+#       define RADEON_PIX2CLK_ALWAYS_ONb       (1<<6)
+#       define RADEON_PIX2CLK_DAC_ALWAYS_ONb   (1<<7)
+#       define RADEON_PIXCLK_TV_SRC_SEL        (1 << 8)
+#       define RADEON_PIXCLK_LVDS_ALWAYS_ONb   (1 << 14)
+#       define RADEON_PIXCLK_TMDS_ALWAYS_ONb   (1 << 15)
+#define RADEON_PLANE_3D_MASK_C              0x1d44
+#define RADEON_PLL_TEST_CNTL                0x0013 /* PLL */
+#define RADEON_PMI_CAP_ID                   0x0f5c /* PCI */
+#define RADEON_PMI_DATA                     0x0f63 /* PCI */
+#define RADEON_PMI_NXT_CAP_PTR              0x0f5d /* PCI */
+#define RADEON_PMI_PMC_REG                  0x0f5e /* PCI */
+#define RADEON_PMI_PMCSR_REG                0x0f60 /* PCI */
+#define RADEON_PMI_REGISTER                 0x0f5c /* PCI */
+#define RADEON_PPLL_CNTL                    0x0002 /* PLL */
+#       define RADEON_PPLL_RESET                (1 <<  0)
+#       define RADEON_PPLL_SLEEP                (1 <<  1)
+#       define RADEON_PPLL_ATOMIC_UPDATE_EN     (1 << 16)
+#       define RADEON_PPLL_VGA_ATOMIC_UPDATE_EN (1 << 17)
+#       define RADEON_PPLL_ATOMIC_UPDATE_VSYNC  (1 << 18)
+#define RADEON_PPLL_DIV_0                   0x0004 /* PLL */
+#define RADEON_PPLL_DIV_1                   0x0005 /* PLL */
+#define RADEON_PPLL_DIV_2                   0x0006 /* PLL */
+#define RADEON_PPLL_DIV_3                   0x0007 /* PLL */
+#       define RADEON_PPLL_FB3_DIV_MASK     0x07ff
+#       define RADEON_PPLL_POST3_DIV_MASK   0x00070000
+#define RADEON_PPLL_REF_DIV                 0x0003 /* PLL */
+#       define RADEON_PPLL_REF_DIV_MASK     0x03ff
+#       define RADEON_PPLL_ATOMIC_UPDATE_R  (1 << 15) /* same as _W */
+#       define RADEON_PPLL_ATOMIC_UPDATE_W  (1 << 15) /* same as _R */
+#define RADEON_PWR_MNGMT_CNTL_STATUS        0x0f60 /* PCI */
+
+#define RADEON_RBBM_GUICNTL                 0x172c
+#       define RADEON_HOST_DATA_SWAP_NONE   (0 << 0)
+#       define RADEON_HOST_DATA_SWAP_16BIT  (1 << 0)
+#       define RADEON_HOST_DATA_SWAP_32BIT  (2 << 0)
+#       define RADEON_HOST_DATA_SWAP_HDW    (3 << 0)
+#define RADEON_RBBM_SOFT_RESET              0x00f0
+#       define RADEON_SOFT_RESET_CP         (1 <<  0)
+#       define RADEON_SOFT_RESET_HI         (1 <<  1)
+#       define RADEON_SOFT_RESET_SE         (1 <<  2)
+#       define RADEON_SOFT_RESET_RE         (1 <<  3)
+#       define RADEON_SOFT_RESET_PP         (1 <<  4)
+#       define RADEON_SOFT_RESET_E2         (1 <<  5)
+#       define RADEON_SOFT_RESET_RB         (1 <<  6)
+#       define RADEON_SOFT_RESET_HDP        (1 <<  7)
+#define RADEON_RBBM_STATUS                  0x0e40
+#       define RADEON_RBBM_FIFOCNT_MASK     0x007f
+#       define RADEON_RBBM_ACTIVE           (1 << 31)
+#define RADEON_RB2D_DSTCACHE_CTLSTAT        0x342c
+#       define RADEON_RB2D_DC_FLUSH         (3 << 0)
+#       define RADEON_RB2D_DC_FREE          (3 << 2)
+#       define RADEON_RB2D_DC_FLUSH_ALL     0xf
+#       define RADEON_RB2D_DC_BUSY          (1 << 31)
+#define RADEON_RB2D_DSTCACHE_MODE           0x3428
+#define RADEON_REG_BASE                     0x0f18 /* PCI */
+#define RADEON_REGPROG_INF                  0x0f09 /* PCI */
+#define RADEON_REVISION_ID                  0x0f08 /* PCI */
+
+#define RADEON_SC_BOTTOM                    0x164c
+#define RADEON_SC_BOTTOM_RIGHT              0x16f0
+#define RADEON_SC_BOTTOM_RIGHT_C            0x1c8c
+#define RADEON_SC_LEFT                      0x1640
+#define RADEON_SC_RIGHT                     0x1644
+#define RADEON_SC_TOP                       0x1648
+#define RADEON_SC_TOP_LEFT                  0x16ec
+#define RADEON_SC_TOP_LEFT_C                0x1c88
+#       define RADEON_SC_SIGN_MASK_LO       0x8000
+#       define RADEON_SC_SIGN_MASK_HI       0x80000000
+#define RADEON_SCLK_CNTL                    0x000d /* PLL */
+#       define RADEON_DYN_STOP_LAT_MASK     0x00007ff8
+#       define RADEON_CP_MAX_DYN_STOP_LAT   0x0008
+#       define RADEON_SCLK_FORCEON_MASK     0xffff8000
+#define RADEON_SCLK_MORE_CNTL               0x0035 /* PLL */
+#       define RADEON_SCLK_MORE_FORCEON     0x0700
+#define RADEON_SDRAM_MODE_REG               0x0158
+#define RADEON_SEQ8_DATA                    0x03c5 /* VGA */
+#define RADEON_SEQ8_IDX                     0x03c4 /* VGA */
+#define RADEON_SNAPSHOT_F_COUNT             0x0244
+#define RADEON_SNAPSHOT_VH_COUNTS           0x0240
+#define RADEON_SNAPSHOT_VIF_COUNT           0x024c
+#define RADEON_SRC_OFFSET                   0x15ac
+#define RADEON_SRC_PITCH                    0x15b0
+#define RADEON_SRC_PITCH_OFFSET             0x1428
+#define RADEON_SRC_SC_BOTTOM                0x165c
+#define RADEON_SRC_SC_BOTTOM_RIGHT          0x16f4
+#define RADEON_SRC_SC_RIGHT                 0x1654
+#define RADEON_SRC_X                        0x1414
+#define RADEON_SRC_X_Y                      0x1590
+#define RADEON_SRC_Y                        0x1418
+#define RADEON_SRC_Y_X                      0x1434
+#define RADEON_STATUS                       0x0f06 /* PCI */
+#define RADEON_SUBPIC_CNTL                  0x0540 /* ? */
+#define RADEON_SUB_CLASS                    0x0f0a /* PCI */
+#define RADEON_SURFACE_CNTL                 0x0b00
+#       define RADEON_SURF_TRANSLATION_DIS  (1 << 8)
+#       define RADEON_NONSURF_AP0_SWP_16BPP (1 << 20)
+#       define RADEON_NONSURF_AP0_SWP_32BPP (1 << 21)
+#define RADEON_SURFACE0_INFO                0x0b0c
+#       define RADEON_SURF_TILE_COLOR_MACRO (0 << 16)
+#       define RADEON_SURF_TILE_COLOR_BOTH  (1 << 16)
+#       define RADEON_SURF_TILE_DEPTH_32BPP (2 << 16)
+#       define RADEON_SURF_TILE_DEPTH_16BPP (3 << 16)
+#       define R200_SURF_TILE_NONE          (0 << 16)
+#       define R200_SURF_TILE_COLOR_MACRO   (1 << 16)
+#       define R200_SURF_TILE_COLOR_MICRO   (2 << 16)
+#       define R200_SURF_TILE_COLOR_BOTH    (3 << 16)
+#       define R200_SURF_TILE_DEPTH_32BPP   (4 << 16)
+#       define R200_SURF_TILE_DEPTH_16BPP   (5 << 16)
+#       define RADEON_SURF_AP0_SWP_16BPP    (1 << 20)
+#       define RADEON_SURF_AP0_SWP_32BPP    (1 << 21)
+#       define RADEON_SURF_AP1_SWP_16BPP    (1 << 22)
+#       define RADEON_SURF_AP1_SWP_32BPP    (1 << 23)
+#define RADEON_SURFACE0_LOWER_BOUND         0x0b04
+#define RADEON_SURFACE0_UPPER_BOUND         0x0b08
+#define RADEON_SURFACE1_INFO                0x0b1c
+#define RADEON_SURFACE1_LOWER_BOUND         0x0b14
+#define RADEON_SURFACE1_UPPER_BOUND         0x0b18
+#define RADEON_SURFACE2_INFO                0x0b2c
+#define RADEON_SURFACE2_LOWER_BOUND         0x0b24
+#define RADEON_SURFACE2_UPPER_BOUND         0x0b28
+#define RADEON_SURFACE3_INFO                0x0b3c
+#define RADEON_SURFACE3_LOWER_BOUND         0x0b34
+#define RADEON_SURFACE3_UPPER_BOUND         0x0b38
+#define RADEON_SURFACE4_INFO                0x0b4c
+#define RADEON_SURFACE4_LOWER_BOUND         0x0b44
+#define RADEON_SURFACE4_UPPER_BOUND         0x0b48
+#define RADEON_SURFACE5_INFO                0x0b5c
+#define RADEON_SURFACE5_LOWER_BOUND         0x0b54
+#define RADEON_SURFACE5_UPPER_BOUND         0x0b58
+#define RADEON_SURFACE6_INFO                0x0b6c
+#define RADEON_SURFACE6_LOWER_BOUND         0x0b64
+#define RADEON_SURFACE6_UPPER_BOUND         0x0b68
+#define RADEON_SURFACE7_INFO                0x0b7c
+#define RADEON_SURFACE7_LOWER_BOUND         0x0b74
+#define RADEON_SURFACE7_UPPER_BOUND         0x0b78
+#define RADEON_SW_SEMAPHORE                 0x013c
+
+#define RADEON_TEST_DEBUG_CNTL              0x0120
+#define RADEON_TEST_DEBUG_MUX               0x0124
+#define RADEON_TEST_DEBUG_OUT               0x012c
+#define RADEON_TMDS_PLL_CNTL                0x02a8
+#define RADEON_TMDS_TRANSMITTER_CNTL        0x02a4
+#       define RADEON_TMDS_TRANSMITTER_PLLEN  1
+#       define RADEON_TMDS_TRANSMITTER_PLLRST 2
+#define RADEON_TRAIL_BRES_DEC               0x1614
+#define RADEON_TRAIL_BRES_ERR               0x160c
+#define RADEON_TRAIL_BRES_INC               0x1610
+#define RADEON_TRAIL_X                      0x1618
+#define RADEON_TRAIL_X_SUB                  0x1620
+
+#define RADEON_VCLK_ECP_CNTL                0x0008 /* PLL */
+#       define RADEON_VCLK_SRC_SEL_MASK     0x03
+#       define RADEON_VCLK_SRC_SEL_CPUCLK   0x00
+#       define RADEON_VCLK_SRC_SEL_PSCANCLK 0x01
+#       define RADEON_VCLK_SRC_SEL_BYTECLK  0x02
+#       define RADEON_VCLK_SRC_SEL_PPLLCLK  0x03
+#       define RADEON_PIXCLK_ALWAYS_ONb     (1<<6)
+#       define RADEON_PIXCLK_DAC_ALWAYS_ONb (1<<7)
+
+#define RADEON_VENDOR_ID                    0x0f00 /* PCI */
+#define RADEON_VGA_DDA_CONFIG               0x02e8
+#define RADEON_VGA_DDA_ON_OFF               0x02ec
+#define RADEON_VID_BUFFER_CONTROL           0x0900
+#define RADEON_VIDEOMUX_CNTL                0x0190
+#define RADEON_VIPH_CONTROL                 0x0c40 /* ? */
+
+#define RADEON_WAIT_UNTIL                   0x1720
+#       define RADEON_WAIT_CRTC_PFLIP       (1 << 0)
+#       define RADEON_WAIT_2D_IDLECLEAN     (1 << 16)
+#       define RADEON_WAIT_3D_IDLECLEAN     (1 << 17)
+#       define RADEON_WAIT_HOST_IDLECLEAN   (1 << 18)
+
+#define RADEON_X_MPLL_REF_FB_DIV            0x000a /* PLL */
+#define RADEON_XCLK_CNTL                    0x000d /* PLL */
+#define RADEON_XDLL_CNTL                    0x000c /* PLL */
+#define RADEON_XPLL_CNTL                    0x000b /* PLL */
+
+
+
+				/* Registers for 3D/TCL */
+#define RADEON_PP_BORDER_COLOR_0            0x1d40
+#define RADEON_PP_BORDER_COLOR_1            0x1d44
+#define RADEON_PP_BORDER_COLOR_2            0x1d48
+#define RADEON_PP_CNTL                      0x1c38
+#       define RADEON_STIPPLE_ENABLE        (1 <<  0)
+#       define RADEON_SCISSOR_ENABLE        (1 <<  1)
+#       define RADEON_PATTERN_ENABLE        (1 <<  2)
+#       define RADEON_SHADOW_ENABLE         (1 <<  3)
+#       define RADEON_TEX_ENABLE_MASK       (0xf << 4)
+#       define RADEON_TEX_0_ENABLE          (1 <<  4)
+#       define RADEON_TEX_1_ENABLE          (1 <<  5)
+#       define RADEON_TEX_2_ENABLE          (1 <<  6)
+#       define RADEON_TEX_3_ENABLE          (1 <<  7)
+#       define RADEON_TEX_BLEND_ENABLE_MASK (0xf << 12)
+#       define RADEON_TEX_BLEND_0_ENABLE    (1 << 12)
+#       define RADEON_TEX_BLEND_1_ENABLE    (1 << 13)
+#       define RADEON_TEX_BLEND_2_ENABLE    (1 << 14)
+#       define RADEON_TEX_BLEND_3_ENABLE    (1 << 15)
+#       define RADEON_PLANAR_YUV_ENABLE     (1 << 20)
+#       define RADEON_SPECULAR_ENABLE       (1 << 21)
+#       define RADEON_FOG_ENABLE            (1 << 22)
+#       define RADEON_ALPHA_TEST_ENABLE     (1 << 23)
+#       define RADEON_ANTI_ALIAS_NONE       (0 << 24)
+#       define RADEON_ANTI_ALIAS_LINE       (1 << 24)
+#       define RADEON_ANTI_ALIAS_POLY       (2 << 24)
+#       define RADEON_ANTI_ALIAS_LINE_POLY  (3 << 24)
+#       define RADEON_BUMP_MAP_ENABLE       (1 << 26)
+#       define RADEON_BUMPED_MAP_T0         (0 << 27)
+#       define RADEON_BUMPED_MAP_T1         (1 << 27)
+#       define RADEON_BUMPED_MAP_T2         (2 << 27)
+#       define RADEON_TEX_3D_ENABLE_0       (1 << 29)
+#       define RADEON_TEX_3D_ENABLE_1       (1 << 30)
+#       define RADEON_MC_ENABLE             (1 << 31)
+#define RADEON_PP_FOG_COLOR                 0x1c18
+#       define RADEON_FOG_COLOR_MASK        0x00ffffff
+#       define RADEON_FOG_VERTEX            (0 << 24)
+#       define RADEON_FOG_TABLE             (1 << 24)
+#       define RADEON_FOG_USE_DEPTH         (0 << 25)
+#       define RADEON_FOG_USE_DIFFUSE_ALPHA (2 << 25)
+#       define RADEON_FOG_USE_SPEC_ALPHA    (3 << 25)
+#define RADEON_PP_LUM_MATRIX                0x1d00
+#define RADEON_PP_MISC                      0x1c14
+#       define RADEON_REF_ALPHA_MASK        0x000000ff
+#       define RADEON_ALPHA_TEST_FAIL       (0 << 8)
+#       define RADEON_ALPHA_TEST_LESS       (1 << 8)
+#       define RADEON_ALPHA_TEST_LEQUAL     (2 << 8)
+#       define RADEON_ALPHA_TEST_EQUAL      (3 << 8)
+#       define RADEON_ALPHA_TEST_GEQUAL     (4 << 8)
+#       define RADEON_ALPHA_TEST_GREATER    (5 << 8)
+#       define RADEON_ALPHA_TEST_NEQUAL     (6 << 8)
+#       define RADEON_ALPHA_TEST_PASS       (7 << 8)
+#       define RADEON_ALPHA_TEST_OP_MASK    (7 << 8)
+#       define RADEON_CHROMA_FUNC_FAIL      (0 << 16)
+#       define RADEON_CHROMA_FUNC_PASS      (1 << 16)
+#       define RADEON_CHROMA_FUNC_NEQUAL    (2 << 16)
+#       define RADEON_CHROMA_FUNC_EQUAL     (3 << 16)
+#       define RADEON_CHROMA_KEY_NEAREST    (0 << 18)
+#       define RADEON_CHROMA_KEY_ZERO       (1 << 18)
+#       define RADEON_SHADOW_ID_AUTO_INC    (1 << 20)
+#       define RADEON_SHADOW_FUNC_EQUAL     (0 << 21)
+#       define RADEON_SHADOW_FUNC_NEQUAL    (1 << 21)
+#       define RADEON_SHADOW_PASS_1         (0 << 22)
+#       define RADEON_SHADOW_PASS_2         (1 << 22)
+#       define RADEON_RIGHT_HAND_CUBE_D3D   (0 << 24)
+#       define RADEON_RIGHT_HAND_CUBE_OGL   (1 << 24)
+#define RADEON_PP_ROT_MATRIX_0              0x1d58
+#define RADEON_PP_ROT_MATRIX_1              0x1d5c
+#define RADEON_PP_TXFILTER_0                0x1c54
+#define RADEON_PP_TXFILTER_1                0x1c6c
+#define RADEON_PP_TXFILTER_2                0x1c84
+#       define RADEON_MAG_FILTER_NEAREST                   (0  <<  0)
+#       define RADEON_MAG_FILTER_LINEAR                    (1  <<  0)
+#       define RADEON_MAG_FILTER_MASK                      (1  <<  0)
+#       define RADEON_MIN_FILTER_NEAREST                   (0  <<  1)
+#       define RADEON_MIN_FILTER_LINEAR                    (1  <<  1)
+#       define RADEON_MIN_FILTER_NEAREST_MIP_NEAREST       (2  <<  1)
+#       define RADEON_MIN_FILTER_NEAREST_MIP_LINEAR        (3  <<  1)
+#       define RADEON_MIN_FILTER_LINEAR_MIP_NEAREST        (6  <<  1)
+#       define RADEON_MIN_FILTER_LINEAR_MIP_LINEAR         (7  <<  1)
+#       define RADEON_MIN_FILTER_ANISO_NEAREST             (8  <<  1)
+#       define RADEON_MIN_FILTER_ANISO_LINEAR              (9  <<  1)
+#       define RADEON_MIN_FILTER_ANISO_NEAREST_MIP_NEAREST (10 <<  1)
+#       define RADEON_MIN_FILTER_ANISO_NEAREST_MIP_LINEAR  (11 <<  1)
+#       define RADEON_MIN_FILTER_MASK                      (15 <<  1)
+#       define RADEON_MAX_ANISO_1_TO_1                     (0  <<  5)
+#       define RADEON_MAX_ANISO_2_TO_1                     (1  <<  5)
+#       define RADEON_MAX_ANISO_4_TO_1                     (2  <<  5)
+#       define RADEON_MAX_ANISO_8_TO_1                     (3  <<  5)
+#       define RADEON_MAX_ANISO_16_TO_1                    (4  <<  5)
+#       define RADEON_MAX_ANISO_MASK                       (7  <<  5)
+#       define RADEON_LOD_BIAS_MASK                        (0xff <<  8)
+#       define RADEON_LOD_BIAS_SHIFT                       8
+#       define RADEON_MAX_MIP_LEVEL_MASK                   (0x0f << 16)
+#       define RADEON_MAX_MIP_LEVEL_SHIFT                  16
+#       define RADEON_YUV_TO_RGB                           (1  << 20)
+#       define RADEON_YUV_TEMPERATURE_COOL                 (0  << 21)
+#       define RADEON_YUV_TEMPERATURE_HOT                  (1  << 21)
+#       define RADEON_YUV_TEMPERATURE_MASK                 (1  << 21)
+#       define RADEON_WRAPEN_S                             (1  << 22)
+#       define RADEON_CLAMP_S_WRAP                         (0  << 23)
+#       define RADEON_CLAMP_S_MIRROR                       (1  << 23)
+#       define RADEON_CLAMP_S_CLAMP_LAST                   (2  << 23)
+#       define RADEON_CLAMP_S_MIRROR_CLAMP_LAST            (3  << 23)
+#       define RADEON_CLAMP_S_CLAMP_BORDER                 (4  << 23)
+#       define RADEON_CLAMP_S_MIRROR_CLAMP_BORDER          (5  << 23)
+#       define RADEON_CLAMP_S_CLAMP_GL                     (6  << 23)
+#       define RADEON_CLAMP_S_MIRROR_CLAMP_GL              (7  << 23)
+#       define RADEON_CLAMP_S_MASK                         (7  << 23)
+#       define RADEON_WRAPEN_T                             (1  << 26)
+#       define RADEON_CLAMP_T_WRAP                         (0  << 27)
+#       define RADEON_CLAMP_T_MIRROR                       (1  << 27)
+#       define RADEON_CLAMP_T_CLAMP_LAST                   (2  << 27)
+#       define RADEON_CLAMP_T_MIRROR_CLAMP_LAST            (3  << 27)
+#       define RADEON_CLAMP_T_CLAMP_BORDER                 (4  << 27)
+#       define RADEON_CLAMP_T_MIRROR_CLAMP_BORDER          (5  << 27)
+#       define RADEON_CLAMP_T_CLAMP_GL                     (6  << 27)
+#       define RADEON_CLAMP_T_MIRROR_CLAMP_GL              (7  << 27)
+#       define RADEON_CLAMP_T_MASK                         (7  << 27)
+#       define RADEON_BORDER_MODE_OGL                      (0  << 31)
+#       define RADEON_BORDER_MODE_D3D                      (1  << 31)
+#define RADEON_PP_TXFORMAT_0                0x1c58
+#define RADEON_PP_TXFORMAT_1                0x1c70
+#define RADEON_PP_TXFORMAT_2                0x1c88
+#       define RADEON_TXFORMAT_I8                 (0  <<  0)
+#       define RADEON_TXFORMAT_AI88               (1  <<  0)
+#       define RADEON_TXFORMAT_RGB332             (2  <<  0)
+#       define RADEON_TXFORMAT_ARGB1555           (3  <<  0)
+#       define RADEON_TXFORMAT_RGB565             (4  <<  0)
+#       define RADEON_TXFORMAT_ARGB4444           (5  <<  0)
+#       define RADEON_TXFORMAT_ARGB8888           (6  <<  0)
+#       define RADEON_TXFORMAT_RGBA8888           (7  <<  0)
+#       define RADEON_TXFORMAT_Y8                 (8  <<  0)
+#       define RADEON_TXFORMAT_VYUY422            (10 <<  0)
+#       define RADEON_TXFORMAT_YVYU422            (11 <<  0)
+#       define RADEON_TXFORMAT_DXT1               (12 <<  0)
+#       define RADEON_TXFORMAT_DXT23              (14 <<  0)
+#       define RADEON_TXFORMAT_DXT45              (15 <<  0)
+#       define RADEON_TXFORMAT_SHADOW16           (16 <<  0)
+#       define RADEON_TXFORMAT_SHADOW32           (17 <<  0)
+#       define RADEON_TXFORMAT_DUDV88             (18 <<  0)
+#       define RADEON_TXFORMAT_LDUDV655           (19 <<  0)
+#       define RADEON_TXFORMAT_LDUDUV8888         (20 <<  0)
+#       define RADEON_TXFORMAT_FORMAT_MASK        (31 <<  0)
+#       define RADEON_TXFORMAT_FORMAT_SHIFT       0
+#       define RADEON_TXFORMAT_APPLE_YUV_MODE     (1  <<  5)
+#       define RADEON_TXFORMAT_ALPHA_IN_MAP       (1  <<  6)
+#       define RADEON_TXFORMAT_NON_POWER2         (1  <<  7)
+#       define RADEON_TXFORMAT_WIDTH_MASK         (15 <<  8)
+#       define RADEON_TXFORMAT_WIDTH_SHIFT        8
+#       define RADEON_TXFORMAT_HEIGHT_MASK        (15 << 12)
+#       define RADEON_TXFORMAT_HEIGHT_SHIFT       12
+#       define RADEON_TXFORMAT_F5_WIDTH_MASK      (15 << 16)
+#       define RADEON_TXFORMAT_F5_WIDTH_SHIFT     16
+#       define RADEON_TXFORMAT_F5_HEIGHT_MASK     (15 << 20)
+#       define RADEON_TXFORMAT_F5_HEIGHT_SHIFT    20
+#       define RADEON_TXFORMAT_ST_ROUTE_STQ0      (0  << 24)
+#       define RADEON_TXFORMAT_ST_ROUTE_MASK      (3  << 24)
+#       define RADEON_TXFORMAT_ST_ROUTE_STQ1      (1  << 24)
+#       define RADEON_TXFORMAT_ST_ROUTE_STQ2      (2  << 24)
+#       define RADEON_TXFORMAT_ENDIAN_NO_SWAP     (0  << 26)
+#       define RADEON_TXFORMAT_ENDIAN_16BPP_SWAP  (1  << 26)
+#       define RADEON_TXFORMAT_ENDIAN_32BPP_SWAP  (2  << 26)
+#       define RADEON_TXFORMAT_ENDIAN_HALFDW_SWAP (3  << 26)
+#       define RADEON_TXFORMAT_ALPHA_MASK_ENABLE  (1  << 28)
+#       define RADEON_TXFORMAT_CHROMA_KEY_ENABLE  (1  << 29)
+#       define RADEON_TXFORMAT_CUBIC_MAP_ENABLE   (1  << 30)
+#       define RADEON_TXFORMAT_PERSPECTIVE_ENABLE (1  << 31)
+#define RADEON_PP_CUBIC_FACES_0             0x1d24
+#define RADEON_PP_CUBIC_FACES_1             0x1d28
+#define RADEON_PP_CUBIC_FACES_2             0x1d2c
+#       define RADEON_FACE_WIDTH_1_SHIFT          0
+#       define RADEON_FACE_HEIGHT_1_SHIFT         4
+#       define RADEON_FACE_WIDTH_1_MASK           (0xf << 0)
+#       define RADEON_FACE_HEIGHT_1_MASK          (0xf << 4)
+#       define RADEON_FACE_WIDTH_2_SHIFT          8
+#       define RADEON_FACE_HEIGHT_2_SHIFT         12
+#       define RADEON_FACE_WIDTH_2_MASK           (0xf << 8)
+#       define RADEON_FACE_HEIGHT_2_MASK          (0xf << 12)
+#       define RADEON_FACE_WIDTH_3_SHIFT          16
+#       define RADEON_FACE_HEIGHT_3_SHIFT         20
+#       define RADEON_FACE_WIDTH_3_MASK           (0xf << 16)
+#       define RADEON_FACE_HEIGHT_3_MASK          (0xf << 20)
+#       define RADEON_FACE_WIDTH_4_SHIFT          24
+#       define RADEON_FACE_HEIGHT_4_SHIFT         28
+#       define RADEON_FACE_WIDTH_4_MASK           (0xf << 24)
+#       define RADEON_FACE_HEIGHT_4_MASK          (0xf << 28)
+
+#define RADEON_PP_TXOFFSET_0                0x1c5c
+#define RADEON_PP_TXOFFSET_1                0x1c74
+#define RADEON_PP_TXOFFSET_2                0x1c8c
+#       define RADEON_TXO_ENDIAN_NO_SWAP     (0 << 0)
+#       define RADEON_TXO_ENDIAN_BYTE_SWAP   (1 << 0)
+#       define RADEON_TXO_ENDIAN_WORD_SWAP   (2 << 0)
+#       define RADEON_TXO_ENDIAN_HALFDW_SWAP (3 << 0)
+#       define RADEON_TXO_MACRO_LINEAR       (0 << 2)
+#       define RADEON_TXO_MACRO_TILE         (1 << 2)
+#       define RADEON_TXO_MICRO_LINEAR       (0 << 3)
+#       define RADEON_TXO_MICRO_TILE_X2      (1 << 3)
+#       define RADEON_TXO_MICRO_TILE_OPT     (2 << 3)
+#       define RADEON_TXO_OFFSET_MASK        0xffffffe0
+#       define RADEON_TXO_OFFSET_SHIFT       5
+
+#define RADEON_PP_CUBIC_OFFSET_T0_0         0x1dd0  /* bits [31:5] */
+#define RADEON_PP_CUBIC_OFFSET_T0_1         0x1dd4
+#define RADEON_PP_CUBIC_OFFSET_T0_2         0x1dd8
+#define RADEON_PP_CUBIC_OFFSET_T0_3         0x1ddc
+#define RADEON_PP_CUBIC_OFFSET_T0_4         0x1de0
+#define RADEON_PP_CUBIC_OFFSET_T1_0         0x1e00
+#define RADEON_PP_CUBIC_OFFSET_T1_1         0x1e04
+#define RADEON_PP_CUBIC_OFFSET_T1_2         0x1e08
+#define RADEON_PP_CUBIC_OFFSET_T1_3         0x1e0c
+#define RADEON_PP_CUBIC_OFFSET_T1_4         0x1e10
+#define RADEON_PP_CUBIC_OFFSET_T2_0         0x1e14
+#define RADEON_PP_CUBIC_OFFSET_T2_1         0x1e18
+#define RADEON_PP_CUBIC_OFFSET_T2_2         0x1e1c
+#define RADEON_PP_CUBIC_OFFSET_T2_3         0x1e20
+#define RADEON_PP_CUBIC_OFFSET_T2_4         0x1e24
+
+#define RADEON_PP_TEX_SIZE_0                0x1d04  /* NPOT */
+#define RADEON_PP_TEX_SIZE_1                0x1d0c
+#define RADEON_PP_TEX_SIZE_2                0x1d14
+#       define RADEON_TEX_USIZE_MASK        (0x7ff << 0)
+#       define RADEON_TEX_USIZE_SHIFT       0
+#       define RADEON_TEX_VSIZE_MASK        (0x7ff << 16)
+#       define RADEON_TEX_VSIZE_SHIFT       16
+#       define RADEON_SIGNED_RGB_MASK       (1 << 30)
+#       define RADEON_SIGNED_RGB_SHIFT      30
+#       define RADEON_SIGNED_ALPHA_MASK     (1 << 31)
+#       define RADEON_SIGNED_ALPHA_SHIFT    31
+#define RADEON_PP_TEX_PITCH_0               0x1d08  /* NPOT */
+#define RADEON_PP_TEX_PITCH_1               0x1d10  /* NPOT */
+#define RADEON_PP_TEX_PITCH_2               0x1d18  /* NPOT */
+/* note: bits 13-5: 32 byte aligned stride of texture map */
+
+#define RADEON_PP_TXCBLEND_0                0x1c60
+#define RADEON_PP_TXCBLEND_1                0x1c78
+#define RADEON_PP_TXCBLEND_2                0x1c90
+#       define RADEON_COLOR_ARG_A_SHIFT          0
+#       define RADEON_COLOR_ARG_A_MASK           (0x1f << 0)
+#       define RADEON_COLOR_ARG_A_ZERO           (0    << 0)
+#       define RADEON_COLOR_ARG_A_CURRENT_COLOR  (2    << 0)
+#       define RADEON_COLOR_ARG_A_CURRENT_ALPHA  (3    << 0)
+#       define RADEON_COLOR_ARG_A_DIFFUSE_COLOR  (4    << 0)
+#       define RADEON_COLOR_ARG_A_DIFFUSE_ALPHA  (5    << 0)
+#       define RADEON_COLOR_ARG_A_SPECULAR_COLOR (6    << 0)
+#       define RADEON_COLOR_ARG_A_SPECULAR_ALPHA (7    << 0)
+#       define RADEON_COLOR_ARG_A_TFACTOR_COLOR  (8    << 0)
+#       define RADEON_COLOR_ARG_A_TFACTOR_ALPHA  (9    << 0)
+#       define RADEON_COLOR_ARG_A_T0_COLOR       (10   << 0)
+#       define RADEON_COLOR_ARG_A_T0_ALPHA       (11   << 0)
+#       define RADEON_COLOR_ARG_A_T1_COLOR       (12   << 0)
+#       define RADEON_COLOR_ARG_A_T1_ALPHA       (13   << 0)
+#       define RADEON_COLOR_ARG_A_T2_COLOR       (14   << 0)
+#       define RADEON_COLOR_ARG_A_T2_ALPHA       (15   << 0)
+#       define RADEON_COLOR_ARG_A_T3_COLOR       (16   << 0)
+#       define RADEON_COLOR_ARG_A_T3_ALPHA       (17   << 0)
+#       define RADEON_COLOR_ARG_B_SHIFT          5
+#       define RADEON_COLOR_ARG_B_MASK           (0x1f << 5)
+#       define RADEON_COLOR_ARG_B_ZERO           (0    << 5)
+#       define RADEON_COLOR_ARG_B_CURRENT_COLOR  (2    << 5)
+#       define RADEON_COLOR_ARG_B_CURRENT_ALPHA  (3    << 5)
+#       define RADEON_COLOR_ARG_B_DIFFUSE_COLOR  (4    << 5)
+#       define RADEON_COLOR_ARG_B_DIFFUSE_ALPHA  (5    << 5)
+#       define RADEON_COLOR_ARG_B_SPECULAR_COLOR (6    << 5)
+#       define RADEON_COLOR_ARG_B_SPECULAR_ALPHA (7    << 5)
+#       define RADEON_COLOR_ARG_B_TFACTOR_COLOR  (8    << 5)
+#       define RADEON_COLOR_ARG_B_TFACTOR_ALPHA  (9    << 5)
+#       define RADEON_COLOR_ARG_B_T0_COLOR       (10   << 5)
+#       define RADEON_COLOR_ARG_B_T0_ALPHA       (11   << 5)
+#       define RADEON_COLOR_ARG_B_T1_COLOR       (12   << 5)
+#       define RADEON_COLOR_ARG_B_T1_ALPHA       (13   << 5)
+#       define RADEON_COLOR_ARG_B_T2_COLOR       (14   << 5)
+#       define RADEON_COLOR_ARG_B_T2_ALPHA       (15   << 5)
+#       define RADEON_COLOR_ARG_B_T3_COLOR       (16   << 5)
+#       define RADEON_COLOR_ARG_B_T3_ALPHA       (17   << 5)
+#       define RADEON_COLOR_ARG_C_SHIFT          10
+#       define RADEON_COLOR_ARG_C_MASK           (0x1f << 10)
+#       define RADEON_COLOR_ARG_C_ZERO           (0    << 10)
+#       define RADEON_COLOR_ARG_C_CURRENT_COLOR  (2    << 10)
+#       define RADEON_COLOR_ARG_C_CURRENT_ALPHA  (3    << 10)
+#       define RADEON_COLOR_ARG_C_DIFFUSE_COLOR  (4    << 10)
+#       define RADEON_COLOR_ARG_C_DIFFUSE_ALPHA  (5    << 10)
+#       define RADEON_COLOR_ARG_C_SPECULAR_COLOR (6    << 10)
+#       define RADEON_COLOR_ARG_C_SPECULAR_ALPHA (7    << 10)
+#       define RADEON_COLOR_ARG_C_TFACTOR_COLOR  (8    << 10)
+#       define RADEON_COLOR_ARG_C_TFACTOR_ALPHA  (9    << 10)
+#       define RADEON_COLOR_ARG_C_T0_COLOR       (10   << 10)
+#       define RADEON_COLOR_ARG_C_T0_ALPHA       (11   << 10)
+#       define RADEON_COLOR_ARG_C_T1_COLOR       (12   << 10)
+#       define RADEON_COLOR_ARG_C_T1_ALPHA       (13   << 10)
+#       define RADEON_COLOR_ARG_C_T2_COLOR       (14   << 10)
+#       define RADEON_COLOR_ARG_C_T2_ALPHA       (15   << 10)
+#       define RADEON_COLOR_ARG_C_T3_COLOR       (16   << 10)
+#       define RADEON_COLOR_ARG_C_T3_ALPHA       (17   << 10)
+#       define RADEON_COMP_ARG_A                 (1 << 15)
+#       define RADEON_COMP_ARG_A_SHIFT           15
+#       define RADEON_COMP_ARG_B                 (1 << 16)
+#       define RADEON_COMP_ARG_B_SHIFT           16
+#       define RADEON_COMP_ARG_C                 (1 << 17)
+#       define RADEON_COMP_ARG_C_SHIFT           17
+#       define RADEON_BLEND_CTL_MASK             (7 << 18)
+#       define RADEON_BLEND_CTL_ADD              (0 << 18)
+#       define RADEON_BLEND_CTL_SUBTRACT         (1 << 18)
+#       define RADEON_BLEND_CTL_ADDSIGNED        (2 << 18)
+#       define RADEON_BLEND_CTL_BLEND            (3 << 18)
+#       define RADEON_BLEND_CTL_DOT3             (4 << 18)
+#       define RADEON_SCALE_SHIFT                21
+#       define RADEON_SCALE_MASK                 (3 << 21)
+#       define RADEON_SCALE_1X                   (0 << 21)
+#       define RADEON_SCALE_2X                   (1 << 21)
+#       define RADEON_SCALE_4X                   (2 << 21)
+#       define RADEON_CLAMP_TX                   (1 << 23)
+#       define RADEON_T0_EQ_TCUR                 (1 << 24)
+#       define RADEON_T1_EQ_TCUR                 (1 << 25)
+#       define RADEON_T2_EQ_TCUR                 (1 << 26)
+#       define RADEON_T3_EQ_TCUR                 (1 << 27)
+#       define RADEON_COLOR_ARG_MASK             0x1f
+#       define RADEON_COMP_ARG_SHIFT             15
+#define RADEON_PP_TXABLEND_0                0x1c64
+#define RADEON_PP_TXABLEND_1                0x1c7c
+#define RADEON_PP_TXABLEND_2                0x1c94
+#       define RADEON_ALPHA_ARG_A_SHIFT          0
+#       define RADEON_ALPHA_ARG_A_MASK           (0xf << 0)
+#       define RADEON_ALPHA_ARG_A_ZERO           (0   << 0)
+#       define RADEON_ALPHA_ARG_A_CURRENT_ALPHA  (1   << 0)
+#       define RADEON_ALPHA_ARG_A_DIFFUSE_ALPHA  (2   << 0)
+#       define RADEON_ALPHA_ARG_A_SPECULAR_ALPHA (3   << 0)
+#       define RADEON_ALPHA_ARG_A_TFACTOR_ALPHA  (4   << 0)
+#       define RADEON_ALPHA_ARG_A_T0_ALPHA       (5   << 0)
+#       define RADEON_ALPHA_ARG_A_T1_ALPHA       (6   << 0)
+#       define RADEON_ALPHA_ARG_A_T2_ALPHA       (7   << 0)
+#       define RADEON_ALPHA_ARG_A_T3_ALPHA       (8   << 0)
+#       define RADEON_ALPHA_ARG_B_SHIFT          4
+#       define RADEON_ALPHA_ARG_B_MASK           (0xf << 4)
+#       define RADEON_ALPHA_ARG_B_ZERO           (0   << 4)
+#       define RADEON_ALPHA_ARG_B_CURRENT_ALPHA  (1   << 4)
+#       define RADEON_ALPHA_ARG_B_DIFFUSE_ALPHA  (2   << 4)
+#       define RADEON_ALPHA_ARG_B_SPECULAR_ALPHA (3   << 4)
+#       define RADEON_ALPHA_ARG_B_TFACTOR_ALPHA  (4   << 4)
+#       define RADEON_ALPHA_ARG_B_T0_ALPHA       (5   << 4)
+#       define RADEON_ALPHA_ARG_B_T1_ALPHA       (6   << 4)
+#       define RADEON_ALPHA_ARG_B_T2_ALPHA       (7   << 4)
+#       define RADEON_ALPHA_ARG_B_T3_ALPHA       (8   << 4)
+#       define RADEON_ALPHA_ARG_C_SHIFT          8
+#       define RADEON_ALPHA_ARG_C_MASK           (0xf << 8)
+#       define RADEON_ALPHA_ARG_C_ZERO           (0   << 8)
+#       define RADEON_ALPHA_ARG_C_CURRENT_ALPHA  (1   << 8)
+#       define RADEON_ALPHA_ARG_C_DIFFUSE_ALPHA  (2   << 8)
+#       define RADEON_ALPHA_ARG_C_SPECULAR_ALPHA (3   << 8)
+#       define RADEON_ALPHA_ARG_C_TFACTOR_ALPHA  (4   << 8)
+#       define RADEON_ALPHA_ARG_C_T0_ALPHA       (5   << 8)
+#       define RADEON_ALPHA_ARG_C_T1_ALPHA       (6   << 8)
+#       define RADEON_ALPHA_ARG_C_T2_ALPHA       (7   << 8)
+#       define RADEON_ALPHA_ARG_C_T3_ALPHA       (8   << 8)
+#       define RADEON_DOT_ALPHA_DONT_REPLICATE   (1   << 9)
+#       define RADEON_ALPHA_ARG_MASK             0xf
+
+#define RADEON_PP_TFACTOR_0                 0x1c68
+#define RADEON_PP_TFACTOR_1                 0x1c80
+#define RADEON_PP_TFACTOR_2                 0x1c98
+
+#define RADEON_RB3D_BLENDCNTL               0x1c20
+#       define RADEON_COMB_FCN_MASK                    (3  << 12)
+#       define RADEON_COMB_FCN_ADD_CLAMP               (0  << 12)
+#       define RADEON_COMB_FCN_ADD_NOCLAMP             (1  << 12)
+#       define RADEON_COMB_FCN_SUB_CLAMP               (2  << 12)
+#       define RADEON_COMB_FCN_SUB_NOCLAMP             (3  << 12)
+#       define RADEON_SRC_BLEND_GL_ZERO                (32 << 16)
+#       define RADEON_SRC_BLEND_GL_ONE                 (33 << 16)
+#       define RADEON_SRC_BLEND_GL_SRC_COLOR           (34 << 16)
+#       define RADEON_SRC_BLEND_GL_ONE_MINUS_SRC_COLOR (35 << 16)
+#       define RADEON_SRC_BLEND_GL_DST_COLOR           (36 << 16)
+#       define RADEON_SRC_BLEND_GL_ONE_MINUS_DST_COLOR (37 << 16)
+#       define RADEON_SRC_BLEND_GL_SRC_ALPHA           (38 << 16)
+#       define RADEON_SRC_BLEND_GL_ONE_MINUS_SRC_ALPHA (39 << 16)
+#       define RADEON_SRC_BLEND_GL_DST_ALPHA           (40 << 16)
+#       define RADEON_SRC_BLEND_GL_ONE_MINUS_DST_ALPHA (41 << 16)
+#       define RADEON_SRC_BLEND_GL_SRC_ALPHA_SATURATE  (42 << 16)
+#       define RADEON_SRC_BLEND_MASK                   (63 << 16)
+#       define RADEON_DST_BLEND_GL_ZERO                (32 << 24)
+#       define RADEON_DST_BLEND_GL_ONE                 (33 << 24)
+#       define RADEON_DST_BLEND_GL_SRC_COLOR           (34 << 24)
+#       define RADEON_DST_BLEND_GL_ONE_MINUS_SRC_COLOR (35 << 24)
+#       define RADEON_DST_BLEND_GL_DST_COLOR           (36 << 24)
+#       define RADEON_DST_BLEND_GL_ONE_MINUS_DST_COLOR (37 << 24)
+#       define RADEON_DST_BLEND_GL_SRC_ALPHA           (38 << 24)
+#       define RADEON_DST_BLEND_GL_ONE_MINUS_SRC_ALPHA (39 << 24)
+#       define RADEON_DST_BLEND_GL_DST_ALPHA           (40 << 24)
+#       define RADEON_DST_BLEND_GL_ONE_MINUS_DST_ALPHA (41 << 24)
+#       define RADEON_DST_BLEND_MASK                   (63 << 24)
+#define RADEON_RB3D_CNTL                    0x1c3c
+#       define RADEON_ALPHA_BLEND_ENABLE       (1  <<  0)
+#       define RADEON_PLANE_MASK_ENABLE        (1  <<  1)
+#       define RADEON_DITHER_ENABLE            (1  <<  2)
+#       define RADEON_ROUND_ENABLE             (1  <<  3)
+#       define RADEON_SCALE_DITHER_ENABLE      (1  <<  4)
+#       define RADEON_DITHER_INIT              (1  <<  5)
+#       define RADEON_ROP_ENABLE               (1  <<  6)
+#       define RADEON_STENCIL_ENABLE           (1  <<  7)
+#       define RADEON_Z_ENABLE                 (1  <<  8)
+#       define RADEON_DEPTH_XZ_OFFEST_ENABLE   (1  <<  9)
+#       define RADEON_COLOR_FORMAT_ARGB1555    (3  << 10)
+#       define RADEON_COLOR_FORMAT_RGB565      (4  << 10)
+#       define RADEON_COLOR_FORMAT_ARGB8888    (6  << 10)
+#       define RADEON_COLOR_FORMAT_RGB332      (7  << 10)
+#       define RADEON_COLOR_FORMAT_Y8          (8  << 10)
+#       define RADEON_COLOR_FORMAT_RGB8        (9  << 10)
+#       define RADEON_COLOR_FORMAT_YUV422_VYUY (11 << 10)
+#       define RADEON_COLOR_FORMAT_YUV422_YVYU (12 << 10)
+#       define RADEON_COLOR_FORMAT_aYUV444     (14 << 10)
+#       define RADEON_COLOR_FORMAT_ARGB4444    (15 << 10)
+#       define RADEON_CLRCMP_FLIP_ENABLE       (1  << 14)
+#       define RADEON_ZBLOCK16                 (1  << 15)
+#define RADEON_RB3D_COLOROFFSET             0x1c40
+#       define RADEON_COLOROFFSET_MASK      0xfffffff0
+#define RADEON_RB3D_COLORPITCH              0x1c48
+#       define RADEON_COLORPITCH_MASK         0x000001ff8
+#       define RADEON_COLOR_TILE_ENABLE       (1 << 16)
+#       define RADEON_COLOR_MICROTILE_ENABLE  (1 << 17)
+#       define RADEON_COLOR_ENDIAN_NO_SWAP    (0 << 18)
+#       define RADEON_COLOR_ENDIAN_WORD_SWAP  (1 << 18)
+#       define RADEON_COLOR_ENDIAN_DWORD_SWAP (2 << 18)
+#define RADEON_RB3D_DEPTHOFFSET             0x1c24
+#define RADEON_RB3D_DEPTHPITCH              0x1c28
+#       define RADEON_DEPTHPITCH_MASK         0x00001ff8
+#       define RADEON_DEPTH_HYPERZ            (3 << 16)
+#       define RADEON_DEPTH_ENDIAN_NO_SWAP    (0 << 18)
+#       define RADEON_DEPTH_ENDIAN_WORD_SWAP  (1 << 18)
+#       define RADEON_DEPTH_ENDIAN_DWORD_SWAP (2 << 18)
+#define RADEON_RB3D_PLANEMASK               0x1d84
+#define RADEON_RB3D_ROPCNTL                 0x1d80
+#       define RADEON_ROP_MASK              (15 << 8)
+#       define RADEON_ROP_CLEAR             (0  << 8)
+#       define RADEON_ROP_NOR               (1  << 8)
+#       define RADEON_ROP_AND_INVERTED      (2  << 8)
+#       define RADEON_ROP_COPY_INVERTED     (3  << 8)
+#       define RADEON_ROP_AND_REVERSE       (4  << 8)
+#       define RADEON_ROP_INVERT            (5  << 8)
+#       define RADEON_ROP_XOR               (6  << 8)
+#       define RADEON_ROP_NAND              (7  << 8)
+#       define RADEON_ROP_AND               (8  << 8)
+#       define RADEON_ROP_EQUIV             (9  << 8)
+#       define RADEON_ROP_NOOP              (10 << 8)
+#       define RADEON_ROP_OR_INVERTED       (11 << 8)
+#       define RADEON_ROP_COPY              (12 << 8)
+#       define RADEON_ROP_OR_REVERSE        (13 << 8)
+#       define RADEON_ROP_OR                (14 << 8)
+#       define RADEON_ROP_SET               (15 << 8)
+#define RADEON_RB3D_STENCILREFMASK          0x1d7c
+#       define RADEON_STENCIL_REF_SHIFT       0
+#       define RADEON_STENCIL_REF_MASK        (0xff << 0)
+#       define RADEON_STENCIL_MASK_SHIFT      16
+#       define RADEON_STENCIL_VALUE_MASK      (0xff << 16)
+#       define RADEON_STENCIL_WRITEMASK_SHIFT 24
+#       define RADEON_STENCIL_WRITE_MASK      (0xff << 24)
+#define RADEON_RB3D_ZSTENCILCNTL            0x1c2c
+#       define RADEON_DEPTH_FORMAT_MASK          (0xf << 0)
+#       define RADEON_DEPTH_FORMAT_16BIT_INT_Z   (0  <<  0)
+#       define RADEON_DEPTH_FORMAT_24BIT_INT_Z   (2  <<  0)
+#       define RADEON_DEPTH_FORMAT_24BIT_FLOAT_Z (3  <<  0)
+#       define RADEON_DEPTH_FORMAT_32BIT_INT_Z   (4  <<  0)
+#       define RADEON_DEPTH_FORMAT_32BIT_FLOAT_Z (5  <<  0)
+#       define RADEON_DEPTH_FORMAT_16BIT_FLOAT_W (7  <<  0)
+#       define RADEON_DEPTH_FORMAT_24BIT_FLOAT_W (9  <<  0)
+#       define RADEON_DEPTH_FORMAT_32BIT_FLOAT_W (11 <<  0)
+#       define RADEON_Z_TEST_NEVER               (0  <<  4)
+#       define RADEON_Z_TEST_LESS                (1  <<  4)
+#       define RADEON_Z_TEST_LEQUAL              (2  <<  4)
+#       define RADEON_Z_TEST_EQUAL               (3  <<  4)
+#       define RADEON_Z_TEST_GEQUAL              (4  <<  4)
+#       define RADEON_Z_TEST_GREATER             (5  <<  4)
+#       define RADEON_Z_TEST_NEQUAL              (6  <<  4)
+#       define RADEON_Z_TEST_ALWAYS              (7  <<  4)
+#       define RADEON_Z_TEST_MASK                (7  <<  4)
+#       define RADEON_Z_HIERARCHY_ENABLE         (1  <<  8)
+#       define RADEON_STENCIL_TEST_NEVER         (0  << 12)
+#       define RADEON_STENCIL_TEST_LESS          (1  << 12)
+#       define RADEON_STENCIL_TEST_LEQUAL        (2  << 12)
+#       define RADEON_STENCIL_TEST_EQUAL         (3  << 12)
+#       define RADEON_STENCIL_TEST_GEQUAL        (4  << 12)
+#       define RADEON_STENCIL_TEST_GREATER       (5  << 12)
+#       define RADEON_STENCIL_TEST_NEQUAL        (6  << 12)
+#       define RADEON_STENCIL_TEST_ALWAYS        (7  << 12)
+#       define RADEON_STENCIL_TEST_MASK          (0x7 << 12)
+#       define RADEON_STENCIL_FAIL_KEEP          (0  << 16)
+#       define RADEON_STENCIL_FAIL_ZERO          (1  << 16)
+#       define RADEON_STENCIL_FAIL_REPLACE       (2  << 16)
+#       define RADEON_STENCIL_FAIL_INC           (3  << 16)
+#       define RADEON_STENCIL_FAIL_DEC           (4  << 16)
+#       define RADEON_STENCIL_FAIL_INVERT        (5  << 16)
+#       define RADEON_STENCIL_FAIL_INC_WRAP      (6  << 16)
+#       define RADEON_STENCIL_FAIL_DEC_WRAP      (7  << 16)
+#       define RADEON_STENCIL_FAIL_MASK          (0x7 << 16)
+#       define RADEON_STENCIL_ZPASS_KEEP         (0  << 20)
+#       define RADEON_STENCIL_ZPASS_ZERO         (1  << 20)
+#       define RADEON_STENCIL_ZPASS_REPLACE      (2  << 20)
+#       define RADEON_STENCIL_ZPASS_INC          (3  << 20)
+#       define RADEON_STENCIL_ZPASS_DEC          (4  << 20)
+#       define RADEON_STENCIL_ZPASS_INVERT       (5  << 20)
+#       define RADEON_STENCIL_ZPASS_INC_WRAP     (6  << 20)
+#       define RADEON_STENCIL_ZPASS_DEC_WRAP     (7  << 20)
+#       define RADEON_STENCIL_ZPASS_MASK         (0x7 << 20)
+#       define RADEON_STENCIL_ZFAIL_KEEP         (0  << 24)
+#       define RADEON_STENCIL_ZFAIL_ZERO         (1  << 24)
+#       define RADEON_STENCIL_ZFAIL_REPLACE      (2  << 24)
+#       define RADEON_STENCIL_ZFAIL_INC          (3  << 24)
+#       define RADEON_STENCIL_ZFAIL_DEC          (4  << 24)
+#       define RADEON_STENCIL_ZFAIL_INVERT       (5  << 24)
+#       define RADEON_STENCIL_ZFAIL_INC_WRAP     (6  << 24)
+#       define RADEON_STENCIL_ZFAIL_DEC_WRAP     (7  << 24)
+#       define RADEON_STENCIL_ZFAIL_MASK         (0x7 << 24)
+#       define RADEON_Z_COMPRESSION_ENABLE       (1  << 28)
+#       define RADEON_FORCE_Z_DIRTY              (1  << 29)
+#       define RADEON_Z_WRITE_ENABLE             (1  << 30)
+#       define RADEON_Z_DECOMPRESSION_ENABLE     (1  << 31)
+#define RADEON_RE_LINE_PATTERN              0x1cd0
+#       define RADEON_LINE_PATTERN_MASK             0x0000ffff
+#       define RADEON_LINE_REPEAT_COUNT_SHIFT       16
+#       define RADEON_LINE_PATTERN_START_SHIFT      24
+#       define RADEON_LINE_PATTERN_LITTLE_BIT_ORDER (0 << 28)
+#       define RADEON_LINE_PATTERN_BIG_BIT_ORDER    (1 << 28)
+#       define RADEON_LINE_PATTERN_AUTO_RESET       (1 << 29)
+#define RADEON_RE_LINE_STATE                0x1cd4
+#       define RADEON_LINE_CURRENT_PTR_SHIFT   0
+#       define RADEON_LINE_CURRENT_COUNT_SHIFT 8
+#define RADEON_RE_MISC                      0x26c4
+#       define RADEON_STIPPLE_COORD_MASK       0x1f
+#       define RADEON_STIPPLE_X_OFFSET_SHIFT   0
+#       define RADEON_STIPPLE_X_OFFSET_MASK    (0x1f << 0)
+#       define RADEON_STIPPLE_Y_OFFSET_SHIFT   8
+#       define RADEON_STIPPLE_Y_OFFSET_MASK    (0x1f << 8)
+#       define RADEON_STIPPLE_LITTLE_BIT_ORDER (0 << 16)
+#       define RADEON_STIPPLE_BIG_BIT_ORDER    (1 << 16)
+#define RADEON_RE_SOLID_COLOR               0x1c1c
+#define RADEON_RE_TOP_LEFT                  0x26c0
+#       define RADEON_RE_LEFT_SHIFT         0
+#       define RADEON_RE_TOP_SHIFT          16
+#define RADEON_RE_WIDTH_HEIGHT              0x1c44
+#       define RADEON_RE_WIDTH_SHIFT        0
+#       define RADEON_RE_HEIGHT_SHIFT       16
+
+#define RADEON_SE_CNTL                      0x1c4c
+#       define RADEON_FFACE_CULL_CW          (0 <<  0)
+#       define RADEON_FFACE_CULL_CCW         (1 <<  0)
+#       define RADEON_FFACE_CULL_DIR_MASK    (1 <<  0)
+#       define RADEON_BFACE_CULL             (0 <<  1)
+#       define RADEON_BFACE_SOLID            (3 <<  1)
+#       define RADEON_FFACE_CULL             (0 <<  3)
+#       define RADEON_FFACE_SOLID            (3 <<  3)
+#       define RADEON_FFACE_CULL_MASK        (3 <<  3)
+#       define RADEON_BADVTX_CULL_DISABLE    (1 <<  5)
+#       define RADEON_FLAT_SHADE_VTX_0       (0 <<  6)
+#       define RADEON_FLAT_SHADE_VTX_1       (1 <<  6)
+#       define RADEON_FLAT_SHADE_VTX_2       (2 <<  6)
+#       define RADEON_FLAT_SHADE_VTX_LAST    (3 <<  6)
+#       define RADEON_DIFFUSE_SHADE_SOLID    (0 <<  8)
+#       define RADEON_DIFFUSE_SHADE_FLAT     (1 <<  8)
+#       define RADEON_DIFFUSE_SHADE_GOURAUD  (2 <<  8)
+#       define RADEON_DIFFUSE_SHADE_MASK     (3 <<  8)
+#       define RADEON_ALPHA_SHADE_SOLID      (0 << 10)
+#       define RADEON_ALPHA_SHADE_FLAT       (1 << 10)
+#       define RADEON_ALPHA_SHADE_GOURAUD    (2 << 10)
+#       define RADEON_ALPHA_SHADE_MASK       (3 << 10)
+#       define RADEON_SPECULAR_SHADE_SOLID   (0 << 12)
+#       define RADEON_SPECULAR_SHADE_FLAT    (1 << 12)
+#       define RADEON_SPECULAR_SHADE_GOURAUD (2 << 12)
+#       define RADEON_SPECULAR_SHADE_MASK    (3 << 12)
+#       define RADEON_FOG_SHADE_SOLID        (0 << 14)
+#       define RADEON_FOG_SHADE_FLAT         (1 << 14)
+#       define RADEON_FOG_SHADE_GOURAUD      (2 << 14)
+#       define RADEON_FOG_SHADE_MASK         (3 << 14)
+#       define RADEON_ZBIAS_ENABLE_POINT     (1 << 16)
+#       define RADEON_ZBIAS_ENABLE_LINE      (1 << 17)
+#       define RADEON_ZBIAS_ENABLE_TRI       (1 << 18)
+#       define RADEON_WIDELINE_ENABLE        (1 << 20)
+#       define RADEON_VPORT_XY_XFORM_ENABLE  (1 << 24)
+#       define RADEON_VPORT_Z_XFORM_ENABLE   (1 << 25)
+#       define RADEON_VTX_PIX_CENTER_D3D     (0 << 27)
+#       define RADEON_VTX_PIX_CENTER_OGL     (1 << 27)
+#       define RADEON_ROUND_MODE_TRUNC       (0 << 28)
+#       define RADEON_ROUND_MODE_ROUND       (1 << 28)
+#       define RADEON_ROUND_MODE_ROUND_EVEN  (2 << 28)
+#       define RADEON_ROUND_MODE_ROUND_ODD   (3 << 28)
+#       define RADEON_ROUND_PREC_16TH_PIX    (0 << 30)
+#       define RADEON_ROUND_PREC_8TH_PIX     (1 << 30)
+#       define RADEON_ROUND_PREC_4TH_PIX     (2 << 30)
+#       define RADEON_ROUND_PREC_HALF_PIX    (3 << 30)
+#define RADEON_SE_CNTL_STATUS               0x2140
+#       define RADEON_VC_NO_SWAP            (0 << 0)
+#       define RADEON_VC_16BIT_SWAP         (1 << 0)
+#       define RADEON_VC_32BIT_SWAP         (2 << 0)
+#       define RADEON_VC_HALF_DWORD_SWAP    (3 << 0)
+#       define RADEON_TCL_BYPASS            (1 << 8)
+#define RADEON_SE_COORD_FMT                 0x1c50
+#       define RADEON_VTX_XY_PRE_MULT_1_OVER_W0  (1 <<  0)
+#       define RADEON_VTX_Z_PRE_MULT_1_OVER_W0   (1 <<  1)
+#       define RADEON_VTX_ST0_NONPARAMETRIC      (1 <<  8)
+#       define RADEON_VTX_ST1_NONPARAMETRIC      (1 <<  9)
+#       define RADEON_VTX_ST2_NONPARAMETRIC      (1 << 10)
+#       define RADEON_VTX_ST3_NONPARAMETRIC      (1 << 11)
+#       define RADEON_VTX_W0_NORMALIZE           (1 << 12)
+#       define RADEON_VTX_W0_IS_NOT_1_OVER_W0    (1 << 16)
+#       define RADEON_VTX_ST0_PRE_MULT_1_OVER_W0 (1 << 17)
+#       define RADEON_VTX_ST1_PRE_MULT_1_OVER_W0 (1 << 19)
+#       define RADEON_VTX_ST2_PRE_MULT_1_OVER_W0 (1 << 21)
+#       define RADEON_VTX_ST3_PRE_MULT_1_OVER_W0 (1 << 23)
+#       define RADEON_TEX1_W_ROUTING_USE_W0      (0 << 26)
+#       define RADEON_TEX1_W_ROUTING_USE_Q1      (1 << 26)
+#define RADEON_SE_LINE_WIDTH                0x1db8
+#define RADEON_SE_TCL_LIGHT_MODEL_CTL       0x226c
+#       define RADEON_LIGHTING_ENABLE              (1 << 0)
+#       define RADEON_LIGHT_IN_MODELSPACE          (1 << 1)
+#       define RADEON_LOCAL_VIEWER                 (1 << 2)
+#       define RADEON_NORMALIZE_NORMALS            (1 << 3)
+#       define RADEON_RESCALE_NORMALS              (1 << 4)
+#       define RADEON_SPECULAR_LIGHTS              (1 << 5)
+#       define RADEON_DIFFUSE_SPECULAR_COMBINE     (1 << 6)
+#       define RADEON_LIGHT_ALPHA                  (1 << 7)
+#       define RADEON_LOCAL_LIGHT_VEC_GL           (1 << 8)
+#       define RADEON_LIGHT_NO_NORMAL_AMBIENT_ONLY (1 << 9)
+#       define RADEON_LM_SOURCE_STATE_PREMULT      0
+#       define RADEON_LM_SOURCE_STATE_MULT         1
+#       define RADEON_LM_SOURCE_VERTEX_DIFFUSE     2
+#       define RADEON_LM_SOURCE_VERTEX_SPECULAR    3
+#       define RADEON_EMISSIVE_SOURCE_SHIFT        16
+#       define RADEON_AMBIENT_SOURCE_SHIFT         18
+#       define RADEON_DIFFUSE_SOURCE_SHIFT         20
+#       define RADEON_SPECULAR_SOURCE_SHIFT        22
+#define RADEON_SE_TCL_MATERIAL_AMBIENT_RED     0x2220
+#define RADEON_SE_TCL_MATERIAL_AMBIENT_GREEN   0x2224
+#define RADEON_SE_TCL_MATERIAL_AMBIENT_BLUE    0x2228
+#define RADEON_SE_TCL_MATERIAL_AMBIENT_ALPHA   0x222c
+#define RADEON_SE_TCL_MATERIAL_DIFFUSE_RED     0x2230
+#define RADEON_SE_TCL_MATERIAL_DIFFUSE_GREEN   0x2234
+#define RADEON_SE_TCL_MATERIAL_DIFFUSE_BLUE    0x2238
+#define RADEON_SE_TCL_MATERIAL_DIFFUSE_ALPHA   0x223c
+#define RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED   0x2210
+#define RADEON_SE_TCL_MATERIAL_EMMISSIVE_GREEN 0x2214
+#define RADEON_SE_TCL_MATERIAL_EMMISSIVE_BLUE  0x2218
+#define RADEON_SE_TCL_MATERIAL_EMMISSIVE_ALPHA 0x221c
+#define RADEON_SE_TCL_MATERIAL_SPECULAR_RED    0x2240
+#define RADEON_SE_TCL_MATERIAL_SPECULAR_GREEN  0x2244
+#define RADEON_SE_TCL_MATERIAL_SPECULAR_BLUE   0x2248
+#define RADEON_SE_TCL_MATERIAL_SPECULAR_ALPHA  0x224c
+#define RADEON_SE_TCL_MATRIX_SELECT_0       0x225c
+#       define RADEON_MODELVIEW_0_SHIFT        0
+#       define RADEON_MODELVIEW_1_SHIFT        4
+#       define RADEON_MODELVIEW_2_SHIFT        8
+#       define RADEON_MODELVIEW_3_SHIFT        12
+#       define RADEON_IT_MODELVIEW_0_SHIFT     16
+#       define RADEON_IT_MODELVIEW_1_SHIFT     20
+#       define RADEON_IT_MODELVIEW_2_SHIFT     24
+#       define RADEON_IT_MODELVIEW_3_SHIFT     28
+#define RADEON_SE_TCL_MATRIX_SELECT_1       0x2260
+#       define RADEON_MODELPROJECT_0_SHIFT     0
+#       define RADEON_MODELPROJECT_1_SHIFT     4
+#       define RADEON_MODELPROJECT_2_SHIFT     8
+#       define RADEON_MODELPROJECT_3_SHIFT     12
+#       define RADEON_TEXMAT_0_SHIFT           16
+#       define RADEON_TEXMAT_1_SHIFT           20
+#       define RADEON_TEXMAT_2_SHIFT           24
+#       define RADEON_TEXMAT_3_SHIFT           28
+
+
+#define RADEON_SE_TCL_OUTPUT_VTX_FMT        0x2254
+#       define RADEON_TCL_VTX_W0                 (1 <<  0)
+#       define RADEON_TCL_VTX_FP_DIFFUSE         (1 <<  1)
+#       define RADEON_TCL_VTX_FP_ALPHA           (1 <<  2)
+#       define RADEON_TCL_VTX_PK_DIFFUSE         (1 <<  3)
+#       define RADEON_TCL_VTX_FP_SPEC            (1 <<  4)
+#       define RADEON_TCL_VTX_FP_FOG             (1 <<  5)
+#       define RADEON_TCL_VTX_PK_SPEC            (1 <<  6)
+#       define RADEON_TCL_VTX_ST0                (1 <<  7)
+#       define RADEON_TCL_VTX_ST1                (1 <<  8)
+#       define RADEON_TCL_VTX_Q1                 (1 <<  9)
+#       define RADEON_TCL_VTX_ST2                (1 << 10)
+#       define RADEON_TCL_VTX_Q2                 (1 << 11)
+#       define RADEON_TCL_VTX_ST3                (1 << 12)
+#       define RADEON_TCL_VTX_Q3                 (1 << 13)
+#       define RADEON_TCL_VTX_Q0                 (1 << 14)
+#       define RADEON_TCL_VTX_WEIGHT_COUNT_SHIFT 15
+#       define RADEON_TCL_VTX_NORM0              (1 << 18)
+#       define RADEON_TCL_VTX_XY1                (1 << 27)
+#       define RADEON_TCL_VTX_Z1                 (1 << 28)
+#       define RADEON_TCL_VTX_W1                 (1 << 29)
+#       define RADEON_TCL_VTX_NORM1              (1 << 30)
+#       define RADEON_TCL_VTX_Z0                 (1 << 31)
+
+#define RADEON_SE_TCL_OUTPUT_VTX_SEL        0x2258
+#       define RADEON_TCL_COMPUTE_XYZW           (1 << 0)
+#       define RADEON_TCL_COMPUTE_DIFFUSE        (1 << 1)
+#       define RADEON_TCL_COMPUTE_SPECULAR       (1 << 2)
+#       define RADEON_TCL_FORCE_NAN_IF_COLOR_NAN (1 << 3)
+#       define RADEON_TCL_FORCE_INORDER_PROC     (1 << 4)
+#       define RADEON_TCL_TEX_INPUT_TEX_0        0
+#       define RADEON_TCL_TEX_INPUT_TEX_1        1
+#       define RADEON_TCL_TEX_INPUT_TEX_2        2
+#       define RADEON_TCL_TEX_INPUT_TEX_3        3
+#       define RADEON_TCL_TEX_COMPUTED_TEX_0     8
+#       define RADEON_TCL_TEX_COMPUTED_TEX_1     9
+#       define RADEON_TCL_TEX_COMPUTED_TEX_2     10
+#       define RADEON_TCL_TEX_COMPUTED_TEX_3     11
+#       define RADEON_TCL_TEX_0_OUTPUT_SHIFT     16
+#       define RADEON_TCL_TEX_1_OUTPUT_SHIFT     20
+#       define RADEON_TCL_TEX_2_OUTPUT_SHIFT     24
+#       define RADEON_TCL_TEX_3_OUTPUT_SHIFT     28
+
+#define RADEON_SE_TCL_PER_LIGHT_CTL_0       0x2270
+#       define RADEON_LIGHT_0_ENABLE               (1 <<  0)
+#       define RADEON_LIGHT_0_ENABLE_AMBIENT       (1 <<  1)
+#       define RADEON_LIGHT_0_ENABLE_SPECULAR      (1 <<  2)
+#       define RADEON_LIGHT_0_IS_LOCAL             (1 <<  3)
+#       define RADEON_LIGHT_0_IS_SPOT              (1 <<  4)
+#       define RADEON_LIGHT_0_DUAL_CONE            (1 <<  5)
+#       define RADEON_LIGHT_0_ENABLE_RANGE_ATTEN   (1 <<  6)
+#       define RADEON_LIGHT_0_CONSTANT_RANGE_ATTEN (1 <<  7)
+#       define RADEON_LIGHT_0_SHIFT                0
+#       define RADEON_LIGHT_1_ENABLE               (1 << 16)
+#       define RADEON_LIGHT_1_ENABLE_AMBIENT       (1 << 17)
+#       define RADEON_LIGHT_1_ENABLE_SPECULAR      (1 << 18)
+#       define RADEON_LIGHT_1_IS_LOCAL             (1 << 19)
+#       define RADEON_LIGHT_1_IS_SPOT              (1 << 20)
+#       define RADEON_LIGHT_1_DUAL_CONE            (1 << 21)
+#       define RADEON_LIGHT_1_ENABLE_RANGE_ATTEN   (1 << 22)
+#       define RADEON_LIGHT_1_CONSTANT_RANGE_ATTEN (1 << 23)
+#       define RADEON_LIGHT_1_SHIFT                16
+#define RADEON_SE_TCL_PER_LIGHT_CTL_1       0x2274
+#       define RADEON_LIGHT_2_SHIFT            0
+#       define RADEON_LIGHT_3_SHIFT            16
+#define RADEON_SE_TCL_PER_LIGHT_CTL_2       0x2278
+#       define RADEON_LIGHT_4_SHIFT            0
+#       define RADEON_LIGHT_5_SHIFT            16
+#define RADEON_SE_TCL_PER_LIGHT_CTL_3       0x227c
+#       define RADEON_LIGHT_6_SHIFT            0
+#       define RADEON_LIGHT_7_SHIFT            16
+
+#define RADEON_SE_TCL_STATE_FLUSH           0x2284
+
+#define RADEON_SE_TCL_SHININESS             0x2250
+
+#define RADEON_SE_TCL_TEXTURE_PROC_CTL      0x2268
+#       define RADEON_TEXGEN_TEXMAT_0_ENABLE      (1 << 0)
+#       define RADEON_TEXGEN_TEXMAT_1_ENABLE      (1 << 1)
+#       define RADEON_TEXGEN_TEXMAT_2_ENABLE      (1 << 2)
+#       define RADEON_TEXGEN_TEXMAT_3_ENABLE      (1 << 3)
+#       define RADEON_TEXMAT_0_ENABLE             (1 << 4)
+#       define RADEON_TEXMAT_1_ENABLE             (1 << 5)
+#       define RADEON_TEXMAT_2_ENABLE             (1 << 6)
+#       define RADEON_TEXMAT_3_ENABLE             (1 << 7)
+#       define RADEON_TEXGEN_INPUT_MASK           0xf
+#       define RADEON_TEXGEN_INPUT_TEXCOORD_0     0
+#       define RADEON_TEXGEN_INPUT_TEXCOORD_1     1
+#       define RADEON_TEXGEN_INPUT_TEXCOORD_2     2
+#       define RADEON_TEXGEN_INPUT_TEXCOORD_3     3
+#       define RADEON_TEXGEN_INPUT_OBJ            4
+#       define RADEON_TEXGEN_INPUT_EYE            5
+#       define RADEON_TEXGEN_INPUT_EYE_NORMAL     6
+#       define RADEON_TEXGEN_INPUT_EYE_REFLECT    7
+#       define RADEON_TEXGEN_INPUT_EYE_NORMALIZED 8
+#       define RADEON_TEXGEN_0_INPUT_SHIFT        16
+#       define RADEON_TEXGEN_1_INPUT_SHIFT        20
+#       define RADEON_TEXGEN_2_INPUT_SHIFT        24
+#       define RADEON_TEXGEN_3_INPUT_SHIFT        28
+
+#define RADEON_SE_TCL_UCP_VERT_BLEND_CTL    0x2264
+#       define RADEON_UCP_IN_CLIP_SPACE            (1 <<  0)
+#       define RADEON_UCP_IN_MODEL_SPACE           (1 <<  1)
+#       define RADEON_UCP_ENABLE_0                 (1 <<  2)
+#       define RADEON_UCP_ENABLE_1                 (1 <<  3)
+#       define RADEON_UCP_ENABLE_2                 (1 <<  4)
+#       define RADEON_UCP_ENABLE_3                 (1 <<  5)
+#       define RADEON_UCP_ENABLE_4                 (1 <<  6)
+#       define RADEON_UCP_ENABLE_5                 (1 <<  7)
+#       define RADEON_TCL_FOG_MASK                 (3 <<  8)
+#       define RADEON_TCL_FOG_DISABLE              (0 <<  8)
+#       define RADEON_TCL_FOG_EXP                  (1 <<  8)
+#       define RADEON_TCL_FOG_EXP2                 (2 <<  8)
+#       define RADEON_TCL_FOG_LINEAR               (3 <<  8)
+#       define RADEON_RNG_BASED_FOG                (1 << 10)
+#       define RADEON_LIGHT_TWOSIDE                (1 << 11)
+#       define RADEON_BLEND_OP_COUNT_MASK          (7 << 12)
+#       define RADEON_BLEND_OP_COUNT_SHIFT         12
+#       define RADEON_POSITION_BLEND_OP_ENABLE     (1 << 16)
+#       define RADEON_NORMAL_BLEND_OP_ENABLE       (1 << 17)
+#       define RADEON_VERTEX_BLEND_SRC_0_PRIMARY   (0 << 18)
+#       define RADEON_VERTEX_BLEND_SRC_0_SECONDARY (1 << 18)
+#       define RADEON_VERTEX_BLEND_SRC_1_PRIMARY   (0 << 19)
+#       define RADEON_VERTEX_BLEND_SRC_1_SECONDARY (1 << 19)
+#       define RADEON_VERTEX_BLEND_SRC_2_PRIMARY   (0 << 20)
+#       define RADEON_VERTEX_BLEND_SRC_2_SECONDARY (1 << 20)
+#       define RADEON_VERTEX_BLEND_SRC_3_PRIMARY   (0 << 21)
+#       define RADEON_VERTEX_BLEND_SRC_3_SECONDARY (1 << 21)
+#       define RADEON_VERTEX_BLEND_WGT_MINUS_ONE   (1 << 22)
+#       define RADEON_CULL_FRONT_IS_CW             (0 << 28)
+#       define RADEON_CULL_FRONT_IS_CCW            (1 << 28)
+#       define RADEON_CULL_FRONT                   (1 << 29)
+#       define RADEON_CULL_BACK                    (1 << 30)
+#       define RADEON_FORCE_W_TO_ONE               (1 << 31)
+
+#define RADEON_SE_VPORT_XSCALE              0x1d98
+#define RADEON_SE_VPORT_XOFFSET             0x1d9c
+#define RADEON_SE_VPORT_YSCALE              0x1da0
+#define RADEON_SE_VPORT_YOFFSET             0x1da4
+#define RADEON_SE_VPORT_ZSCALE              0x1da8
+#define RADEON_SE_VPORT_ZOFFSET             0x1dac
+#define RADEON_SE_ZBIAS_FACTOR              0x1db0
+#define RADEON_SE_ZBIAS_CONSTANT            0x1db4
+
+
+
+				/* Registers for CP and Microcode Engine */
+#define RADEON_CP_ME_RAM_ADDR               0x07d4
+#define RADEON_CP_ME_RAM_RADDR              0x07d8
+#define RADEON_CP_ME_RAM_DATAH              0x07dc
+#define RADEON_CP_ME_RAM_DATAL              0x07e0
+
+#define RADEON_CP_RB_BASE                   0x0700
+#define RADEON_CP_RB_CNTL                   0x0704
+#define RADEON_CP_RB_RPTR_ADDR              0x070c
+#define RADEON_CP_RB_RPTR                   0x0710
+#define RADEON_CP_RB_WPTR                   0x0714
+
+#define RADEON_CP_IB_BASE                   0x0738
+#define RADEON_CP_IB_BUFSZ                  0x073c
+
+#define RADEON_CP_CSQ_CNTL                  0x0740
+#       define RADEON_CSQ_CNT_PRIMARY_MASK     (0xff << 0)
+#       define RADEON_CSQ_PRIDIS_INDDIS        (0    << 28)
+#       define RADEON_CSQ_PRIPIO_INDDIS        (1    << 28)
+#       define RADEON_CSQ_PRIBM_INDDIS         (2    << 28)
+#       define RADEON_CSQ_PRIPIO_INDBM         (3    << 28)
+#       define RADEON_CSQ_PRIBM_INDBM          (4    << 28)
+#       define RADEON_CSQ_PRIPIO_INDPIO        (15   << 28)
+#define RADEON_CP_CSQ_STAT                  0x07f8
+#       define RADEON_CSQ_RPTR_PRIMARY_MASK    (0xff <<  0)
+#       define RADEON_CSQ_WPTR_PRIMARY_MASK    (0xff <<  8)
+#       define RADEON_CSQ_RPTR_INDIRECT_MASK   (0xff << 16)
+#       define RADEON_CSQ_WPTR_INDIRECT_MASK   (0xff << 24)
+#define RADEON_CP_CSQ_ADDR                  0x07f0
+#define RADEON_CP_CSQ_DATA                  0x07f4
+#define RADEON_CP_CSQ_APER_PRIMARY          0x1000
+#define RADEON_CP_CSQ_APER_INDIRECT         0x1300
+
+#define RADEON_CP_RB_WPTR_DELAY             0x0718
+#       define RADEON_PRE_WRITE_TIMER_SHIFT    0
+#       define RADEON_PRE_WRITE_LIMIT_SHIFT    23
+
+#define RADEON_AIC_CNTL                     0x01d0
+#       define RADEON_PCIGART_TRANSLATE_EN     (1 << 0)
+#define RADEON_AIC_LO_ADDR                  0x01dc
+
+
+
+				/* Constants */
+#define RADEON_LAST_FRAME_REG               RADEON_GUI_SCRATCH_REG0
+#define RADEON_LAST_CLEAR_REG               RADEON_GUI_SCRATCH_REG2
+
+
+
+				/* CP packet types */
+#define RADEON_CP_PACKET0                           0x00000000
+#define RADEON_CP_PACKET1                           0x40000000
+#define RADEON_CP_PACKET2                           0x80000000
+#define RADEON_CP_PACKET3                           0xC0000000
+#       define RADEON_CP_PACKET_MASK                0xC0000000
+#       define RADEON_CP_PACKET_COUNT_MASK          0x3fff0000
+#       define RADEON_CP_PACKET_MAX_DWORDS          (1 << 12)
+#       define RADEON_CP_PACKET0_REG_MASK           0x000007ff
+#       define RADEON_CP_PACKET1_REG0_MASK          0x000007ff
+#       define RADEON_CP_PACKET1_REG1_MASK          0x003ff800
+
+#define RADEON_CP_PACKET0_ONE_REG_WR                0x00008000
+
+#define RADEON_CP_PACKET3_NOP                       0xC0001000
+#define RADEON_CP_PACKET3_NEXT_CHAR                 0xC0001900
+#define RADEON_CP_PACKET3_PLY_NEXTSCAN              0xC0001D00
+#define RADEON_CP_PACKET3_SET_SCISSORS              0xC0001E00
+#define RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM     0xC0002300
+#define RADEON_CP_PACKET3_LOAD_MICROCODE            0xC0002400
+#define RADEON_CP_PACKET3_WAIT_FOR_IDLE             0xC0002600
+#define RADEON_CP_PACKET3_3D_DRAW_VBUF              0xC0002800
+#define RADEON_CP_PACKET3_3D_DRAW_IMMD              0xC0002900
+#define RADEON_CP_PACKET3_3D_DRAW_INDX              0xC0002A00
+#define RADEON_CP_PACKET3_LOAD_PALETTE              0xC0002C00
+#define RADEON_CP_PACKET3_3D_LOAD_VBPNTR            0xC0002F00
+#define RADEON_CP_PACKET3_CNTL_PAINT                0xC0009100
+#define RADEON_CP_PACKET3_CNTL_BITBLT               0xC0009200
+#define RADEON_CP_PACKET3_CNTL_SMALLTEXT            0xC0009300
+#define RADEON_CP_PACKET3_CNTL_HOSTDATA_BLT         0xC0009400
+#define RADEON_CP_PACKET3_CNTL_POLYLINE             0xC0009500
+#define RADEON_CP_PACKET3_CNTL_POLYSCANLINES        0xC0009800
+#define RADEON_CP_PACKET3_CNTL_PAINT_MULTI          0xC0009A00
+#define RADEON_CP_PACKET3_CNTL_BITBLT_MULTI         0xC0009B00
+#define RADEON_CP_PACKET3_CNTL_TRANS_BITBLT         0xC0009C00
+
+
+#define RADEON_CP_VC_FRMT_XY                        0x00000000
+#define RADEON_CP_VC_FRMT_W0                        0x00000001
+#define RADEON_CP_VC_FRMT_FPCOLOR                   0x00000002
+#define RADEON_CP_VC_FRMT_FPALPHA                   0x00000004
+#define RADEON_CP_VC_FRMT_PKCOLOR                   0x00000008
+#define RADEON_CP_VC_FRMT_FPSPEC                    0x00000010
+#define RADEON_CP_VC_FRMT_FPFOG                     0x00000020
+#define RADEON_CP_VC_FRMT_PKSPEC                    0x00000040
+#define RADEON_CP_VC_FRMT_ST0                       0x00000080
+#define RADEON_CP_VC_FRMT_ST1                       0x00000100
+#define RADEON_CP_VC_FRMT_Q1                        0x00000200
+#define RADEON_CP_VC_FRMT_ST2                       0x00000400
+#define RADEON_CP_VC_FRMT_Q2                        0x00000800
+#define RADEON_CP_VC_FRMT_ST3                       0x00001000
+#define RADEON_CP_VC_FRMT_Q3                        0x00002000
+#define RADEON_CP_VC_FRMT_Q0                        0x00004000
+#define RADEON_CP_VC_FRMT_BLND_WEIGHT_CNT_MASK      0x00038000
+#define RADEON_CP_VC_FRMT_N0                        0x00040000
+#define RADEON_CP_VC_FRMT_XY1                       0x08000000
+#define RADEON_CP_VC_FRMT_Z1                        0x10000000
+#define RADEON_CP_VC_FRMT_W1                        0x20000000
+#define RADEON_CP_VC_FRMT_N1                        0x40000000
+#define RADEON_CP_VC_FRMT_Z                         0x80000000
+
+#define RADEON_CP_VC_CNTL_PRIM_TYPE_NONE            0x00000000
+#define RADEON_CP_VC_CNTL_PRIM_TYPE_POINT           0x00000001
+#define RADEON_CP_VC_CNTL_PRIM_TYPE_LINE            0x00000002
+#define RADEON_CP_VC_CNTL_PRIM_TYPE_LINE_STRIP      0x00000003
+#define RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_LIST        0x00000004
+#define RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_FAN         0x00000005
+#define RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_STRIP       0x00000006
+#define RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_TYPE_2      0x00000007
+#define RADEON_CP_VC_CNTL_PRIM_TYPE_RECT_LIST       0x00000008
+#define RADEON_CP_VC_CNTL_PRIM_TYPE_3VRT_POINT_LIST 0x00000009
+#define RADEON_CP_VC_CNTL_PRIM_TYPE_3VRT_LINE_LIST  0x0000000a
+#define RADEON_CP_VC_CNTL_PRIM_WALK_IND             0x00000010
+#define RADEON_CP_VC_CNTL_PRIM_WALK_LIST            0x00000020
+#define RADEON_CP_VC_CNTL_PRIM_WALK_RING            0x00000030
+#define RADEON_CP_VC_CNTL_COLOR_ORDER_BGRA          0x00000000
+#define RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA          0x00000040
+#define RADEON_CP_VC_CNTL_MAOS_ENABLE               0x00000080
+#define RADEON_CP_VC_CNTL_VTX_FMT_NON_RADEON_MODE   0x00000000
+#define RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE       0x00000100
+#define RADEON_CP_VC_CNTL_TCL_DISABLE               0x00000000
+#define RADEON_CP_VC_CNTL_TCL_ENABLE                0x00000200
+#define RADEON_CP_VC_CNTL_NUM_SHIFT                 16
+
+#define RADEON_VS_MATRIX_0_ADDR                   0
+#define RADEON_VS_MATRIX_1_ADDR                   4
+#define RADEON_VS_MATRIX_2_ADDR                   8
+#define RADEON_VS_MATRIX_3_ADDR                  12
+#define RADEON_VS_MATRIX_4_ADDR                  16
+#define RADEON_VS_MATRIX_5_ADDR                  20
+#define RADEON_VS_MATRIX_6_ADDR                  24
+#define RADEON_VS_MATRIX_7_ADDR                  28
+#define RADEON_VS_MATRIX_8_ADDR                  32
+#define RADEON_VS_MATRIX_9_ADDR                  36
+#define RADEON_VS_MATRIX_10_ADDR                 40
+#define RADEON_VS_MATRIX_11_ADDR                 44
+#define RADEON_VS_MATRIX_12_ADDR                 48
+#define RADEON_VS_MATRIX_13_ADDR                 52
+#define RADEON_VS_MATRIX_14_ADDR                 56
+#define RADEON_VS_MATRIX_15_ADDR                 60
+#define RADEON_VS_LIGHT_AMBIENT_ADDR             64
+#define RADEON_VS_LIGHT_DIFFUSE_ADDR             72
+#define RADEON_VS_LIGHT_SPECULAR_ADDR            80
+#define RADEON_VS_LIGHT_DIRPOS_ADDR              88
+#define RADEON_VS_LIGHT_HWVSPOT_ADDR             96
+#define RADEON_VS_LIGHT_ATTENUATION_ADDR        104
+#define RADEON_VS_MATRIX_EYE2CLIP_ADDR          112
+#define RADEON_VS_UCP_ADDR                      116
+#define RADEON_VS_GLOBAL_AMBIENT_ADDR           122
+#define RADEON_VS_FOG_PARAM_ADDR                123
+#define RADEON_VS_EYE_VECTOR_ADDR               124
+
+#define RADEON_SS_LIGHT_DCD_ADDR                  0
+#define RADEON_SS_LIGHT_SPOT_EXPONENT_ADDR        8
+#define RADEON_SS_LIGHT_SPOT_CUTOFF_ADDR         16
+#define RADEON_SS_LIGHT_SPECULAR_THRESH_ADDR     24
+#define RADEON_SS_LIGHT_RANGE_CUTOFF_ADDR        32
+#define RADEON_SS_VERT_GUARD_CLIP_ADJ_ADDR       48
+#define RADEON_SS_VERT_GUARD_DISCARD_ADJ_ADDR    49
+#define RADEON_SS_HORZ_GUARD_CLIP_ADJ_ADDR       50
+#define RADEON_SS_HORZ_GUARD_DISCARD_ADJ_ADDR    51
+#define RADEON_SS_SHININESS                      60
+
+#define RADEON_TV_MASTER_CNTL                    0x0800
+#       define RADEON_TVCLK_ALWAYS_ONb           (1 << 30)
+#define RADEON_TV_DAC_CNTL                       0x088c
+#       define RADEON_TV_DAC_CMPOUT              (1 << 5)
+#define RADEON_TV_PRE_DAC_MUX_CNTL               0x0888
+#       define RADEON_Y_RED_EN                   (1 << 0)
+#       define RADEON_C_GRN_EN                   (1 << 1)
+#       define RADEON_CMP_BLU_EN                 (1 << 2)
+#       define RADEON_RED_MX_FORCE_DAC_DATA      (6 << 4)
+#       define RADEON_GRN_MX_FORCE_DAC_DATA      (6 << 8)
+#       define RADEON_BLU_MX_FORCE_DAC_DATA      (6 << 12)
+#       define RADEON_TV_FORCE_DAC_DATA_SHIFT    16
+#endif
diff --git a/src/Makefile.am b/src/Makefile.am
deleted file mode 100644
index aa854c5..0000000
--- a/src/Makefile.am
+++ /dev/null
@@ -1,7 +0,0 @@
-AM_CFLAGS = -DIN_DRI_DRIVER -DGLX_DIRECT_RENDERING -DGLX_INDIRECT_RENDERING
-
-xxx_dri_la_LTLIBRARIES = xxx_dri.la
-xxx_dri_la_CFLAGS = $(AM_CFLAGS) $(DRM_CFLAGS) $(DRI_CFLAGS) -Iserver
-xxx_dri_la_LDFLAGS = -module -noprefix -lm -ldl $(DRM_LIBS) $(DRI_LIBS)
-xxx_dri_ladir = @libdir@/dri
-xxx_dri_la_SOURCES = \
author	Luc Verhaegen <libv@skynet.be>	2010-03-14 07:04:46 +0100
committer	Luc Verhaegen <libv@skynet.be>	2010-03-14 07:04:46 +0100
commit	50d4922305e925896a71e705c438ededbaedb80f (patch)
tree	d9a44227dcdda1de61337280b20170d0deb6211d
parent	5dee9b7b19c1aa3a13618b08bc24f00677b5364b (diff)