35 files changed, 17367 insertions, 0 deletions
diff --git a/r300/Lindent b/r300/Lindent
new file mode 100755
index 0000000..7d8d889
--- /dev/null
+++ b/r300/Lindent
@@ -0,0 +1,2 @@
+#!/bin/sh
+indent -npro -kr -i8 -ts8 -sob -l80 -ss -ncs "$@"
diff --git a/r300/Makefile.am b/r300/Makefile.am
new file mode 100644
index 0000000..0992115
--- /dev/null
+++ b/r300/Makefile.am
@@ -0,0 +1,30 @@
+AM_CFLAGS = -DIN_DRI_DRIVER -DGLX_DIRECT_RENDERING -DGLX_INDIRECT_RENDERING
+
+R300_CFLAGS = -DCOMPILE_R300 -DR200_MERGED=0 -DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R300
+R300_CFLAGS += -I../radeon -I../radeon/server
+
+r300_dri_la_LTLIBRARIES = r300_dri.la
+r300_dri_la_CFLAGS = $(AM_CFLAGS) $(DRM_CFLAGS) $(DRI_CFLAGS) $(R300_CFLAGS)
+r300_dri_la_LDFLAGS = -module -noprefix -avoid-version -lm -ldl \
+		$(DRM_LIBS) $(DRI_LIBS)
+r300_dri_ladir = @libdir@/dri
+r300_dri_la_SOURCES = \
+	../radeon/radeon_screen.c \
+	radeon_context.c \
+	radeon_ioctl.c \
+	radeon_lock.c \
+	radeon_span.c \
+	radeon_state.c \
+	r300_mem.c \
+	r300_context.c \
+	r300_ioctl.c \
+	r300_cmdbuf.c \
+	r300_state.c \
+	r300_render.c \
+	r300_texmem.c \
+	r300_tex.c \
+	r300_texstate.c \
+	r300_vertprog.c \
+	r300_fragprog.c \
+	r300_shader.c \
+	r300_emit.c
diff --git a/r300/r300_cmdbuf.c b/r300/r300_cmdbuf.c
new file mode 100644
index 0000000..3befa58
--- /dev/null
+++ b/r300/r300_cmdbuf.c
@@ -0,0 +1,590 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include "glheader.h"
+#include "state.h"
+#include "imports.h"
+#include "macros.h"
+#include "context.h"
+#include "swrast/swrast.h"
+#include "simple_list.h"
+
+#include "drm.h"
+#include "radeon_drm.h"
+
+#include "radeon_ioctl.h"
+#include "r300_context.h"
+#include "r300_ioctl.h"
+#include "radeon_reg.h"
+#include "r300_reg.h"
+#include "r300_cmdbuf.h"
+#include "r300_emit.h"
+#include "r300_state.h"
+
+// Set this to 1 for extremely verbose debugging of command buffers
+#define DEBUG_CMDBUF		0
+
+/**
+ * Send the current command buffer via ioctl to the hardware.
+ */
+int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller)
+{
+	int ret;
+	int i;
+	drm_radeon_cmd_buffer_t cmd;
+	int start;
+
+	if (r300->radeon.lost_context) {
+		start = 0;
+		r300->radeon.lost_context = GL_FALSE;
+	} else
+		start = r300->cmdbuf.count_reemit;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL) {
+		fprintf(stderr, "%s from %s - %i cliprects\n",
+			__FUNCTION__, caller, r300->radeon.numClipRects);
+
+		if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_VERBOSE)
+			for (i = start; i < r300->cmdbuf.count_used; ++i)
+				fprintf(stderr, "%d: %08x\n", i,
+					r300->cmdbuf.cmd_buf[i]);
+	}
+
+	cmd.buf = (char *)(r300->cmdbuf.cmd_buf + start);
+	cmd.bufsz = (r300->cmdbuf.count_used - start) * 4;
+
+	if (r300->radeon.state.scissor.enabled) {
+		cmd.nbox = r300->radeon.state.scissor.numClipRects;
+		cmd.boxes =
+		    (drm_clip_rect_t *) r300->radeon.state.scissor.pClipRects;
+	} else {
+		cmd.nbox = r300->radeon.numClipRects;
+		cmd.boxes = (drm_clip_rect_t *) r300->radeon.pClipRects;
+	}
+
+	ret = drmCommandWrite(r300->radeon.dri.fd,
+			      DRM_RADEON_CMDBUF, &cmd, sizeof(cmd));
+
+	if (RADEON_DEBUG & DEBUG_SYNC) {
+		fprintf(stderr, "Syncing in %s (from %s)\n\n",
+			__FUNCTION__, caller);
+		radeonWaitForIdleLocked(&r300->radeon);
+	}
+
+	r300->dma.nr_released_bufs = 0;
+	r300->cmdbuf.count_used = 0;
+	r300->cmdbuf.count_reemit = 0;
+
+	return ret;
+}
+
+int r300FlushCmdBuf(r300ContextPtr r300, const char *caller)
+{
+	int ret;
+
+	LOCK_HARDWARE(&r300->radeon);
+
+	ret = r300FlushCmdBufLocked(r300, caller);
+
+	UNLOCK_HARDWARE(&r300->radeon);
+
+	if (ret) {
+		fprintf(stderr, "drmRadeonCmdBuffer: %d\n", ret);
+		_mesa_exit(ret);
+	}
+
+	return ret;
+}
+
+static void r300PrintStateAtom(r300ContextPtr r300, struct r300_state_atom *state)
+{
+	int i;
+	int dwords = (*state->check) (r300, state);
+
+	fprintf(stderr, "  emit %s/%d/%d\n", state->name, dwords,
+		state->cmd_size);
+
+	if (RADEON_DEBUG & DEBUG_VERBOSE)
+		for (i = 0; i < dwords; i++)
+			fprintf(stderr, "      %s[%d]: %08X\n",
+				state->name, i, state->cmd[i]);
+}
+
+/**
+ * Emit all atoms with a dirty field equal to dirty.
+ *
+ * The caller must have ensured that there is enough space in the command
+ * buffer.
+ */
+static __inline__ void r300EmitAtoms(r300ContextPtr r300, GLboolean dirty)
+{
+	struct r300_state_atom *atom;
+	uint32_t *dest;
+
+	dest = r300->cmdbuf.cmd_buf + r300->cmdbuf.count_used;
+
+	if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
+		foreach(atom, &r300->hw.atomlist) {
+			if ((atom->dirty || r300->hw.all_dirty) == dirty) {
+				int dwords = (*atom->check) (r300, atom);
+
+				if (dwords)
+					r300PrintStateAtom(r300, atom);
+				else
+					fprintf(stderr,
+						"  skip state %s\n",
+						atom->name);
+			}
+		}
+	}
+
+	/* Emit WAIT */
+	*dest = cmdwait(R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	dest++;
+	r300->cmdbuf.count_used++;
+
+	/* Emit cache flush */
+	*dest = cmdpacket0(R300_TX_CNTL, 1);
+	dest++;
+	r300->cmdbuf.count_used++;
+
+	*dest = R300_TX_FLUSH;
+	dest++;
+	r300->cmdbuf.count_used++;
+
+	/* Emit END3D */
+	*dest = cmdpacify();
+	dest++;
+	r300->cmdbuf.count_used++;
+
+	/* Emit actual atoms */
+
+	foreach(atom, &r300->hw.atomlist) {
+		if ((atom->dirty || r300->hw.all_dirty) == dirty) {
+			int dwords = (*atom->check) (r300, atom);
+
+			if (dwords) {
+				memcpy(dest, atom->cmd, dwords * 4);
+				dest += dwords;
+				r300->cmdbuf.count_used += dwords;
+				atom->dirty = GL_FALSE;
+			}
+		}
+	}
+}
+
+/**
+ * Copy dirty hardware state atoms into the command buffer.
+ *
+ * We also copy out clean state if we're at the start of a buffer. That makes
+ * it easy to recover from lost contexts.
+ */
+void r300EmitState(r300ContextPtr r300)
+{
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_PRIMS))
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (r300->cmdbuf.count_used && !r300->hw.is_dirty
+	    && !r300->hw.all_dirty)
+		return;
+
+	/* To avoid going across the entire set of states multiple times, just check
+	 * for enough space for the case of emitting all state, and inline the
+	 * r300AllocCmdBuf code here without all the checks.
+	 */
+	r300EnsureCmdBufSpace(r300, r300->hw.max_state_size, __FUNCTION__);
+
+	if (!r300->cmdbuf.count_used) {
+		if (RADEON_DEBUG & DEBUG_STATE)
+			fprintf(stderr, "Begin reemit state\n");
+
+		r300EmitAtoms(r300, GL_FALSE);
+		r300->cmdbuf.count_reemit = r300->cmdbuf.count_used;
+	}
+
+	if (RADEON_DEBUG & DEBUG_STATE)
+		fprintf(stderr, "Begin dirty state\n");
+
+	r300EmitAtoms(r300, GL_TRUE);
+
+	assert(r300->cmdbuf.count_used < r300->cmdbuf.size);
+
+	r300->hw.is_dirty = GL_FALSE;
+	r300->hw.all_dirty = GL_FALSE;
+}
+
+#define CHECK( NM, COUNT )				\
+static int check_##NM( r300ContextPtr r300, 		\
+			struct r300_state_atom* atom )	\
+{							\
+   (void) atom;	(void) r300;				\
+   return (COUNT);					\
+}
+
+#define packet0_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->packet0.count)
+#define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
+
+CHECK(always, atom->cmd_size)
+    CHECK(variable, packet0_count(atom->cmd) ? (1 + packet0_count(atom->cmd)) : 0)
+    CHECK(vpu, vpu_count(atom->cmd) ? (1 + vpu_count(atom->cmd) * 4) : 0)
+#undef packet0_count
+#undef vpu_count
+#define ALLOC_STATE( ATOM, CHK, SZ, IDX )				\
+   do {									\
+      r300->hw.ATOM.cmd_size = (SZ);					\
+      r300->hw.ATOM.cmd = (uint32_t*)CALLOC((SZ) * sizeof(uint32_t));	\
+      r300->hw.ATOM.name = #ATOM;					\
+      r300->hw.ATOM.idx = (IDX);					\
+      r300->hw.ATOM.check = check_##CHK;				\
+      r300->hw.ATOM.dirty = GL_FALSE;					\
+      r300->hw.max_state_size += (SZ);					\
+      insert_at_tail(&r300->hw.atomlist, &r300->hw.ATOM);		\
+   } while (0)
+/**
+ * Allocate memory for the command buffer and initialize the state atom
+ * list. Note that the initial hardware state is set by r300InitState().
+ */
+void r300InitCmdBuf(r300ContextPtr r300)
+{
+	int size, mtu;
+	int has_tcl = 1;
+
+	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+		has_tcl = 0;
+
+	r300->hw.max_state_size = 2 + 2;	/* reserve extra space for WAIT_IDLE and tex cache flush */
+
+	mtu = r300->radeon.glCtx->Const.MaxTextureUnits;
+	if (RADEON_DEBUG & DEBUG_TEXTURE) {
+		fprintf(stderr, "Using %d maximum texture units..\n", mtu);
+	}
+
+	/* Setup the atom linked list */
+	make_empty_list(&r300->hw.atomlist);
+	r300->hw.atomlist.name = "atom-list";
+
+	/* Initialize state atoms */
+	ALLOC_STATE(vpt, always, R300_VPT_CMDSIZE, 0);
+	r300->hw.vpt.cmd[R300_VPT_CMD_0] = cmdpacket0(R300_SE_VPORT_XSCALE, 6);
+	ALLOC_STATE(vap_cntl, always, 2, 0);
+	r300->hw.vap_cntl.cmd[0] = cmdpacket0(R300_VAP_CNTL, 1);
+	ALLOC_STATE(vte, always, 3, 0);
+	r300->hw.vte.cmd[0] = cmdpacket0(R300_SE_VTE_CNTL, 2);
+	ALLOC_STATE(unk2134, always, 3, 0);
+	r300->hw.unk2134.cmd[0] = cmdpacket0(0x2134, 2);
+	ALLOC_STATE(vap_cntl_status, always, 2, 0);
+	r300->hw.vap_cntl_status.cmd[0] = cmdpacket0(R300_VAP_CNTL_STATUS, 1);
+	ALLOC_STATE(vir[0], variable, R300_VIR_CMDSIZE, 0);
+	r300->hw.vir[0].cmd[R300_VIR_CMD_0] =
+	    cmdpacket0(R300_VAP_INPUT_ROUTE_0_0, 1);
+	ALLOC_STATE(vir[1], variable, R300_VIR_CMDSIZE, 1);
+	r300->hw.vir[1].cmd[R300_VIR_CMD_0] =
+	    cmdpacket0(R300_VAP_INPUT_ROUTE_1_0, 1);
+	ALLOC_STATE(vic, always, R300_VIC_CMDSIZE, 0);
+	r300->hw.vic.cmd[R300_VIC_CMD_0] = cmdpacket0(R300_VAP_INPUT_CNTL_0, 2);
+	ALLOC_STATE(unk21DC, always, 2, 0);
+	r300->hw.unk21DC.cmd[0] = cmdpacket0(0x21DC, 1);
+	ALLOC_STATE(vap_clip_cntl, always, 2, 0);
+	r300->hw.vap_clip_cntl.cmd[0] = cmdpacket0(R300_VAP_CLIP_CNTL, 1);
+	ALLOC_STATE(unk2220, always, 5, 0);
+	r300->hw.unk2220.cmd[0] = cmdpacket0(0x2220, 4);
+	ALLOC_STATE(unk2288, always, 2, 0);
+	r300->hw.unk2288.cmd[0] = cmdpacket0(R300_VAP_UNKNOWN_2288, 1);
+	ALLOC_STATE(vof, always, R300_VOF_CMDSIZE, 0);
+	r300->hw.vof.cmd[R300_VOF_CMD_0] =
+	    cmdpacket0(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+
+	if (has_tcl) {
+		ALLOC_STATE(pvs, always, R300_PVS_CMDSIZE, 0);
+		r300->hw.pvs.cmd[R300_PVS_CMD_0] =
+		    cmdpacket0(R300_VAP_PVS_CNTL_1, 3);
+	}
+
+	ALLOC_STATE(gb_enable, always, 2, 0);
+	r300->hw.gb_enable.cmd[0] = cmdpacket0(R300_GB_ENABLE, 1);
+	ALLOC_STATE(gb_misc, always, R300_GB_MISC_CMDSIZE, 0);
+	r300->hw.gb_misc.cmd[0] = cmdpacket0(R300_GB_MSPOS0, 5);
+	ALLOC_STATE(txe, always, R300_TXE_CMDSIZE, 0);
+	r300->hw.txe.cmd[R300_TXE_CMD_0] = cmdpacket0(R300_TX_ENABLE, 1);
+	ALLOC_STATE(unk4200, always, 5, 0);
+	r300->hw.unk4200.cmd[0] = cmdpacket0(0x4200, 4);
+	ALLOC_STATE(unk4214, always, 2, 0);
+	r300->hw.unk4214.cmd[0] = cmdpacket0(0x4214, 1);
+	ALLOC_STATE(ps, always, R300_PS_CMDSIZE, 0);
+	r300->hw.ps.cmd[0] = cmdpacket0(R300_RE_POINTSIZE, 1);
+	ALLOC_STATE(unk4230, always, 4, 0);
+	r300->hw.unk4230.cmd[0] = cmdpacket0(0x4230, 3);
+	ALLOC_STATE(lcntl, always, 2, 0);
+	r300->hw.lcntl.cmd[0] = cmdpacket0(R300_RE_LINE_CNT, 1);
+	ALLOC_STATE(unk4260, always, 4, 0);
+	r300->hw.unk4260.cmd[0] = cmdpacket0(0x4260, 3);
+	ALLOC_STATE(shade, always, 5, 0);
+	r300->hw.shade.cmd[0] = cmdpacket0(R300_RE_SHADE, 4);
+	ALLOC_STATE(polygon_mode, always, 4, 0);
+	r300->hw.polygon_mode.cmd[0] = cmdpacket0(R300_RE_POLYGON_MODE, 3);
+	ALLOC_STATE(fogp, always, 3, 0);
+	r300->hw.fogp.cmd[0] = cmdpacket0(R300_RE_FOG_SCALE, 2);
+	ALLOC_STATE(zbias_cntl, always, 2, 0);
+	r300->hw.zbias_cntl.cmd[0] = cmdpacket0(R300_RE_ZBIAS_CNTL, 1);
+	ALLOC_STATE(zbs, always, R300_ZBS_CMDSIZE, 0);
+	r300->hw.zbs.cmd[R300_ZBS_CMD_0] =
+	    cmdpacket0(R300_RE_ZBIAS_T_FACTOR, 4);
+	ALLOC_STATE(occlusion_cntl, always, 2, 0);
+	r300->hw.occlusion_cntl.cmd[0] = cmdpacket0(R300_RE_OCCLUSION_CNTL, 1);
+	ALLOC_STATE(cul, always, R300_CUL_CMDSIZE, 0);
+	r300->hw.cul.cmd[R300_CUL_CMD_0] = cmdpacket0(R300_RE_CULL_CNTL, 1);
+	ALLOC_STATE(unk42C0, always, 3, 0);
+	r300->hw.unk42C0.cmd[0] = cmdpacket0(0x42C0, 2);
+	ALLOC_STATE(rc, always, R300_RC_CMDSIZE, 0);
+	r300->hw.rc.cmd[R300_RC_CMD_0] = cmdpacket0(R300_RS_CNTL_0, 2);
+	ALLOC_STATE(ri, always, R300_RI_CMDSIZE, 0);
+	r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(R300_RS_INTERP_0, 8);
+	ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
+	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_ROUTE_0, 1);
+	ALLOC_STATE(unk43A4, always, 3, 0);
+	r300->hw.unk43A4.cmd[0] = cmdpacket0(0x43A4, 2);
+	ALLOC_STATE(unk43E8, always, 2, 0);
+	r300->hw.unk43E8.cmd[0] = cmdpacket0(0x43E8, 1);
+	ALLOC_STATE(fp, always, R300_FP_CMDSIZE, 0);
+	r300->hw.fp.cmd[R300_FP_CMD_0] = cmdpacket0(R300_PFS_CNTL_0, 3);
+	r300->hw.fp.cmd[R300_FP_CMD_1] = cmdpacket0(R300_PFS_NODE_0, 4);
+	ALLOC_STATE(fpt, variable, R300_FPT_CMDSIZE, 0);
+	r300->hw.fpt.cmd[R300_FPT_CMD_0] = cmdpacket0(R300_PFS_TEXI_0, 0);
+	ALLOC_STATE(unk46A4, always, 6, 0);
+	r300->hw.unk46A4.cmd[0] = cmdpacket0(0x46A4, 5);
+	ALLOC_STATE(fpi[0], variable, R300_FPI_CMDSIZE, 0);
+	r300->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_PFS_INSTR0_0, 1);
+	ALLOC_STATE(fpi[1], variable, R300_FPI_CMDSIZE, 1);
+	r300->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_PFS_INSTR1_0, 1);
+	ALLOC_STATE(fpi[2], variable, R300_FPI_CMDSIZE, 2);
+	r300->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_PFS_INSTR2_0, 1);
+	ALLOC_STATE(fpi[3], variable, R300_FPI_CMDSIZE, 3);
+	r300->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_PFS_INSTR3_0, 1);
+	ALLOC_STATE(fogs, always, R300_FOGS_CMDSIZE, 0);
+	r300->hw.fogs.cmd[R300_FOGS_CMD_0] = cmdpacket0(R300_RE_FOG_STATE, 1);
+	ALLOC_STATE(fogc, always, R300_FOGC_CMDSIZE, 0);
+	r300->hw.fogc.cmd[R300_FOGC_CMD_0] = cmdpacket0(R300_FOG_COLOR_R, 3);
+	ALLOC_STATE(at, always, R300_AT_CMDSIZE, 0);
+	r300->hw.at.cmd[R300_AT_CMD_0] = cmdpacket0(R300_PP_ALPHA_TEST, 2);
+	ALLOC_STATE(unk4BD8, always, 2, 0);
+	r300->hw.unk4BD8.cmd[0] = cmdpacket0(0x4BD8, 1);
+	ALLOC_STATE(fpp, variable, R300_FPP_CMDSIZE, 0);
+	r300->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, 0);
+	ALLOC_STATE(unk4E00, always, 2, 0);
+	r300->hw.unk4E00.cmd[0] = cmdpacket0(0x4E00, 1);
+	ALLOC_STATE(bld, always, R300_BLD_CMDSIZE, 0);
+	r300->hw.bld.cmd[R300_BLD_CMD_0] = cmdpacket0(R300_RB3D_CBLEND, 2);
+	ALLOC_STATE(cmk, always, R300_CMK_CMDSIZE, 0);
+	r300->hw.cmk.cmd[R300_CMK_CMD_0] = cmdpacket0(R300_RB3D_COLORMASK, 1);
+	ALLOC_STATE(blend_color, always, 4, 0);
+	r300->hw.blend_color.cmd[0] = cmdpacket0(R300_RB3D_BLEND_COLOR, 3);
+	ALLOC_STATE(cb, always, R300_CB_CMDSIZE, 0);
+	r300->hw.cb.cmd[R300_CB_CMD_0] = cmdpacket0(R300_RB3D_COLOROFFSET0, 1);
+	r300->hw.cb.cmd[R300_CB_CMD_1] = cmdpacket0(R300_RB3D_COLORPITCH0, 1);
+	ALLOC_STATE(unk4E50, always, 10, 0);
+	r300->hw.unk4E50.cmd[0] = cmdpacket0(0x4E50, 9);
+	ALLOC_STATE(unk4E88, always, 2, 0);
+	r300->hw.unk4E88.cmd[0] = cmdpacket0(0x4E88, 1);
+	ALLOC_STATE(unk4EA0, always, 3, 0);
+	r300->hw.unk4EA0.cmd[0] = cmdpacket0(0x4EA0, 2);
+	ALLOC_STATE(zs, always, R300_ZS_CMDSIZE, 0);
+	r300->hw.zs.cmd[R300_ZS_CMD_0] =
+	    cmdpacket0(R300_RB3D_ZSTENCIL_CNTL_0, 3);
+	ALLOC_STATE(zstencil_format, always, 5, 0);
+	r300->hw.zstencil_format.cmd[0] =
+	    cmdpacket0(R300_RB3D_ZSTENCIL_FORMAT, 4);
+	ALLOC_STATE(zb, always, R300_ZB_CMDSIZE, 0);
+	r300->hw.zb.cmd[R300_ZB_CMD_0] = cmdpacket0(R300_RB3D_DEPTHOFFSET, 2);
+	ALLOC_STATE(unk4F28, always, 2, 0);
+	r300->hw.unk4F28.cmd[0] = cmdpacket0(0x4F28, 1);
+	ALLOC_STATE(unk4F30, always, 3, 0);
+	r300->hw.unk4F30.cmd[0] = cmdpacket0(0x4F30, 2);
+	ALLOC_STATE(unk4F44, always, 2, 0);
+	r300->hw.unk4F44.cmd[0] = cmdpacket0(0x4F44, 1);
+	ALLOC_STATE(unk4F54, always, 2, 0);
+	r300->hw.unk4F54.cmd[0] = cmdpacket0(0x4F54, 1);
+
+	/* VPU only on TCL */
+	if (has_tcl) {
+   	        int i;
+		ALLOC_STATE(vpi, vpu, R300_VPI_CMDSIZE, 0);
+		r300->hw.vpi.cmd[R300_VPI_CMD_0] =
+		    cmdvpu(R300_PVS_UPLOAD_PROGRAM, 0);
+
+		ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
+		r300->hw.vpp.cmd[R300_VPP_CMD_0] =
+		    cmdvpu(R300_PVS_UPLOAD_PARAMETERS, 0);
+
+		ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
+		r300->hw.vps.cmd[R300_VPS_CMD_0] =
+		    cmdvpu(R300_PVS_UPLOAD_POINTSIZE, 1);
+
+		for (i = 0; i < 6; i++) {
+		  ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
+		  r300->hw.vpucp[i].cmd[R300_VPUCP_CMD_0] =
+ 		    cmdvpu(R300_PVS_UPLOAD_CLIP_PLANE0+i, 1);
+		}
+	}
+
+	/* Textures */
+	ALLOC_STATE(tex.filter, variable, mtu + 1, 0);
+	r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_FILTER_0, 0);
+
+	ALLOC_STATE(tex.filter_1, variable, mtu + 1, 0);
+	r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_FILTER1_0, 0);
+
+	ALLOC_STATE(tex.size, variable, mtu + 1, 0);
+	r300->hw.tex.size.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_SIZE_0, 0);
+
+	ALLOC_STATE(tex.format, variable, mtu + 1, 0);
+	r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_FORMAT_0, 0);
+
+	ALLOC_STATE(tex.pitch, variable, mtu + 1, 0);
+	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_PITCH_0, 0);
+
+	ALLOC_STATE(tex.offset, variable, mtu + 1, 0);
+	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_OFFSET_0, 0);
+
+	ALLOC_STATE(tex.chroma_key, variable, mtu + 1, 0);
+	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_CHROMA_KEY_0, 0);
+
+	ALLOC_STATE(tex.border_color, variable, mtu + 1, 0);
+	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_BORDER_COLOR_0, 0);
+
+	r300->hw.is_dirty = GL_TRUE;
+	r300->hw.all_dirty = GL_TRUE;
+
+	/* Initialize command buffer */
+	size =
+	    256 * driQueryOptioni(&r300->radeon.optionCache,
+				  "command_buffer_size");
+	if (size < 2 * r300->hw.max_state_size) {
+		size = 2 * r300->hw.max_state_size + 65535;
+	}
+	if (size > 64 * 256)
+		size = 64 * 256;
+
+	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA)) {
+		fprintf(stderr, "sizeof(drm_r300_cmd_header_t)=%zd\n",
+			sizeof(drm_r300_cmd_header_t));
+		fprintf(stderr, "sizeof(drm_radeon_cmd_buffer_t)=%zd\n",
+			sizeof(drm_radeon_cmd_buffer_t));
+		fprintf(stderr,
+			"Allocating %d bytes command buffer (max state is %d bytes)\n",
+			size * 4, r300->hw.max_state_size * 4);
+	}
+
+	r300->cmdbuf.size = size;
+	r300->cmdbuf.cmd_buf = (uint32_t *) CALLOC(size * 4);
+	r300->cmdbuf.count_used = 0;
+	r300->cmdbuf.count_reemit = 0;
+}
+
+/**
+ * Destroy the command buffer and state atoms.
+ */
+void r300DestroyCmdBuf(r300ContextPtr r300)
+{
+	struct r300_state_atom *atom;
+
+	FREE(r300->cmdbuf.cmd_buf);
+
+	foreach(atom, &r300->hw.atomlist) {
+		FREE(atom->cmd);
+	}
+}
+
+void r300EmitBlit(r300ContextPtr rmesa,
+		  GLuint color_fmt,
+		  GLuint src_pitch,
+		  GLuint src_offset,
+		  GLuint dst_pitch,
+		  GLuint dst_offset,
+		  GLint srcx, GLint srcy,
+		  GLint dstx, GLint dsty, GLuint w, GLuint h)
+{
+	drm_r300_cmd_header_t *cmd;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr,
+			"%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
+			__FUNCTION__, src_pitch, src_offset, srcx, srcy,
+			dst_pitch, dst_offset, dstx, dsty, w, h);
+
+	assert((src_pitch & 63) == 0);
+	assert((dst_pitch & 63) == 0);
+	assert((src_offset & 1023) == 0);
+	assert((dst_offset & 1023) == 0);
+	assert(w < (1 << 16));
+	assert(h < (1 << 16));
+
+	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 8, __FUNCTION__);
+
+	cmd[0].header.cmd_type = R300_CMD_PACKET3;
+	cmd[0].header.pad0 = R300_CMD_PACKET3_RAW;
+	cmd[1].u = R300_CP_CMD_BITBLT_MULTI | (5 << 16);
+	cmd[2].u = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
+		    RADEON_GMC_DST_PITCH_OFFSET_CNTL |
+		    RADEON_GMC_BRUSH_NONE |
+		    (color_fmt << 8) |
+		    RADEON_GMC_SRC_DATATYPE_COLOR |
+		    RADEON_ROP3_S |
+		    RADEON_DP_SRC_SOURCE_MEMORY |
+		    RADEON_GMC_CLR_CMP_CNTL_DIS | RADEON_GMC_WR_MSK_DIS);
+
+	cmd[3].u = ((src_pitch / 64) << 22) | (src_offset >> 10);
+	cmd[4].u = ((dst_pitch / 64) << 22) | (dst_offset >> 10);
+	cmd[5].u = (srcx << 16) | srcy;
+	cmd[6].u = (dstx << 16) | dsty;	/* dst */
+	cmd[7].u = (w << 16) | h;
+}
+
+void r300EmitWait(r300ContextPtr rmesa, GLuint flags)
+{
+	drm_r300_cmd_header_t *cmd;
+
+	assert(!(flags & ~(R300_WAIT_2D | R300_WAIT_3D)));
+
+	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
+	cmd[0].u = 0;
+	cmd[0].wait.cmd_type = R300_CMD_WAIT;
+	cmd[0].wait.flags = flags;
+}
diff --git a/r300/r300_cmdbuf.h b/r300/r300_cmdbuf.h
new file mode 100644
index 0000000..bfb2eda
--- /dev/null
+++ b/r300/r300_cmdbuf.h
@@ -0,0 +1,116 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R300_CMDBUF_H__
+#define __R300_CMDBUF_H__
+
+#include "r300_context.h"
+
+extern int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller);
+extern int r300FlushCmdBuf(r300ContextPtr r300, const char *caller);
+
+extern void r300EmitState(r300ContextPtr r300);
+
+extern void r300InitCmdBuf(r300ContextPtr r300);
+extern void r300DestroyCmdBuf(r300ContextPtr r300);
+
+/**
+ * Make sure that enough space is available in the command buffer
+ * by flushing if necessary.
+ *
+ * \param dwords The number of dwords we need to be free on the command buffer
+ */
+static __inline__ void r300EnsureCmdBufSpace(r300ContextPtr r300,
+					     int dwords, const char *caller)
+{
+	assert(dwords < r300->cmdbuf.size);
+
+	if (r300->cmdbuf.count_used + dwords > r300->cmdbuf.size)
+		r300FlushCmdBuf(r300, caller);
+}
+
+/**
+ * Allocate the given number of dwords in the command buffer and return
+ * a pointer to the allocated area.
+ * When necessary, these functions cause a flush. r300AllocCmdBuf() also
+ * causes state reemission after a flush. This is necessary to ensure
+ * correct hardware state after an unlock.
+ */
+static __inline__ uint32_t *r300RawAllocCmdBuf(r300ContextPtr r300,
+					       int dwords, const char *caller)
+{
+	uint32_t *ptr;
+
+	r300EnsureCmdBufSpace(r300, dwords, caller);
+
+	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
+	r300->cmdbuf.count_used += dwords;
+	return ptr;
+}
+
+static __inline__ uint32_t *r300AllocCmdBuf(r300ContextPtr r300,
+					    int dwords, const char *caller)
+{
+	uint32_t *ptr;
+
+	r300EnsureCmdBufSpace(r300, dwords, caller);
+
+	if (!r300->cmdbuf.count_used) {
+		if (RADEON_DEBUG & DEBUG_IOCTL)
+			fprintf(stderr,
+				"Reemit state after flush (from %s)\n", caller);
+		r300EmitState(r300);
+	}
+
+	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
+	r300->cmdbuf.count_used += dwords;
+	return ptr;
+}
+
+extern void r300EmitBlit(r300ContextPtr rmesa,
+			 GLuint color_fmt,
+			 GLuint src_pitch,
+			 GLuint src_offset,
+			 GLuint dst_pitch,
+			 GLuint dst_offset,
+			 GLint srcx, GLint srcy,
+			 GLint dstx, GLint dsty, GLuint w, GLuint h);
+
+extern void r300EmitWait(r300ContextPtr rmesa, GLuint flags);
+extern void r300EmitLOAD_VBPNTR(r300ContextPtr rmesa, int start);
+extern void r300EmitVertexShader(r300ContextPtr rmesa);
+extern void r300EmitPixelShader(r300ContextPtr rmesa);
+
+#endif				/* __R300_CMDBUF_H__ */
diff --git a/r300/r300_context.c b/r300/r300_context.c
new file mode 100644
index 0000000..9ea14ab
--- /dev/null
+++ b/r300/r300_context.c
@@ -0,0 +1,532 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include "glheader.h"
+#include "api_arrayelt.h"
+#include "context.h"
+#include "simple_list.h"
+#include "imports.h"
+#include "matrix.h"
+#include "extensions.h"
+#include "state.h"
+#include "bufferobj.h"
+
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "vbo/vbo.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "tnl/t_vp_build.h"
+
+#include "drivers/common/driverfuncs.h"
+
+#include "radeon_ioctl.h"
+#include "radeon_span.h"
+#include "r300_context.h"
+#include "r300_cmdbuf.h"
+#include "r300_state.h"
+#include "r300_ioctl.h"
+#include "r300_tex.h"
+#include "r300_emit.h"
+
+#ifdef USER_BUFFERS
+#include "r300_mem.h"
+#endif
+
+#include "vblank.h"
+#include "utils.h"
+#include "xmlpool.h"		/* for symbolic values of enum-type options */
+
+/* hw_tcl_on derives from future_hw_tcl_on when its safe to change it. */
+int future_hw_tcl_on = 1;
+int hw_tcl_on = 1;
+
+#define need_GL_EXT_stencil_two_side
+#define need_GL_ARB_multisample
+#define need_GL_ARB_texture_compression
+#define need_GL_ARB_vertex_buffer_object
+#define need_GL_ARB_vertex_program
+#define need_GL_EXT_blend_minmax
+//#define need_GL_EXT_fog_coord
+#define need_GL_EXT_secondary_color
+#define need_GL_EXT_blend_equation_separate
+#define need_GL_EXT_blend_func_separate
+#define need_GL_EXT_gpu_program_parameters
+#define need_GL_NV_vertex_program
+#include "extension_helper.h"
+
+const struct dri_extension card_extensions[] = {
+  /* *INDENT-OFF* */
+  {"GL_ARB_multisample",		GL_ARB_multisample_functions},
+  {"GL_ARB_multitexture",		NULL},
+  {"GL_ARB_texture_border_clamp",	NULL},
+  {"GL_ARB_texture_compression",	GL_ARB_texture_compression_functions},
+  {"GL_ARB_texture_cube_map",		NULL},
+  {"GL_ARB_texture_env_add",		NULL},
+  {"GL_ARB_texture_env_combine",	NULL},
+  {"GL_ARB_texture_env_crossbar",	NULL},
+  {"GL_ARB_texture_env_dot3",		NULL},
+  {"GL_ARB_texture_mirrored_repeat",	NULL},
+  {"GL_ARB_vertex_buffer_object",	GL_ARB_vertex_buffer_object_functions},
+  {"GL_ARB_vertex_program",		GL_ARB_vertex_program_functions},
+  {"GL_ARB_fragment_program",		NULL},
+  {"GL_EXT_blend_equation_separate",	GL_EXT_blend_equation_separate_functions},
+  {"GL_EXT_blend_func_separate",	GL_EXT_blend_func_separate_functions},
+  {"GL_EXT_blend_minmax",		GL_EXT_blend_minmax_functions},
+  {"GL_EXT_blend_subtract",		NULL},
+//  {"GL_EXT_fog_coord",			GL_EXT_fog_coord_functions },
+  {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
+  {"GL_EXT_secondary_color", 		GL_EXT_secondary_color_functions},
+  {"GL_EXT_stencil_two_side",		GL_EXT_stencil_two_side_functions},
+  {"GL_EXT_stencil_wrap",		NULL},
+  {"GL_EXT_texture_edge_clamp",		NULL},
+  {"GL_EXT_texture_env_combine", 	NULL},
+  {"GL_EXT_texture_env_dot3", 		NULL},
+  {"GL_EXT_texture_filter_anisotropic",	NULL},
+  {"GL_EXT_texture_lod_bias",		NULL},
+  {"GL_EXT_texture_mirror_clamp",	NULL},
+  {"GL_EXT_texture_rectangle",		NULL},
+  {"GL_ATI_texture_env_combine3",	NULL},
+  {"GL_ATI_texture_mirror_once",	NULL},
+  {"GL_MESA_pack_invert",		NULL},
+  {"GL_MESA_ycbcr_texture",		NULL},
+  {"GL_MESAX_texture_float",		NULL},
+  {"GL_NV_blend_square",		NULL},
+  {"GL_NV_vertex_program",		GL_NV_vertex_program_functions},
+  {"GL_SGIS_generate_mipmap",		NULL},
+  {NULL,				NULL}
+  /* *INDENT-ON* */
+};
+
+extern struct tnl_pipeline_stage _r300_render_stage;
+extern const struct tnl_pipeline_stage _r300_tcl_stage;
+
+static const struct tnl_pipeline_stage *r300_pipeline[] = {
+
+	/* Try and go straight to t&l
+	 */
+	&_r300_tcl_stage,
+
+	/* Catch any t&l fallbacks
+	 */
+	&_tnl_vertex_transform_stage,
+	&_tnl_normal_transform_stage,
+	&_tnl_lighting_stage,
+	&_tnl_fog_coordinate_stage,
+	&_tnl_texgen_stage,
+	&_tnl_texture_transform_stage,
+	&_tnl_vertex_program_stage,
+
+	/* Try again to go to tcl?
+	 *     - no good for asymmetric-twoside (do with multipass)
+	 *     - no good for asymmetric-unfilled (do with multipass)
+	 *     - good for material
+	 *     - good for texgen
+	 *     - need to manipulate a bit of state
+	 *
+	 * - worth it/not worth it?
+	 */
+
+	/* Else do them here.
+	 */
+	&_r300_render_stage,
+	&_tnl_render_stage,	/* FALLBACK  */
+	0,
+};
+
+/* Create the device specific rendering context.
+ */
+GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+			    __DRIcontextPrivate * driContextPriv,
+			    void *sharedContextPrivate)
+{
+	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+	struct dd_function_table functions;
+	r300ContextPtr r300;
+	GLcontext *ctx;
+	int tcl_mode, i;
+
+	assert(glVisual);
+	assert(driContextPriv);
+	assert(screen);
+
+	/* Allocate the R300 context */
+	r300 = (r300ContextPtr) CALLOC(sizeof(*r300));
+	if (!r300)
+		return GL_FALSE;
+
+	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
+		hw_tcl_on = future_hw_tcl_on = 0;
+
+	/* Parse configuration files.
+	 * Do this here so that initialMaxAnisotropy is set before we create
+	 * the default textures.
+	 */
+	driParseConfigFiles(&r300->radeon.optionCache, &screen->optionCache,
+			    screen->driScreen->myNum, "r300");
+	r300->initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache,
+						     "def_max_anisotropy");
+
+	/* Init default driver functions then plug in our R300-specific functions
+	 * (the texture functions are especially important)
+	 */
+	_mesa_init_driver_functions(&functions);
+	r300InitIoctlFuncs(&functions);
+	r300InitStateFuncs(&functions);
+	r300InitTextureFuncs(&functions);
+	r300InitShaderFuncs(&functions);
+
+#ifdef USER_BUFFERS
+	r300_mem_init(r300);
+#endif
+
+	if (!radeonInitContext(&r300->radeon, &functions,
+			       glVisual, driContextPriv,
+			       sharedContextPrivate)) {
+		FREE(r300);
+		return GL_FALSE;
+	}
+
+	/* Init r300 context data */
+	r300->dma.buf0_address =
+	    r300->radeon.radeonScreen->buffers->list[0].address;
+
+	(void)memset(r300->texture_heaps, 0, sizeof(r300->texture_heaps));
+	make_empty_list(&r300->swapped);
+
+	r300->nr_heaps = 1 /* screen->numTexHeaps */ ;
+	assert(r300->nr_heaps < RADEON_NR_TEX_HEAPS);
+	for (i = 0; i < r300->nr_heaps; i++) {
+		/* *INDENT-OFF* */
+		r300->texture_heaps[i] = driCreateTextureHeap(i, r300,
+							       screen->
+							       texSize[i], 12,
+							       RADEON_NR_TEX_REGIONS,
+							       (drmTextureRegionPtr)
+							       r300->radeon.sarea->
+							       tex_list[i],
+							       &r300->radeon.sarea->
+							       tex_age[i],
+							       &r300->swapped,
+							       sizeof
+							       (r300TexObj),
+							       (destroy_texture_object_t
+								*)
+							       r300DestroyTexObj);
+		/* *INDENT-ON* */
+	}
+	r300->texture_depth = driQueryOptioni(&r300->radeon.optionCache,
+					      "texture_depth");
+	if (r300->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
+		r300->texture_depth = (screen->cpp == 4) ?
+		    DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
+
+	/* Set the maximum texture size small enough that we can guarentee that
+	 * all texture units can bind a maximal texture and have them both in
+	 * texturable memory at once.
+	 */
+
+	ctx = r300->radeon.glCtx;
+
+	ctx->Const.MaxTextureImageUnits =
+	    driQueryOptioni(&r300->radeon.optionCache, "texture_image_units");
+	ctx->Const.MaxTextureCoordUnits =
+	    driQueryOptioni(&r300->radeon.optionCache, "texture_coord_units");
+	ctx->Const.MaxTextureUnits =
+	    MIN2(ctx->Const.MaxTextureImageUnits,
+		 ctx->Const.MaxTextureCoordUnits);
+	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+
+	ctx->Const.MinPointSize = 1.0;
+	ctx->Const.MinPointSizeAA = 1.0;
+	ctx->Const.MaxPointSize = R300_POINTSIZE_MAX;
+	ctx->Const.MaxPointSizeAA = R300_POINTSIZE_MAX;
+
+	ctx->Const.MinLineWidth = 1.0;
+	ctx->Const.MinLineWidthAA = 1.0;
+	ctx->Const.MaxLineWidth = R300_LINESIZE_MAX;
+	ctx->Const.MaxLineWidthAA = R300_LINESIZE_MAX;
+
+#ifdef USER_BUFFERS
+	/* Needs further modifications */
+#if 0
+	ctx->Const.MaxArrayLockSize =
+	    ( /*512 */ RADEON_BUFFER_SIZE * 16 * 1024) / (4 * 4);
+#endif
+#endif
+
+	/* Initialize the software rasterizer and helper modules.
+	 */
+	_swrast_CreateContext(ctx);
+	_vbo_CreateContext(ctx);
+	_tnl_CreateContext(ctx);
+	_swsetup_CreateContext(ctx);
+	_swsetup_Wakeup(ctx);
+	_ae_create_context(ctx);
+
+	/* Install the customized pipeline:
+	 */
+	_tnl_destroy_pipeline(ctx);
+	_tnl_install_pipeline(ctx, r300_pipeline);
+
+	/* Try and keep materials and vertices separate:
+	 */
+/* 	_tnl_isolate_materials(ctx, GL_TRUE); */
+
+	/* Configure swrast and TNL to match hardware characteristics:
+	 */
+	_swrast_allow_pixel_fog(ctx, GL_FALSE);
+	_swrast_allow_vertex_fog(ctx, GL_TRUE);
+	_tnl_allow_pixel_fog(ctx, GL_FALSE);
+	_tnl_allow_vertex_fog(ctx, GL_TRUE);
+
+	/* currently bogus data */
+	ctx->Const.VertexProgram.MaxInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
+	ctx->Const.VertexProgram.MaxNativeInstructions =
+	    VSF_MAX_FRAGMENT_LENGTH / 4;
+	ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
+	ctx->Const.VertexProgram.MaxTemps = 32;
+	ctx->Const.VertexProgram.MaxNativeTemps =
+	    /*VSF_MAX_FRAGMENT_TEMPS */ 32;
+	ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
+	ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+
+	ctx->Const.FragmentProgram.MaxNativeTemps = PFS_NUM_TEMP_REGS;
+	ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
+	ctx->Const.FragmentProgram.MaxNativeParameters = PFS_NUM_CONST_REGS;
+	ctx->Const.FragmentProgram.MaxNativeAluInstructions = PFS_MAX_ALU_INST;
+	ctx->Const.FragmentProgram.MaxNativeTexInstructions = PFS_MAX_TEX_INST;
+	ctx->Const.FragmentProgram.MaxNativeInstructions =
+	    PFS_MAX_ALU_INST + PFS_MAX_TEX_INST;
+	ctx->Const.FragmentProgram.MaxNativeTexIndirections =
+	    PFS_MAX_TEX_INDIRECT;
+	ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;	/* and these are?? */
+	_tnl_ProgramCacheInit(ctx);
+	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
+
+	driInitExtensions(ctx, card_extensions, GL_TRUE);
+
+	if (driQueryOptionb
+	    (&r300->radeon.optionCache, "disable_stencil_two_side"))
+		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
+
+	if (r300->radeon.glCtx->Mesa_DXTn
+	    && !driQueryOptionb(&r300->radeon.optionCache, "disable_s3tc")) {
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+		_mesa_enable_extension(ctx, "GL_S3_s3tc");
+	} else
+	    if (driQueryOptionb(&r300->radeon.optionCache, "force_s3tc_enable"))
+	{
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+	}
+
+	r300->disable_lowimpact_fallback =
+	    driQueryOptionb(&r300->radeon.optionCache,
+			    "disable_lowimpact_fallback");
+
+	radeonInitSpanFuncs(ctx);
+	r300InitCmdBuf(r300);
+	r300InitState(r300);
+
+	TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
+
+	tcl_mode = driQueryOptioni(&r300->radeon.optionCache, "tcl_mode");
+	if (driQueryOptionb(&r300->radeon.optionCache, "no_rast")) {
+		fprintf(stderr, "disabling 3D acceleration\n");
+#if R200_MERGED
+		FALLBACK(&r300->radeon, RADEON_FALLBACK_DISABLE, 1);
+#endif
+	}
+	if (tcl_mode == DRI_CONF_TCL_SW ||
+	    !(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+		if (r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+			r300->radeon.radeonScreen->chip_flags &=
+			    ~RADEON_CHIPSET_TCL;
+			fprintf(stderr, "Disabling HW TCL support\n");
+		}
+		TCL_FALLBACK(r300->radeon.glCtx,
+			     RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
+	}
+
+	return GL_TRUE;
+}
+
+static void r300FreeGartAllocations(r300ContextPtr r300)
+{
+	int i, ret, tries = 0, done_age, in_use = 0;
+	drm_radeon_mem_free_t memfree;
+
+	memfree.region = RADEON_MEM_REGION_GART;
+
+#ifdef USER_BUFFERS
+	for (i = r300->rmm->u_last; i > 0; i--) {
+		if (r300->rmm->u_list[i].ptr == NULL) {
+			continue;
+		}
+
+		/* check whether this buffer is still in use */
+		if (r300->rmm->u_list[i].pending) {
+			in_use++;
+		}
+	}
+	/* Cannot flush/lock if no context exists. */
+	if (in_use)
+		r300FlushCmdBuf(r300, __FUNCTION__);
+
+	done_age = radeonGetAge((radeonContextPtr) r300);
+
+	for (i = r300->rmm->u_last; i > 0; i--) {
+		if (r300->rmm->u_list[i].ptr == NULL) {
+			continue;
+		}
+
+		/* check whether this buffer is still in use */
+		if (!r300->rmm->u_list[i].pending) {
+			continue;
+		}
+
+		assert(r300->rmm->u_list[i].h_pending == 0);
+
+		tries = 0;
+		while (r300->rmm->u_list[i].age > done_age && tries++ < 1000) {
+			usleep(10);
+			done_age = radeonGetAge((radeonContextPtr) r300);
+		}
+		if (tries >= 1000) {
+			WARN_ONCE("Failed to idle region!");
+		}
+
+		memfree.region_offset = (char *)r300->rmm->u_list[i].ptr -
+		    (char *)r300->radeon.radeonScreen->gartTextures.map;
+
+		ret = drmCommandWrite(r300->radeon.radeonScreen->driScreen->fd,
+				      DRM_RADEON_FREE, &memfree,
+				      sizeof(memfree));
+		if (ret) {
+			fprintf(stderr, "Failed to free at %p\nret = %s\n",
+				r300->rmm->u_list[i].ptr, strerror(-ret));
+		} else {
+			if (i == r300->rmm->u_last)
+				r300->rmm->u_last--;
+
+			r300->rmm->u_list[i].pending = 0;
+			r300->rmm->u_list[i].ptr = NULL;
+		}
+	}
+	r300->rmm->u_head = i;
+#endif				/* USER_BUFFERS */
+}
+
+/* Destroy the device specific context.
+ */
+void r300DestroyContext(__DRIcontextPrivate * driContextPriv)
+{
+	GET_CURRENT_CONTEXT(ctx);
+	r300ContextPtr r300 = (r300ContextPtr) driContextPriv->driverPrivate;
+	radeonContextPtr radeon = (radeonContextPtr) r300;
+	radeonContextPtr current = ctx ? RADEON_CONTEXT(ctx) : NULL;
+
+	if (RADEON_DEBUG & DEBUG_DRI) {
+		fprintf(stderr, "Destroying context !\n");
+	}
+
+	/* check if we're deleting the currently bound context */
+	if (&r300->radeon == current) {
+		radeonFlush(r300->radeon.glCtx);
+		_mesa_make_current(NULL, NULL, NULL);
+	}
+
+	/* Free r300 context resources */
+	assert(r300);		/* should never be null */
+
+	if (r300) {
+		GLboolean release_texture_heaps;
+
+		release_texture_heaps =
+		    (r300->radeon.glCtx->Shared->RefCount == 1);
+		_swsetup_DestroyContext(r300->radeon.glCtx);
+		_tnl_ProgramCacheDestroy(r300->radeon.glCtx);
+		_tnl_DestroyContext(r300->radeon.glCtx);
+		_vbo_DestroyContext(r300->radeon.glCtx);
+		_swrast_DestroyContext(r300->radeon.glCtx);
+
+		if (r300->dma.current.buf) {
+			r300ReleaseDmaRegion(r300, &r300->dma.current,
+					     __FUNCTION__);
+#ifndef USER_BUFFERS
+			r300FlushCmdBuf(r300, __FUNCTION__);
+#endif
+		}
+		r300FreeGartAllocations(r300);
+		r300DestroyCmdBuf(r300);
+
+		if (radeon->state.scissor.pClipRects) {
+			FREE(radeon->state.scissor.pClipRects);
+			radeon->state.scissor.pClipRects = NULL;
+		}
+
+		if (release_texture_heaps) {
+			/* This share group is about to go away, free our private
+			 * texture object data.
+			 */
+			int i;
+
+			for (i = 0; i < r300->nr_heaps; i++) {
+				driDestroyTextureHeap(r300->texture_heaps[i]);
+				r300->texture_heaps[i] = NULL;
+			}
+
+			assert(is_empty_list(&r300->swapped));
+		}
+
+		radeonCleanupContext(&r300->radeon);
+
+#ifdef USER_BUFFERS
+		/* the memory manager might be accessed when Mesa frees the shared
+		 * state, so don't destroy it earlier
+		 */
+		r300_mem_destroy(r300);
+#endif
+
+		/* free the option cache */
+		driDestroyOptionCache(&r300->radeon.optionCache);
+
+		FREE(r300);
+	}
+}
diff --git a/r300/r300_context.h b/r300/r300_context.h
new file mode 100644
index 0000000..6b0a588
--- /dev/null
+++ b/r300/r300_context.h
@@ -0,0 +1,916 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R300_CONTEXT_H__
+#define __R300_CONTEXT_H__
+
+#include "tnl/t_vertex.h"
+#include "drm.h"
+#include "radeon_drm.h"
+#include "dri_util.h"
+#include "texmem.h"
+
+#include "macros.h"
+#include "mtypes.h"
+#include "colormac.h"
+
+#define USER_BUFFERS
+
+//#define OPTIMIZE_ELTS
+
+struct r300_context;
+typedef struct r300_context r300ContextRec;
+typedef struct r300_context *r300ContextPtr;
+
+#include "radeon_lock.h"
+#include "mm.h"
+
+/* From http://gcc.gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
+   I suppose we could inline this and use macro to fetch out __LINE__ and stuff in case we run into trouble
+   with other compilers ... GLUE!
+*/
+#define WARN_ONCE(a, ...)	{ \
+	static int warn##__LINE__=1; \
+	if(warn##__LINE__){ \
+		fprintf(stderr, "*********************************WARN_ONCE*********************************\n"); \
+		fprintf(stderr, "File %s function %s line %d\n", \
+			__FILE__, __FUNCTION__, __LINE__); \
+		fprintf(stderr,  a, ## __VA_ARGS__);\
+		fprintf(stderr, "***************************************************************************\n"); \
+		warn##__LINE__=0;\
+		} \
+	}
+
+#include "r300_vertprog.h"
+#include "r300_fragprog.h"
+
+/**
+ * This function takes a float and packs it into a uint32_t
+ */
+static __inline__ uint32_t r300PackFloat32(float fl)
+{
+	union {
+		float fl;
+		uint32_t u;
+	} u;
+
+	u.fl = fl;
+	return u.u;
+}
+
+/* This is probably wrong for some values, I need to test this
+ * some more.  Range checking would be a good idea also..
+ *
+ * But it works for most things.  I'll fix it later if someone
+ * else with a better clue doesn't
+ */
+static __inline__ uint32_t r300PackFloat24(float f)
+{
+	float mantissa;
+	int exponent;
+	uint32_t float24 = 0;
+
+	if (f == 0.0)
+		return 0;
+
+	mantissa = frexpf(f, &exponent);
+
+	/* Handle -ve */
+	if (mantissa < 0) {
+		float24 |= (1 << 23);
+		mantissa = mantissa * -1.0;
+	}
+	/* Handle exponent, bias of 63 */
+	exponent += 62;
+	float24 |= (exponent << 16);
+	/* Kill 7 LSB of mantissa */
+	float24 |= (r300PackFloat32(mantissa) & 0x7FFFFF) >> 7;
+
+	return float24;
+}
+
+/************ DMA BUFFERS **************/
+
+/* Need refcounting on dma buffers:
+ */
+struct r300_dma_buffer {
+	int refcount;		/**< the number of retained regions in buf */
+	drmBufPtr buf;
+	int id;
+};
+#undef GET_START
+#ifdef USER_BUFFERS
+#define GET_START(rvb) (r300GartOffsetFromVirtual(rmesa, (rvb)->address+(rvb)->start))
+#else
+#define GET_START(rvb) (rmesa->radeon.radeonScreen->gart_buffer_offset +		\
+			(rvb)->address - rmesa->dma.buf0_address +	\
+			(rvb)->start)
+#endif
+/* A retained region, eg vertices for indexed vertices.
+ */
+struct r300_dma_region {
+	struct r300_dma_buffer *buf;
+	char *address;		/* == buf->address */
+	int start, end, ptr;	/* offsets from start of buf */
+
+	int aos_offset;		/* address in GART memory */
+	int aos_stride;		/* distance between elements, in dwords */
+	int aos_size;		/* number of components (1-4) */
+	int aos_reg;		/* VAP register assignment */
+};
+
+struct r300_dma {
+	/* Active dma region.  Allocations for vertices and retained
+	 * regions come from here.  Also used for emitting random vertices,
+	 * these may be flushed by calling flush_current();
+	 */
+	struct r300_dma_region current;
+
+	void (*flush) (r300ContextPtr);
+
+	char *buf0_address;	/* start of buf[0], for index calcs */
+
+	/* Number of "in-flight" DMA buffers, i.e. the number of buffers
+	 * for which a DISCARD command is currently queued in the command buffer.
+	 */
+	GLuint nr_released_bufs;
+};
+
+       /* Texture related */
+
+typedef struct r300_tex_obj r300TexObj, *r300TexObjPtr;
+
+/* Texture object in locally shared texture space.
+ */
+struct r300_tex_obj {
+	driTextureObject base;
+
+	GLuint bufAddr;		/* Offset to start of locally
+				   shared texture block */
+
+	GLuint dirty_state;	/* Flags (1 per texunit) for
+				   whether or not this texobj
+				   has dirty hardware state
+				   (pp_*) that needs to be
+				   brought into the
+				   texunit. */
+
+	drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
+	/* Six, for the cube faces */
+
+	GLboolean image_override;	/* Image overridden by GLX_EXT_tfp */
+
+	GLuint pitch;		/* this isn't sent to hardware just used in calculations */
+	/* hardware register values */
+	/* Note that R200 has 8 registers per texture and R300 only 7 */
+	GLuint filter;
+	GLuint filter_1;
+	GLuint pitch_reg;
+	GLuint size;		/* npot only */
+	GLuint format;
+	GLuint offset;		/* Image location in the card's address space.
+				   All cube faces follow. */
+	GLuint unknown4;
+	GLuint unknown5;
+	/* end hardware registers */
+
+	/* registers computed by r200 code - keep them here to
+	   compare against what is actually written.
+
+	   to be removed later.. */
+	GLuint pp_border_color;
+	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
+	GLuint format_x;
+
+	GLboolean border_fallback;
+
+	GLuint tile_bits;	/* hw texture tile bits used on this texture */
+};
+
+struct r300_texture_env_state {
+	r300TexObjPtr texobj;
+	GLenum format;
+	GLenum envMode;
+};
+
+/* The blit width for texture uploads
+ */
+#define R300_BLIT_WIDTH_BYTES 1024
+#define R300_MAX_TEXTURE_UNITS 8
+
+struct r300_texture_state {
+	struct r300_texture_env_state unit[R300_MAX_TEXTURE_UNITS];
+	int tc_count;		/* number of incoming texture coordinates from VAP */
+};
+
+/**
+ * A block of hardware state.
+ *
+ * When check returns non-zero, the returned number of dwords must be
+ * copied verbatim into the command buffer in order to update a state atom
+ * when it is dirty.
+ */
+struct r300_state_atom {
+	struct r300_state_atom *next, *prev;
+	const char *name;	/* for debug */
+	int cmd_size;		/* maximum size in dwords */
+	GLuint idx;		/* index in an array (e.g. textures) */
+	uint32_t *cmd;
+	GLboolean dirty;
+
+	int (*check) (r300ContextPtr, struct r300_state_atom * atom);
+};
+
+#define R300_VPT_CMD_0		0
+#define R300_VPT_XSCALE		1
+#define R300_VPT_XOFFSET	2
+#define R300_VPT_YSCALE		3
+#define R300_VPT_YOFFSET	4
+#define R300_VPT_ZSCALE		5
+#define R300_VPT_ZOFFSET	6
+#define R300_VPT_CMDSIZE	7
+
+#define R300_VIR_CMD_0		0	/* vir is variable size (at least 1) */
+#define R300_VIR_CNTL_0		1
+#define R300_VIR_CNTL_1		2
+#define R300_VIR_CNTL_2		3
+#define R300_VIR_CNTL_3		4
+#define R300_VIR_CNTL_4		5
+#define R300_VIR_CNTL_5		6
+#define R300_VIR_CNTL_6		7
+#define R300_VIR_CNTL_7		8
+#define R300_VIR_CMDSIZE	9
+
+#define R300_VIC_CMD_0		0
+#define R300_VIC_CNTL_0		1
+#define R300_VIC_CNTL_1		2
+#define R300_VIC_CMDSIZE	3
+
+#define R300_VOF_CMD_0		0
+#define R300_VOF_CNTL_0		1
+#define R300_VOF_CNTL_1		2
+#define R300_VOF_CMDSIZE	3
+
+#define R300_PVS_CMD_0		0
+#define R300_PVS_CNTL_1		1
+#define R300_PVS_CNTL_2		2
+#define R300_PVS_CNTL_3		3
+#define R300_PVS_CMDSIZE	4
+
+#define R300_GB_MISC_CMD_0		0
+#define R300_GB_MISC_MSPOS_0		1
+#define R300_GB_MISC_MSPOS_1		2
+#define R300_GB_MISC_TILE_CONFIG	3
+#define R300_GB_MISC_SELECT		4
+#define R300_GB_MISC_AA_CONFIG		5
+#define R300_GB_MISC_CMDSIZE		6
+
+#define R300_TXE_CMD_0		0
+#define R300_TXE_ENABLE		1
+#define R300_TXE_CMDSIZE	2
+
+#define R300_PS_CMD_0		0
+#define R300_PS_POINTSIZE	1
+#define R300_PS_CMDSIZE		2
+
+#define R300_ZBS_CMD_0		0
+#define R300_ZBS_T_FACTOR	1
+#define R300_ZBS_T_CONSTANT	2
+#define R300_ZBS_W_FACTOR	3
+#define R300_ZBS_W_CONSTANT	4
+#define R300_ZBS_CMDSIZE	5
+
+#define R300_CUL_CMD_0		0
+#define R300_CUL_CULL		1
+#define R300_CUL_CMDSIZE	2
+
+#define R300_RC_CMD_0		0
+#define R300_RC_CNTL_0		1
+#define R300_RC_CNTL_1		2
+#define R300_RC_CMDSIZE		3
+
+#define R300_RI_CMD_0		0
+#define R300_RI_INTERP_0	1
+#define R300_RI_INTERP_1	2
+#define R300_RI_INTERP_2	3
+#define R300_RI_INTERP_3	4
+#define R300_RI_INTERP_4	5
+#define R300_RI_INTERP_5	6
+#define R300_RI_INTERP_6	7
+#define R300_RI_INTERP_7	8
+#define R300_RI_CMDSIZE		9
+
+#define R300_RR_CMD_0		0	/* rr is variable size (at least 1) */
+#define R300_RR_ROUTE_0		1
+#define R300_RR_ROUTE_1		2
+#define R300_RR_ROUTE_2		3
+#define R300_RR_ROUTE_3		4
+#define R300_RR_ROUTE_4		5
+#define R300_RR_ROUTE_5		6
+#define R300_RR_ROUTE_6		7
+#define R300_RR_ROUTE_7		8
+#define R300_RR_CMDSIZE		9
+
+#define R300_FP_CMD_0		0
+#define R300_FP_CNTL0		1
+#define R300_FP_CNTL1		2
+#define R300_FP_CNTL2		3
+#define R300_FP_CMD_1		4
+#define R300_FP_NODE0		5
+#define R300_FP_NODE1		6
+#define R300_FP_NODE2		7
+#define R300_FP_NODE3		8
+#define R300_FP_CMDSIZE		9
+
+#define R300_FPT_CMD_0		0
+#define R300_FPT_INSTR_0	1
+#define R300_FPT_CMDSIZE	65
+
+#define R300_FPI_CMD_0		0
+#define R300_FPI_INSTR_0	1
+#define R300_FPI_CMDSIZE	65
+
+#define R300_FPP_CMD_0		0
+#define R300_FPP_PARAM_0	1
+#define R300_FPP_CMDSIZE	(32*4+1)
+
+#define R300_FOGS_CMD_0		0
+#define R300_FOGS_STATE		1
+#define R300_FOGS_CMDSIZE	2
+
+#define R300_FOGC_CMD_0		0
+#define R300_FOGC_R		1
+#define R300_FOGC_G		2
+#define R300_FOGC_B		3
+#define R300_FOGC_CMDSIZE	4
+
+#define R300_FOGP_CMD_0		0
+#define R300_FOGP_SCALE		1
+#define R300_FOGP_START		2
+#define R300_FOGP_CMDSIZE	3
+
+#define R300_AT_CMD_0		0
+#define R300_AT_ALPHA_TEST	1
+#define R300_AT_UNKNOWN		2
+#define R300_AT_CMDSIZE		3
+
+#define R300_BLD_CMD_0		0
+#define R300_BLD_CBLEND		1
+#define R300_BLD_ABLEND		2
+#define R300_BLD_CMDSIZE	3
+
+#define R300_CMK_CMD_0		0
+#define R300_CMK_COLORMASK	1
+#define R300_CMK_CMDSIZE	2
+
+#define R300_CB_CMD_0		0
+#define R300_CB_OFFSET		1
+#define R300_CB_CMD_1		2
+#define R300_CB_PITCH		3
+#define R300_CB_CMDSIZE		4
+
+#define R300_ZS_CMD_0		0
+#define R300_ZS_CNTL_0		1
+#define R300_ZS_CNTL_1		2
+#define R300_ZS_CNTL_2		3
+#define R300_ZS_CMDSIZE		4
+
+#define R300_ZB_CMD_0		0
+#define R300_ZB_OFFSET		1
+#define R300_ZB_PITCH		2
+#define R300_ZB_CMDSIZE		3
+
+#define R300_VPI_CMD_0		0
+#define R300_VPI_INSTR_0	1
+#define R300_VPI_CMDSIZE	1025	/* 256 16 byte instructions */
+
+#define R300_VPP_CMD_0		0
+#define R300_VPP_PARAM_0	1
+#define R300_VPP_CMDSIZE	1025	/* 256 4-component parameters */
+
+#define R300_VPUCP_CMD_0		0
+#define R300_VPUCP_X            1
+#define R300_VPUCP_Y            2
+#define R300_VPUCP_Z            3
+#define R300_VPUCP_W            4
+#define R300_VPUCP_CMDSIZE	5	/* 256 4-component parameters */
+
+#define R300_VPS_CMD_0		0
+#define R300_VPS_ZERO_0		1
+#define R300_VPS_ZERO_1		2
+#define R300_VPS_POINTSIZE	3
+#define R300_VPS_ZERO_3		4
+#define R300_VPS_CMDSIZE	5
+
+	/* the layout is common for all fields inside tex */
+#define R300_TEX_CMD_0		0
+#define R300_TEX_VALUE_0	1
+/* We don't really use this, instead specify mtu+1 dynamically
+#define R300_TEX_CMDSIZE	(MAX_TEXTURE_UNITS+1)
+*/
+
+/**
+ * Cache for hardware register state.
+ */
+struct r300_hw_state {
+	struct r300_state_atom atomlist;
+
+	GLboolean is_dirty;
+	GLboolean all_dirty;
+	int max_state_size;	/* in dwords */
+
+	struct r300_state_atom vpt;	/* viewport (1D98) */
+	struct r300_state_atom vap_cntl;
+	struct r300_state_atom vof;	/* VAP output format register 0x2090 */
+	struct r300_state_atom vte;	/* (20B0) */
+	struct r300_state_atom unk2134;	/* (2134) */
+	struct r300_state_atom vap_cntl_status;
+	struct r300_state_atom vir[2];	/* vap input route (2150/21E0) */
+	struct r300_state_atom vic;	/* vap input control (2180) */
+	struct r300_state_atom unk21DC;	/* (21DC) */
+	struct r300_state_atom vap_clip_cntl;
+	struct r300_state_atom unk2220;	/* (2220) */
+	struct r300_state_atom unk2288;	/* (2288) */
+	struct r300_state_atom pvs;	/* pvs_cntl (22D0) */
+	struct r300_state_atom gb_enable;	/* (4008) */
+	struct r300_state_atom gb_misc;	/* Multisampling position shifts ? (4010) */
+	struct r300_state_atom unk4200;	/* (4200) */
+	struct r300_state_atom unk4214;	/* (4214) */
+	struct r300_state_atom ps;	/* pointsize (421C) */
+	struct r300_state_atom unk4230;	/* (4230) */
+	struct r300_state_atom lcntl;	/* line control */
+	struct r300_state_atom unk4260;	/* (4260) */
+	struct r300_state_atom shade;
+	struct r300_state_atom polygon_mode;
+	struct r300_state_atom fogp;	/* fog parameters (4294) */
+	struct r300_state_atom unk429C;	/* (429C) */
+	struct r300_state_atom zbias_cntl;
+	struct r300_state_atom zbs;	/* zbias (42A4) */
+	struct r300_state_atom occlusion_cntl;
+	struct r300_state_atom cul;	/* cull cntl (42B8) */
+	struct r300_state_atom unk42C0;	/* (42C0) */
+	struct r300_state_atom rc;	/* rs control (4300) */
+	struct r300_state_atom ri;	/* rs interpolators (4310) */
+	struct r300_state_atom rr;	/* rs route (4330) */
+	struct r300_state_atom unk43A4;	/* (43A4) */
+	struct r300_state_atom unk43E8;	/* (43E8) */
+	struct r300_state_atom fp;	/* fragment program cntl + nodes (4600) */
+	struct r300_state_atom fpt;	/* texi - (4620) */
+	struct r300_state_atom unk46A4;	/* (46A4) */
+	struct r300_state_atom fpi[4];	/* fp instructions (46C0/47C0/48C0/49C0) */
+	struct r300_state_atom fogs;	/* fog state (4BC0) */
+	struct r300_state_atom fogc;	/* fog color (4BC8) */
+	struct r300_state_atom at;	/* alpha test (4BD4) */
+	struct r300_state_atom unk4BD8;	/* (4BD8) */
+	struct r300_state_atom fpp;	/* 0x4C00 and following */
+	struct r300_state_atom unk4E00;	/* (4E00) */
+	struct r300_state_atom bld;	/* blending (4E04) */
+	struct r300_state_atom cmk;	/* colormask (4E0C) */
+	struct r300_state_atom blend_color;	/* constant blend color */
+	struct r300_state_atom cb;	/* colorbuffer (4E28) */
+	struct r300_state_atom unk4E50;	/* (4E50) */
+	struct r300_state_atom unk4E88;	/* (4E88) */
+	struct r300_state_atom unk4EA0;	/* (4E88) I saw it only written on RV350 hardware..  */
+	struct r300_state_atom zs;	/* zstencil control (4F00) */
+	struct r300_state_atom zstencil_format;
+	struct r300_state_atom zb;	/* z buffer (4F20) */
+	struct r300_state_atom unk4F28;	/* (4F28) */
+	struct r300_state_atom unk4F30;	/* (4F30) */
+	struct r300_state_atom unk4F44;	/* (4F44) */
+	struct r300_state_atom unk4F54;	/* (4F54) */
+
+	struct r300_state_atom vpi;	/* vp instructions */
+	struct r300_state_atom vpp;	/* vp parameters */
+	struct r300_state_atom vps;	/* vertex point size (?) */
+	struct r300_state_atom vpucp[6];	/* vp user clip plane - 6 */
+	/* 8 texture units */
+	/* the state is grouped by function and not by
+	   texture unit. This makes single unit updates
+	   really awkward - we are much better off
+	   updating the whole thing at once */
+	struct {
+		struct r300_state_atom filter;
+		struct r300_state_atom filter_1;
+		struct r300_state_atom size;
+		struct r300_state_atom format;
+		struct r300_state_atom pitch;
+		struct r300_state_atom offset;
+		struct r300_state_atom chroma_key;
+		struct r300_state_atom border_color;
+	} tex;
+	struct r300_state_atom txe;	/* tex enable (4104) */
+};
+
+/**
+ * This structure holds the command buffer while it is being constructed.
+ *
+ * The first batch of commands in the buffer is always the state that needs
+ * to be re-emitted when the context is lost. This batch can be skipped
+ * otherwise.
+ */
+struct r300_cmdbuf {
+	int size;		/* DWORDs allocated for buffer */
+	uint32_t *cmd_buf;
+	int count_used;		/* DWORDs filled so far */
+	int count_reemit;	/* size of re-emission batch */
+};
+
+/**
+ * State cache
+ */
+
+struct r300_depthbuffer_state {
+	GLfloat scale;
+};
+
+struct r300_stencilbuffer_state {
+	GLuint clear;
+	GLboolean hw_stencil;
+
+};
+
+/* Vertex shader state */
+
+/* Perhaps more if we store programs in vmem? */
+/* drm_r300_cmd_header_t->vpu->count is unsigned char */
+#define VSF_MAX_FRAGMENT_LENGTH (255*4)
+
+/* Can be tested with colormat currently. */
+#define VSF_MAX_FRAGMENT_TEMPS (14)
+
+#define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
+#define STATE_R300_TEXRECT_FACTOR (STATE_INTERNAL_DRIVER+1)
+
+struct r300_vertex_shader_fragment {
+	int length;
+	union {
+		GLuint d[VSF_MAX_FRAGMENT_LENGTH];
+		float f[VSF_MAX_FRAGMENT_LENGTH];
+		VERTEX_SHADER_INSTRUCTION i[VSF_MAX_FRAGMENT_LENGTH / 4];
+	} body;
+};
+
+#define VSF_DEST_PROGRAM	0x0
+#define VSF_DEST_MATRIX0	0x200
+#define VSF_DEST_MATRIX1	0x204
+#define VSF_DEST_MATRIX2	0x208
+#define VSF_DEST_VECTOR0	0x20c
+#define VSF_DEST_VECTOR1	0x20d
+#define VSF_DEST_UNKNOWN1	0x400
+#define VSF_DEST_UNKNOWN2	0x406
+
+struct r300_vertex_shader_state {
+	struct r300_vertex_shader_fragment program;
+
+	struct r300_vertex_shader_fragment unknown1;
+	struct r300_vertex_shader_fragment unknown2;
+
+	int program_start;
+	int unknown_ptr1;	/* pointer within program space */
+	int program_end;
+
+	int param_offset;
+	int param_count;
+
+	int unknown_ptr2;	/* pointer within program space */
+	int unknown_ptr3;	/* pointer within program space */
+};
+
+extern int hw_tcl_on;
+
+//#define CURRENT_VERTEX_SHADER(ctx) (ctx->VertexProgram._Current)
+#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->selected_vp)
+
+/* Should but doesnt work */
+//#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->curr_vp)
+
+/* r300_vertex_shader_state and r300_vertex_program should probably be merged together someday.
+ * Keeping them them seperate for now should ensure fixed pipeline keeps functioning properly.
+ */
+
+struct r300_vertex_program_key {
+	GLuint InputsRead;
+	GLuint OutputsWritten;
+};
+
+struct r300_vertex_program {
+	struct r300_vertex_program *next;
+	struct r300_vertex_program_key key;
+	int translated;
+
+	struct r300_vertex_shader_fragment program;
+
+	int pos_end;
+	int num_temporaries;	/* Number of temp vars used by program */
+	int wpos_idx;
+	int inputs[VERT_ATTRIB_MAX];
+	int outputs[VERT_RESULT_MAX];
+	int native;
+	int ref_count;
+	int use_ref_count;
+};
+
+struct r300_vertex_program_cont {
+	struct gl_vertex_program mesa_program;	/* Must be first */
+	struct r300_vertex_shader_fragment params;
+	struct r300_vertex_program *progs;
+};
+
+#define PFS_MAX_ALU_INST	64
+#define PFS_MAX_TEX_INST	64
+#define PFS_MAX_TEX_INDIRECT 4
+#define PFS_NUM_TEMP_REGS	32
+#define PFS_NUM_CONST_REGS	16
+
+/* Mapping Mesa registers to R300 temporaries */
+struct reg_acc {
+	int reg;		/* Assigned hw temp */
+	unsigned int refcount;	/* Number of uses by mesa program */
+};
+
+/**
+ * Describe the current lifetime information for an R300 temporary
+ */
+struct reg_lifetime {
+	/* Index of the first slot where this register is free in the sense
+	   that it can be used as a new destination register.
+	   This is -1 if the register has been assigned to a Mesa register
+	   and the last access to the register has not yet been emitted */
+	int free;
+
+	/* Index of the first slot where this register is currently reserved.
+	   This is used to stop e.g. a scalar operation from being moved
+	   before the allocation time of a register that was first allocated
+	   for a vector operation. */
+	int reserved;
+
+	/* Index of the first slot in which the register can be used as a
+	   source without losing the value that is written by the last
+	   emitted instruction that writes to the register */
+	int vector_valid;
+	int scalar_valid;
+
+	/* Index to the slot where the register was last read.
+	   This is also the first slot in which the register may be written again */
+	int vector_lastread;
+	int scalar_lastread;
+};
+
+/**
+ * Store usage information about an ALU instruction slot during the
+ * compilation of a fragment program.
+ */
+#define SLOT_SRC_VECTOR  (1<<0)
+#define SLOT_SRC_SCALAR  (1<<3)
+#define SLOT_SRC_BOTH    (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
+#define SLOT_OP_VECTOR   (1<<16)
+#define SLOT_OP_SCALAR   (1<<17)
+#define SLOT_OP_BOTH     (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
+
+struct r300_pfs_compile_slot {
+	/* Bitmask indicating which parts of the slot are used, using SLOT_ constants
+	   defined above */
+	unsigned int used;
+
+	/* Selected sources */
+	int vsrc[3];
+	int ssrc[3];
+};
+
+/**
+ * Store information during compilation of fragment programs.
+ */
+struct r300_pfs_compile_state {
+	int nrslots;		/* number of ALU slots used so far */
+
+	/* Track which (parts of) slots are already filled with instructions */
+	struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
+
+	/* Track the validity of R300 temporaries */
+	struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
+
+	/* Used to map Mesa's inputs/temps onto hardware temps */
+	int temp_in_use;
+	struct reg_acc temps[PFS_NUM_TEMP_REGS];
+	struct reg_acc inputs[32];	/* don't actually need 32... */
+
+	/* Track usage of hardware temps, for register allocation,
+	 * indirection detection, etc. */
+	GLuint used_in_node;
+	GLuint dest_in_node;
+};
+
+/**
+ * Store everything about a fragment program that is needed
+ * to render with that program.
+ */
+struct r300_fragment_program {
+	struct gl_fragment_program mesa_program;
+
+	GLcontext *ctx;
+	GLboolean translated;
+	GLboolean error;
+	struct r300_pfs_compile_state *cs;
+
+	struct {
+		int length;
+		GLuint inst[PFS_MAX_TEX_INST];
+	} tex;
+
+	struct {
+		struct {
+			GLuint inst0;
+			GLuint inst1;
+			GLuint inst2;
+			GLuint inst3;
+		} inst[PFS_MAX_ALU_INST];
+	} alu;
+
+	struct {
+		int tex_offset;
+		int tex_end;
+		int alu_offset;
+		int alu_end;
+		int flags;
+	} node[4];
+	int cur_node;
+	int first_node_has_tex;
+
+	int alu_offset;
+	int alu_end;
+	int tex_offset;
+	int tex_end;
+
+	/* Hardware constants.
+	 * Contains a pointer to the value. The destination of the pointer
+	 * is supposed to be updated when GL state changes.
+	 * Typically, this is either a pointer into
+	 * gl_program_parameter_list::ParameterValues, or a pointer to a
+	 * global constant (e.g. for sin/cos-approximation)
+	 */
+	const GLfloat *constant[PFS_NUM_CONST_REGS];
+	int const_nr;
+
+	int max_temp_idx;
+
+	GLuint optimization;
+};
+
+#define R300_MAX_AOS_ARRAYS		16
+
+#define AOS_FORMAT_USHORT	0
+#define AOS_FORMAT_FLOAT	1
+#define AOS_FORMAT_UBYTE	2
+#define AOS_FORMAT_FLOAT_COLOR	3
+
+#define REG_COORDS	0
+#define REG_COLOR0	1
+#define REG_TEX0	2
+
+struct dt {
+	GLint size;
+	GLenum type;
+	GLsizei stride;
+	void *data;
+};
+
+struct radeon_vertex_buffer {
+	int Count;
+	void *Elts;
+	int elt_size;
+	int elt_min, elt_max;	/* debug */
+
+	struct dt AttribPtr[VERT_ATTRIB_MAX];
+
+	const struct _mesa_prim *Primitive;
+	GLuint PrimitiveCount;
+	GLint LockFirst;
+	GLsizei LockCount;
+	int lock_uptodate;
+};
+
+struct r300_state {
+	struct r300_depthbuffer_state depth;
+	struct r300_texture_state texture;
+	int sw_tcl_inputs[VERT_ATTRIB_MAX];
+	struct r300_vertex_shader_state vertex_shader;
+	struct r300_pfs_compile_state pfs_compile;
+	struct r300_dma_region aos[R300_MAX_AOS_ARRAYS];
+	int aos_count;
+	struct radeon_vertex_buffer VB;
+
+	GLuint *Elts;
+	struct r300_dma_region elt_dma;
+
+	 DECLARE_RENDERINPUTS(render_inputs_bitset);	/* actual render inputs that R300 was configured for.
+							   They are the same as tnl->render_inputs for fixed pipeline */
+
+	struct {
+		int transform_offset;	/* Transform matrix offset, -1 if none */
+	} vap_param;		/* vertex processor parameter allocation - tells where to write parameters */
+
+	struct r300_stencilbuffer_state stencil;
+
+};
+
+#define R300_FALLBACK_NONE 0
+#define R300_FALLBACK_TCL 1
+#define R300_FALLBACK_RAST 2
+
+/**
+ * \brief R300 context structure.
+ */
+struct r300_context {
+	struct radeon_context radeon;	/* parent class, must be first */
+
+	struct r300_hw_state hw;
+	struct r300_cmdbuf cmdbuf;
+	struct r300_state state;
+	struct gl_vertex_program *curr_vp;
+	struct r300_vertex_program *selected_vp;
+
+	/* Vertex buffers
+	 */
+	struct r300_dma dma;
+	GLboolean save_on_next_unlock;
+	GLuint NewGLState;
+
+	/* Texture object bookkeeping
+	 */
+	unsigned nr_heaps;
+	driTexHeap *texture_heaps[RADEON_NR_TEX_HEAPS];
+	driTextureObject swapped;
+	int texture_depth;
+	float initialMaxAnisotropy;
+
+	/* Clientdata textures;
+	 */
+	GLuint prefer_gart_client_texturing;
+
+#ifdef USER_BUFFERS
+	struct r300_memory_manager *rmm;
+#endif
+
+	GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
+	GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
+
+	GLboolean disable_lowimpact_fallback;
+};
+
+struct r300_buffer_object {
+	struct gl_buffer_object mesa_obj;
+	int id;
+};
+
+#define R300_CONTEXT(ctx)		((r300ContextPtr)(ctx->DriverCtx))
+
+extern void r300DestroyContext(__DRIcontextPrivate * driContextPriv);
+extern GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+				   __DRIcontextPrivate * driContextPriv,
+				   void *sharedContextPrivate);
+
+extern void r300SelectVertexShader(r300ContextPtr r300);
+extern void r300InitShaderFuncs(struct dd_function_table *functions);
+extern int r300VertexProgUpdateParams(GLcontext * ctx,
+				      struct r300_vertex_program_cont *vp,
+				      float *dst);
+
+#define RADEON_D_CAPTURE 0
+#define RADEON_D_PLAYBACK 1
+#define RADEON_D_PLAYBACK_RAW 2
+#define RADEON_D_T 3
+
+#endif				/* __R300_CONTEXT_H__ */
diff --git a/r300/r300_emit.c b/r300/r300_emit.c
new file mode 100644
index 0000000..2c26069
--- /dev/null
+++ b/r300/r300_emit.c
@@ -0,0 +1,627 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "mtypes.h"
+#include "colormac.h"
+#include "imports.h"
+#include "macros.h"
+#include "image.h"
+
+#include "swrast_setup/swrast_setup.h"
+#include "math/m_translate.h"
+#include "tnl/tnl.h"
+#include "tnl/t_context.h"
+
+#include "r300_context.h"
+#include "radeon_ioctl.h"
+#include "r300_state.h"
+#include "r300_emit.h"
+#include "r300_ioctl.h"
+
+#ifdef USER_BUFFERS
+#include "r300_mem.h"
+#endif
+
+#if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
+    SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
+    SWIZZLE_Z != R300_INPUT_ROUTE_SELECT_Z || \
+    SWIZZLE_W != R300_INPUT_ROUTE_SELECT_W || \
+    SWIZZLE_ZERO != R300_INPUT_ROUTE_SELECT_ZERO || \
+    SWIZZLE_ONE != R300_INPUT_ROUTE_SELECT_ONE
+#error Cannot change these!
+#endif
+
+#define DEBUG_ALL DEBUG_VERTS
+
+#if defined(USE_X86_ASM)
+#define COPY_DWORDS( dst, src, nr )					\
+do {									\
+	int __tmp;							\
+	__asm__ __volatile__( "rep ; movsl"				\
+			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
+			      : "0" (nr),				\
+			        "D" ((long)dst),			\
+			        "S" ((long)src) );			\
+} while (0)
+#else
+#define COPY_DWORDS( dst, src, nr )		\
+do {						\
+   int j;					\
+   for ( j = 0 ; j < nr ; j++ )			\
+      dst[j] = ((int *)src)[j];			\
+   dst += nr;					\
+} while (0)
+#endif
+
+static void r300EmitVec4(GLcontext * ctx,
+			 struct r300_dma_region *rvb,
+			 GLvoid * data, int stride, int count)
+{
+	int i;
+	int *out = (int *)(rvb->address + rvb->start);
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d stride %d\n",
+			__FUNCTION__, count, stride);
+
+	if (stride == 4)
+		COPY_DWORDS(out, data, count);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out++;
+			data += stride;
+		}
+}
+
+static void r300EmitVec8(GLcontext * ctx,
+			 struct r300_dma_region *rvb,
+			 GLvoid * data, int stride, int count)
+{
+	int i;
+	int *out = (int *)(rvb->address + rvb->start);
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d stride %d\n",
+			__FUNCTION__, count, stride);
+
+	if (stride == 8)
+		COPY_DWORDS(out, data, count * 2);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out[1] = *(int *)(data + 4);
+			out += 2;
+			data += stride;
+		}
+}
+
+static void r300EmitVec12(GLcontext * ctx,
+			  struct r300_dma_region *rvb,
+			  GLvoid * data, int stride, int count)
+{
+	int i;
+	int *out = (int *)(rvb->address + rvb->start);
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+			__FUNCTION__, count, stride, (void *)out, (void *)data);
+
+	if (stride == 12)
+		COPY_DWORDS(out, data, count * 3);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out[1] = *(int *)(data + 4);
+			out[2] = *(int *)(data + 8);
+			out += 3;
+			data += stride;
+		}
+}
+
+static void r300EmitVec16(GLcontext * ctx,
+			  struct r300_dma_region *rvb,
+			  GLvoid * data, int stride, int count)
+{
+	int i;
+	int *out = (int *)(rvb->address + rvb->start);
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d stride %d\n",
+			__FUNCTION__, count, stride);
+
+	if (stride == 16)
+		COPY_DWORDS(out, data, count * 4);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out[1] = *(int *)(data + 4);
+			out[2] = *(int *)(data + 8);
+			out[3] = *(int *)(data + 12);
+			out += 4;
+			data += stride;
+		}
+}
+
+static void r300EmitVec(GLcontext * ctx,
+			struct r300_dma_region *rvb,
+			GLvoid * data, int size, int stride, int count)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d size %d stride %d\n",
+			__FUNCTION__, count, size, stride);
+
+	/* Gets triggered when playing with future_hw_tcl_on ... */
+	//assert(!rvb->buf);
+
+	if (stride == 0) {
+		r300AllocDmaRegion(rmesa, rvb, size * 4, 4);
+		count = 1;
+		rvb->aos_offset = GET_START(rvb);
+		rvb->aos_stride = 0;
+	} else {
+		r300AllocDmaRegion(rmesa, rvb, size * count * 4, 4);	/* alignment? */
+		rvb->aos_offset = GET_START(rvb);
+		rvb->aos_stride = size;
+	}
+
+	/* Emit the data
+	 */
+	switch (size) {
+	case 1:
+		r300EmitVec4(ctx, rvb, data, stride, count);
+		break;
+	case 2:
+		r300EmitVec8(ctx, rvb, data, stride, count);
+		break;
+	case 3:
+		r300EmitVec12(ctx, rvb, data, stride, count);
+		break;
+	case 4:
+		r300EmitVec16(ctx, rvb, data, stride, count);
+		break;
+	default:
+		assert(0);
+		_mesa_exit(-1);
+		break;
+	}
+
+}
+
+static GLuint t_type(struct dt *dt)
+{
+	switch (dt->type) {
+	case GL_UNSIGNED_BYTE:
+		return AOS_FORMAT_UBYTE;
+	case GL_SHORT:
+		return AOS_FORMAT_USHORT;
+	case GL_FLOAT:
+		return AOS_FORMAT_FLOAT;
+	default:
+		assert(0);
+		break;
+	}
+
+	return AOS_FORMAT_FLOAT;
+}
+
+static GLuint t_vir0_size(struct dt *dt)
+{
+	switch (dt->type) {
+	case GL_UNSIGNED_BYTE:
+		return 4;
+	case GL_SHORT:
+		return 7;
+	case GL_FLOAT:
+		return dt->size - 1;
+	default:
+		assert(0);
+		break;
+	}
+
+	return 0;
+}
+
+static GLuint t_aos_size(struct dt *dt)
+{
+	switch (dt->type) {
+	case GL_UNSIGNED_BYTE:
+		return 1;
+	case GL_SHORT:
+		return 2;
+	case GL_FLOAT:
+		return dt->size;
+	default:
+		assert(0);
+		break;
+	}
+
+	return 0;
+}
+
+static GLuint t_vir0(uint32_t * dst, struct dt *dt, int *inputs,
+		     GLint * tab, GLuint nr)
+{
+	GLuint i, dw;
+
+	for (i = 0; i + 1 < nr; i += 2) {
+		dw = t_vir0_size(&dt[tab[i]]) | (inputs[tab[i]] << 8) |
+		    (t_type(&dt[tab[i]]) << 14);
+		dw |=
+		    (t_vir0_size(&dt[tab[i + 1]]) |
+		     (inputs[tab[i + 1]] << 8) | (t_type(&dt[tab[i + 1]])
+						  << 14)) << 16;
+
+		if (i + 2 == nr) {
+			dw |= (1 << (13 + 16));
+		}
+		dst[i >> 1] = dw;
+	}
+
+	if (nr & 1) {
+		dw = t_vir0_size(&dt[tab[nr - 1]]) | (inputs[tab[nr - 1]]
+						      << 8) |
+		    (t_type(&dt[tab[nr - 1]]) << 14);
+		dw |= 1 << 13;
+
+		dst[nr >> 1] = dw;
+	}
+
+	return (nr + 1) >> 1;
+}
+
+static GLuint t_swizzle(int swizzle[4])
+{
+	return (swizzle[0] << R300_INPUT_ROUTE_X_SHIFT) |
+	    (swizzle[1] << R300_INPUT_ROUTE_Y_SHIFT) |
+	    (swizzle[2] << R300_INPUT_ROUTE_Z_SHIFT) |
+	    (swizzle[3] << R300_INPUT_ROUTE_W_SHIFT);
+}
+
+static GLuint t_vir1(uint32_t * dst, int swizzle[][4], GLuint nr)
+{
+	GLuint i;
+
+	for (i = 0; i + 1 < nr; i += 2) {
+		dst[i >> 1] = t_swizzle(swizzle[i]) | R300_INPUT_ROUTE_ENABLE;
+		dst[i >> 1] |=
+		    (t_swizzle(swizzle[i + 1]) | R300_INPUT_ROUTE_ENABLE)
+		    << 16;
+	}
+
+	if (nr & 1)
+		dst[nr >> 1] =
+		    t_swizzle(swizzle[nr - 1]) | R300_INPUT_ROUTE_ENABLE;
+
+	return (nr + 1) >> 1;
+}
+
+static GLuint t_emit_size(struct dt *dt)
+{
+	return dt->size;
+}
+
+static GLuint t_vic(GLcontext * ctx, GLuint InputsRead)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	GLuint i, vic_1 = 0;
+
+	if (InputsRead & (1 << VERT_ATTRIB_POS))
+		vic_1 |= R300_INPUT_CNTL_POS;
+
+	if (InputsRead & (1 << VERT_ATTRIB_NORMAL))
+		vic_1 |= R300_INPUT_CNTL_NORMAL;
+
+	if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
+		vic_1 |= R300_INPUT_CNTL_COLOR;
+
+	r300->state.texture.tc_count = 0;
+	for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+		if (InputsRead & (1 << (VERT_ATTRIB_TEX0 + i))) {
+			r300->state.texture.tc_count++;
+			vic_1 |= R300_INPUT_CNTL_TC0 << i;
+		}
+
+	return vic_1;
+}
+
+/* Emit vertex data to GART memory
+ * Route inputs to the vertex processor
+ * This function should never return R300_FALLBACK_TCL when using software tcl.
+ */
+
+int r300EmitArrays(GLcontext * ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	r300ContextPtr r300 = rmesa;
+	struct radeon_vertex_buffer *VB = &rmesa->state.VB;
+	GLuint nr;
+	GLuint count = VB->Count;
+	GLuint i;
+	GLuint InputsRead = 0, OutputsWritten = 0;
+	int *inputs = NULL;
+	int vir_inputs[VERT_ATTRIB_MAX];
+	GLint tab[VERT_ATTRIB_MAX];
+	int swizzle[VERT_ATTRIB_MAX][4];
+
+	if (hw_tcl_on) {
+		struct r300_vertex_program *prog =
+		    (struct r300_vertex_program *)
+		    CURRENT_VERTEX_SHADER(ctx);
+		inputs = prog->inputs;
+		InputsRead = CURRENT_VERTEX_SHADER(ctx)->key.InputsRead;
+		OutputsWritten = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+	} else {
+		DECLARE_RENDERINPUTS(inputs_bitset);
+		inputs = r300->state.sw_tcl_inputs;
+
+		RENDERINPUTS_COPY(inputs_bitset,
+				  TNL_CONTEXT(ctx)->render_inputs_bitset);
+
+		assert(RENDERINPUTS_TEST(inputs_bitset, _TNL_ATTRIB_POS));
+		InputsRead |= 1 << VERT_ATTRIB_POS;
+		OutputsWritten |= 1 << VERT_RESULT_HPOS;
+
+		assert(RENDERINPUTS_TEST(inputs_bitset, _TNL_ATTRIB_NORMAL)
+		       == 0);
+
+		assert(RENDERINPUTS_TEST(inputs_bitset, _TNL_ATTRIB_COLOR0));
+		InputsRead |= 1 << VERT_ATTRIB_COLOR0;
+		OutputsWritten |= 1 << VERT_RESULT_COL0;
+
+		if (RENDERINPUTS_TEST(inputs_bitset, _TNL_ATTRIB_COLOR1)) {
+			InputsRead |= 1 << VERT_ATTRIB_COLOR1;
+			OutputsWritten |= 1 << VERT_RESULT_COL1;
+		}
+
+		for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+			if (RENDERINPUTS_TEST
+			    (inputs_bitset, _TNL_ATTRIB_TEX(i))) {
+				InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
+				OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
+			}
+
+		for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++)
+			if (InputsRead & (1 << i))
+				inputs[i] = nr++;
+			else
+				inputs[i] = -1;
+
+		if (!
+		    (r300->radeon.radeonScreen->
+		     chip_flags & RADEON_CHIPSET_TCL)) {
+			/* Fixed, apply to vir0 only */
+			memcpy(vir_inputs, inputs,
+			       VERT_ATTRIB_MAX * sizeof(int));
+			inputs = vir_inputs;
+
+			if (InputsRead & VERT_ATTRIB_POS)
+				inputs[VERT_ATTRIB_POS] = 0;
+
+			if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
+				inputs[VERT_ATTRIB_COLOR0] = 2;
+
+			if (InputsRead & (1 << VERT_ATTRIB_COLOR1))
+				inputs[VERT_ATTRIB_COLOR1] = 3;
+
+			for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
+				if (InputsRead & (1 << i))
+					inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
+		}
+
+		RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset,
+				  inputs_bitset);
+	}
+	assert(InputsRead);
+	assert(OutputsWritten);
+
+	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++)
+		if (InputsRead & (1 << i))
+			tab[nr++] = i;
+
+	if (nr > R300_MAX_AOS_ARRAYS)
+		return R300_FALLBACK_TCL;
+
+	for (i = 0; i < nr; i++) {
+		int ci;
+		int comp_size, fix, found = 0;
+
+		swizzle[i][0] = SWIZZLE_ZERO;
+		swizzle[i][1] = SWIZZLE_ZERO;
+		swizzle[i][2] = SWIZZLE_ZERO;
+		swizzle[i][3] = SWIZZLE_ONE;
+
+		for (ci = 0; ci < VB->AttribPtr[tab[i]].size; ci++)
+			swizzle[i][ci] = ci;
+
+#if MESA_BIG_ENDIAN
+#define SWAP_INT(a, b) do { \
+	int __temp; \
+	__temp = a;\
+	a = b; \
+	b = __temp; \
+} while (0)
+
+		if (VB->AttribPtr[tab[i]].type == GL_UNSIGNED_BYTE) {
+			SWAP_INT(swizzle[i][0], swizzle[i][3]);
+			SWAP_INT(swizzle[i][1], swizzle[i][2]);
+		}
+#endif				/* MESA_BIG_ENDIAN */
+
+		if (r300IsGartMemory(rmesa, VB->AttribPtr[tab[i]].data,
+				     /*(count-1)*stride */ 4)) {
+			if (VB->AttribPtr[tab[i]].stride % 4)
+				return R300_FALLBACK_TCL;
+
+			rmesa->state.aos[i].address =
+			    VB->AttribPtr[tab[i]].data;
+			rmesa->state.aos[i].start = 0;
+			rmesa->state.aos[i].aos_offset =
+			    r300GartOffsetFromVirtual(rmesa,
+						      VB->
+						      AttribPtr[tab[i]].data);
+			rmesa->state.aos[i].aos_stride =
+			    VB->AttribPtr[tab[i]].stride / 4;
+
+			rmesa->state.aos[i].aos_size =
+			    t_emit_size(&VB->AttribPtr[tab[i]]);
+		} else {
+			/* TODO: r300EmitVec can only handle 4 byte vectors */
+			if (VB->AttribPtr[tab[i]].type != GL_FLOAT)
+				return R300_FALLBACK_TCL;
+
+			r300EmitVec(ctx, &rmesa->state.aos[i],
+				    VB->AttribPtr[tab[i]].data,
+				    t_emit_size(&VB->AttribPtr[tab[i]]),
+				    VB->AttribPtr[tab[i]].stride, count);
+		}
+
+		rmesa->state.aos[i].aos_size =
+		    t_aos_size(&VB->AttribPtr[tab[i]]);
+
+		comp_size = _mesa_sizeof_type(VB->AttribPtr[tab[i]].type);
+
+		for (fix = 0; fix <= 4 - VB->AttribPtr[tab[i]].size; fix++) {
+			if ((rmesa->state.aos[i].aos_offset -
+			     comp_size * fix) % 4)
+				continue;
+
+			found = 1;
+			break;
+		}
+
+		if (found) {
+			if (fix > 0) {
+				WARN_ONCE("Feeling lucky?\n");
+			}
+
+			rmesa->state.aos[i].aos_offset -= comp_size * fix;
+
+			for (ci = 0; ci < VB->AttribPtr[tab[i]].size; ci++)
+				swizzle[i][ci] += fix;
+		} else {
+			WARN_ONCE
+			    ("Cannot handle offset %x with stride %d, comp %d\n",
+			     rmesa->state.aos[i].aos_offset,
+			     rmesa->state.aos[i].aos_stride,
+			     VB->AttribPtr[tab[i]].size);
+			return R300_FALLBACK_TCL;
+		}
+	}
+
+	/* setup INPUT_ROUTE */
+	R300_STATECHANGE(r300, vir[0]);
+	((drm_r300_cmd_header_t *) r300->hw.vir[0].cmd)->packet0.count =
+	    t_vir0(&r300->hw.vir[0].cmd[R300_VIR_CNTL_0], VB->AttribPtr,
+		   inputs, tab, nr);
+
+	R300_STATECHANGE(r300, vir[1]);
+	((drm_r300_cmd_header_t *) r300->hw.vir[1].cmd)->packet0.count =
+	    t_vir1(&r300->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle, nr);
+
+	/* Set up input_cntl */
+	/* I don't think this is needed for vertex buffers, but it doesn't hurt anything */
+	R300_STATECHANGE(r300, vic);
+	r300->hw.vic.cmd[R300_VIC_CNTL_0] = 0x5555;	/* Hard coded value, no idea what it means */
+	r300->hw.vic.cmd[R300_VIC_CNTL_1] = t_vic(ctx, InputsRead);
+
+	/* Stage 3: VAP output */
+
+	R300_STATECHANGE(r300, vof);
+
+	r300->hw.vof.cmd[R300_VOF_CNTL_0] = 0;
+	r300->hw.vof.cmd[R300_VOF_CNTL_1] = 0;
+
+	if (OutputsWritten & (1 << VERT_RESULT_HPOS))
+		r300->hw.vof.cmd[R300_VOF_CNTL_0] |=
+		    R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
+
+	if (OutputsWritten & (1 << VERT_RESULT_COL0))
+		r300->hw.vof.cmd[R300_VOF_CNTL_0] |=
+		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_PRESENT;
+
+	if (OutputsWritten & (1 << VERT_RESULT_COL1))
+		r300->hw.vof.cmd[R300_VOF_CNTL_0] |=
+		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT;
+
+	/*if(OutputsWritten & (1 << VERT_RESULT_BFC0))
+	   r300->hw.vof.cmd[R300_VOF_CNTL_0] |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT;
+
+	   if(OutputsWritten & (1 << VERT_RESULT_BFC1))
+	   r300->hw.vof.cmd[R300_VOF_CNTL_0] |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT; */
+	//if(OutputsWritten & (1 << VERT_RESULT_FOGC))
+
+	if (OutputsWritten & (1 << VERT_RESULT_PSIZ))
+		r300->hw.vof.cmd[R300_VOF_CNTL_0] |=
+		    R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
+
+	for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+		if (OutputsWritten & (1 << (VERT_RESULT_TEX0 + i)))
+			r300->hw.vof.cmd[R300_VOF_CNTL_1] |= (4 << (3 * i));
+
+	rmesa->state.aos_count = nr;
+
+	return R300_FALLBACK_NONE;
+}
+
+#ifdef USER_BUFFERS
+void r300UseArrays(GLcontext * ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	int i;
+
+	if (rmesa->state.elt_dma.buf)
+		r300_mem_use(rmesa, rmesa->state.elt_dma.buf->id);
+
+	for (i = 0; i < rmesa->state.aos_count; i++) {
+		if (rmesa->state.aos[i].buf)
+			r300_mem_use(rmesa, rmesa->state.aos[i].buf->id);
+	}
+}
+#endif
+
+void r300ReleaseArrays(GLcontext * ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	int i;
+
+	r300ReleaseDmaRegion(rmesa, &rmesa->state.elt_dma, __FUNCTION__);
+	for (i = 0; i < rmesa->state.aos_count; i++) {
+		r300ReleaseDmaRegion(rmesa, &rmesa->state.aos[i], __FUNCTION__);
+	}
+}
diff --git a/r300/r300_emit.h b/r300/r300_emit.h
new file mode 100644
index 0000000..7be098f
--- /dev/null
+++ b/r300/r300_emit.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (C) 2005 Vladimir Dergachev.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/*
+ * Authors:
+ *   Vladimir Dergachev <volodya@mindspring.com>
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ *   Aapo Tahkola <aet@rasterburn.org>
+ *   Ben Skeggs <darktama@iinet.net.au>
+ *   Jerome Glisse <j.glisse@gmail.com>
+ */
+
+/* This files defines functions for accessing R300 hardware.
+ */
+#ifndef __R300_EMIT_H__
+#define __R300_EMIT_H__
+
+#include "glheader.h"
+#include "r300_context.h"
+#include "r300_cmdbuf.h"
+#include "radeon_reg.h"
+
+/*
+ * CP type-3 packets
+ */
+#define RADEON_CP_PACKET3_UNK1B                     0xC0001B00
+#define RADEON_CP_PACKET3_INDX_BUFFER               0xC0003300
+#define RADEON_CP_PACKET3_3D_DRAW_VBUF_2            0xC0003400
+#define RADEON_CP_PACKET3_3D_DRAW_IMMD_2            0xC0003500
+#define RADEON_CP_PACKET3_3D_DRAW_INDX_2            0xC0003600
+#define RADEON_CP_PACKET3_3D_LOAD_VBPNTR            0xC0002F00
+#define RADEON_CP_PACKET3_3D_CLEAR_ZMASK            0xC0003202
+#define RADEON_CP_PACKET3_3D_CLEAR_CMASK            0xC0003802
+#define RADEON_CP_PACKET3_3D_CLEAR_HIZ              0xC0003702
+
+#define CP_PACKET0(reg, n)	(RADEON_CP_PACKET0 | ((n)<<16) | ((reg)>>2))
+
+static __inline__ uint32_t cmdpacket0(int reg, int count)
+{
+	drm_r300_cmd_header_t cmd;
+
+	cmd.packet0.cmd_type = R300_CMD_PACKET0;
+	cmd.packet0.count = count;
+	cmd.packet0.reghi = ((unsigned int)reg & 0xFF00) >> 8;
+	cmd.packet0.reglo = ((unsigned int)reg & 0x00FF);
+
+	return cmd.u;
+}
+
+static __inline__ uint32_t cmdvpu(int addr, int count)
+{
+	drm_r300_cmd_header_t cmd;
+
+	cmd.vpu.cmd_type = R300_CMD_VPU;
+	cmd.vpu.count = count;
+	cmd.vpu.adrhi = ((unsigned int)addr & 0xFF00) >> 8;
+	cmd.vpu.adrlo = ((unsigned int)addr & 0x00FF);
+
+	return cmd.u;
+}
+
+static __inline__ uint32_t cmdpacket3(int packet)
+{
+	drm_r300_cmd_header_t cmd;
+
+	cmd.packet3.cmd_type = R300_CMD_PACKET3;
+	cmd.packet3.packet = packet;
+
+	return cmd.u;
+}
+
+static __inline__ uint32_t cmdcpdelay(unsigned short count)
+{
+	drm_r300_cmd_header_t cmd;
+
+	cmd.delay.cmd_type = R300_CMD_CP_DELAY;
+	cmd.delay.count = count;
+
+	return cmd.u;
+}
+
+static __inline__ uint32_t cmdwait(unsigned char flags)
+{
+	drm_r300_cmd_header_t cmd;
+
+	cmd.wait.cmd_type = R300_CMD_WAIT;
+	cmd.wait.flags = flags;
+
+	return cmd.u;
+}
+
+static __inline__ uint32_t cmdpacify(void)
+{
+	drm_r300_cmd_header_t cmd;
+
+	cmd.header.cmd_type = R300_CMD_END3D;
+
+	return cmd.u;
+}
+
+/**
+ * Prepare to write a register value to register at address reg.
+ * If num_extra > 0 then the following extra values are written
+ * to registers with address +4, +8 and so on..
+ */
+#define reg_start(reg, num_extra)					\
+	do {								\
+		int _n;							\
+		_n=(num_extra);						\
+		cmd = (drm_radeon_cmd_header_t*)			\
+			r300AllocCmdBuf(rmesa,				\
+					(_n+2),				\
+					__FUNCTION__);			\
+		cmd_reserved=_n+2;					\
+		cmd_written=1;						\
+		cmd[0].i=cmdpacket0((reg), _n+1);			\
+	} while (0);
+
+/**
+ * Emit GLuint freestyle
+ */
+#define e32(dword)							\
+	do {								\
+		if(cmd_written<cmd_reserved) {				\
+			cmd[cmd_written].i=(dword);			\
+			cmd_written++;					\
+		} else {						\
+			fprintf(stderr,					\
+				"e32 but no previous packet "		\
+				"declaration.\n"			\
+				"Aborting! in %s::%s at line %d, "	\
+				"cmd_written=%d cmd_reserved=%d\n",	\
+				__FILE__, __FUNCTION__, __LINE__,	\
+				cmd_written, cmd_reserved);		\
+			_mesa_exit(-1);					\
+		}							\
+	} while(0)
+
+#define	efloat(f) e32(r300PackFloat32(f))
+
+#define vsf_start_fragment(dest, length)				\
+	do {								\
+		int _n;							\
+		_n = (length);						\
+		cmd = (drm_radeon_cmd_header_t*)			\
+			r300AllocCmdBuf(rmesa,				\
+					(_n+1),				\
+					__FUNCTION__);			\
+		cmd_reserved = _n+2;					\
+		cmd_written =1;						\
+		cmd[0].i = cmdvpu((dest), _n/4);			\
+	} while (0);
+
+#define start_packet3(packet, count)					\
+	{								\
+		int _n;							\
+		GLuint _p;						\
+		_n = (count);						\
+		_p = (packet);						\
+		cmd = (drm_radeon_cmd_header_t*)			\
+			r300AllocCmdBuf(rmesa,				\
+					(_n+3),				\
+					__FUNCTION__);			\
+		cmd_reserved = _n+3;					\
+		cmd_written = 2;					\
+		if(_n > 0x3fff) {					\
+			fprintf(stderr,"Too big packet3 %08x: cannot "	\
+				"store %d dwords\n",			\
+				_p, _n);				\
+			_mesa_exit(-1);					\
+		}							\
+		cmd[0].i = cmdpacket3(R300_CMD_PACKET3_RAW);		\
+		cmd[1].i = _p | ((_n & 0x3fff)<<16);			\
+	}
+
+/**
+ * Must be sent to switch to 2d commands
+ */
+void static inline end_3d(r300ContextPtr rmesa)
+{
+	drm_radeon_cmd_header_t *cmd = NULL;
+
+	cmd =
+	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
+	cmd[0].header.cmd_type = R300_CMD_END3D;
+}
+
+void static inline cp_delay(r300ContextPtr rmesa, unsigned short count)
+{
+	drm_radeon_cmd_header_t *cmd = NULL;
+
+	cmd =
+	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
+	cmd[0].i = cmdcpdelay(count);
+}
+
+void static inline cp_wait(r300ContextPtr rmesa, unsigned char flags)
+{
+	drm_radeon_cmd_header_t *cmd = NULL;
+
+	cmd =
+	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
+	cmd[0].i = cmdwait(flags);
+}
+
+extern int r300EmitArrays(GLcontext * ctx);
+
+#ifdef USER_BUFFERS
+void r300UseArrays(GLcontext * ctx);
+#endif
+
+extern void r300ReleaseArrays(GLcontext * ctx);
+
+#endif
diff --git a/r300/r300_fragprog.c b/r300/r300_fragprog.c
new file mode 100644
index 0000000..cce8e68
--- /dev/null
+++ b/r300/r300_fragprog.c
@@ -0,0 +1,2472 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * \author Ben Skeggs <darktama@iinet.net.au>
+ *
+ * \author Jerome Glisse <j.glisse@gmail.com>
+ *
+ * \todo Depth write, WPOS/FOGC inputs
+ *
+ * \todo FogOption
+ *
+ * \todo Verify results of opcodes for accuracy, I've only checked them in
+ * specific cases.
+ */
+
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "shader/prog_instruction.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+
+#include "r300_context.h"
+#include "r300_fragprog.h"
+#include "r300_reg.h"
+#include "r300_state.h"
+
+/*
+ * Usefull macros and values
+ */
+#define ERROR(fmt, args...) do {			\
+		fprintf(stderr, "%s::%s(): " fmt "\n",	\
+			__FILE__, __FUNCTION__, ##args);	\
+		fp->error = GL_TRUE;			\
+	} while(0)
+
+#define PFS_INVAL 0xFFFFFFFF
+#define COMPILE_STATE struct r300_pfs_compile_state *cs = fp->cs
+
+#define SWIZZLE_XYZ		0
+#define SWIZZLE_XXX		1
+#define SWIZZLE_YYY		2
+#define SWIZZLE_ZZZ		3
+#define SWIZZLE_WWW		4
+#define SWIZZLE_YZX		5
+#define SWIZZLE_ZXY		6
+#define SWIZZLE_WZY		7
+#define SWIZZLE_111		8
+#define SWIZZLE_000		9
+#define SWIZZLE_HHH		10
+
+#define swizzle(r, x, y, z, w) do_swizzle(fp, r,		\
+					  ((SWIZZLE_##x<<0)|	\
+					   (SWIZZLE_##y<<3)|	\
+					   (SWIZZLE_##z<<6)|	\
+					   (SWIZZLE_##w<<9)),	\
+					  0)
+
+#define REG_TYPE_INPUT		0
+#define REG_TYPE_OUTPUT		1
+#define REG_TYPE_TEMP		2
+#define REG_TYPE_CONST		3
+
+#define REG_TYPE_SHIFT		0
+#define REG_INDEX_SHIFT		2
+#define REG_VSWZ_SHIFT		8
+#define REG_SSWZ_SHIFT		13
+#define REG_NEGV_SHIFT		18
+#define REG_NEGS_SHIFT		19
+#define REG_ABS_SHIFT		20
+#define REG_NO_USE_SHIFT	21	// Hack for refcounting
+#define REG_VALID_SHIFT		22	// Does the register contain a defined value?
+#define REG_BUILTIN_SHIFT   23	// Is it a builtin (like all zero/all one)?
+
+#define REG_TYPE_MASK		(0x03 << REG_TYPE_SHIFT)
+#define REG_INDEX_MASK		(0x3F << REG_INDEX_SHIFT)
+#define REG_VSWZ_MASK		(0x1F << REG_VSWZ_SHIFT)
+#define REG_SSWZ_MASK		(0x1F << REG_SSWZ_SHIFT)
+#define REG_NEGV_MASK		(0x01 << REG_NEGV_SHIFT)
+#define REG_NEGS_MASK		(0x01 << REG_NEGS_SHIFT)
+#define REG_ABS_MASK		(0x01 << REG_ABS_SHIFT)
+#define REG_NO_USE_MASK		(0x01 << REG_NO_USE_SHIFT)
+#define REG_VALID_MASK		(0x01 << REG_VALID_SHIFT)
+#define REG_BUILTIN_MASK	(0x01 << REG_BUILTIN_SHIFT)
+
+#define REG(type, index, vswz, sswz, nouse, valid, builtin)	\
+	(((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |			\
+	 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |		\
+	 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |		\
+	 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |		\
+	 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |	\
+	 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |			\
+	 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
+#define REG_GET_TYPE(reg)						\
+	((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
+#define REG_GET_INDEX(reg)						\
+	((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
+#define REG_GET_VSWZ(reg)						\
+	((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
+#define REG_GET_SSWZ(reg)						\
+	((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
+#define REG_GET_NO_USE(reg)						\
+	((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
+#define REG_GET_VALID(reg)						\
+	((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
+#define REG_GET_BUILTIN(reg)						\
+	((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
+#define REG_SET_TYPE(reg, type)						\
+	reg = ((reg & ~REG_TYPE_MASK) |					\
+	       ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
+#define REG_SET_INDEX(reg, index)					\
+	reg = ((reg & ~REG_INDEX_MASK) |				\
+	       ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
+#define REG_SET_VSWZ(reg, vswz)						\
+	reg = ((reg & ~REG_VSWZ_MASK) |					\
+	       ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
+#define REG_SET_SSWZ(reg, sswz)						\
+	reg = ((reg & ~REG_SSWZ_MASK) |					\
+	       ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
+#define REG_SET_NO_USE(reg, nouse)					\
+	reg = ((reg & ~REG_NO_USE_MASK) |				\
+	       ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
+#define REG_SET_VALID(reg, valid)					\
+	reg = ((reg & ~REG_VALID_MASK) |				\
+	       ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
+#define REG_SET_BUILTIN(reg, builtin)					\
+	reg = ((reg & ~REG_BUILTIN_MASK) |				\
+	       ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
+#define REG_ABS(reg)							\
+	reg = (reg | REG_ABS_MASK)
+#define REG_NEGV(reg)							\
+	reg = (reg | REG_NEGV_MASK)
+#define REG_NEGS(reg)							\
+	reg = (reg | REG_NEGS_MASK)
+
+/*
+ * Datas structures for fragment program generation
+ */
+
+/* description of r300 native hw instructions */
+static const struct {
+	const char *name;
+	int argc;
+	int v_op;
+	int s_op;
+} r300_fpop[] = {
+	/* *INDENT-OFF* */
+	{"MAD", 3, R300_FPI0_OUTC_MAD, R300_FPI2_OUTA_MAD},
+	{"DP3", 2, R300_FPI0_OUTC_DP3, R300_FPI2_OUTA_DP4},
+	{"DP4", 2, R300_FPI0_OUTC_DP4, R300_FPI2_OUTA_DP4},
+	{"MIN", 2, R300_FPI0_OUTC_MIN, R300_FPI2_OUTA_MIN},
+	{"MAX", 2, R300_FPI0_OUTC_MAX, R300_FPI2_OUTA_MAX},
+	{"CMP", 3, R300_FPI0_OUTC_CMP, R300_FPI2_OUTA_CMP},
+	{"FRC", 1, R300_FPI0_OUTC_FRC, R300_FPI2_OUTA_FRC},
+	{"EX2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_EX2},
+	{"LG2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_LG2},
+	{"RCP", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RCP},
+	{"RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RSQ},
+	{"REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA, PFS_INVAL},
+	{"CMPH", 3, R300_FPI0_OUTC_CMPH, PFS_INVAL},
+	/* *INDENT-ON* */
+};
+
+/* vector swizzles r300 can support natively, with a couple of
+ * cases we handle specially
+ *
+ * REG_VSWZ/REG_SSWZ is an index into this table
+ */
+
+/* mapping from SWIZZLE_* to r300 native values for scalar insns */
+#define SWIZZLE_HALF 6
+
+#define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
+					  SWIZZLE_##y, \
+					  SWIZZLE_##z, \
+					  SWIZZLE_ZERO))
+/* native swizzles */
+static const struct r300_pfs_swizzle {
+	GLuint hash;		/* swizzle value this matches */
+	GLuint base;		/* base value for hw swizzle */
+	GLuint stride;		/* difference in base between arg0/1/2 */
+	GLuint flags;
+} v_swiz[] = {
+	/* *INDENT-OFF* */
+	{MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
+	{MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
+	{MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
+	{MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
+	{MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
+	{MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
+	{PFS_INVAL, 0, 0, 0},
+	/* *INDENT-ON* */
+};
+
+/* used during matching of non-native swizzles */
+#define SWZ_X_MASK (7 << 0)
+#define SWZ_Y_MASK (7 << 3)
+#define SWZ_Z_MASK (7 << 6)
+#define SWZ_W_MASK (7 << 9)
+static const struct {
+	GLuint hash;		/* used to mask matching swizzle components */
+	int mask;		/* actual outmask */
+	int count;		/* count of components matched */
+} s_mask[] = {
+	/* *INDENT-OFF* */
+	{SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
+	{SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
+	{SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
+	{SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
+	{SWZ_X_MASK, 1, 1},
+	{SWZ_Y_MASK, 2, 1},
+	{SWZ_Z_MASK, 4, 1},
+	{PFS_INVAL, PFS_INVAL, PFS_INVAL}
+	/* *INDENT-ON* */
+};
+
+static const struct {
+	int base;		/* hw value of swizzle */
+	int stride;		/* difference between SRC0/1/2 */
+	GLuint flags;
+} s_swiz[] = {
+	/* *INDENT-OFF* */
+	{R300_FPI2_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
+	{R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
+	{R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
+	{R300_FPI2_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
+	{R300_FPI2_ARGA_ZERO, 0, 0},
+	{R300_FPI2_ARGA_ONE, 0, 0},
+	{R300_FPI2_ARGA_HALF, 0, 0}
+	/* *INDENT-ON* */
+};
+
+/* boiler-plate reg, for convenience */
+static const GLuint undef = REG(REG_TYPE_TEMP,
+				0,
+				SWIZZLE_XYZ,
+				SWIZZLE_W,
+				GL_FALSE,
+				GL_FALSE,
+				GL_FALSE);
+
+/* constant one source */
+static const GLuint pfs_one = REG(REG_TYPE_CONST,
+				  0,
+				  SWIZZLE_111,
+				  SWIZZLE_ONE,
+				  GL_FALSE,
+				  GL_TRUE,
+				  GL_TRUE);
+
+/* constant half source */
+static const GLuint pfs_half = REG(REG_TYPE_CONST,
+				   0,
+				   SWIZZLE_HHH,
+				   SWIZZLE_HALF,
+				   GL_FALSE,
+				   GL_TRUE,
+				   GL_TRUE);
+
+/* constant zero source */
+static const GLuint pfs_zero = REG(REG_TYPE_CONST,
+				   0,
+				   SWIZZLE_000,
+				   SWIZZLE_ZERO,
+				   GL_FALSE,
+				   GL_TRUE,
+				   GL_TRUE);
+
+/*
+ * Common functions prototypes
+ */
+static void dump_program(struct r300_fragment_program *fp);
+static void emit_arith(struct r300_fragment_program *fp, int op,
+		       GLuint dest, int mask,
+		       GLuint src0, GLuint src1, GLuint src2, int flags);
+
+/**
+ * Get an R300 temporary that can be written to in the given slot.
+ */
+static int get_hw_temp(struct r300_fragment_program *fp, int slot)
+{
+	COMPILE_STATE;
+	int r;
+
+	for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
+			break;
+	}
+
+	if (r >= PFS_NUM_TEMP_REGS) {
+		ERROR("Out of hardware temps\n");
+		return 0;
+	}
+	// Reserved is used to avoid the following scenario:
+	//  R300 temporary X is first assigned to Mesa temporary Y during vector ops
+	//  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
+	//  Then scalar ops on Mesa temporary Z are emitted and move back in time
+	//  to overwrite the value of temporary Y.
+	// End scenario.
+	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+	cs->hwtemps[r].free = -1;
+
+	// Reset to some value that won't mess things up when the user
+	// tries to read from a temporary that hasn't been assigned a value yet.
+	// In the normal case, vector_valid and scalar_valid should be set to
+	// a sane value by the first emit that writes to this temporary.
+	cs->hwtemps[r].vector_valid = 0;
+	cs->hwtemps[r].scalar_valid = 0;
+
+	if (r > fp->max_temp_idx)
+		fp->max_temp_idx = r;
+
+	return r;
+}
+
+/**
+ * Get an R300 temporary that will act as a TEX destination register.
+ */
+static int get_hw_temp_tex(struct r300_fragment_program *fp)
+{
+	COMPILE_STATE;
+	int r;
+
+	for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
+		if (cs->used_in_node & (1 << r))
+			continue;
+
+		// Note: Be very careful here
+		if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
+			break;
+	}
+
+	if (r >= PFS_NUM_TEMP_REGS)
+		return get_hw_temp(fp, 0);	/* Will cause an indirection */
+
+	cs->hwtemps[r].reserved = cs->hwtemps[r].free;
+	cs->hwtemps[r].free = -1;
+
+	// Reset to some value that won't mess things up when the user
+	// tries to read from a temporary that hasn't been assigned a value yet.
+	// In the normal case, vector_valid and scalar_valid should be set to
+	// a sane value by the first emit that writes to this temporary.
+	cs->hwtemps[r].vector_valid = cs->nrslots;
+	cs->hwtemps[r].scalar_valid = cs->nrslots;
+
+	if (r > fp->max_temp_idx)
+		fp->max_temp_idx = r;
+
+	return r;
+}
+
+/**
+ * Mark the given hardware register as free.
+ */
+static void free_hw_temp(struct r300_fragment_program *fp, int idx)
+{
+	COMPILE_STATE;
+
+	// Be very careful here. Consider sequences like
+	//  MAD r0, r1,r2,r3
+	//  TEX r4, ...
+	// The TEX instruction may be moved in front of the MAD instruction
+	// due to the way nodes work. We don't want to alias r1 and r4 in
+	// this case.
+	// I'm certain the register allocation could be further sanitized,
+	// but it's tricky because of stuff that can happen inside emit_tex
+	// and emit_arith.
+	cs->hwtemps[idx].free = cs->nrslots + 1;
+}
+
+/**
+ * Create a new Mesa temporary register.
+ */
+static GLuint get_temp_reg(struct r300_fragment_program *fp)
+{
+	COMPILE_STATE;
+	GLuint r = undef;
+	GLuint index;
+
+	index = ffs(~cs->temp_in_use);
+	if (!index) {
+		ERROR("Out of program temps\n");
+		return r;
+	}
+
+	cs->temp_in_use |= (1 << --index);
+	cs->temps[index].refcount = 0xFFFFFFFF;
+	cs->temps[index].reg = -1;
+
+	REG_SET_TYPE(r, REG_TYPE_TEMP);
+	REG_SET_INDEX(r, index);
+	REG_SET_VALID(r, GL_TRUE);
+	return r;
+}
+
+/**
+ * Create a new Mesa temporary register that will act as the destination
+ * register for a texture read.
+ */
+static GLuint get_temp_reg_tex(struct r300_fragment_program *fp)
+{
+	COMPILE_STATE;
+	GLuint r = undef;
+	GLuint index;
+
+	index = ffs(~cs->temp_in_use);
+	if (!index) {
+		ERROR("Out of program temps\n");
+		return r;
+	}
+
+	cs->temp_in_use |= (1 << --index);
+	cs->temps[index].refcount = 0xFFFFFFFF;
+	cs->temps[index].reg = get_hw_temp_tex(fp);
+
+	REG_SET_TYPE(r, REG_TYPE_TEMP);
+	REG_SET_INDEX(r, index);
+	REG_SET_VALID(r, GL_TRUE);
+	return r;
+}
+
+/**
+ * Free a Mesa temporary and the associated R300 temporary.
+ */
+static void free_temp(struct r300_fragment_program *fp, GLuint r)
+{
+	COMPILE_STATE;
+	GLuint index = REG_GET_INDEX(r);
+
+	if (!(cs->temp_in_use & (1 << index)))
+		return;
+
+	if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
+		free_hw_temp(fp, cs->temps[index].reg);
+		cs->temps[index].reg = -1;
+		cs->temp_in_use &= ~(1 << index);
+	} else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
+		free_hw_temp(fp, cs->inputs[index].reg);
+		cs->inputs[index].reg = -1;
+	}
+}
+
+/**
+ * Emit a hardware constant/parameter.
+ *
+ * \p cp Stable pointer to an array of 4 floats.
+ *  The pointer must be stable in the sense that it remains to be valid
+ *  and hold the contents of the constant/parameter throughout the lifetime
+ *  of the fragment program (actually, up until the next time the fragment
+ *  program is translated).
+ */
+static GLuint emit_const4fv(struct r300_fragment_program *fp,
+			    const GLfloat * cp)
+{
+	GLuint reg = undef;
+	int index;
+
+	for (index = 0; index < fp->const_nr; ++index) {
+		if (fp->constant[index] == cp)
+			break;
+	}
+
+	if (index >= fp->const_nr) {
+		if (index >= PFS_NUM_CONST_REGS) {
+			ERROR("Out of hw constants!\n");
+			return reg;
+		}
+
+		fp->const_nr++;
+		fp->constant[index] = cp;
+	}
+
+	REG_SET_TYPE(reg, REG_TYPE_CONST);
+	REG_SET_INDEX(reg, index);
+	REG_SET_VALID(reg, GL_TRUE);
+	return reg;
+}
+
+static inline GLuint negate(GLuint r)
+{
+	REG_NEGS(r);
+	REG_NEGV(r);
+	return r;
+}
+
+/* Hack, to prevent clobbering sources used multiple times when
+ * emulating non-native instructions
+ */
+static inline GLuint keep(GLuint r)
+{
+	REG_SET_NO_USE(r, GL_TRUE);
+	return r;
+}
+
+static inline GLuint absolute(GLuint r)
+{
+	REG_ABS(r);
+	return r;
+}
+
+static int swz_native(struct r300_fragment_program *fp,
+		      GLuint src, GLuint * r, GLuint arbneg)
+{
+	/* Native swizzle, handle negation */
+	src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
+
+	if ((arbneg & 0x7) == 0x0) {
+		src = src & ~REG_NEGV_MASK;
+		*r = src;
+	} else if ((arbneg & 0x7) == 0x7) {
+		src |= REG_NEGV_MASK;
+		*r = src;
+	} else {
+		if (!REG_GET_VALID(*r))
+			*r = get_temp_reg(fp);
+		src |= REG_NEGV_MASK;
+		emit_arith(fp,
+			   PFS_OP_MAD,
+			   *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
+		src = src & ~REG_NEGV_MASK;
+		emit_arith(fp,
+			   PFS_OP_MAD,
+			   *r,
+			   (arbneg ^ 0x7) | WRITEMASK_W,
+			   src, pfs_one, pfs_zero, 0);
+	}
+
+	return 3;
+}
+
+static int swz_emit_partial(struct r300_fragment_program *fp,
+			    GLuint src,
+			    GLuint * r, int mask, int mc, GLuint arbneg)
+{
+	GLuint tmp;
+	GLuint wmask = 0;
+
+	if (!REG_GET_VALID(*r))
+		*r = get_temp_reg(fp);
+
+	/* A partial match, VSWZ/mask define what parts of the
+	 * desired swizzle we match
+	 */
+	if (mc + s_mask[mask].count == 3) {
+		wmask = WRITEMASK_W;
+		src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
+	}
+
+	tmp = arbneg & s_mask[mask].mask;
+	if (tmp) {
+		tmp = tmp ^ s_mask[mask].mask;
+		if (tmp) {
+			emit_arith(fp,
+				   PFS_OP_MAD,
+				   *r,
+				   arbneg & s_mask[mask].mask,
+				   keep(src) | REG_NEGV_MASK,
+				   pfs_one, pfs_zero, 0);
+			if (!wmask) {
+				REG_SET_NO_USE(src, GL_TRUE);
+			} else {
+				REG_SET_NO_USE(src, GL_FALSE);
+			}
+			emit_arith(fp,
+				   PFS_OP_MAD,
+				   *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
+		} else {
+			if (!wmask) {
+				REG_SET_NO_USE(src, GL_TRUE);
+			} else {
+				REG_SET_NO_USE(src, GL_FALSE);
+			}
+			emit_arith(fp,
+				   PFS_OP_MAD,
+				   *r,
+				   (arbneg & s_mask[mask].mask) | wmask,
+				   src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
+		}
+	} else {
+		if (!wmask) {
+			REG_SET_NO_USE(src, GL_TRUE);
+		} else {
+			REG_SET_NO_USE(src, GL_FALSE);
+		}
+		emit_arith(fp, PFS_OP_MAD,
+			   *r,
+			   s_mask[mask].mask | wmask,
+			   src, pfs_one, pfs_zero, 0);
+	}
+
+	return s_mask[mask].count;
+}
+
+static GLuint do_swizzle(struct r300_fragment_program *fp,
+			 GLuint src, GLuint arbswz, GLuint arbneg)
+{
+	GLuint r = undef;
+	GLuint vswz;
+	int c_mask = 0;
+	int v_match = 0;
+
+	/* If swizzling from something without an XYZW native swizzle,
+	 * emit result to a temp, and do new swizzle from the temp.
+	 */
+#if 0
+	if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
+		GLuint temp = get_temp_reg(fp);
+		emit_arith(fp,
+			   PFS_OP_MAD,
+			   temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
+		src = temp;
+	}
+#endif
+
+	if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
+		GLuint vsrcswz =
+		    (v_swiz[REG_GET_VSWZ(src)].
+		     hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
+		    REG_GET_SSWZ(src) << 9;
+		GLint i;
+
+		GLuint newswz = 0;
+		GLuint offset;
+		for (i = 0; i < 4; ++i) {
+			offset = GET_SWZ(arbswz, i);
+
+			newswz |=
+			    (offset <= 3) ? GET_SWZ(vsrcswz,
+						    offset) << i *
+			    3 : offset << i * 3;
+		}
+
+		arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
+		REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
+	} else {
+		/* set scalar swizzling */
+		REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
+
+	}
+	do {
+		vswz = REG_GET_VSWZ(src);
+		do {
+			int chash;
+
+			REG_SET_VSWZ(src, vswz);
+			chash = v_swiz[REG_GET_VSWZ(src)].hash &
+			    s_mask[c_mask].hash;
+
+			if (chash == (arbswz & s_mask[c_mask].hash)) {
+				if (s_mask[c_mask].count == 3) {
+					v_match += swz_native(fp,
+							      src, &r, arbneg);
+				} else {
+					v_match += swz_emit_partial(fp,
+								    src,
+								    &r,
+								    c_mask,
+								    v_match,
+								    arbneg);
+				}
+
+				if (v_match == 3)
+					return r;
+
+				/* Fill with something invalid.. all 0's was
+				 * wrong before, matched SWIZZLE_X.  So all
+				 * 1's will be okay for now
+				 */
+				arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
+			}
+		} while (v_swiz[++vswz].hash != PFS_INVAL);
+		REG_SET_VSWZ(src, SWIZZLE_XYZ);
+	} while (s_mask[++c_mask].hash != PFS_INVAL);
+
+	ERROR("should NEVER get here\n");
+	return r;
+}
+
+static GLuint t_src(struct r300_fragment_program *fp,
+		    struct prog_src_register fpsrc)
+{
+	GLuint r = undef;
+
+	switch (fpsrc.File) {
+	case PROGRAM_TEMPORARY:
+		REG_SET_INDEX(r, fpsrc.Index);
+		REG_SET_VALID(r, GL_TRUE);
+		REG_SET_TYPE(r, REG_TYPE_TEMP);
+		break;
+	case PROGRAM_INPUT:
+		REG_SET_INDEX(r, fpsrc.Index);
+		REG_SET_VALID(r, GL_TRUE);
+		REG_SET_TYPE(r, REG_TYPE_INPUT);
+		break;
+	case PROGRAM_LOCAL_PARAM:
+		r = emit_const4fv(fp,
+				  fp->mesa_program.Base.LocalParams[fpsrc.
+								    Index]);
+		break;
+	case PROGRAM_ENV_PARAM:
+		r = emit_const4fv(fp,
+				  fp->ctx->FragmentProgram.Parameters[fpsrc.
+								      Index]);
+		break;
+	case PROGRAM_STATE_VAR:
+	case PROGRAM_NAMED_PARAM:
+		r = emit_const4fv(fp,
+				  fp->mesa_program.Base.Parameters->
+				  ParameterValues[fpsrc.Index]);
+		break;
+	default:
+		ERROR("unknown SrcReg->File %x\n", fpsrc.File);
+		return r;
+	}
+
+	/* no point swizzling ONE/ZERO/HALF constants... */
+	if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
+		r = do_swizzle(fp, r, fpsrc.Swizzle, fpsrc.NegateBase);
+	return r;
+}
+
+static GLuint t_scalar_src(struct r300_fragment_program *fp,
+			   struct prog_src_register fpsrc)
+{
+	struct prog_src_register src = fpsrc;
+	int sc = GET_SWZ(fpsrc.Swizzle, 0);	/* X */
+
+	src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
+
+	return t_src(fp, src);
+}
+
+static GLuint t_dst(struct r300_fragment_program *fp,
+		    struct prog_dst_register dest)
+{
+	GLuint r = undef;
+
+	switch (dest.File) {
+	case PROGRAM_TEMPORARY:
+		REG_SET_INDEX(r, dest.Index);
+		REG_SET_VALID(r, GL_TRUE);
+		REG_SET_TYPE(r, REG_TYPE_TEMP);
+		return r;
+	case PROGRAM_OUTPUT:
+		REG_SET_TYPE(r, REG_TYPE_OUTPUT);
+		switch (dest.Index) {
+		case FRAG_RESULT_COLR:
+		case FRAG_RESULT_DEPR:
+			REG_SET_INDEX(r, dest.Index);
+			REG_SET_VALID(r, GL_TRUE);
+			return r;
+		default:
+			ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
+			return r;
+		}
+	default:
+		ERROR("Bad DstReg->File 0x%x\n", dest.File);
+		return r;
+	}
+}
+
+static int t_hw_src(struct r300_fragment_program *fp, GLuint src, GLboolean tex)
+{
+	COMPILE_STATE;
+	int idx;
+	int index = REG_GET_INDEX(src);
+
+	switch (REG_GET_TYPE(src)) {
+	case REG_TYPE_TEMP:
+		/* NOTE: if reg==-1 here, a source is being read that
+		 *       hasn't been written to. Undefined results.
+		 */
+		if (cs->temps[index].reg == -1)
+			cs->temps[index].reg = get_hw_temp(fp, cs->nrslots);
+
+		idx = cs->temps[index].reg;
+
+		if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
+			free_temp(fp, src);
+		break;
+	case REG_TYPE_INPUT:
+		idx = cs->inputs[index].reg;
+
+		if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
+			free_hw_temp(fp, cs->inputs[index].reg);
+		break;
+	case REG_TYPE_CONST:
+		return (index | SRC_CONST);
+	default:
+		ERROR("Invalid type for source reg\n");
+		return (0 | SRC_CONST);
+	}
+
+	if (!tex)
+		cs->used_in_node |= (1 << idx);
+
+	return idx;
+}
+
+static int t_hw_dst(struct r300_fragment_program *fp,
+		    GLuint dest, GLboolean tex, int slot)
+{
+	COMPILE_STATE;
+	int idx;
+	GLuint index = REG_GET_INDEX(dest);
+	assert(REG_GET_VALID(dest));
+
+	switch (REG_GET_TYPE(dest)) {
+	case REG_TYPE_TEMP:
+		if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
+			if (!tex) {
+				cs->temps[index].reg = get_hw_temp(fp, slot);
+			} else {
+				cs->temps[index].reg = get_hw_temp_tex(fp);
+			}
+		}
+		idx = cs->temps[index].reg;
+
+		if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
+			free_temp(fp, dest);
+
+		cs->dest_in_node |= (1 << idx);
+		cs->used_in_node |= (1 << idx);
+		break;
+	case REG_TYPE_OUTPUT:
+		switch (index) {
+		case FRAG_RESULT_COLR:
+			fp->node[fp->cur_node].flags |=
+			    R300_PFS_NODE_OUTPUT_COLOR;
+			break;
+		case FRAG_RESULT_DEPR:
+			fp->node[fp->cur_node].flags |=
+			    R300_PFS_NODE_OUTPUT_DEPTH;
+			break;
+		}
+		return index;
+		break;
+	default:
+		ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
+		return 0;
+	}
+
+	return idx;
+}
+
+static void emit_nop(struct r300_fragment_program *fp)
+{
+	COMPILE_STATE;
+
+	if (cs->nrslots >= PFS_MAX_ALU_INST) {
+		ERROR("Out of ALU instruction slots\n");
+		return;
+	}
+
+	fp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
+	fp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
+	fp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
+	fp->alu.inst[cs->nrslots].inst3 = NOP_INST3;
+	cs->nrslots++;
+}
+
+static void emit_tex(struct r300_fragment_program *fp,
+		     struct prog_instruction *fpi, int opcode)
+{
+	COMPILE_STATE;
+	GLuint coord = t_src(fp, fpi->SrcReg[0]);
+	GLuint dest = undef, rdest = undef;
+	GLuint din, uin;
+	int unit = fpi->TexSrcUnit;
+	int hwsrc, hwdest;
+	GLuint tempreg = 0;
+
+	uin = cs->used_in_node;
+	din = cs->dest_in_node;
+
+	/* Resolve source/dest to hardware registers */
+	if (opcode != R300_FPITX_OP_KIL) {
+		if (fpi->TexSrcTarget == TEXTURE_RECT_INDEX) {
+			/**
+			 * Hardware uses [0..1]x[0..1] range for rectangle textures
+			 * instead of [0..Width]x[0..Height].
+			 * Add a scaling instruction.
+			 *
+			 * \todo Refactor this once we have proper rewriting/optimization
+			 * support for programs.
+			 */
+			gl_state_index tokens[STATE_LENGTH] = {
+				STATE_INTERNAL, STATE_R300_TEXRECT_FACTOR, 0, 0,
+				0
+			};
+			int factor_index;
+			GLuint factorreg;
+
+			tokens[2] = unit;
+			factor_index =
+			    _mesa_add_state_reference(fp->mesa_program.Base.
+						      Parameters, tokens);
+			factorreg =
+			    emit_const4fv(fp,
+					  fp->mesa_program.Base.Parameters->
+					  ParameterValues[factor_index]);
+			tempreg = keep(get_temp_reg(fp));
+
+			emit_arith(fp, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
+				   coord, factorreg, pfs_zero, 0);
+
+			/* Ensure correct node indirection */
+			uin = cs->used_in_node;
+			din = cs->dest_in_node;
+
+			hwsrc = t_hw_src(fp, tempreg, GL_TRUE);
+		} else {
+			hwsrc = t_hw_src(fp, coord, GL_TRUE);
+		}
+
+		dest = t_dst(fp, fpi->DstReg);
+
+		/* r300 doesn't seem to be able to do TEX->output reg */
+		if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+			rdest = dest;
+			dest = get_temp_reg_tex(fp);
+		}
+		hwdest =
+		    t_hw_dst(fp, dest, GL_TRUE,
+			     fp->node[fp->cur_node].alu_offset);
+
+		/* Use a temp that hasn't been used in this node, rather
+		 * than causing an indirection
+		 */
+		if (uin & (1 << hwdest)) {
+			free_hw_temp(fp, hwdest);
+			hwdest = get_hw_temp_tex(fp);
+			cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
+		}
+	} else {
+		hwdest = 0;
+		unit = 0;
+		hwsrc = t_hw_src(fp, coord, GL_TRUE);
+	}
+
+	/* Indirection if source has been written in this node, or if the
+	 * dest has been read/written in this node
+	 */
+	if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
+	     (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
+
+		/* Finish off current node */
+		if (fp->node[fp->cur_node].alu_offset == cs->nrslots)
+			emit_nop(fp);
+
+		fp->node[fp->cur_node].alu_end =
+		    cs->nrslots - fp->node[fp->cur_node].alu_offset - 1;
+		assert(fp->node[fp->cur_node].alu_end >= 0);
+
+		if (++fp->cur_node >= PFS_MAX_TEX_INDIRECT) {
+			ERROR("too many levels of texture indirection\n");
+			return;
+		}
+
+		/* Start new node */
+		fp->node[fp->cur_node].tex_offset = fp->tex.length;
+		fp->node[fp->cur_node].alu_offset = cs->nrslots;
+		fp->node[fp->cur_node].tex_end = -1;
+		fp->node[fp->cur_node].alu_end = -1;
+		fp->node[fp->cur_node].flags = 0;
+		cs->used_in_node = 0;
+		cs->dest_in_node = 0;
+	}
+
+	if (fp->cur_node == 0)
+		fp->first_node_has_tex = 1;
+
+	fp->tex.inst[fp->tex.length++] = 0 | (hwsrc << R300_FPITX_SRC_SHIFT)
+	    | (hwdest << R300_FPITX_DST_SHIFT)
+	    | (unit << R300_FPITX_IMAGE_SHIFT)
+	    /* not entirely sure about this */
+	    | (opcode << R300_FPITX_OPCODE_SHIFT);
+
+	cs->dest_in_node |= (1 << hwdest);
+	if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
+		cs->used_in_node |= (1 << hwsrc);
+
+	fp->node[fp->cur_node].tex_end++;
+
+	/* Copy from temp to output if needed */
+	if (REG_GET_VALID(rdest)) {
+		emit_arith(fp, PFS_OP_MAD, rdest, WRITEMASK_XYZW, dest,
+			   pfs_one, pfs_zero, 0);
+		free_temp(fp, dest);
+	}
+
+	/* Free temp register */
+	if (tempreg != 0)
+		free_temp(fp, tempreg);
+}
+
+/**
+ * Returns the first slot where we could possibly allow writing to dest,
+ * according to register allocation.
+ */
+static int get_earliest_allowed_write(struct r300_fragment_program *fp,
+				      GLuint dest, int mask)
+{
+	COMPILE_STATE;
+	int idx;
+	int pos;
+	GLuint index = REG_GET_INDEX(dest);
+	assert(REG_GET_VALID(dest));
+
+	switch (REG_GET_TYPE(dest)) {
+	case REG_TYPE_TEMP:
+		if (cs->temps[index].reg == -1)
+			return 0;
+
+		idx = cs->temps[index].reg;
+		break;
+	case REG_TYPE_OUTPUT:
+		return 0;
+	default:
+		ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
+		return 0;
+	}
+
+	pos = cs->hwtemps[idx].reserved;
+	if (mask & WRITEMASK_XYZ) {
+		if (pos < cs->hwtemps[idx].vector_lastread)
+			pos = cs->hwtemps[idx].vector_lastread;
+	}
+	if (mask & WRITEMASK_W) {
+		if (pos < cs->hwtemps[idx].scalar_lastread)
+			pos = cs->hwtemps[idx].scalar_lastread;
+	}
+
+	return pos;
+}
+
+/**
+ * Allocates a slot for an ALU instruction that can consist of
+ * a vertex part or a scalar part or both.
+ *
+ * Sources from src (src[0] to src[argc-1]) are added to the slot in the
+ * appropriate position (vector and/or scalar), and their positions are
+ * recorded in the srcpos array.
+ *
+ * This function emits instruction code for the source fetch and the
+ * argument selection. It does not emit instruction code for the
+ * opcode or the destination selection.
+ *
+ * @return the index of the slot
+ */
+static int find_and_prepare_slot(struct r300_fragment_program *fp,
+				 GLboolean emit_vop,
+				 GLboolean emit_sop,
+				 int argc, GLuint * src, GLuint dest, int mask)
+{
+	COMPILE_STATE;
+	int hwsrc[3];
+	int srcpos[3];
+	unsigned int used;
+	int tempused;
+	int tempvsrc[3];
+	int tempssrc[3];
+	int pos;
+	int regnr;
+	int i, j;
+
+	// Determine instruction slots, whether sources are required on
+	// vector or scalar side, and the smallest slot number where
+	// all source registers are available
+	used = 0;
+	if (emit_vop)
+		used |= SLOT_OP_VECTOR;
+	if (emit_sop)
+		used |= SLOT_OP_SCALAR;
+
+	pos = get_earliest_allowed_write(fp, dest, mask);
+
+	if (fp->node[fp->cur_node].alu_offset > pos)
+		pos = fp->node[fp->cur_node].alu_offset;
+	for (i = 0; i < argc; ++i) {
+		if (!REG_GET_BUILTIN(src[i])) {
+			if (emit_vop)
+				used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
+			if (emit_sop)
+				used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
+		}
+
+		hwsrc[i] = t_hw_src(fp, src[i], GL_FALSE);	/* Note: sideeffects wrt refcounting! */
+		regnr = hwsrc[i] & 31;
+
+		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
+			if (used & (SLOT_SRC_VECTOR << i)) {
+				if (cs->hwtemps[regnr].vector_valid > pos)
+					pos = cs->hwtemps[regnr].vector_valid;
+			}
+			if (used & (SLOT_SRC_SCALAR << i)) {
+				if (cs->hwtemps[regnr].scalar_valid > pos)
+					pos = cs->hwtemps[regnr].scalar_valid;
+			}
+		}
+	}
+
+	// Find a slot that fits
+	for (;; ++pos) {
+		if (cs->slot[pos].used & used & SLOT_OP_BOTH)
+			continue;
+
+		if (pos >= cs->nrslots) {
+			if (cs->nrslots >= PFS_MAX_ALU_INST) {
+				ERROR("Out of ALU instruction slots\n");
+				return -1;
+			}
+
+			fp->alu.inst[pos].inst0 = NOP_INST0;
+			fp->alu.inst[pos].inst1 = NOP_INST1;
+			fp->alu.inst[pos].inst2 = NOP_INST2;
+			fp->alu.inst[pos].inst3 = NOP_INST3;
+
+			cs->nrslots++;
+		}
+		// Note: When we need both parts (vector and scalar) of a source,
+		// we always try to put them into the same position. This makes the
+		// code easier to read, and it is optimal (i.e. one doesn't gain
+		// anything by splitting the parts).
+		// It also avoids headaches with swizzles that access both parts (i.e WXY)
+		tempused = cs->slot[pos].used;
+		for (i = 0; i < 3; ++i) {
+			tempvsrc[i] = cs->slot[pos].vsrc[i];
+			tempssrc[i] = cs->slot[pos].ssrc[i];
+		}
+
+		for (i = 0; i < argc; ++i) {
+			int flags = (used >> i) & SLOT_SRC_BOTH;
+
+			if (!flags) {
+				srcpos[i] = 0;
+				continue;
+			}
+
+			for (j = 0; j < 3; ++j) {
+				if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
+					if (tempvsrc[j] != hwsrc[i])
+						continue;
+				}
+
+				if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
+					if (tempssrc[j] != hwsrc[i])
+						continue;
+				}
+
+				break;
+			}
+
+			if (j == 3)
+				break;
+
+			srcpos[i] = j;
+			tempused |= flags << j;
+			if (flags & SLOT_SRC_VECTOR)
+				tempvsrc[j] = hwsrc[i];
+			if (flags & SLOT_SRC_SCALAR)
+				tempssrc[j] = hwsrc[i];
+		}
+
+		if (i == argc)
+			break;
+	}
+
+	// Found a slot, reserve it
+	cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
+	for (i = 0; i < 3; ++i) {
+		cs->slot[pos].vsrc[i] = tempvsrc[i];
+		cs->slot[pos].ssrc[i] = tempssrc[i];
+	}
+
+	for (i = 0; i < argc; ++i) {
+		if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
+			int regnr = hwsrc[i] & 31;
+
+			if (used & (SLOT_SRC_VECTOR << i)) {
+				if (cs->hwtemps[regnr].vector_lastread < pos)
+					cs->hwtemps[regnr].vector_lastread =
+					    pos;
+			}
+			if (used & (SLOT_SRC_SCALAR << i)) {
+				if (cs->hwtemps[regnr].scalar_lastread < pos)
+					cs->hwtemps[regnr].scalar_lastread =
+					    pos;
+			}
+		}
+	}
+
+	// Emit the source fetch code
+	fp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK;
+	fp->alu.inst[pos].inst1 |=
+	    ((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
+	     (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
+	     (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
+
+	fp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK;
+	fp->alu.inst[pos].inst3 |=
+	    ((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
+	     (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
+	     (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
+
+	// Emit the argument selection code
+	if (emit_vop) {
+		int swz[3];
+
+		for (i = 0; i < 3; ++i) {
+			if (i < argc) {
+				swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
+					  (srcpos[i] *
+					   v_swiz[REG_GET_VSWZ(src[i])].
+					   stride)) | ((src[i] & REG_NEGV_MASK)
+						       ? ARG_NEG : 0) | ((src[i]
+									  &
+									  REG_ABS_MASK)
+									 ?
+									 ARG_ABS
+									 : 0);
+			} else {
+				swz[i] = R300_FPI0_ARGC_ZERO;
+			}
+		}
+
+		fp->alu.inst[pos].inst0 &=
+		    ~(R300_FPI0_ARG0C_MASK | R300_FPI0_ARG1C_MASK |
+		      R300_FPI0_ARG2C_MASK);
+		fp->alu.inst[pos].inst0 |=
+		    (swz[0] << R300_FPI0_ARG0C_SHIFT) | (swz[1] <<
+							 R300_FPI0_ARG1C_SHIFT)
+		    | (swz[2] << R300_FPI0_ARG2C_SHIFT);
+	}
+
+	if (emit_sop) {
+		int swz[3];
+
+		for (i = 0; i < 3; ++i) {
+			if (i < argc) {
+				swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
+					  (srcpos[i] *
+					   s_swiz[REG_GET_SSWZ(src[i])].
+					   stride)) | ((src[i] & REG_NEGV_MASK)
+						       ? ARG_NEG : 0) | ((src[i]
+									  &
+									  REG_ABS_MASK)
+									 ?
+									 ARG_ABS
+									 : 0);
+			} else {
+				swz[i] = R300_FPI2_ARGA_ZERO;
+			}
+		}
+
+		fp->alu.inst[pos].inst2 &=
+		    ~(R300_FPI2_ARG0A_MASK | R300_FPI2_ARG1A_MASK |
+		      R300_FPI2_ARG2A_MASK);
+		fp->alu.inst[pos].inst2 |=
+		    (swz[0] << R300_FPI2_ARG0A_SHIFT) | (swz[1] <<
+							 R300_FPI2_ARG1A_SHIFT)
+		    | (swz[2] << R300_FPI2_ARG2A_SHIFT);
+	}
+
+	return pos;
+}
+
+/**
+ * Append an ALU instruction to the instruction list.
+ */
+static void emit_arith(struct r300_fragment_program *fp,
+		       int op,
+		       GLuint dest,
+		       int mask,
+		       GLuint src0, GLuint src1, GLuint src2, int flags)
+{
+	COMPILE_STATE;
+	GLuint src[3] = { src0, src1, src2 };
+	int hwdest;
+	GLboolean emit_vop, emit_sop;
+	int vop, sop, argc;
+	int pos;
+
+	vop = r300_fpop[op].v_op;
+	sop = r300_fpop[op].s_op;
+	argc = r300_fpop[op].argc;
+
+	if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
+	    REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
+		if (mask & WRITEMASK_Z) {
+			mask = WRITEMASK_W;
+		} else {
+			return;
+		}
+	}
+
+	emit_vop = GL_FALSE;
+	emit_sop = GL_FALSE;
+	if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
+		emit_vop = GL_TRUE;
+	if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
+		emit_sop = GL_TRUE;
+
+	pos =
+	    find_and_prepare_slot(fp, emit_vop, emit_sop, argc, src, dest,
+				  mask);
+	if (pos < 0)
+		return;
+
+	hwdest = t_hw_dst(fp, dest, GL_FALSE, pos);	/* Note: Side effects wrt register allocation */
+
+	if (flags & PFS_FLAG_SAT) {
+		vop |= R300_FPI0_OUTC_SAT;
+		sop |= R300_FPI2_OUTA_SAT;
+	}
+
+	/* Throw the pieces together and get FPI0/1 */
+	if (emit_vop) {
+		fp->alu.inst[pos].inst0 |= vop;
+
+		fp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
+
+		if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+			if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
+				fp->alu.inst[pos].inst1 |=
+				    (mask & WRITEMASK_XYZ) <<
+				    R300_FPI1_DSTC_OUTPUT_MASK_SHIFT;
+			} else
+				assert(0);
+		} else {
+			fp->alu.inst[pos].inst1 |=
+			    (mask & WRITEMASK_XYZ) <<
+			    R300_FPI1_DSTC_REG_MASK_SHIFT;
+
+			cs->hwtemps[hwdest].vector_valid = pos + 1;
+		}
+	}
+
+	/* And now FPI2/3 */
+	if (emit_sop) {
+		fp->alu.inst[pos].inst2 |= sop;
+
+		if (mask & WRITEMASK_W) {
+			if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+				if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
+					fp->alu.inst[pos].inst3 |=
+					    (hwdest << R300_FPI3_DSTA_SHIFT) |
+					    R300_FPI3_DSTA_OUTPUT;
+				} else if (REG_GET_INDEX(dest) ==
+					   FRAG_RESULT_DEPR) {
+					fp->alu.inst[pos].inst3 |=
+					    R300_FPI3_DSTA_DEPTH;
+				} else
+					assert(0);
+			} else {
+				fp->alu.inst[pos].inst3 |=
+				    (hwdest << R300_FPI3_DSTA_SHIFT) |
+				    R300_FPI3_DSTA_REG;
+
+				cs->hwtemps[hwdest].scalar_valid = pos + 1;
+			}
+		}
+	}
+
+	return;
+}
+
+#if 0
+static GLuint get_attrib(struct r300_fragment_program *fp, GLuint attr)
+{
+	struct gl_fragment_program *mp = &fp->mesa_program;
+	GLuint r = undef;
+
+	if (!(mp->Base.InputsRead & (1 << attr))) {
+		ERROR("Attribute %d was not provided!\n", attr);
+		return undef;
+	}
+
+	REG_SET_TYPE(r, REG_TYPE_INPUT);
+	REG_SET_INDEX(r, attr);
+	REG_SET_VALID(r, GL_TRUE);
+	return r;
+}
+#endif
+
+static GLfloat SinCosConsts[2][4] = {
+	{
+	 1.273239545,		// 4/PI
+	 -0.405284735,		// -4/(PI*PI)
+	 3.141592654,		// PI
+	 0.2225			// weight
+	 },
+	{
+	 0.75,
+	 0.0,
+	 0.159154943,		// 1/(2*PI)
+	 6.283185307		// 2*PI
+	 }
+};
+
+/**
+ * Emit a LIT instruction.
+ * \p flags may be PFS_FLAG_SAT
+ *
+ * Definition of LIT (from ARB_fragment_program):
+ * tmp = VectorLoad(op0);
+ * if (tmp.x < 0) tmp.x = 0;
+ * if (tmp.y < 0) tmp.y = 0;
+ * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
+ * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
+ * result.x = 1.0;
+ * result.y = tmp.x;
+ * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
+ * result.w = 1.0;
+ *
+ * The longest path of computation is the one leading to result.z,
+ * consisting of 5 operations. This implementation of LIT takes
+ * 5 slots. So unless there's some special undocumented opcode,
+ * this implementation is potentially optimal. Unfortunately,
+ * emit_arith is a bit too conservative because it doesn't understand
+ * partial writes to the vector component.
+ */
+static const GLfloat LitConst[4] =
+    { 127.999999, 127.999999, 127.999999, -127.999999 };
+
+static void emit_lit(struct r300_fragment_program *fp,
+		     GLuint dest, int mask, GLuint src, int flags)
+{
+	COMPILE_STATE;
+	GLuint cnst;
+	int needTemporary;
+	GLuint temp;
+
+	cnst = emit_const4fv(fp, LitConst);
+
+	needTemporary = 0;
+	if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
+		needTemporary = 1;
+	} else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
+		// LIT is typically followed by DP3/DP4, so there's no point
+		// in creating special code for this case
+		needTemporary = 1;
+	}
+
+	if (needTemporary) {
+		temp = keep(get_temp_reg(fp));
+	} else {
+		temp = keep(dest);
+	}
+
+	// Note: The order of emit_arith inside the slots is relevant,
+	// because emit_arith only looks at scalar vs. vector when resolving
+	// dependencies, and it does not consider individual vector components,
+	// so swizzling between the two parts can create fake dependencies.
+
+	// First slot
+	emit_arith(fp, PFS_OP_MAX, temp, WRITEMASK_XY,
+		   keep(src), pfs_zero, undef, 0);
+	emit_arith(fp, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
+
+	// Second slot
+	emit_arith(fp, PFS_OP_MIN, temp, WRITEMASK_Z,
+		   swizzle(temp, W, W, W, W), cnst, undef, 0);
+	emit_arith(fp, PFS_OP_LG2, temp, WRITEMASK_W,
+		   swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
+
+	// Third slot
+	// If desired, we saturate the y result here.
+	// This does not affect the use as a condition variable in the CMP later
+	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_W,
+		   temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
+	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_Y,
+		   swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
+
+	// Fourth slot
+	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_X,
+		   pfs_one, pfs_one, pfs_zero, 0);
+	emit_arith(fp, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
+
+	// Fifth slot
+	emit_arith(fp, PFS_OP_CMP, temp, WRITEMASK_Z,
+		   pfs_zero, swizzle(temp, W, W, W, W),
+		   negate(swizzle(temp, Y, Y, Y, Y)), flags);
+	emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
+		   pfs_zero, 0);
+
+	if (needTemporary) {
+		emit_arith(fp, PFS_OP_MAD, dest, mask,
+			   temp, pfs_one, pfs_zero, flags);
+		free_temp(fp, temp);
+	} else {
+		// Decrease refcount of the destination
+		t_hw_dst(fp, dest, GL_FALSE, cs->nrslots);
+	}
+}
+
+static GLboolean parse_program(struct r300_fragment_program *fp)
+{
+	struct gl_fragment_program *mp = &fp->mesa_program;
+	const struct prog_instruction *inst = mp->Base.Instructions;
+	struct prog_instruction *fpi;
+	GLuint src[3], dest, temp[2];
+	int flags, mask = 0;
+	int const_sin[2];
+
+	if (!inst || inst[0].Opcode == OPCODE_END) {
+		ERROR("empty program?\n");
+		return GL_FALSE;
+	}
+
+	for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
+		if (fpi->SaturateMode == SATURATE_ZERO_ONE)
+			flags = PFS_FLAG_SAT;
+		else
+			flags = 0;
+
+		if (fpi->Opcode != OPCODE_KIL) {
+			dest = t_dst(fp, fpi->DstReg);
+			mask = fpi->DstReg.WriteMask;
+		}
+
+		switch (fpi->Opcode) {
+		case OPCODE_ABS:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   absolute(src[0]), pfs_one, pfs_zero, flags);
+			break;
+		case OPCODE_ADD:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], pfs_one, src[1], flags);
+			break;
+		case OPCODE_CMP:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			src[2] = t_src(fp, fpi->SrcReg[2]);
+			/* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
+			 *    r300 - if src2.c < 0.0 ? src1.c : src0.c
+			 */
+			emit_arith(fp, PFS_OP_CMP, dest, mask,
+				   src[2], src[1], src[0], flags);
+			break;
+		case OPCODE_COS:
+			/*
+			 * cos using a parabola (see SIN):
+			 * cos(x):
+			 *   x = (x/(2*PI))+0.75
+			 *   x = frac(x)
+			 *   x = (x*2*PI)-PI
+			 *   result = sin(x)
+			 */
+			temp[0] = get_temp_reg(fp);
+			const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
+			const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+
+			/* add 0.5*PI and do range reduction */
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
+				   swizzle(src[0], X, X, X, X),
+				   swizzle(const_sin[1], Z, Z, Z, Z),
+				   swizzle(const_sin[1], X, X, X, X), 0);
+
+			emit_arith(fp, PFS_OP_FRC, temp[0], WRITEMASK_X,
+				   swizzle(temp[0], X, X, X, X),
+				   undef, undef, 0);
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),	//2*PI
+				   negate(swizzle(const_sin[0], Z, Z, Z, Z)),	//-PI
+				   0);
+
+			/* SIN */
+
+			emit_arith(fp, PFS_OP_MAD, temp[0],
+				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
+								      Z, Z, Z,
+								      Z),
+				   const_sin[0], pfs_zero, 0);
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
+				   swizzle(temp[0], Y, Y, Y, Y),
+				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
+				   swizzle(temp[0], X, X, X, X), 0);
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
+				   swizzle(temp[0], X, X, X, X),
+				   absolute(swizzle(temp[0], X, X, X, X)),
+				   negate(swizzle(temp[0], X, X, X, X)), 0);
+
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   swizzle(temp[0], Y, Y, Y, Y),
+				   swizzle(const_sin[0], W, W, W, W),
+				   swizzle(temp[0], X, X, X, X), flags);
+
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_DP3:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_DP3, dest, mask,
+				   src[0], src[1], undef, flags);
+			break;
+		case OPCODE_DP4:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_DP4, dest, mask,
+				   src[0], src[1], undef, flags);
+			break;
+		case OPCODE_DPH:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			/* src0.xyz1 -> temp
+			 * DP4 dest, temp, src1
+			 */
+#if 0
+			temp[0] = get_temp_reg(fp);
+			src[0].s_swz = SWIZZLE_ONE;
+			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
+				   src[0], pfs_one, pfs_zero, 0);
+			emit_arith(fp, PFS_OP_DP4, dest, mask,
+				   temp[0], src[1], undef, flags);
+			free_temp(fp, temp[0]);
+#else
+			emit_arith(fp, PFS_OP_DP4, dest, mask,
+				   swizzle(src[0], X, Y, Z, ONE), src[1],
+				   undef, flags);
+#endif
+			break;
+		case OPCODE_DST:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			/* dest.y = src0.y * src1.y */
+			if (mask & WRITEMASK_Y)
+				emit_arith(fp, PFS_OP_MAD, dest, WRITEMASK_Y,
+					   keep(src[0]), keep(src[1]),
+					   pfs_zero, flags);
+			/* dest.z = src0.z */
+			if (mask & WRITEMASK_Z)
+				emit_arith(fp, PFS_OP_MAD, dest, WRITEMASK_Z,
+					   src[0], pfs_one, pfs_zero, flags);
+			/* result.x = 1.0
+			 * result.w = src1.w */
+			if (mask & WRITEMASK_XW) {
+				REG_SET_VSWZ(src[1], SWIZZLE_111);	/*Cheat */
+				emit_arith(fp, PFS_OP_MAD, dest,
+					   mask & WRITEMASK_XW,
+					   src[1], pfs_one, pfs_zero, flags);
+			}
+			break;
+		case OPCODE_EX2:
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_EX2, dest, mask,
+				   src[0], undef, undef, flags);
+			break;
+		case OPCODE_FLR:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			temp[0] = get_temp_reg(fp);
+			/* FRC temp, src0
+			 * MAD dest, src0, 1.0, -temp
+			 */
+			emit_arith(fp, PFS_OP_FRC, temp[0], mask,
+				   keep(src[0]), undef, undef, 0);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], pfs_one, negate(temp[0]), flags);
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_FRC:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_FRC, dest, mask,
+				   src[0], undef, undef, flags);
+			break;
+		case OPCODE_KIL:
+			emit_tex(fp, fpi, R300_FPITX_OP_KIL);
+			break;
+		case OPCODE_LG2:
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_LG2, dest, mask,
+				   src[0], undef, undef, flags);
+			break;
+		case OPCODE_LIT:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			emit_lit(fp, dest, mask, src[0], flags);
+			break;
+		case OPCODE_LRP:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			src[2] = t_src(fp, fpi->SrcReg[2]);
+			/* result = tmp0tmp1 + (1 - tmp0)tmp2
+			 *        = tmp0tmp1 + tmp2 + (-tmp0)tmp2
+			 *     MAD temp, -tmp0, tmp2, tmp2
+			 *     MAD result, tmp0, tmp1, temp
+			 */
+			temp[0] = get_temp_reg(fp);
+			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
+				   negate(keep(src[0])), keep(src[2]), src[2],
+				   0);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], src[1], temp[0], flags);
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_MAD:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			src[2] = t_src(fp, fpi->SrcReg[2]);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], src[1], src[2], flags);
+			break;
+		case OPCODE_MAX:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_MAX, dest, mask,
+				   src[0], src[1], undef, flags);
+			break;
+		case OPCODE_MIN:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_MIN, dest, mask,
+				   src[0], src[1], undef, flags);
+			break;
+		case OPCODE_MOV:
+		case OPCODE_SWZ:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], pfs_one, pfs_zero, flags);
+			break;
+		case OPCODE_MUL:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], src[1], pfs_zero, flags);
+			break;
+		case OPCODE_POW:
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+			src[1] = t_scalar_src(fp, fpi->SrcReg[1]);
+			temp[0] = get_temp_reg(fp);
+			emit_arith(fp, PFS_OP_LG2, temp[0], WRITEMASK_W,
+				   src[0], undef, undef, 0);
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_W,
+				   temp[0], src[1], pfs_zero, 0);
+			emit_arith(fp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
+				   temp[0], undef, undef, 0);
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_RCP:
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_RCP, dest, mask,
+				   src[0], undef, undef, flags);
+			break;
+		case OPCODE_RSQ:
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+			emit_arith(fp, PFS_OP_RSQ, dest, mask,
+				   absolute(src[0]), pfs_zero, pfs_zero, flags);
+			break;
+		case OPCODE_SCS:
+			/*
+			 * scs using a parabola :
+			 * scs(x):
+			 *   result.x = sin(-abs(x)+0.5*PI)  (cos)
+			 *   result.y = sin(x)               (sin)
+			 *
+			 */
+			temp[0] = get_temp_reg(fp);
+			temp[1] = get_temp_reg(fp);
+			const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
+			const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+
+			/* x = -abs(x)+0.5*PI */
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z),	//PI
+				   pfs_half,
+				   negate(abs
+					  (swizzle(keep(src[0]), X, X, X, X))),
+				   0);
+
+			/* C*x (sin) */
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_W,
+				   swizzle(const_sin[0], Y, Y, Y, Y),
+				   swizzle(keep(src[0]), X, X, X, X),
+				   pfs_zero, 0);
+
+			/* B*x, C*x (cos) */
+			emit_arith(fp, PFS_OP_MAD, temp[0],
+				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
+								      Z, Z, Z,
+								      Z),
+				   const_sin[0], pfs_zero, 0);
+
+			/* B*x (sin) */
+			emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_W,
+				   swizzle(const_sin[0], X, X, X, X),
+				   keep(src[0]), pfs_zero, 0);
+
+			/* y = B*x + C*x*abs(x) (sin) */
+			emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_Z,
+				   absolute(src[0]),
+				   swizzle(temp[0], W, W, W, W),
+				   swizzle(temp[1], W, W, W, W), 0);
+
+			/* y = B*x + C*x*abs(x) (cos) */
+			emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_W,
+				   swizzle(temp[0], Y, Y, Y, Y),
+				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
+				   swizzle(temp[0], X, X, X, X), 0);
+
+			/* y*abs(y) - y (cos), y*abs(y) - y (sin) */
+			emit_arith(fp, PFS_OP_MAD, temp[0],
+				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
+								      W, Z, Y,
+								      X),
+				   absolute(swizzle(temp[1], W, Z, Y, X)),
+				   negate(swizzle(temp[1], W, Z, Y, X)), 0);
+
+			/* dest.xy = mad(temp.xy, P, temp2.wz) */
+			emit_arith(fp, PFS_OP_MAD, dest,
+				   mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
+				   swizzle(const_sin[0], W, W, W, W),
+				   swizzle(temp[1], W, Z, Y, X), flags);
+
+			free_temp(fp, temp[0]);
+			free_temp(fp, temp[1]);
+			break;
+		case OPCODE_SGE:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			temp[0] = get_temp_reg(fp);
+			/* temp = src0 - src1
+			 * dest.c = (temp.c < 0.0) ? 0 : 1
+			 */
+			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
+				   src[0], pfs_one, negate(src[1]), 0);
+			emit_arith(fp, PFS_OP_CMP, dest, mask,
+				   pfs_one, pfs_zero, temp[0], 0);
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_SIN:
+			/*
+			 *  using a parabola:
+			 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
+			 * extra precision is obtained by weighting against
+			 * itself squared.
+			 */
+
+			temp[0] = get_temp_reg(fp);
+			const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
+			const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
+			src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
+
+			/* do range reduction */
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
+				   swizzle(keep(src[0]), X, X, X, X),
+				   swizzle(const_sin[1], Z, Z, Z, Z),
+				   pfs_half, 0);
+
+			emit_arith(fp, PFS_OP_FRC, temp[0], WRITEMASK_X,
+				   swizzle(temp[0], X, X, X, X),
+				   undef, undef, 0);
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),	//2*PI
+				   negate(swizzle(const_sin[0], Z, Z, Z, Z)),	//PI
+				   0);
+
+			/* SIN */
+
+			emit_arith(fp, PFS_OP_MAD, temp[0],
+				   WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
+								      Z, Z, Z,
+								      Z),
+				   const_sin[0], pfs_zero, 0);
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
+				   swizzle(temp[0], Y, Y, Y, Y),
+				   absolute(swizzle(temp[0], Z, Z, Z, Z)),
+				   swizzle(temp[0], X, X, X, X), 0);
+
+			emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
+				   swizzle(temp[0], X, X, X, X),
+				   absolute(swizzle(temp[0], X, X, X, X)),
+				   negate(swizzle(temp[0], X, X, X, X)), 0);
+
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   swizzle(temp[0], Y, Y, Y, Y),
+				   swizzle(const_sin[0], W, W, W, W),
+				   swizzle(temp[0], X, X, X, X), flags);
+
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_SLT:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			temp[0] = get_temp_reg(fp);
+			/* temp = src0 - src1
+			 * dest.c = (temp.c < 0.0) ? 1 : 0
+			 */
+			emit_arith(fp, PFS_OP_MAD, temp[0], mask,
+				   src[0], pfs_one, negate(src[1]), 0);
+			emit_arith(fp, PFS_OP_CMP, dest, mask,
+				   pfs_zero, pfs_one, temp[0], 0);
+			free_temp(fp, temp[0]);
+			break;
+		case OPCODE_SUB:
+			src[0] = t_src(fp, fpi->SrcReg[0]);
+			src[1] = t_src(fp, fpi->SrcReg[1]);
+			emit_arith(fp, PFS_OP_MAD, dest, mask,
+				   src[0], pfs_one, negate(src[1]), flags);
+			break;
+		case OPCODE_TEX:
+			emit_tex(fp, fpi, R300_FPITX_OP_TEX);
+			break;
+		case OPCODE_TXB:
+			emit_tex(fp, fpi, R300_FPITX_OP_TXB);
+			break;
+		case OPCODE_TXP:
+			emit_tex(fp, fpi, R300_FPITX_OP_TXP);
+			break;
+		case OPCODE_XPD:{
+				src[0] = t_src(fp, fpi->SrcReg[0]);
+				src[1] = t_src(fp, fpi->SrcReg[1]);
+				temp[0] = get_temp_reg(fp);
+				/* temp = src0.zxy * src1.yzx */
+				emit_arith(fp, PFS_OP_MAD, temp[0],
+					   WRITEMASK_XYZ, swizzle(keep(src[0]),
+								  Z, X, Y, W),
+					   swizzle(keep(src[1]), Y, Z, X, W),
+					   pfs_zero, 0);
+				/* dest.xyz = src0.yzx * src1.zxy - temp
+				 * dest.w       = undefined
+				 * */
+				emit_arith(fp, PFS_OP_MAD, dest,
+					   mask & WRITEMASK_XYZ, swizzle(src[0],
+									 Y, Z,
+									 X, W),
+					   swizzle(src[1], Z, X, Y, W),
+					   negate(temp[0]), flags);
+				/* cleanup */
+				free_temp(fp, temp[0]);
+				break;
+			}
+		default:
+			ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
+			break;
+		}
+
+		if (fp->error)
+			return GL_FALSE;
+
+	}
+
+	return GL_TRUE;
+}
+
+static void insert_wpos(struct gl_program *prog)
+{
+	static gl_state_index tokens[STATE_LENGTH] = {
+		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
+	};
+	struct prog_instruction *fpi;
+	GLuint window_index;
+	int i = 0;
+	GLuint tempregi = prog->NumTemporaries;
+	/* should do something else if no temps left... */
+	prog->NumTemporaries++;
+
+	fpi = _mesa_alloc_instructions(prog->NumInstructions + 3);
+	_mesa_init_instructions(fpi, prog->NumInstructions + 3);
+
+	/* perspective divide */
+	fpi[i].Opcode = OPCODE_RCP;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_W;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+	i++;
+
+	fpi[i].Opcode = OPCODE_MUL;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
+	fpi[i].SrcReg[1].Index = tempregi;
+	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
+	i++;
+
+	/* viewport transformation */
+	window_index = _mesa_add_state_reference(prog->Parameters, tokens);
+
+	fpi[i].Opcode = OPCODE_MAD;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+	fpi[i].SrcReg[0].Index = tempregi;
+	fpi[i].SrcReg[0].Swizzle =
+	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
+	fpi[i].SrcReg[1].Index = window_index;
+	fpi[i].SrcReg[1].Swizzle =
+	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
+	fpi[i].SrcReg[2].Index = window_index;
+	fpi[i].SrcReg[2].Swizzle =
+	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+	i++;
+
+	_mesa_copy_instructions(&fpi[i], prog->Instructions,
+				prog->NumInstructions);
+
+	free(prog->Instructions);
+
+	prog->Instructions = fpi;
+
+	prog->NumInstructions += i;
+	fpi = &prog->Instructions[prog->NumInstructions - 1];
+
+	assert(fpi->Opcode == OPCODE_END);
+
+	for (fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++) {
+		for (i = 0; i < 3; i++)
+			if (fpi->SrcReg[i].File == PROGRAM_INPUT &&
+			    fpi->SrcReg[i].Index == FRAG_ATTRIB_WPOS) {
+				fpi->SrcReg[i].File = PROGRAM_TEMPORARY;
+				fpi->SrcReg[i].Index = tempregi;
+			}
+	}
+}
+
+/* - Init structures
+ * - Determine what hwregs each input corresponds to
+ */
+static void init_program(r300ContextPtr r300, struct r300_fragment_program *fp)
+{
+	struct r300_pfs_compile_state *cs = NULL;
+	struct gl_fragment_program *mp = &fp->mesa_program;
+	struct prog_instruction *fpi;
+	GLuint InputsRead = mp->Base.InputsRead;
+	GLuint temps_used = 0;	/* for fp->temps[] */
+	int i, j;
+
+	/* New compile, reset tracking data */
+	fp->optimization =
+	    driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
+	fp->translated = GL_FALSE;
+	fp->error = GL_FALSE;
+	fp->cs = cs = &(R300_CONTEXT(fp->ctx)->state.pfs_compile);
+	fp->tex.length = 0;
+	fp->cur_node = 0;
+	fp->first_node_has_tex = 0;
+	fp->const_nr = 0;
+	fp->max_temp_idx = 0;
+	fp->node[0].alu_end = -1;
+	fp->node[0].tex_end = -1;
+
+	_mesa_memset(cs, 0, sizeof(*fp->cs));
+	for (i = 0; i < PFS_MAX_ALU_INST; i++) {
+		for (j = 0; j < 3; j++) {
+			cs->slot[i].vsrc[j] = SRC_CONST;
+			cs->slot[i].ssrc[j] = SRC_CONST;
+		}
+	}
+
+	/* Work out what temps the Mesa inputs correspond to, this must match
+	 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
+	 * configures itself based on the fragprog's InputsRead
+	 *
+	 * NOTE: this depends on get_hw_temp() allocating registers in order,
+	 * starting from register 0.
+	 */
+
+	/* Texcoords come first */
+	for (i = 0; i < fp->ctx->Const.MaxTextureUnits; i++) {
+		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
+			cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
+			cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
+			    get_hw_temp(fp, 0);
+		}
+	}
+	InputsRead &= ~FRAG_BITS_TEX_ANY;
+
+	/* fragment position treated as a texcoord */
+	if (InputsRead & FRAG_BIT_WPOS) {
+		cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
+		cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(fp, 0);
+		insert_wpos(&mp->Base);
+	}
+	InputsRead &= ~FRAG_BIT_WPOS;
+
+	/* Then primary colour */
+	if (InputsRead & FRAG_BIT_COL0) {
+		cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
+		cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(fp, 0);
+	}
+	InputsRead &= ~FRAG_BIT_COL0;
+
+	/* Secondary color */
+	if (InputsRead & FRAG_BIT_COL1) {
+		cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
+		cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(fp, 0);
+	}
+	InputsRead &= ~FRAG_BIT_COL1;
+
+	/* Anything else */
+	if (InputsRead) {
+		WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
+		/* force read from hwreg 0 for now */
+		for (i = 0; i < 32; i++)
+			if (InputsRead & (1 << i))
+				cs->inputs[i].reg = 0;
+	}
+
+	/* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
+	 * That way, we can free up the reg when it's no longer needed
+	 */
+	if (!mp->Base.Instructions) {
+		ERROR("No instructions found in program\n");
+		return;
+	}
+
+	for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
+		int idx;
+
+		for (i = 0; i < 3; i++) {
+			idx = fpi->SrcReg[i].Index;
+			switch (fpi->SrcReg[i].File) {
+			case PROGRAM_TEMPORARY:
+				if (!(temps_used & (1 << idx))) {
+					cs->temps[idx].reg = -1;
+					cs->temps[idx].refcount = 1;
+					temps_used |= (1 << idx);
+				} else
+					cs->temps[idx].refcount++;
+				break;
+			case PROGRAM_INPUT:
+				cs->inputs[idx].refcount++;
+				break;
+			default:
+				break;
+			}
+		}
+
+		idx = fpi->DstReg.Index;
+		if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
+			if (!(temps_used & (1 << idx))) {
+				cs->temps[idx].reg = -1;
+				cs->temps[idx].refcount = 1;
+				temps_used |= (1 << idx);
+			} else
+				cs->temps[idx].refcount++;
+		}
+	}
+	cs->temp_in_use = temps_used;
+}
+
+static void update_params(struct r300_fragment_program *fp)
+{
+	struct gl_fragment_program *mp = &fp->mesa_program;
+
+	/* Ask Mesa nicely to fill in ParameterValues for us */
+	if (mp->Base.Parameters)
+		_mesa_load_state_parameters(fp->ctx, mp->Base.Parameters);
+}
+
+void r300TranslateFragmentShader(r300ContextPtr r300,
+				 struct r300_fragment_program *fp)
+{
+	struct r300_pfs_compile_state *cs = NULL;
+
+	if (!fp->translated) {
+
+		init_program(r300, fp);
+		cs = fp->cs;
+
+		if (parse_program(fp) == GL_FALSE) {
+			dump_program(fp);
+			return;
+		}
+
+		/* Finish off */
+		fp->node[fp->cur_node].alu_end =
+		    cs->nrslots - fp->node[fp->cur_node].alu_offset - 1;
+		if (fp->node[fp->cur_node].tex_end < 0)
+			fp->node[fp->cur_node].tex_end = 0;
+		fp->alu_offset = 0;
+		fp->alu_end = cs->nrslots - 1;
+		fp->tex_offset = 0;
+		fp->tex_end = fp->tex.length ? fp->tex.length - 1 : 0;
+		assert(fp->node[fp->cur_node].alu_end >= 0);
+		assert(fp->alu_end >= 0);
+
+		fp->translated = GL_TRUE;
+		if (RADEON_DEBUG & DEBUG_PIXEL)
+			dump_program(fp);
+		r300UpdateStateParameters(fp->ctx, _NEW_PROGRAM);
+	}
+
+	update_params(fp);
+}
+
+/* just some random things... */
+static void dump_program(struct r300_fragment_program *fp)
+{
+	int n, i, j;
+	static int pc = 0;
+
+	fprintf(stderr, "pc=%d*************************************\n", pc++);
+
+	fprintf(stderr, "Mesa program:\n");
+	fprintf(stderr, "-------------\n");
+	_mesa_print_program(&fp->mesa_program.Base);
+	fflush(stdout);
+
+	fprintf(stderr, "Hardware program\n");
+	fprintf(stderr, "----------------\n");
+
+	for (n = 0; n < (fp->cur_node + 1); n++) {
+		fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "
+			"alu_end: %d, tex_end: %d\n", n,
+			fp->node[n].alu_offset,
+			fp->node[n].tex_offset,
+			fp->node[n].alu_end, fp->node[n].tex_end);
+
+		if (fp->tex.length) {
+			fprintf(stderr, "  TEX:\n");
+			for (i = fp->node[n].tex_offset;
+			     i <= fp->node[n].tex_offset + fp->node[n].tex_end;
+			     ++i) {
+				const char *instr;
+
+				switch ((fp->tex.
+					 inst[i] >> R300_FPITX_OPCODE_SHIFT) &
+					15) {
+				case R300_FPITX_OP_TEX:
+					instr = "TEX";
+					break;
+				case R300_FPITX_OP_KIL:
+					instr = "KIL";
+					break;
+				case R300_FPITX_OP_TXP:
+					instr = "TXP";
+					break;
+				case R300_FPITX_OP_TXB:
+					instr = "TXB";
+					break;
+				default:
+					instr = "UNKNOWN";
+				}
+
+				fprintf(stderr,
+					"    %s t%i, %c%i, texture[%i]   (%08x)\n",
+					instr,
+					(fp->tex.
+					 inst[i] >> R300_FPITX_DST_SHIFT) & 31,
+					(fp->tex.
+					 inst[i] & R300_FPITX_SRC_CONST) ? 'c' :
+					't',
+					(fp->tex.
+					 inst[i] >> R300_FPITX_SRC_SHIFT) & 31,
+					(fp->tex.
+					 inst[i] & R300_FPITX_IMAGE_MASK) >>
+					R300_FPITX_IMAGE_SHIFT,
+					fp->tex.inst[i]);
+			}
+		}
+
+		for (i = fp->node[n].alu_offset;
+		     i <= fp->node[n].alu_offset + fp->node[n].alu_end; ++i) {
+			char srcc[3][10], dstc[20];
+			char srca[3][10], dsta[20];
+			char argc[3][20];
+			char arga[3][20];
+			char flags[5], tmp[10];
+
+			for (j = 0; j < 3; ++j) {
+				int regc = fp->alu.inst[i].inst1 >> (j * 6);
+				int rega = fp->alu.inst[i].inst3 >> (j * 6);
+
+				sprintf(srcc[j], "%c%i",
+					(regc & 32) ? 'c' : 't', regc & 31);
+				sprintf(srca[j], "%c%i",
+					(rega & 32) ? 'c' : 't', rega & 31);
+			}
+
+			dstc[0] = 0;
+			sprintf(flags, "%s%s%s",
+				(fp->alu.inst[i].
+				 inst1 & R300_FPI1_DSTC_REG_X) ? "x" : "",
+				(fp->alu.inst[i].
+				 inst1 & R300_FPI1_DSTC_REG_Y) ? "y" : "",
+				(fp->alu.inst[i].
+				 inst1 & R300_FPI1_DSTC_REG_Z) ? "z" : "");
+			if (flags[0] != 0) {
+				sprintf(dstc, "t%i.%s ",
+					(fp->alu.inst[i].
+					 inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
+					flags);
+			}
+			sprintf(flags, "%s%s%s",
+				(fp->alu.inst[i].
+				 inst1 & R300_FPI1_DSTC_OUTPUT_X) ? "x" : "",
+				(fp->alu.inst[i].
+				 inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? "y" : "",
+				(fp->alu.inst[i].
+				 inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? "z" : "");
+			if (flags[0] != 0) {
+				sprintf(tmp, "o%i.%s",
+					(fp->alu.inst[i].
+					 inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
+					flags);
+				strcat(dstc, tmp);
+			}
+
+			dsta[0] = 0;
+			if (fp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) {
+				sprintf(dsta, "t%i.w ",
+					(fp->alu.inst[i].
+					 inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
+			}
+			if (fp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) {
+				sprintf(tmp, "o%i.w ",
+					(fp->alu.inst[i].
+					 inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
+				strcat(dsta, tmp);
+			}
+			if (fp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) {
+				strcat(dsta, "Z");
+			}
+
+			fprintf(stderr,
+				"%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
+				"       w: %3s %3s %3s -> %-20s (%08x)\n", i,
+				srcc[0], srcc[1], srcc[2], dstc,
+				fp->alu.inst[i].inst1, srca[0], srca[1],
+				srca[2], dsta, fp->alu.inst[i].inst3);
+
+			for (j = 0; j < 3; ++j) {
+				int regc = fp->alu.inst[i].inst0 >> (j * 7);
+				int rega = fp->alu.inst[i].inst2 >> (j * 7);
+				int d;
+				char buf[20];
+
+				d = regc & 31;
+				if (d < 12) {
+					switch (d % 4) {
+					case R300_FPI0_ARGC_SRC0C_XYZ:
+						sprintf(buf, "%s.xyz",
+							srcc[d / 4]);
+						break;
+					case R300_FPI0_ARGC_SRC0C_XXX:
+						sprintf(buf, "%s.xxx",
+							srcc[d / 4]);
+						break;
+					case R300_FPI0_ARGC_SRC0C_YYY:
+						sprintf(buf, "%s.yyy",
+							srcc[d / 4]);
+						break;
+					case R300_FPI0_ARGC_SRC0C_ZZZ:
+						sprintf(buf, "%s.zzz",
+							srcc[d / 4]);
+						break;
+					}
+				} else if (d < 15) {
+					sprintf(buf, "%s.www", srca[d - 12]);
+				} else if (d == 20) {
+					sprintf(buf, "0.0");
+				} else if (d == 21) {
+					sprintf(buf, "1.0");
+				} else if (d == 22) {
+					sprintf(buf, "0.5");
+				} else if (d >= 23 && d < 32) {
+					d -= 23;
+					switch (d / 3) {
+					case 0:
+						sprintf(buf, "%s.yzx",
+							srcc[d % 3]);
+						break;
+					case 1:
+						sprintf(buf, "%s.zxy",
+							srcc[d % 3]);
+						break;
+					case 2:
+						sprintf(buf, "%s.Wzy",
+							srcc[d % 3]);
+						break;
+					}
+				} else {
+					sprintf(buf, "%i", d);
+				}
+
+				sprintf(argc[j], "%s%s%s%s",
+					(regc & 32) ? "-" : "",
+					(regc & 64) ? "|" : "",
+					buf, (regc & 64) ? "|" : "");
+
+				d = rega & 31;
+				if (d < 9) {
+					sprintf(buf, "%s.%c", srcc[d / 3],
+						'x' + (char)(d % 3));
+				} else if (d < 12) {
+					sprintf(buf, "%s.w", srca[d - 9]);
+				} else if (d == 16) {
+					sprintf(buf, "0.0");
+				} else if (d == 17) {
+					sprintf(buf, "1.0");
+				} else if (d == 18) {
+					sprintf(buf, "0.5");
+				} else {
+					sprintf(buf, "%i", d);
+				}
+
+				sprintf(arga[j], "%s%s%s%s",
+					(rega & 32) ? "-" : "",
+					(rega & 64) ? "|" : "",
+					buf, (rega & 64) ? "|" : "");
+			}
+
+			fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
+				"       w: %8s %8s %8s    op: %08x\n",
+				argc[0], argc[1], argc[2],
+				fp->alu.inst[i].inst0, arga[0], arga[1],
+				arga[2], fp->alu.inst[i].inst2);
+		}
+	}
+}
diff --git a/r300/r300_fragprog.h b/r300/r300_fragprog.h
new file mode 100644
index 0000000..72fca77
--- /dev/null
+++ b/r300/r300_fragprog.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/*
+ * Authors:
+ *   Ben Skeggs <darktama@iinet.net.au>
+ *   Jerome Glisse <j.glisse@gmail.com>
+ */
+#ifndef __R300_FRAGPROG_H_
+#define __R300_FRAGPROG_H_
+
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "shader/program.h"
+#include "shader/prog_instruction.h"
+
+#include "r300_context.h"
+
+typedef struct r300_fragment_program_swizzle {
+	GLuint length;
+	GLuint src[4];
+	GLuint inst[8];
+} r300_fragment_program_swizzle_t;
+
+/* supported hw opcodes */
+#define PFS_OP_MAD 0
+#define PFS_OP_DP3 1
+#define PFS_OP_DP4 2
+#define PFS_OP_MIN 3
+#define PFS_OP_MAX 4
+#define PFS_OP_CMP 5
+#define PFS_OP_FRC 6
+#define PFS_OP_EX2 7
+#define PFS_OP_LG2 8
+#define PFS_OP_RCP 9
+#define PFS_OP_RSQ 10
+#define PFS_OP_REPL_ALPHA 11
+#define PFS_OP_CMPH 12
+#define MAX_PFS_OP 12
+
+#define PFS_FLAG_SAT	(1 << 0)
+#define PFS_FLAG_ABS	(1 << 1)
+
+#define ARG_NEG			(1 << 5)
+#define ARG_ABS			(1 << 6)
+#define ARG_MASK		(127 << 0)
+#define ARG_STRIDE		7
+#define SRC_CONST		(1 << 5)
+#define SRC_MASK		(63 << 0)
+#define SRC_STRIDE		6
+
+#define NOP_INST0 (						 \
+		(R300_FPI0_OUTC_MAD) |				 \
+		(R300_FPI0_ARGC_ZERO << R300_FPI0_ARG0C_SHIFT) | \
+		(R300_FPI0_ARGC_ZERO << R300_FPI0_ARG1C_SHIFT) | \
+		(R300_FPI0_ARGC_ZERO << R300_FPI0_ARG2C_SHIFT))
+#define NOP_INST1 (					     \
+		((0 | SRC_CONST) << R300_FPI1_SRC0C_SHIFT) | \
+		((0 | SRC_CONST) << R300_FPI1_SRC1C_SHIFT) | \
+		((0 | SRC_CONST) << R300_FPI1_SRC2C_SHIFT))
+#define NOP_INST2 ( \
+		(R300_FPI2_OUTA_MAD) |				 \
+		(R300_FPI2_ARGA_ZERO << R300_FPI2_ARG0A_SHIFT) | \
+		(R300_FPI2_ARGA_ZERO << R300_FPI2_ARG1A_SHIFT) | \
+		(R300_FPI2_ARGA_ZERO << R300_FPI2_ARG2A_SHIFT))
+#define NOP_INST3 (					     \
+		((0 | SRC_CONST) << R300_FPI3_SRC0A_SHIFT) | \
+		((0 | SRC_CONST) << R300_FPI3_SRC1A_SHIFT) | \
+		((0 | SRC_CONST) << R300_FPI3_SRC2A_SHIFT))
+
+#define DRI_CONF_FP_OPTIMIZATION_SPEED   0
+#define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
+
+struct r300_fragment_program;
+
+extern void r300TranslateFragmentShader(r300ContextPtr r300,
+					struct r300_fragment_program *fp);
+
+#endif
diff --git a/r300/r300_ioctl.c b/r300/r300_ioctl.c
new file mode 100644
index 0000000..ea94ce2
--- /dev/null
+++ b/r300/r300_ioctl.c
@@ -0,0 +1,719 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.
+Copyright (C) 2004 Nicolai Haehnle.
+All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include <sched.h>
+#include <errno.h>
+
+#include "glheader.h"
+#include "imports.h"
+#include "macros.h"
+#include "context.h"
+#include "swrast/swrast.h"
+
+#include "r300_context.h"
+#include "radeon_ioctl.h"
+#include "r300_ioctl.h"
+#include "r300_cmdbuf.h"
+#include "r300_state.h"
+#include "r300_program.h"
+#include "radeon_reg.h"
+#include "r300_emit.h"
+
+#include "vblank.h"
+
+#define CLEARBUFFER_COLOR	0x1
+#define CLEARBUFFER_DEPTH	0x2
+#define CLEARBUFFER_STENCIL	0x4
+
+static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
+{
+	GLcontext *ctx = r300->radeon.glCtx;
+	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	GLuint cboffset, cbpitch;
+	drm_r300_cmd_header_t *cmd2;
+	int cmd_reserved = 0;
+	int cmd_written = 0;
+	drm_radeon_cmd_header_t *cmd = NULL;
+	r300ContextPtr rmesa = r300;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s: %s buffer (%i,%i %ix%i)\n",
+			__FUNCTION__, buffer ? "back" : "front",
+			dPriv->x, dPriv->y, dPriv->w, dPriv->h);
+
+	if (buffer) {
+		cboffset = r300->radeon.radeonScreen->backOffset;
+		cbpitch = r300->radeon.radeonScreen->backPitch;
+	} else {
+		cboffset = r300->radeon.radeonScreen->frontOffset;
+		cbpitch = r300->radeon.radeonScreen->frontPitch;
+	}
+
+	cboffset += r300->radeon.radeonScreen->fbLocation;
+
+	cp_wait(r300, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	end_3d(rmesa);
+
+	R300_STATECHANGE(r300, cb);
+	reg_start(R300_RB3D_COLOROFFSET0, 0);
+	e32(cboffset);
+
+	if (r300->radeon.radeonScreen->cpp == 4)
+		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+	else
+		cbpitch |= R300_COLOR_FORMAT_RGB565;
+
+	if (r300->radeon.sarea->tiling_enabled)
+		cbpitch |= R300_COLOR_TILE_ENABLE;
+
+	reg_start(R300_RB3D_COLORPITCH0, 0);
+	e32(cbpitch);
+
+	R300_STATECHANGE(r300, cmk);
+	reg_start(R300_RB3D_COLORMASK, 0);
+
+	if (flags & CLEARBUFFER_COLOR) {
+		e32((ctx->Color.ColorMask[BCOMP] ? R300_COLORMASK0_B : 0) |
+		    (ctx->Color.ColorMask[GCOMP] ? R300_COLORMASK0_G : 0) |
+		    (ctx->Color.ColorMask[RCOMP] ? R300_COLORMASK0_R : 0) |
+		    (ctx->Color.ColorMask[ACOMP] ? R300_COLORMASK0_A : 0));
+	} else {
+		e32(0x0);
+	}
+
+	R300_STATECHANGE(r300, zs);
+	reg_start(R300_RB3D_ZSTENCIL_CNTL_0, 2);
+
+	{
+		uint32_t t1, t2;
+
+		t1 = 0x0;
+		t2 = 0x0;
+
+		if (flags & CLEARBUFFER_DEPTH) {
+			t1 |= R300_RB3D_Z_WRITE_ONLY;
+			t2 |=
+			    (R300_ZS_ALWAYS << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT);
+		} else {
+			t1 |= R300_RB3D_Z_DISABLED_1;	// disable
+		}
+
+		if (flags & CLEARBUFFER_STENCIL) {
+			t1 |= R300_RB3D_STENCIL_ENABLE;
+			t2 |=
+			    (R300_ZS_ALWAYS <<
+			     R300_RB3D_ZS1_FRONT_FUNC_SHIFT) |
+			    (R300_ZS_REPLACE <<
+			     R300_RB3D_ZS1_FRONT_FAIL_OP_SHIFT) |
+			    (R300_ZS_REPLACE <<
+			     R300_RB3D_ZS1_FRONT_ZPASS_OP_SHIFT) |
+			    (R300_ZS_REPLACE <<
+			     R300_RB3D_ZS1_FRONT_ZFAIL_OP_SHIFT) |
+			    (R300_ZS_ALWAYS <<
+			     R300_RB3D_ZS1_BACK_FUNC_SHIFT) |
+			    (R300_ZS_REPLACE <<
+			     R300_RB3D_ZS1_BACK_FAIL_OP_SHIFT) |
+			    (R300_ZS_REPLACE <<
+			     R300_RB3D_ZS1_BACK_ZPASS_OP_SHIFT) |
+			    (R300_ZS_REPLACE <<
+			     R300_RB3D_ZS1_BACK_ZFAIL_OP_SHIFT);
+		}
+
+		e32(t1);
+		e32(t2);
+		e32(r300->state.stencil.clear);
+	}
+
+	cmd2 = (drm_r300_cmd_header_t *) r300AllocCmdBuf(r300, 9, __FUNCTION__);
+	cmd2[0].packet3.cmd_type = R300_CMD_PACKET3;
+	cmd2[0].packet3.packet = R300_CMD_PACKET3_CLEAR;
+	cmd2[1].u = r300PackFloat32(dPriv->w / 2.0);
+	cmd2[2].u = r300PackFloat32(dPriv->h / 2.0);
+	cmd2[3].u = r300PackFloat32(ctx->Depth.Clear);
+	cmd2[4].u = r300PackFloat32(1.0);
+	cmd2[5].u = r300PackFloat32(ctx->Color.ClearColor[0]);
+	cmd2[6].u = r300PackFloat32(ctx->Color.ClearColor[1]);
+	cmd2[7].u = r300PackFloat32(ctx->Color.ClearColor[2]);
+	cmd2[8].u = r300PackFloat32(ctx->Color.ClearColor[3]);
+
+	reg_start(R300_RB3D_DSTCACHE_CTLSTAT, 0);
+	e32(R300_RB3D_DSTCACHE_UNKNOWN_0A);
+
+	reg_start(R300_RB3D_ZCACHE_CTLSTAT, 0);
+	e32(R300_RB3D_ZCACHE_UNKNOWN_03);
+	cp_wait(rmesa, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+}
+
+static void r300EmitClearState(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	r300ContextPtr rmesa = r300;
+	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	int i;
+	int cmd_reserved = 0;
+	int cmd_written = 0;
+	drm_radeon_cmd_header_t *cmd = NULL;
+	int has_tcl = 1;
+
+	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+		has_tcl = 0;
+
+	/* FIXME: the values written to R300_VAP_INPUT_ROUTE_0_0 and
+	 * R300_VAP_INPUT_ROUTE_0_1 are in fact known, however, the values are
+	 * quite complex; see the functions in r300_emit.c.
+	 *
+	 * I believe it would be a good idea to extend the functions in
+	 * r300_emit.c so that they can be used to setup the default values for
+	 * these registers, as well as the actual values used for rendering.
+	 */
+	R300_STATECHANGE(r300, vir[0]);
+	reg_start(R300_VAP_INPUT_ROUTE_0_0, 0);
+	if (!has_tcl)
+		e32(0x22030003);
+	else
+		e32(0x21030003);
+
+	/* disable fog */
+	R300_STATECHANGE(r300, fogs);
+	reg_start(R300_RE_FOG_STATE, 0);
+	e32(0x0);
+
+	R300_STATECHANGE(r300, vir[1]);
+	reg_start(R300_VAP_INPUT_ROUTE_1_0, 0);
+	e32(0xF688F688);
+
+	/* R300_VAP_INPUT_CNTL_0, R300_VAP_INPUT_CNTL_1 */
+	R300_STATECHANGE(r300, vic);
+	reg_start(R300_VAP_INPUT_CNTL_0, 1);
+	e32(R300_INPUT_CNTL_0_COLOR);
+	e32(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
+
+	if (!has_tcl) {
+		R300_STATECHANGE(r300, vte);
+		/* comes from fglrx startup of clear */
+		reg_start(R300_SE_VTE_CNTL, 1);
+		e32(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
+		    R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
+		    R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
+		    R300_VPORT_Z_OFFSET_ENA);
+		e32(0x8);
+
+		reg_start(0x21dc, 0);
+		e32(0xaaaaaaaa);
+	}
+
+	R300_STATECHANGE(r300, vof);
+	reg_start(R300_VAP_OUTPUT_VTX_FMT_0, 1);
+	e32(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
+	    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_PRESENT);
+	e32(0x0);			/* no textures */
+
+	R300_STATECHANGE(r300, txe);
+	reg_start(R300_TX_ENABLE, 0);
+	e32(0x0);
+
+	R300_STATECHANGE(r300, vpt);
+	reg_start(R300_SE_VPORT_XSCALE, 5);
+	efloat(1.0);
+	efloat(dPriv->x);
+	efloat(1.0);
+	efloat(dPriv->y);
+	efloat(1.0);
+	efloat(0.0);
+
+	R300_STATECHANGE(r300, at);
+	reg_start(R300_PP_ALPHA_TEST, 0);
+	e32(0x0);
+
+	R300_STATECHANGE(r300, bld);
+	reg_start(R300_RB3D_CBLEND, 1);
+	e32(0x0);
+	e32(0x0);
+
+	R300_STATECHANGE(r300, vap_clip_cntl);
+	reg_start(R300_VAP_CLIP_CNTL, 0);
+	e32(R300_221C_CLEAR);
+
+	R300_STATECHANGE(r300, ps);
+	reg_start(R300_RE_POINTSIZE, 0);
+	e32(((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
+	    ((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
+
+	R300_STATECHANGE(r300, ri);
+	reg_start(R300_RS_INTERP_0, 8);
+	for (i = 0; i < 8; ++i) {
+		e32(R300_RS_INTERP_USED);
+	}
+
+	R300_STATECHANGE(r300, rc);
+	/* The second constant is needed to get glxgears display anything .. */
+	reg_start(R300_RS_CNTL_0, 1);
+	e32((1 << R300_RS_CNTL_CI_CNT_SHIFT) | R300_RS_CNTL_0_UNKNOWN_18);
+	e32(0x0);
+
+	R300_STATECHANGE(r300, rr);
+	reg_start(R300_RS_ROUTE_0, 0);
+	e32(R300_RS_ROUTE_0_COLOR);
+
+	R300_STATECHANGE(r300, fp);
+	reg_start(R300_PFS_CNTL_0, 2);
+	e32(0x0);
+	e32(0x0);
+	e32(0x0);
+	reg_start(R300_PFS_NODE_0, 3);
+	e32(0x0);
+	e32(0x0);
+	e32(0x0);
+	e32(R300_PFS_NODE_OUTPUT_COLOR);
+
+	R300_STATECHANGE(r300, fpi[0]);
+	R300_STATECHANGE(r300, fpi[1]);
+	R300_STATECHANGE(r300, fpi[2]);
+	R300_STATECHANGE(r300, fpi[3]);
+
+	reg_start(R300_PFS_INSTR0_0, 0);
+	e32(FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
+
+	reg_start(R300_PFS_INSTR1_0, 0);
+	e32(FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
+
+	reg_start(R300_PFS_INSTR2_0, 0);
+	e32(FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
+
+	reg_start(R300_PFS_INSTR3_0, 0);
+	e32(FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
+
+	if (has_tcl) {
+		R300_STATECHANGE(r300, pvs);
+		reg_start(R300_VAP_PVS_CNTL_1, 2);
+		e32((0 << R300_PVS_CNTL_1_PROGRAM_START_SHIFT) |
+		    (0 << R300_PVS_CNTL_1_POS_END_SHIFT) |
+		    (1 << R300_PVS_CNTL_1_PROGRAM_END_SHIFT));
+		e32(0x0);
+		e32(1 << R300_PVS_CNTL_3_PROGRAM_UNKNOWN_SHIFT);
+
+		R300_STATECHANGE(r300, vpi);
+		vsf_start_fragment(0x0, 8);
+		e32(VP_OUT(ADD, OUT, 0, XYZW));
+		e32(VP_IN(IN, 0));
+		e32(VP_ZERO());
+		e32(0x0);
+
+		e32(VP_OUT(ADD, OUT, 1, XYZW));
+		e32(VP_IN(IN, 1));
+		e32(VP_ZERO());
+		e32(0x0);
+	}
+}
+
+/**
+ * Buffer clear
+ */
+static void r300Clear(GLcontext * ctx, GLbitfield mask)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	int flags = 0;
+	int bits = 0;
+	int swapped;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "r300Clear\n");
+
+	{
+		LOCK_HARDWARE(&r300->radeon);
+		UNLOCK_HARDWARE(&r300->radeon);
+		if (dPriv->numClipRects == 0)
+			return;
+	}
+
+	if (mask & BUFFER_BIT_FRONT_LEFT) {
+		flags |= BUFFER_BIT_FRONT_LEFT;
+		mask &= ~BUFFER_BIT_FRONT_LEFT;
+	}
+
+	if (mask & BUFFER_BIT_BACK_LEFT) {
+		flags |= BUFFER_BIT_BACK_LEFT;
+		mask &= ~BUFFER_BIT_BACK_LEFT;
+	}
+
+	if (mask & BUFFER_BIT_DEPTH) {
+		bits |= CLEARBUFFER_DEPTH;
+		mask &= ~BUFFER_BIT_DEPTH;
+	}
+
+	if ((mask & BUFFER_BIT_STENCIL) && r300->state.stencil.hw_stencil) {
+		bits |= CLEARBUFFER_STENCIL;
+		mask &= ~BUFFER_BIT_STENCIL;
+	}
+
+	if (mask) {
+		if (RADEON_DEBUG & DEBUG_FALLBACKS)
+			fprintf(stderr, "%s: swrast clear, mask: %x\n",
+				__FUNCTION__, mask);
+		_swrast_Clear(ctx, mask);
+	}
+
+	swapped = r300->radeon.sarea->pfCurrentPage == 1;
+
+	/* Make sure it fits there. */
+	r300EnsureCmdBufSpace(r300, 421 * 3, __FUNCTION__);
+	if (flags || bits)
+		r300EmitClearState(ctx);
+
+	if (flags & BUFFER_BIT_FRONT_LEFT) {
+		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped);
+		bits = 0;
+	}
+
+	if (flags & BUFFER_BIT_BACK_LEFT) {
+		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped ^ 1);
+		bits = 0;
+	}
+
+	if (bits)
+		r300ClearBuffer(r300, bits, 0);
+
+}
+
+void r300Flush(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (r300->cmdbuf.count_used > r300->cmdbuf.count_reemit)
+		r300FlushCmdBuf(r300, __FUNCTION__);
+}
+
+#ifdef USER_BUFFERS
+#include "r300_mem.h"
+
+static void r300RefillCurrentDmaRegion(r300ContextPtr rmesa, int size)
+{
+	struct r300_dma_buffer *dmabuf;
+	size = MAX2(size, RADEON_BUFFER_SIZE * 16);
+
+	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (rmesa->dma.flush) {
+		rmesa->dma.flush(rmesa);
+	}
+
+	if (rmesa->dma.current.buf)
+		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
+
+	if (rmesa->dma.nr_released_bufs > 4)
+		r300FlushCmdBuf(rmesa, __FUNCTION__);
+
+	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
+	dmabuf->buf = (void *)1;	/* hack */
+	dmabuf->refcount = 1;
+
+	dmabuf->id = r300_mem_alloc(rmesa, 4, size);
+	if (dmabuf->id == 0) {
+		LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
+
+		r300FlushCmdBufLocked(rmesa, __FUNCTION__);
+		radeonWaitForIdleLocked(&rmesa->radeon);
+
+		dmabuf->id = r300_mem_alloc(rmesa, 4, size);
+
+		UNLOCK_HARDWARE(&rmesa->radeon);
+
+		if (dmabuf->id == 0) {
+			fprintf(stderr,
+				"Error: Could not get dma buffer... exiting\n");
+			_mesa_exit(-1);
+		}
+	}
+
+	rmesa->dma.current.buf = dmabuf;
+	rmesa->dma.current.address = r300_mem_ptr(rmesa, dmabuf->id);
+	rmesa->dma.current.end = size;
+	rmesa->dma.current.start = 0;
+	rmesa->dma.current.ptr = 0;
+}
+
+void r300ReleaseDmaRegion(r300ContextPtr rmesa,
+			  struct r300_dma_region *region, const char *caller)
+{
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
+
+	if (!region->buf)
+		return;
+
+	if (rmesa->dma.flush)
+		rmesa->dma.flush(rmesa);
+
+	if (--region->buf->refcount == 0) {
+		r300_mem_free(rmesa, region->buf->id);
+		FREE(region->buf);
+		rmesa->dma.nr_released_bufs++;
+	}
+
+	region->buf = 0;
+	region->start = 0;
+}
+
+/* Allocates a region from rmesa->dma.current.  If there isn't enough
+ * space in current, grab a new buffer (and discard what was left of current)
+ */
+void r300AllocDmaRegion(r300ContextPtr rmesa,
+			struct r300_dma_region *region,
+			int bytes, int alignment)
+{
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+
+	if (rmesa->dma.flush)
+		rmesa->dma.flush(rmesa);
+
+	if (region->buf)
+		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
+
+	alignment--;
+	rmesa->dma.current.start = rmesa->dma.current.ptr =
+	    (rmesa->dma.current.ptr + alignment) & ~alignment;
+
+	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
+		r300RefillCurrentDmaRegion(rmesa, (bytes + 0x7) & ~0x7);
+
+	region->start = rmesa->dma.current.start;
+	region->ptr = rmesa->dma.current.start;
+	region->end = rmesa->dma.current.start + bytes;
+	region->address = rmesa->dma.current.address;
+	region->buf = rmesa->dma.current.buf;
+	region->buf->refcount++;
+
+	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
+	rmesa->dma.current.start =
+	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
+
+	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
+}
+
+#else
+static void r300RefillCurrentDmaRegion(r300ContextPtr rmesa)
+{
+	struct r300_dma_buffer *dmabuf;
+	int fd = rmesa->radeon.dri.fd;
+	int index = 0;
+	int size = 0;
+	drmDMAReq dma;
+	int ret;
+
+	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (rmesa->dma.flush) {
+		rmesa->dma.flush(rmesa);
+	}
+
+	if (rmesa->dma.current.buf)
+		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
+
+	if (rmesa->dma.nr_released_bufs > 4)
+		r300FlushCmdBuf(rmesa, __FUNCTION__);
+
+	dma.context = rmesa->radeon.dri.hwContext;
+	dma.send_count = 0;
+	dma.send_list = NULL;
+	dma.send_sizes = NULL;
+	dma.flags = 0;
+	dma.request_count = 1;
+	dma.request_size = RADEON_BUFFER_SIZE;
+	dma.request_list = &index;
+	dma.request_sizes = &size;
+	dma.granted_count = 0;
+
+	LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
+
+	ret = drmDMA(fd, &dma);
+
+	if (ret != 0) {
+		/* Try to release some buffers and wait until we can't get any more */
+		if (rmesa->dma.nr_released_bufs) {
+			r300FlushCmdBufLocked(rmesa, __FUNCTION__);
+		}
+
+		if (RADEON_DEBUG & DEBUG_DMA)
+			fprintf(stderr, "Waiting for buffers\n");
+
+		radeonWaitForIdleLocked(&rmesa->radeon);
+		ret = drmDMA(fd, &dma);
+
+		if (ret != 0) {
+			UNLOCK_HARDWARE(&rmesa->radeon);
+			fprintf(stderr,
+				"Error: Could not get dma buffer... exiting\n");
+			_mesa_exit(-1);
+		}
+	}
+
+	UNLOCK_HARDWARE(&rmesa->radeon);
+
+	if (RADEON_DEBUG & DEBUG_DMA)
+		fprintf(stderr, "Allocated buffer %d\n", index);
+
+	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
+	dmabuf->buf = &rmesa->radeon.radeonScreen->buffers->list[index];
+	dmabuf->refcount = 1;
+
+	rmesa->dma.current.buf = dmabuf;
+	rmesa->dma.current.address = dmabuf->buf->address;
+	rmesa->dma.current.end = dmabuf->buf->total;
+	rmesa->dma.current.start = 0;
+	rmesa->dma.current.ptr = 0;
+}
+
+void r300ReleaseDmaRegion(r300ContextPtr rmesa,
+			  struct r300_dma_region *region, const char *caller)
+{
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
+
+	if (!region->buf)
+		return;
+
+	if (rmesa->dma.flush)
+		rmesa->dma.flush(rmesa);
+
+	if (--region->buf->refcount == 0) {
+		drm_radeon_cmd_header_t *cmd;
+
+		if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
+			fprintf(stderr, "%s -- DISCARD BUF %d\n",
+				__FUNCTION__, region->buf->buf->idx);
+		cmd =
+		    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa,
+								sizeof
+								(*cmd) / 4,
+								__FUNCTION__);
+		cmd->dma.cmd_type = R300_CMD_DMA_DISCARD;
+		cmd->dma.buf_idx = region->buf->buf->idx;
+
+		FREE(region->buf);
+		rmesa->dma.nr_released_bufs++;
+	}
+
+	region->buf = 0;
+	region->start = 0;
+}
+
+/* Allocates a region from rmesa->dma.current.  If there isn't enough
+ * space in current, grab a new buffer (and discard what was left of current)
+ */
+void r300AllocDmaRegion(r300ContextPtr rmesa,
+			struct r300_dma_region *region,
+			int bytes, int alignment)
+{
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+
+	if (rmesa->dma.flush)
+		rmesa->dma.flush(rmesa);
+
+	if (region->buf)
+		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
+
+	alignment--;
+	rmesa->dma.current.start = rmesa->dma.current.ptr =
+	    (rmesa->dma.current.ptr + alignment) & ~alignment;
+
+	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
+		r300RefillCurrentDmaRegion(rmesa);
+
+	region->start = rmesa->dma.current.start;
+	region->ptr = rmesa->dma.current.start;
+	region->end = rmesa->dma.current.start + bytes;
+	region->address = rmesa->dma.current.address;
+	region->buf = rmesa->dma.current.buf;
+	region->buf->refcount++;
+
+	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
+	rmesa->dma.current.start =
+	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
+
+	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
+}
+
+#endif
+
+GLboolean r300IsGartMemory(r300ContextPtr rmesa, const GLvoid * pointer,
+			   GLint size)
+{
+	int offset =
+	    (char *)pointer -
+	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+	int valid = (size >= 0 && offset >= 0
+		     && offset + size <
+		     rmesa->radeon.radeonScreen->gartTextures.size);
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "r300IsGartMemory( %p ) : %d\n", pointer,
+			valid);
+
+	return valid;
+}
+
+GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa, const GLvoid * pointer)
+{
+	int offset =
+	    (char *)pointer -
+	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+
+	//fprintf(stderr, "offset=%08x\n", offset);
+
+	if (offset < 0
+	    || offset > rmesa->radeon.radeonScreen->gartTextures.size)
+		return ~0;
+	else
+		return rmesa->radeon.radeonScreen->gart_texture_offset + offset;
+}
+
+void r300InitIoctlFuncs(struct dd_function_table *functions)
+{
+	functions->Clear = r300Clear;
+	functions->Finish = radeonFinish;
+	functions->Flush = r300Flush;
+}
diff --git a/r300/r300_ioctl.h b/r300/r300_ioctl.h
new file mode 100644
index 0000000..7a19a2c
--- /dev/null
+++ b/r300/r300_ioctl.h
@@ -0,0 +1,59 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R300_IOCTL_H__
+#define __R300_IOCTL_H__
+
+#include "r300_context.h"
+#include "radeon_drm.h"
+
+extern GLboolean r300IsGartMemory(r300ContextPtr rmesa,
+				  const GLvoid * pointer, GLint size);
+
+extern GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa,
+					const GLvoid * pointer);
+
+extern void r300Flush(GLcontext * ctx);
+
+extern void r300ReleaseDmaRegion(r300ContextPtr rmesa,
+				 struct r300_dma_region *region,
+				 const char *caller);
+extern void r300AllocDmaRegion(r300ContextPtr rmesa,
+			       struct r300_dma_region *region, int bytes,
+			       int alignment);
+
+extern void r300InitIoctlFuncs(struct dd_function_table *functions);
+
+#endif				/* __R300_IOCTL_H__ */
diff --git a/r300/r300_mem.c b/r300/r300_mem.c
new file mode 100644
index 0000000..f8f9d4f
--- /dev/null
+++ b/r300/r300_mem.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright (C) 2005 Aapo Tahkola.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * \author Aapo Tahkola <aet@rasterburn.org>
+ */
+
+#include <unistd.h>
+
+#include "r300_context.h"
+#include "r300_cmdbuf.h"
+#include "r300_ioctl.h"
+#include "r300_mem.h"
+#include "radeon_ioctl.h"
+
+#ifdef USER_BUFFERS
+
+static void resize_u_list(r300ContextPtr rmesa)
+{
+	void *temp;
+	int nsize;
+
+	temp = rmesa->rmm->u_list;
+	nsize = rmesa->rmm->u_size * 2;
+
+	rmesa->rmm->u_list = _mesa_malloc(nsize * sizeof(*rmesa->rmm->u_list));
+	_mesa_memset(rmesa->rmm->u_list, 0,
+		     nsize * sizeof(*rmesa->rmm->u_list));
+
+	if (temp) {
+		r300FlushCmdBuf(rmesa, __FUNCTION__);
+
+		_mesa_memcpy(rmesa->rmm->u_list, temp,
+			     rmesa->rmm->u_size * sizeof(*rmesa->rmm->u_list));
+		_mesa_free(temp);
+	}
+
+	rmesa->rmm->u_size = nsize;
+}
+
+void r300_mem_init(r300ContextPtr rmesa)
+{
+	rmesa->rmm = malloc(sizeof(struct r300_memory_manager));
+	memset(rmesa->rmm, 0, sizeof(struct r300_memory_manager));
+
+	rmesa->rmm->u_size = 128;
+	resize_u_list(rmesa);
+}
+
+void r300_mem_destroy(r300ContextPtr rmesa)
+{
+	_mesa_free(rmesa->rmm->u_list);
+	rmesa->rmm->u_list = NULL;
+
+	_mesa_free(rmesa->rmm);
+	rmesa->rmm = NULL;
+}
+
+void *r300_mem_ptr(r300ContextPtr rmesa, int id)
+{
+	assert(id <= rmesa->rmm->u_last);
+	return rmesa->rmm->u_list[id].ptr;
+}
+
+int r300_mem_find(r300ContextPtr rmesa, void *ptr)
+{
+	int i;
+
+	for (i = 1; i < rmesa->rmm->u_size + 1; i++)
+		if (rmesa->rmm->u_list[i].ptr &&
+		    ptr >= rmesa->rmm->u_list[i].ptr &&
+		    ptr <
+		    rmesa->rmm->u_list[i].ptr + rmesa->rmm->u_list[i].size)
+			break;
+
+	if (i < rmesa->rmm->u_size + 1)
+		return i;
+
+	fprintf(stderr, "%p failed\n", ptr);
+	return 0;
+}
+
+//#define MM_DEBUG
+int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size)
+{
+	drm_radeon_mem_alloc_t alloc;
+	int offset = 0, ret;
+	int i, free = -1;
+	int done_age;
+	drm_radeon_mem_free_t memfree;
+	int tries = 0;
+	static int bytes_wasted = 0, allocated = 0;
+
+	if (size < 4096)
+		bytes_wasted += 4096 - size;
+
+	allocated += size;
+
+#if 0
+	static int t = 0;
+	if (t != time(NULL)) {
+		t = time(NULL);
+		fprintf(stderr, "slots used %d, wasted %d kb, allocated %d\n",
+			rmesa->rmm->u_last, bytes_wasted / 1024,
+			allocated / 1024);
+	}
+#endif
+
+	memfree.region = RADEON_MEM_REGION_GART;
+
+      again:
+
+	done_age = radeonGetAge((radeonContextPtr) rmesa);
+
+	if (rmesa->rmm->u_last + 1 >= rmesa->rmm->u_size)
+		resize_u_list(rmesa);
+
+	for (i = rmesa->rmm->u_last + 1; i > 0; i--) {
+		if (rmesa->rmm->u_list[i].ptr == NULL) {
+			free = i;
+			continue;
+		}
+
+		if (rmesa->rmm->u_list[i].h_pending == 0 &&
+		    rmesa->rmm->u_list[i].pending
+		    && rmesa->rmm->u_list[i].age <= done_age) {
+			memfree.region_offset =
+			    (char *)rmesa->rmm->u_list[i].ptr -
+			    (char *)rmesa->radeon.radeonScreen->gartTextures.
+			    map;
+
+			ret =
+			    drmCommandWrite(rmesa->radeon.radeonScreen->
+					    driScreen->fd, DRM_RADEON_FREE,
+					    &memfree, sizeof(memfree));
+
+			if (ret) {
+				fprintf(stderr, "Failed to free at %p\n",
+					rmesa->rmm->u_list[i].ptr);
+				fprintf(stderr, "ret = %s\n", strerror(-ret));
+				exit(1);
+			} else {
+#ifdef MM_DEBUG
+				fprintf(stderr, "really freed %d at age %x\n",
+					i,
+					radeonGetAge((radeonContextPtr) rmesa));
+#endif
+				if (i == rmesa->rmm->u_last)
+					rmesa->rmm->u_last--;
+
+				if (rmesa->rmm->u_list[i].size < 4096)
+					bytes_wasted -=
+					    4096 - rmesa->rmm->u_list[i].size;
+
+				allocated -= rmesa->rmm->u_list[i].size;
+				rmesa->rmm->u_list[i].pending = 0;
+				rmesa->rmm->u_list[i].ptr = NULL;
+				free = i;
+			}
+		}
+	}
+	rmesa->rmm->u_head = i;
+
+	if (free == -1) {
+		WARN_ONCE("Ran out of slots!\n");
+		//usleep(100);
+		r300FlushCmdBuf(rmesa, __FUNCTION__);
+		tries++;
+		if (tries > 100) {
+			WARN_ONCE("Ran out of slots!\n");
+			exit(1);
+		}
+		goto again;
+	}
+
+	alloc.region = RADEON_MEM_REGION_GART;
+	alloc.alignment = alignment;
+	alloc.size = size;
+	alloc.region_offset = &offset;
+
+	ret =
+	    drmCommandWriteRead(rmesa->radeon.dri.fd, DRM_RADEON_ALLOC, &alloc,
+				sizeof(alloc));
+	if (ret) {
+#if 0
+		WARN_ONCE("Ran out of mem!\n");
+		r300FlushCmdBuf(rmesa, __FUNCTION__);
+		//usleep(100);
+		tries2++;
+		tries = 0;
+		if (tries2 > 100) {
+			WARN_ONCE("Ran out of GART memory!\n");
+			exit(1);
+		}
+		goto again;
+#else
+		WARN_ONCE
+		    ("Ran out of GART memory (for %d)!\nPlease consider adjusting GARTSize option.\n",
+		     size);
+		return 0;
+#endif
+	}
+
+	i = free;
+
+	if (i > rmesa->rmm->u_last)
+		rmesa->rmm->u_last = i;
+
+	rmesa->rmm->u_list[i].ptr =
+	    ((GLubyte *) rmesa->radeon.radeonScreen->gartTextures.map) + offset;
+	rmesa->rmm->u_list[i].size = size;
+	rmesa->rmm->u_list[i].age = 0;
+	//fprintf(stderr, "alloc %p at id %d\n", rmesa->rmm->u_list[i].ptr, i);
+
+#ifdef MM_DEBUG
+	fprintf(stderr, "allocated %d at age %x\n", i,
+		radeonGetAge((radeonContextPtr) rmesa));
+#endif
+
+	return i;
+}
+
+void r300_mem_use(r300ContextPtr rmesa, int id)
+{
+	uint64_t ull;
+#ifdef MM_DEBUG
+	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
+		radeonGetAge((radeonContextPtr) rmesa));
+#endif
+	drm_r300_cmd_header_t *cmd;
+
+	assert(id <= rmesa->rmm->u_last);
+
+	if (id == 0)
+		return;
+
+	cmd =
+	    (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa,
+						      2 + sizeof(ull) / 4,
+						      __FUNCTION__);
+	cmd[0].scratch.cmd_type = R300_CMD_SCRATCH;
+	cmd[0].scratch.reg = R300_MEM_SCRATCH;
+	cmd[0].scratch.n_bufs = 1;
+	cmd[0].scratch.flags = 0;
+	cmd++;
+
+	ull = (uint64_t) (intptr_t) & rmesa->rmm->u_list[id].age;
+	_mesa_memcpy(cmd, &ull, sizeof(ull));
+	cmd += sizeof(ull) / 4;
+
+	cmd[0].u = /*id */ 0;
+
+	LOCK_HARDWARE(&rmesa->radeon);	/* Protect from DRM. */
+	rmesa->rmm->u_list[id].h_pending++;
+	UNLOCK_HARDWARE(&rmesa->radeon);
+}
+
+unsigned long r300_mem_offset(r300ContextPtr rmesa, int id)
+{
+	unsigned long offset;
+
+	assert(id <= rmesa->rmm->u_last);
+
+	offset = (char *)rmesa->rmm->u_list[id].ptr -
+	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+	offset += rmesa->radeon.radeonScreen->gart_texture_offset;
+
+	return offset;
+}
+
+void *r300_mem_map(r300ContextPtr rmesa, int id, int access)
+{
+#ifdef MM_DEBUG
+	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
+		radeonGetAge((radeonContextPtr) rmesa));
+#endif
+	void *ptr;
+	int tries = 0;
+
+	assert(id <= rmesa->rmm->u_last);
+
+	if (access == R300_MEM_R) {
+
+		if (rmesa->rmm->u_list[id].mapped == 1)
+			WARN_ONCE("buffer %d already mapped\n", id);
+
+		rmesa->rmm->u_list[id].mapped = 1;
+		ptr = r300_mem_ptr(rmesa, id);
+
+		return ptr;
+	}
+
+	if (rmesa->rmm->u_list[id].h_pending)
+		r300FlushCmdBuf(rmesa, __FUNCTION__);
+
+	if (rmesa->rmm->u_list[id].h_pending) {
+		return NULL;
+	}
+
+	while (rmesa->rmm->u_list[id].age >
+	       radeonGetAge((radeonContextPtr) rmesa) && tries++ < 1000)
+		usleep(10);
+
+	if (tries >= 1000) {
+		fprintf(stderr, "Idling failed (%x vs %x)\n",
+			rmesa->rmm->u_list[id].age,
+			radeonGetAge((radeonContextPtr) rmesa));
+		return NULL;
+	}
+
+	if (rmesa->rmm->u_list[id].mapped == 1)
+		WARN_ONCE("buffer %d already mapped\n", id);
+
+	rmesa->rmm->u_list[id].mapped = 1;
+	ptr = r300_mem_ptr(rmesa, id);
+
+	return ptr;
+}
+
+void r300_mem_unmap(r300ContextPtr rmesa, int id)
+{
+#ifdef MM_DEBUG
+	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
+		radeonGetAge((radeonContextPtr) rmesa));
+#endif
+
+	assert(id <= rmesa->rmm->u_last);
+
+	if (rmesa->rmm->u_list[id].mapped == 0)
+		WARN_ONCE("buffer %d not mapped\n", id);
+
+	rmesa->rmm->u_list[id].mapped = 0;
+}
+
+void r300_mem_free(r300ContextPtr rmesa, int id)
+{
+#ifdef MM_DEBUG
+	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
+		radeonGetAge((radeonContextPtr) rmesa));
+#endif
+
+	assert(id <= rmesa->rmm->u_last);
+
+	if (id == 0)
+		return;
+
+	if (rmesa->rmm->u_list[id].ptr == NULL) {
+		WARN_ONCE("Not allocated!\n");
+		return;
+	}
+
+	if (rmesa->rmm->u_list[id].pending) {
+		WARN_ONCE("%p already pended!\n", rmesa->rmm->u_list[id].ptr);
+		return;
+	}
+
+	rmesa->rmm->u_list[id].pending = 1;
+}
+#endif
diff --git a/r300/r300_mem.h b/r300/r300_mem.h
new file mode 100644
index 0000000..625a7f6
--- /dev/null
+++ b/r300/r300_mem.h
@@ -0,0 +1,37 @@
+#ifndef __R300_MEM_H__
+#define __R300_MEM_H__
+
+//#define R300_MEM_PDL 0
+#define R300_MEM_UL 1
+
+#define R300_MEM_R 1
+#define R300_MEM_W 2
+#define R300_MEM_RW (R300_MEM_R | R300_MEM_W)
+
+#define R300_MEM_SCRATCH 2
+
+struct r300_memory_manager {
+	struct {
+		void *ptr;
+		uint32_t size;
+		uint32_t age;
+		uint32_t h_pending;
+		int pending;
+		int mapped;
+	} *u_list;
+	int u_head, u_size, u_last;
+
+};
+
+extern void r300_mem_init(r300ContextPtr rmesa);
+extern void r300_mem_destroy(r300ContextPtr rmesa);
+extern void *r300_mem_ptr(r300ContextPtr rmesa, int id);
+extern int r300_mem_find(r300ContextPtr rmesa, void *ptr);
+extern int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size);
+extern void r300_mem_use(r300ContextPtr rmesa, int id);
+extern unsigned long r300_mem_offset(r300ContextPtr rmesa, int id);
+extern void *r300_mem_map(r300ContextPtr rmesa, int id, int access);
+extern void r300_mem_unmap(r300ContextPtr rmesa, int id);
+extern void r300_mem_free(r300ContextPtr rmesa, int id);
+
+#endif
diff --git a/r300/r300_program.h b/r300/r300_program.h
new file mode 100644
index 0000000..eddd783
--- /dev/null
+++ b/r300/r300_program.h
@@ -0,0 +1,150 @@
+/*
+Copyright (C) 2004 Nicolai Haehnle.  All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R300_PROGRAM_H__
+#define __R300_PROGRAM_H__
+
+#include "r300_reg.h"
+
+/**
+ * Vertex program helper macros
+ */
+
+/* Produce out dword */
+#define VP_OUTCLASS_TMP		R300_VPI_OUT_REG_CLASS_TEMPORARY
+#define VP_OUTCLASS_OUT		R300_VPI_OUT_REG_CLASS_RESULT
+
+#define VP_OUTMASK_X	R300_VPI_OUT_WRITE_X
+#define VP_OUTMASK_Y	R300_VPI_OUT_WRITE_Y
+#define VP_OUTMASK_Z	R300_VPI_OUT_WRITE_Z
+#define VP_OUTMASK_W	R300_VPI_OUT_WRITE_W
+#define VP_OUTMASK_XY	(VP_OUTMASK_X|VP_OUTMASK_Y)
+#define VP_OUTMASK_XZ	(VP_OUTMASK_X|VP_OUTMASK_Z)
+#define VP_OUTMASK_XW	(VP_OUTMASK_X|VP_OUTMASK_W)
+#define VP_OUTMASK_XYZ	(VP_OUTMASK_XY|VP_OUTMASK_Z)
+#define VP_OUTMASK_XYW	(VP_OUTMASK_XY|VP_OUTMASK_W)
+#define VP_OUTMASK_XZW	(VP_OUTMASK_XZ|VP_OUTMASK_W)
+#define VP_OUTMASK_XYZW	(VP_OUTMASK_XYZ|VP_OUTMASK_W)
+#define VP_OUTMASK_YZ	(VP_OUTMASK_Y|VP_OUTMASK_Z)
+#define VP_OUTMASK_YW	(VP_OUTMASK_Y|VP_OUTMASK_W)
+#define VP_OUTMASK_YZW	(VP_OUTMASK_YZ|VP_OUTMASK_W)
+#define VP_OUTMASK_ZW	(VP_OUTMASK_Z|VP_OUTMASK_W)
+
+#define VP_OUT(instr,outclass,outidx,outmask) \
+	(R300_VPI_OUT_OP_##instr |				\
+	((outidx) << R300_VPI_OUT_REG_INDEX_SHIFT) |		\
+	VP_OUTCLASS_##outclass |				\
+	VP_OUTMASK_##outmask)
+
+/* Produce in dword */
+#define VP_INCLASS_TMP		R300_VPI_IN_REG_CLASS_TEMPORARY
+#define VP_INCLASS_IN		R300_VPI_IN_REG_CLASS_ATTRIBUTE
+#define VP_INCLASS_CONST	R300_VPI_IN_REG_CLASS_PARAMETER
+
+#define VP_IN(class,idx) \
+	(((idx) << R300_VPI_IN_REG_INDEX_SHIFT) |		\
+	VP_INCLASS_##class |					\
+	(R300_VPI_IN_SELECT_X << R300_VPI_IN_X_SHIFT) |		\
+	(R300_VPI_IN_SELECT_Y << R300_VPI_IN_Y_SHIFT) |		\
+	(R300_VPI_IN_SELECT_Z << R300_VPI_IN_Z_SHIFT) |		\
+	(R300_VPI_IN_SELECT_W << R300_VPI_IN_W_SHIFT))
+#define VP_ZERO() \
+	((R300_VPI_IN_SELECT_ZERO << R300_VPI_IN_X_SHIFT) |	\
+	(R300_VPI_IN_SELECT_ZERO << R300_VPI_IN_Y_SHIFT) |	\
+	(R300_VPI_IN_SELECT_ZERO << R300_VPI_IN_Z_SHIFT) |	\
+	(R300_VPI_IN_SELECT_ZERO << R300_VPI_IN_W_SHIFT))
+#define VP_ONE() \
+	((R300_VPI_IN_SELECT_ONE << R300_VPI_IN_X_SHIFT) |	\
+	(R300_VPI_IN_SELECT_ONE << R300_VPI_IN_Y_SHIFT) |	\
+	(R300_VPI_IN_SELECT_ONE << R300_VPI_IN_Z_SHIFT) |	\
+	(R300_VPI_IN_SELECT_ONE << R300_VPI_IN_W_SHIFT))
+
+#define VP_NEG(in,comp)		((in) ^ (R300_VPI_IN_NEG_##comp))
+#define VP_NEGALL(in,comp)	VP_NEG(VP_NEG(VP_NEG(VP_NEG((in),X),Y),Z),W)
+
+/**
+ * Fragment program helper macros
+ */
+
+/* Produce unshifted source selectors */
+#define FP_TMP(idx) (idx)
+#define FP_CONST(idx) ((idx) | (1 << 5))
+
+/* Produce source/dest selector dword */
+#define FP_SELC_MASK_NO		0
+#define FP_SELC_MASK_X		1
+#define FP_SELC_MASK_Y		2
+#define FP_SELC_MASK_XY		3
+#define FP_SELC_MASK_Z		4
+#define FP_SELC_MASK_XZ		5
+#define FP_SELC_MASK_YZ		6
+#define FP_SELC_MASK_XYZ	7
+
+#define FP_SELC(destidx,regmask,outmask,src0,src1,src2) \
+	(((destidx) << R300_FPI1_DSTC_SHIFT) |		\
+	 (FP_SELC_MASK_##regmask << 23) |		\
+	 (FP_SELC_MASK_##outmask << 26) |		\
+	 ((src0) << R300_FPI1_SRC0C_SHIFT) |		\
+	 ((src1) << R300_FPI1_SRC1C_SHIFT) |		\
+	 ((src2) << R300_FPI1_SRC2C_SHIFT))
+
+#define FP_SELA_MASK_NO		0
+#define FP_SELA_MASK_W		1
+
+#define FP_SELA(destidx,regmask,outmask,src0,src1,src2) \
+	(((destidx) << R300_FPI3_DSTA_SHIFT) |		\
+	 (FP_SELA_MASK_##regmask << 23) |		\
+	 (FP_SELA_MASK_##outmask << 24) |		\
+	 ((src0) << R300_FPI3_SRC0A_SHIFT) |		\
+	 ((src1) << R300_FPI3_SRC1A_SHIFT) |		\
+	 ((src2) << R300_FPI3_SRC2A_SHIFT))
+
+/* Produce unshifted argument selectors */
+#define FP_ARGC(source)	R300_FPI0_ARGC_##source
+#define FP_ARGA(source) R300_FPI2_ARGA_##source
+#define FP_ABS(arg) ((arg) | (1 << 6))
+#define FP_NEG(arg) ((arg) ^ (1 << 5))
+
+/* Produce instruction dword */
+#define FP_INSTRC(opcode,arg0,arg1,arg2) \
+	(R300_FPI0_OUTC_##opcode | 		\
+	((arg0) << R300_FPI0_ARG0C_SHIFT) |	\
+	((arg1) << R300_FPI0_ARG1C_SHIFT) |	\
+	((arg2) << R300_FPI0_ARG2C_SHIFT))
+
+#define FP_INSTRA(opcode,arg0,arg1,arg2) \
+	(R300_FPI2_OUTA_##opcode | 		\
+	((arg0) << R300_FPI2_ARG0A_SHIFT) |	\
+	((arg1) << R300_FPI2_ARG1A_SHIFT) |	\
+	((arg2) << R300_FPI2_ARG2A_SHIFT))
+
+extern void debug_vp(GLcontext * ctx, struct gl_vertex_program *vp);
+
+#endif				/* __R300_PROGRAM_H__ */
diff --git a/r300/r300_reg.h b/r300/r300_reg.h
new file mode 100644
index 0000000..e5501b6
--- /dev/null
+++ b/r300/r300_reg.h
@@ -0,0 +1,1635 @@
+/**************************************************************************
+
+Copyright (C) 2004-2005 Nicolai Haehnle et al.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/* *INDENT-OFF* */
+
+#ifndef _R300_REG_H
+#define _R300_REG_H
+
+#define R300_MC_INIT_MISC_LAT_TIMER	0x180
+#	define R300_MC_MISC__MC_CPR_INIT_LAT_SHIFT	0
+#	define R300_MC_MISC__MC_VF_INIT_LAT_SHIFT	4
+#	define R300_MC_MISC__MC_DISP0R_INIT_LAT_SHIFT	8
+#	define R300_MC_MISC__MC_DISP1R_INIT_LAT_SHIFT	12
+#	define R300_MC_MISC__MC_FIXED_INIT_LAT_SHIFT	16
+#	define R300_MC_MISC__MC_E2R_INIT_LAT_SHIFT	20
+#	define R300_MC_MISC__MC_SAME_PAGE_PRIO_SHIFT	24
+#	define R300_MC_MISC__MC_GLOBW_INIT_LAT_SHIFT	28
+
+
+#define R300_MC_INIT_GFX_LAT_TIMER	0x154
+#	define R300_MC_MISC__MC_G3D0R_INIT_LAT_SHIFT	0
+#	define R300_MC_MISC__MC_G3D1R_INIT_LAT_SHIFT	4
+#	define R300_MC_MISC__MC_G3D2R_INIT_LAT_SHIFT	8
+#	define R300_MC_MISC__MC_G3D3R_INIT_LAT_SHIFT	12
+#	define R300_MC_MISC__MC_TX0R_INIT_LAT_SHIFT	16
+#	define R300_MC_MISC__MC_TX1R_INIT_LAT_SHIFT	20
+#	define R300_MC_MISC__MC_GLOBR_INIT_LAT_SHIFT	24
+#	define R300_MC_MISC__MC_GLOBW_FULL_LAT_SHIFT	28
+
+/*
+ * This file contains registers and constants for the R300. They have been
+ * found mostly by examining command buffers captured using glxtest, as well
+ * as by extrapolating some known registers and constants from the R200.
+ * I am fairly certain that they are correct unless stated otherwise
+ * in comments.
+ */
+
+#define R300_SE_VPORT_XSCALE                0x1D98
+#define R300_SE_VPORT_XOFFSET               0x1D9C
+#define R300_SE_VPORT_YSCALE                0x1DA0
+#define R300_SE_VPORT_YOFFSET               0x1DA4
+#define R300_SE_VPORT_ZSCALE                0x1DA8
+#define R300_SE_VPORT_ZOFFSET               0x1DAC
+
+
+/*
+ * Vertex Array Processing (VAP) Control
+ * Stolen from r200 code from Christoph Brill (It's a guess!)
+ */
+#define R300_VAP_CNTL	0x2080
+
+/* This register is written directly and also starts data section
+ * in many 3d CP_PACKET3's
+ */
+#define R300_VAP_VF_CNTL	0x2084
+#	define	R300_VAP_VF_CNTL__PRIM_TYPE__SHIFT              0
+#	define  R300_VAP_VF_CNTL__PRIM_NONE                     (0<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_POINTS                   (1<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINES                    (2<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINE_STRIP               (3<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLES                (4<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN             (5<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP           (6<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_LINE_LOOP                (12<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_QUADS                    (13<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_QUAD_STRIP               (14<<0)
+#	define  R300_VAP_VF_CNTL__PRIM_POLYGON                  (15<<0)
+
+#	define	R300_VAP_VF_CNTL__PRIM_WALK__SHIFT              4
+	/* State based - direct writes to registers trigger vertex
+           generation */
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_STATE_BASED         (0<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_INDICES             (1<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST         (2<<4)
+#	define	R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED     (3<<4)
+
+	/* I don't think I saw these three used.. */
+#	define	R300_VAP_VF_CNTL__COLOR_ORDER__SHIFT            6
+#	define	R300_VAP_VF_CNTL__TCL_OUTPUT_CTL_ENA__SHIFT     9
+#	define	R300_VAP_VF_CNTL__PROG_STREAM_ENA__SHIFT        10
+
+	/* index size - when not set the indices are assumed to be 16 bit */
+#	define	R300_VAP_VF_CNTL__INDEX_SIZE_32bit              (1<<11)
+	/* number of vertices */
+#	define	R300_VAP_VF_CNTL__NUM_VERTICES__SHIFT           16
+
+/* BEGIN: Wild guesses */
+#define R300_VAP_OUTPUT_VTX_FMT_0           0x2090
+#       define R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT     (1<<0)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_PRESENT   (1<<1)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT (1<<2)  /* GUESS */
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT (1<<3)  /* GUESS */
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT (1<<4)  /* GUESS */
+#       define R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT (1<<16) /* GUESS */
+
+#define R300_VAP_OUTPUT_VTX_FMT_1           0x2094
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_0_COMP_CNT_SHIFT 0
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_1_COMP_CNT_SHIFT 3
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_2_COMP_CNT_SHIFT 6
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_3_COMP_CNT_SHIFT 9
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_4_COMP_CNT_SHIFT 12
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_5_COMP_CNT_SHIFT 15
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_6_COMP_CNT_SHIFT 18
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_7_COMP_CNT_SHIFT 21
+/* END: Wild guesses */
+
+#define R300_SE_VTE_CNTL                  0x20b0
+#	define     R300_VPORT_X_SCALE_ENA                0x00000001
+#	define     R300_VPORT_X_OFFSET_ENA               0x00000002
+#	define     R300_VPORT_Y_SCALE_ENA                0x00000004
+#	define     R300_VPORT_Y_OFFSET_ENA               0x00000008
+#	define     R300_VPORT_Z_SCALE_ENA                0x00000010
+#	define     R300_VPORT_Z_OFFSET_ENA               0x00000020
+#	define     R300_VTX_XY_FMT                       0x00000100
+#	define     R300_VTX_Z_FMT                        0x00000200
+#	define     R300_VTX_W0_FMT                       0x00000400
+#	define     R300_VTX_W0_NORMALIZE                 0x00000800
+#	define     R300_VTX_ST_DENORMALIZED              0x00001000
+
+/* BEGIN: Vertex data assembly - lots of uncertainties */
+
+/* gap */
+
+#define R300_VAP_CNTL_STATUS              0x2140
+#	define R300_VC_NO_SWAP                  (0 << 0)
+#	define R300_VC_16BIT_SWAP               (1 << 0)
+#	define R300_VC_32BIT_SWAP               (2 << 0)
+#	define R300_VAP_TCL_BYPASS		(1 << 8)
+
+/* gap */
+
+/* Where do we get our vertex data?
+ *
+ * Vertex data either comes either from immediate mode registers or from
+ * vertex arrays.
+ * There appears to be no mixed mode (though we can force the pitch of
+ * vertex arrays to 0, effectively reusing the same element over and over
+ * again).
+ *
+ * Immediate mode is controlled by the INPUT_CNTL registers. I am not sure
+ * if these registers influence vertex array processing.
+ *
+ * Vertex arrays are controlled via the 3D_LOAD_VBPNTR packet3.
+ *
+ * In both cases, vertex attributes are then passed through INPUT_ROUTE.
+ *
+ * Beginning with INPUT_ROUTE_0_0 is a list of WORDs that route vertex data
+ * into the vertex processor's input registers.
+ * The first word routes the first input, the second word the second, etc.
+ * The corresponding input is routed into the register with the given index.
+ * The list is ended by a word with INPUT_ROUTE_END set.
+ *
+ * Always set COMPONENTS_4 in immediate mode.
+ */
+
+#define R300_VAP_INPUT_ROUTE_0_0            0x2150
+#       define R300_INPUT_ROUTE_COMPONENTS_1     (0 << 0)
+#       define R300_INPUT_ROUTE_COMPONENTS_2     (1 << 0)
+#       define R300_INPUT_ROUTE_COMPONENTS_3     (2 << 0)
+#       define R300_INPUT_ROUTE_COMPONENTS_4     (3 << 0)
+#       define R300_INPUT_ROUTE_COMPONENTS_RGBA  (4 << 0) /* GUESS */
+#       define R300_VAP_INPUT_ROUTE_IDX_SHIFT    8
+#       define R300_VAP_INPUT_ROUTE_IDX_MASK     (31 << 8) /* GUESS */
+#       define R300_VAP_INPUT_ROUTE_END          (1 << 13)
+#       define R300_INPUT_ROUTE_IMMEDIATE_MODE   (0 << 14) /* GUESS */
+#       define R300_INPUT_ROUTE_FLOAT            (1 << 14) /* GUESS */
+#       define R300_INPUT_ROUTE_UNSIGNED_BYTE    (2 << 14) /* GUESS */
+#       define R300_INPUT_ROUTE_FLOAT_COLOR      (3 << 14) /* GUESS */
+#define R300_VAP_INPUT_ROUTE_0_1            0x2154
+#define R300_VAP_INPUT_ROUTE_0_2            0x2158
+#define R300_VAP_INPUT_ROUTE_0_3            0x215C
+#define R300_VAP_INPUT_ROUTE_0_4            0x2160
+#define R300_VAP_INPUT_ROUTE_0_5            0x2164
+#define R300_VAP_INPUT_ROUTE_0_6            0x2168
+#define R300_VAP_INPUT_ROUTE_0_7            0x216C
+
+/* gap */
+
+/* Notes:
+ *  - always set up to produce at least two attributes:
+ *    if vertex program uses only position, fglrx will set normal, too
+ *  - INPUT_CNTL_0_COLOR and INPUT_CNTL_COLOR bits are always equal.
+ */
+#define R300_VAP_INPUT_CNTL_0               0x2180
+#       define R300_INPUT_CNTL_0_COLOR           0x00000001
+#define R300_VAP_INPUT_CNTL_1               0x2184
+#       define R300_INPUT_CNTL_POS               0x00000001
+#       define R300_INPUT_CNTL_NORMAL            0x00000002
+#       define R300_INPUT_CNTL_COLOR             0x00000004
+#       define R300_INPUT_CNTL_TC0               0x00000400
+#       define R300_INPUT_CNTL_TC1               0x00000800
+#       define R300_INPUT_CNTL_TC2               0x00001000 /* GUESS */
+#       define R300_INPUT_CNTL_TC3               0x00002000 /* GUESS */
+#       define R300_INPUT_CNTL_TC4               0x00004000 /* GUESS */
+#       define R300_INPUT_CNTL_TC5               0x00008000 /* GUESS */
+#       define R300_INPUT_CNTL_TC6               0x00010000 /* GUESS */
+#       define R300_INPUT_CNTL_TC7               0x00020000 /* GUESS */
+
+/* gap */
+
+/* Words parallel to INPUT_ROUTE_0; All words that are active in INPUT_ROUTE_0
+ * are set to a swizzling bit pattern, other words are 0.
+ *
+ * In immediate mode, the pattern is always set to xyzw. In vertex array
+ * mode, the swizzling pattern is e.g. used to set zw components in texture
+ * coordinates with only tweo components.
+ */
+#define R300_VAP_INPUT_ROUTE_1_0            0x21E0
+#       define R300_INPUT_ROUTE_SELECT_X    0
+#       define R300_INPUT_ROUTE_SELECT_Y    1
+#       define R300_INPUT_ROUTE_SELECT_Z    2
+#       define R300_INPUT_ROUTE_SELECT_W    3
+#       define R300_INPUT_ROUTE_SELECT_ZERO 4
+#       define R300_INPUT_ROUTE_SELECT_ONE  5
+#       define R300_INPUT_ROUTE_SELECT_MASK 7
+#       define R300_INPUT_ROUTE_X_SHIFT     0
+#       define R300_INPUT_ROUTE_Y_SHIFT     3
+#       define R300_INPUT_ROUTE_Z_SHIFT     6
+#       define R300_INPUT_ROUTE_W_SHIFT     9
+#       define R300_INPUT_ROUTE_ENABLE      (15 << 12)
+#define R300_VAP_INPUT_ROUTE_1_1            0x21E4
+#define R300_VAP_INPUT_ROUTE_1_2            0x21E8
+#define R300_VAP_INPUT_ROUTE_1_3            0x21EC
+#define R300_VAP_INPUT_ROUTE_1_4            0x21F0
+#define R300_VAP_INPUT_ROUTE_1_5            0x21F4
+#define R300_VAP_INPUT_ROUTE_1_6            0x21F8
+#define R300_VAP_INPUT_ROUTE_1_7            0x21FC
+
+/* END: Vertex data assembly */
+
+/* gap */
+
+/* BEGIN: Upload vertex program and data */
+
+/*
+ * The programmable vertex shader unit has a memory bank of unknown size
+ * that can be written to in 16 byte units by writing the address into
+ * UPLOAD_ADDRESS, followed by data in UPLOAD_DATA (multiples of 4 DWORDs).
+ *
+ * Pointers into the memory bank are always in multiples of 16 bytes.
+ *
+ * The memory bank is divided into areas with fixed meaning.
+ *
+ * Starting at address UPLOAD_PROGRAM: Vertex program instructions.
+ * Native limits reported by drivers from ATI suggest size 256 (i.e. 4KB),
+ * whereas the difference between known addresses suggests size 512.
+ *
+ * Starting at address UPLOAD_PARAMETERS: Vertex program parameters.
+ * Native reported limits and the VPI layout suggest size 256, whereas
+ * difference between known addresses suggests size 512.
+ *
+ * At address UPLOAD_POINTSIZE is a vector (0, 0, ps, 0), where ps is the
+ * floating point pointsize. The exact purpose of this state is uncertain,
+ * as there is also the R300_RE_POINTSIZE register.
+ *
+ * Multiple vertex programs and parameter sets can be loaded at once,
+ * which could explain the size discrepancy.
+ */
+#define R300_VAP_PVS_UPLOAD_ADDRESS         0x2200
+#       define R300_PVS_UPLOAD_PROGRAM           0x00000000
+#       define R300_PVS_UPLOAD_PARAMETERS        0x00000200
+#       define R300_PVS_UPLOAD_CLIP_PLANE0       0x00000400
+#       define R300_PVS_UPLOAD_CLIP_PLANE1       0x00000401
+#       define R300_PVS_UPLOAD_CLIP_PLANE2       0x00000402
+#       define R300_PVS_UPLOAD_CLIP_PLANE3       0x00000403
+#       define R300_PVS_UPLOAD_CLIP_PLANE4       0x00000404
+#       define R300_PVS_UPLOAD_CLIP_PLANE5       0x00000405
+#       define R300_PVS_UPLOAD_POINTSIZE         0x00000406
+
+#       define R500_PVS_UPLOAD_CLIP_PLANE0       0x00000600
+#       define R500_PVS_UPLOAD_CLIP_PLANE1       0x00000601
+#       define R500_PVS_UPLOAD_CLIP_PLANE2       0x00000602
+#       define R500_PVS_UPLOAD_CLIP_PLANE3       0x00000603
+#       define R500_PVS_UPLOAD_CLIP_PLANE4       0x00000604
+#       define R500_PVS_UPLOAD_CLIP_PLANE5       0x00000605
+
+/* gap */
+
+#define R300_VAP_PVS_UPLOAD_DATA            0x2208
+
+/* END: Upload vertex program and data */
+
+/* gap */
+
+/* I do not know the purpose of this register. However, I do know that
+ * it is set to 221C_CLEAR for clear operations and to 221C_NORMAL
+ * for normal rendering.
+ *
+ * 2007-11-05: This register is the user clip plane control register, but there
+ * also seems to be a rendering mode control; the NORMAL/CLEAR defines.
+ *
+ * See bug #9871. http://bugs.freedesktop.org/attachment.cgi?id=10672&action=view
+ */
+#define R300_VAP_CLIP_CNTL                       0x221C
+#       define R300_221C_NORMAL                  0x00000000
+#       define R300_221C_CLEAR                   0x0001C000
+#define R300_VAP_UCP_ENABLE_0 (1 << 0)
+
+/* gap */
+
+/* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
+ * rendering commands and overwriting vertex program parameters.
+ * Therefore, I suspect writing zero to 0x2284 synchronizes the engine and
+ * avoids bugs caused by still running shaders reading bad data from memory.
+ */
+#define R300_VAP_PVS_WAITIDLE               0x2284 /* GUESS */
+
+/* Absolutely no clue what this register is about. */
+#define R300_VAP_UNKNOWN_2288               0x2288
+#       define R300_2288_R300                    0x00750000 /* -- nh */
+#       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
+
+/* gap */
+
+/* Addresses are relative to the vertex program instruction area of the
+ * memory bank. PROGRAM_END points to the last instruction of the active
+ * program
+ *
+ * The meaning of the two UNKNOWN fields is obviously not known. However,
+ * experiments so far have shown that both *must* point to an instruction
+ * inside the vertex program, otherwise the GPU locks up.
+ * fglrx usually sets CNTL_3_UNKNOWN to the end of the program and
+ * CNTL_1_UNKNOWN points to instruction where last write to position takes
+ * place.
+ * Most likely this is used to ignore rest of the program in cases
+ * where group of verts arent visible. For some reason this "section"
+ * is sometimes accepted other instruction that have no relationship with
+ *position calculations.
+ */
+#define R300_VAP_PVS_CNTL_1                 0x22D0
+#       define R300_PVS_CNTL_1_PROGRAM_START_SHIFT   0
+#       define R300_PVS_CNTL_1_POS_END_SHIFT         10
+#       define R300_PVS_CNTL_1_PROGRAM_END_SHIFT     20
+/* Addresses are relative the the vertex program parameters area. */
+#define R300_VAP_PVS_CNTL_2                 0x22D4
+#       define R300_PVS_CNTL_2_PARAM_OFFSET_SHIFT 0
+#       define R300_PVS_CNTL_2_PARAM_COUNT_SHIFT  16
+#define R300_VAP_PVS_CNTL_3	           0x22D8
+#       define R300_PVS_CNTL_3_PROGRAM_UNKNOWN_SHIFT 10
+#       define R300_PVS_CNTL_3_PROGRAM_UNKNOWN2_SHIFT 0
+
+/* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
+ * immediate vertices
+ */
+#define R300_VAP_VTX_COLOR_R                0x2464
+#define R300_VAP_VTX_COLOR_G                0x2468
+#define R300_VAP_VTX_COLOR_B                0x246C
+#define R300_VAP_VTX_POS_0_X_1              0x2490 /* used for glVertex2*() */
+#define R300_VAP_VTX_POS_0_Y_1              0x2494
+#define R300_VAP_VTX_COLOR_PKD              0x249C /* RGBA */
+#define R300_VAP_VTX_POS_0_X_2              0x24A0 /* used for glVertex3*() */
+#define R300_VAP_VTX_POS_0_Y_2              0x24A4
+#define R300_VAP_VTX_POS_0_Z_2              0x24A8
+/* write 0 to indicate end of packet? */
+#define R300_VAP_VTX_END_OF_PKT             0x24AC
+
+/* gap */
+
+/* These are values from r300_reg/r300_reg.h - they are known to be correct
+ * and are here so we can use one register file instead of several
+ * - Vladimir
+ */
+#define R300_GB_VAP_RASTER_VTX_FMT_0	0x4000
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__POS_PRESENT	(1<<0)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_0_PRESENT	(1<<1)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_1_PRESENT	(1<<2)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_2_PRESENT	(1<<3)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_3_PRESENT	(1<<4)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_SPACE	(0xf<<5)
+#	define R300_GB_VAP_RASTER_VTX_FMT_0__PT_SIZE_PRESENT	(0x1<<16)
+
+#define R300_GB_VAP_RASTER_VTX_FMT_1	0x4004
+	/* each of the following is 3 bits wide, specifies number
+	   of components */
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_0_COMP_CNT_SHIFT	0
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_1_COMP_CNT_SHIFT	3
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_2_COMP_CNT_SHIFT	6
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_3_COMP_CNT_SHIFT	9
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_4_COMP_CNT_SHIFT	12
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_5_COMP_CNT_SHIFT	15
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_6_COMP_CNT_SHIFT	18
+#	define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_7_COMP_CNT_SHIFT	21
+
+/* UNK30 seems to enables point to quad transformation on textures
+ * (or something closely related to that).
+ * This bit is rather fatal at the time being due to lackings at pixel
+ * shader side
+ */
+#define R300_GB_ENABLE	0x4008
+#	define R300_GB_POINT_STUFF_ENABLE	(1<<0)
+#	define R300_GB_LINE_STUFF_ENABLE	(1<<1)
+#	define R300_GB_TRIANGLE_STUFF_ENABLE	(1<<2)
+#	define R300_GB_STENCIL_AUTO_ENABLE	(1<<4)
+#	define R300_GB_UNK31			(1<<31)
+	/* each of the following is 2 bits wide */
+#define R300_GB_TEX_REPLICATE	0
+#define R300_GB_TEX_ST		1
+#define R300_GB_TEX_STR		2
+#	define R300_GB_TEX0_SOURCE_SHIFT	16
+#	define R300_GB_TEX1_SOURCE_SHIFT	18
+#	define R300_GB_TEX2_SOURCE_SHIFT	20
+#	define R300_GB_TEX3_SOURCE_SHIFT	22
+#	define R300_GB_TEX4_SOURCE_SHIFT	24
+#	define R300_GB_TEX5_SOURCE_SHIFT	26
+#	define R300_GB_TEX6_SOURCE_SHIFT	28
+#	define R300_GB_TEX7_SOURCE_SHIFT	30
+
+/* MSPOS - positions for multisample antialiasing (?) */
+#define R300_GB_MSPOS0	0x4010
+	/* shifts - each of the fields is 4 bits */
+#	define R300_GB_MSPOS0__MS_X0_SHIFT	0
+#	define R300_GB_MSPOS0__MS_Y0_SHIFT	4
+#	define R300_GB_MSPOS0__MS_X1_SHIFT	8
+#	define R300_GB_MSPOS0__MS_Y1_SHIFT	12
+#	define R300_GB_MSPOS0__MS_X2_SHIFT	16
+#	define R300_GB_MSPOS0__MS_Y2_SHIFT	20
+#	define R300_GB_MSPOS0__MSBD0_Y		24
+#	define R300_GB_MSPOS0__MSBD0_X		28
+
+#define R300_GB_MSPOS1	0x4014
+#	define R300_GB_MSPOS1__MS_X3_SHIFT	0
+#	define R300_GB_MSPOS1__MS_Y3_SHIFT	4
+#	define R300_GB_MSPOS1__MS_X4_SHIFT	8
+#	define R300_GB_MSPOS1__MS_Y4_SHIFT	12
+#	define R300_GB_MSPOS1__MS_X5_SHIFT	16
+#	define R300_GB_MSPOS1__MS_Y5_SHIFT	20
+#	define R300_GB_MSPOS1__MSBD1		24
+
+
+#define R300_GB_TILE_CONFIG	0x4018
+#	define R300_GB_TILE_ENABLE	(1<<0)
+#	define R300_GB_TILE_PIPE_COUNT_RV300	0
+#	define R300_GB_TILE_PIPE_COUNT_R300	(3<<1)
+#	define R300_GB_TILE_PIPE_COUNT_R420	(7<<1)
+#	define R300_GB_TILE_PIPE_COUNT_RV410	(3<<1)
+#	define R300_GB_TILE_SIZE_8		0
+#	define R300_GB_TILE_SIZE_16		(1<<4)
+#	define R300_GB_TILE_SIZE_32		(2<<4)
+#	define R300_GB_SUPER_SIZE_1		(0<<6)
+#	define R300_GB_SUPER_SIZE_2		(1<<6)
+#	define R300_GB_SUPER_SIZE_4		(2<<6)
+#	define R300_GB_SUPER_SIZE_8		(3<<6)
+#	define R300_GB_SUPER_SIZE_16		(4<<6)
+#	define R300_GB_SUPER_SIZE_32		(5<<6)
+#	define R300_GB_SUPER_SIZE_64		(6<<6)
+#	define R300_GB_SUPER_SIZE_128		(7<<6)
+#	define R300_GB_SUPER_X_SHIFT		9	/* 3 bits wide */
+#	define R300_GB_SUPER_Y_SHIFT		12	/* 3 bits wide */
+#	define R300_GB_SUPER_TILE_A		0
+#	define R300_GB_SUPER_TILE_B		(1<<15)
+#	define R300_GB_SUBPIXEL_1_12		0
+#	define R300_GB_SUBPIXEL_1_16		(1<<16)
+
+#define R300_GB_FIFO_SIZE	0x4024
+	/* each of the following is 2 bits wide */
+#define R300_GB_FIFO_SIZE_32	0
+#define R300_GB_FIFO_SIZE_64	1
+#define R300_GB_FIFO_SIZE_128	2
+#define R300_GB_FIFO_SIZE_256	3
+#	define R300_SC_IFIFO_SIZE_SHIFT	0
+#	define R300_SC_TZFIFO_SIZE_SHIFT	2
+#	define R300_SC_BFIFO_SIZE_SHIFT	4
+
+#	define R300_US_OFIFO_SIZE_SHIFT	12
+#	define R300_US_WFIFO_SIZE_SHIFT	14
+	/* the following use the same constants as above, but meaning is
+	   is times 2 (i.e. instead of 32 words it means 64 */
+#	define R300_RS_TFIFO_SIZE_SHIFT	6
+#	define R300_RS_CFIFO_SIZE_SHIFT	8
+#	define R300_US_RAM_SIZE_SHIFT		10
+	/* watermarks, 3 bits wide */
+#	define R300_RS_HIGHWATER_COL_SHIFT	16
+#	define R300_RS_HIGHWATER_TEX_SHIFT	19
+#	define R300_OFIFO_HIGHWATER_SHIFT	22	/* two bits only */
+#	define R300_CUBE_FIFO_HIGHWATER_COL_SHIFT	24
+
+#define R300_GB_SELECT	0x401C
+#	define R300_GB_FOG_SELECT_C0A		0
+#	define R300_GB_FOG_SELECT_C1A		1
+#	define R300_GB_FOG_SELECT_C2A		2
+#	define R300_GB_FOG_SELECT_C3A		3
+#	define R300_GB_FOG_SELECT_1_1_W	4
+#	define R300_GB_FOG_SELECT_Z		5
+#	define R300_GB_DEPTH_SELECT_Z		0
+#	define R300_GB_DEPTH_SELECT_1_1_W	(1<<3)
+#	define R300_GB_W_SELECT_1_W		0
+#	define R300_GB_W_SELECT_1		(1<<4)
+
+#define R300_GB_AA_CONFIG		0x4020
+#	define R300_AA_DISABLE			0x00
+#	define R300_AA_ENABLE			0x01
+#	define R300_AA_SUBSAMPLES_2		0
+#	define R300_AA_SUBSAMPLES_3		(1<<1)
+#	define R300_AA_SUBSAMPLES_4		(2<<1)
+#	define R300_AA_SUBSAMPLES_6		(3<<1)
+
+/* gap */
+
+/* Zero to flush caches. */
+#define R300_TX_CNTL                        0x4100
+#define R300_TX_FLUSH                       0x0
+
+/* The upper enable bits are guessed, based on fglrx reported limits. */
+#define R300_TX_ENABLE                      0x4104
+#       define R300_TX_ENABLE_0                  (1 << 0)
+#       define R300_TX_ENABLE_1                  (1 << 1)
+#       define R300_TX_ENABLE_2                  (1 << 2)
+#       define R300_TX_ENABLE_3                  (1 << 3)
+#       define R300_TX_ENABLE_4                  (1 << 4)
+#       define R300_TX_ENABLE_5                  (1 << 5)
+#       define R300_TX_ENABLE_6                  (1 << 6)
+#       define R300_TX_ENABLE_7                  (1 << 7)
+#       define R300_TX_ENABLE_8                  (1 << 8)
+#       define R300_TX_ENABLE_9                  (1 << 9)
+#       define R300_TX_ENABLE_10                 (1 << 10)
+#       define R300_TX_ENABLE_11                 (1 << 11)
+#       define R300_TX_ENABLE_12                 (1 << 12)
+#       define R300_TX_ENABLE_13                 (1 << 13)
+#       define R300_TX_ENABLE_14                 (1 << 14)
+#       define R300_TX_ENABLE_15                 (1 << 15)
+
+/* The pointsize is given in multiples of 6. The pointsize can be
+ * enormous: Clear() renders a single point that fills the entire
+ * framebuffer.
+ */
+#define R300_RE_POINTSIZE                   0x421C
+#       define R300_POINTSIZE_Y_SHIFT            0
+#       define R300_POINTSIZE_Y_MASK             (0xFFFF << 0) /* GUESS */
+#       define R300_POINTSIZE_X_SHIFT            16
+#       define R300_POINTSIZE_X_MASK             (0xFFFF << 16) /* GUESS */
+#       define R300_POINTSIZE_MAX             (R300_POINTSIZE_Y_MASK / 6)
+
+/* The line width is given in multiples of 6.
+ * In default mode lines are classified as vertical lines.
+ * HO: horizontal
+ * VE: vertical or horizontal
+ * HO & VE: no classification
+ */
+#define R300_RE_LINE_CNT                      0x4234
+#       define R300_LINESIZE_SHIFT            0
+#       define R300_LINESIZE_MASK             (0xFFFF << 0) /* GUESS */
+#       define R300_LINESIZE_MAX             (R300_LINESIZE_MASK / 6)
+#       define R300_LINE_CNT_HO               (1 << 16)
+#       define R300_LINE_CNT_VE               (1 << 17)
+
+/* Some sort of scale or clamp value for texcoordless textures. */
+#define R300_RE_UNK4238                       0x4238
+
+/* Something shade related */
+#define R300_RE_SHADE                         0x4274
+
+#define R300_RE_SHADE_MODEL                   0x4278
+#	define R300_RE_SHADE_MODEL_SMOOTH     0x3aaaa
+#	define R300_RE_SHADE_MODEL_FLAT       0x39595
+
+/* Dangerous */
+#define R300_RE_POLYGON_MODE                  0x4288
+#	define R300_PM_ENABLED                (1 << 0)
+#	define R300_PM_FRONT_POINT            (0 << 0)
+#	define R300_PM_BACK_POINT             (0 << 0)
+#	define R300_PM_FRONT_LINE             (1 << 4)
+#	define R300_PM_FRONT_FILL             (1 << 5)
+#	define R300_PM_BACK_LINE              (1 << 7)
+#	define R300_PM_BACK_FILL              (1 << 8)
+
+/* Fog parameters */
+#define R300_RE_FOG_SCALE                     0x4294
+#define R300_RE_FOG_START                     0x4298
+
+/* Not sure why there are duplicate of factor and constant values.
+ * My best guess so far is that there are seperate zbiases for test and write.
+ * Ordering might be wrong.
+ * Some of the tests indicate that fgl has a fallback implementation of zbias
+ * via pixel shaders.
+ */
+#define R300_RE_ZBIAS_CNTL                    0x42A0 /* GUESS */
+#define R300_RE_ZBIAS_T_FACTOR                0x42A4
+#define R300_RE_ZBIAS_T_CONSTANT              0x42A8
+#define R300_RE_ZBIAS_W_FACTOR                0x42AC
+#define R300_RE_ZBIAS_W_CONSTANT              0x42B0
+
+/* This register needs to be set to (1<<1) for RV350 to correctly
+ * perform depth test (see --vb-triangles in r300_demo)
+ * Don't know about other chips. - Vladimir
+ * This is set to 3 when GL_POLYGON_OFFSET_FILL is on.
+ * My guess is that there are two bits for each zbias primitive
+ * (FILL, LINE, POINT).
+ *  One to enable depth test and one for depth write.
+ * Yet this doesnt explain why depth writes work ...
+ */
+#define R300_RE_OCCLUSION_CNTL		    0x42B4
+#	define R300_OCCLUSION_ON		(1<<1)
+
+#define R300_RE_CULL_CNTL                   0x42B8
+#       define R300_CULL_FRONT                   (1 << 0)
+#       define R300_CULL_BACK                    (1 << 1)
+#       define R300_FRONT_FACE_CCW               (0 << 2)
+#       define R300_FRONT_FACE_CW                (1 << 2)
+
+
+/* BEGIN: Rasterization / Interpolators - many guesses */
+
+/* 0_UNKNOWN_18 has always been set except for clear operations.
+ * TC_CNT is the number of incoming texture coordinate sets (i.e. it depends
+ * on the vertex program, *not* the fragment program)
+ */
+#define R300_RS_CNTL_0                      0x4300
+#       define R300_RS_CNTL_TC_CNT_SHIFT         2
+#       define R300_RS_CNTL_TC_CNT_MASK          (7 << 2)
+	/* number of color interpolators used */
+#	define R300_RS_CNTL_CI_CNT_SHIFT         7
+#       define R300_RS_CNTL_0_UNKNOWN_18         (1 << 18)
+	/* Guess: RS_CNTL_1 holds the index of the highest used RS_ROUTE_n
+	   register. */
+#define R300_RS_CNTL_1                      0x4304
+
+/* gap */
+
+/* Only used for texture coordinates.
+ * Use the source field to route texture coordinate input from the
+ * vertex program to the desired interpolator. Note that the source
+ * field is relative to the outputs the vertex program *actually*
+ * writes. If a vertex program only writes texcoord[1], this will
+ * be source index 0.
+ * Set INTERP_USED on all interpolators that produce data used by
+ * the fragment program. INTERP_USED looks like a swizzling mask,
+ * but I haven't seen it used that way.
+ *
+ * Note: The _UNKNOWN constants are always set in their respective
+ * register. I don't know if this is necessary.
+ */
+#define R300_RS_INTERP_0                    0x4310
+#define R300_RS_INTERP_1                    0x4314
+#       define R300_RS_INTERP_1_UNKNOWN          0x40
+#define R300_RS_INTERP_2                    0x4318
+#       define R300_RS_INTERP_2_UNKNOWN          0x80
+#define R300_RS_INTERP_3                    0x431C
+#       define R300_RS_INTERP_3_UNKNOWN          0xC0
+#define R300_RS_INTERP_4                    0x4320
+#define R300_RS_INTERP_5                    0x4324
+#define R300_RS_INTERP_6                    0x4328
+#define R300_RS_INTERP_7                    0x432C
+#       define R300_RS_INTERP_SRC_SHIFT          2
+#       define R300_RS_INTERP_SRC_MASK           (7 << 2)
+#       define R300_RS_INTERP_USED               0x00D10000
+
+/* These DWORDs control how vertex data is routed into fragment program
+ * registers, after interpolators.
+ */
+#define R300_RS_ROUTE_0                     0x4330
+#define R300_RS_ROUTE_1                     0x4334
+#define R300_RS_ROUTE_2                     0x4338
+#define R300_RS_ROUTE_3                     0x433C /* GUESS */
+#define R300_RS_ROUTE_4                     0x4340 /* GUESS */
+#define R300_RS_ROUTE_5                     0x4344 /* GUESS */
+#define R300_RS_ROUTE_6                     0x4348 /* GUESS */
+#define R300_RS_ROUTE_7                     0x434C /* GUESS */
+#       define R300_RS_ROUTE_SOURCE_INTERP_0     0
+#       define R300_RS_ROUTE_SOURCE_INTERP_1     1
+#       define R300_RS_ROUTE_SOURCE_INTERP_2     2
+#       define R300_RS_ROUTE_SOURCE_INTERP_3     3
+#       define R300_RS_ROUTE_SOURCE_INTERP_4     4
+#       define R300_RS_ROUTE_SOURCE_INTERP_5     5 /* GUESS */
+#       define R300_RS_ROUTE_SOURCE_INTERP_6     6 /* GUESS */
+#       define R300_RS_ROUTE_SOURCE_INTERP_7     7 /* GUESS */
+#       define R300_RS_ROUTE_ENABLE              (1 << 3) /* GUESS */
+#       define R300_RS_ROUTE_DEST_SHIFT          6
+#       define R300_RS_ROUTE_DEST_MASK           (31 << 6) /* GUESS */
+
+/* Special handling for color: When the fragment program uses color,
+ * the ROUTE_0_COLOR bit is set and ROUTE_0_COLOR_DEST contains the
+ * color register index.
+ *
+ * Apperently you may set the R300_RS_ROUTE_0_COLOR bit, but not provide any
+ * R300_RS_ROUTE_0_COLOR_DEST value; this setup is used for clearing the state.
+ * See r300_ioctl.c:r300EmitClearState. I'm not sure if this setup is strictly
+ * correct or not. - Oliver.
+ */
+#       define R300_RS_ROUTE_0_COLOR             (1 << 14)
+#       define R300_RS_ROUTE_0_COLOR_DEST_SHIFT  17
+#       define R300_RS_ROUTE_0_COLOR_DEST_MASK   (31 << 17) /* GUESS */
+/* As above, but for secondary color */
+#		define R300_RS_ROUTE_1_COLOR1            (1 << 14)
+#		define R300_RS_ROUTE_1_COLOR1_DEST_SHIFT 17
+#		define R300_RS_ROUTE_1_COLOR1_DEST_MASK  (31 << 17)
+#		define R300_RS_ROUTE_1_UNKNOWN11         (1 << 11)
+/* END: Rasterization / Interpolators - many guesses */
+
+/* BEGIN: Scissors and cliprects */
+
+/* There are four clipping rectangles. Their corner coordinates are inclusive.
+ * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
+ * on whether the pixel is inside cliprects 0-3, respectively. For example,
+ * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
+ * the number 3 (binary 0011).
+ * Iff the bit corresponding to the pixel's number in RE_CLIPRECT_CNTL is set,
+ * the pixel is rasterized.
+ *
+ * In addition to this, there is a scissors rectangle. Only pixels inside the
+ * scissors rectangle are drawn. (coordinates are inclusive)
+ *
+ * For some reason, the top-left corner of the framebuffer is at (1440, 1440)
+ * for the purpose of clipping and scissors.
+ */
+#define R300_RE_CLIPRECT_TL_0               0x43B0
+#define R300_RE_CLIPRECT_BR_0               0x43B4
+#define R300_RE_CLIPRECT_TL_1               0x43B8
+#define R300_RE_CLIPRECT_BR_1               0x43BC
+#define R300_RE_CLIPRECT_TL_2               0x43C0
+#define R300_RE_CLIPRECT_BR_2               0x43C4
+#define R300_RE_CLIPRECT_TL_3               0x43C8
+#define R300_RE_CLIPRECT_BR_3               0x43CC
+#       define R300_CLIPRECT_OFFSET              1440
+#       define R300_CLIPRECT_MASK                0x1FFF
+#       define R300_CLIPRECT_X_SHIFT             0
+#       define R300_CLIPRECT_X_MASK              (0x1FFF << 0)
+#       define R300_CLIPRECT_Y_SHIFT             13
+#       define R300_CLIPRECT_Y_MASK              (0x1FFF << 13)
+#define R300_RE_CLIPRECT_CNTL               0x43D0
+#       define R300_CLIP_OUT                     (1 << 0)
+#       define R300_CLIP_0                       (1 << 1)
+#       define R300_CLIP_1                       (1 << 2)
+#       define R300_CLIP_10                      (1 << 3)
+#       define R300_CLIP_2                       (1 << 4)
+#       define R300_CLIP_20                      (1 << 5)
+#       define R300_CLIP_21                      (1 << 6)
+#       define R300_CLIP_210                     (1 << 7)
+#       define R300_CLIP_3                       (1 << 8)
+#       define R300_CLIP_30                      (1 << 9)
+#       define R300_CLIP_31                      (1 << 10)
+#       define R300_CLIP_310                     (1 << 11)
+#       define R300_CLIP_32                      (1 << 12)
+#       define R300_CLIP_320                     (1 << 13)
+#       define R300_CLIP_321                     (1 << 14)
+#       define R300_CLIP_3210                    (1 << 15)
+
+/* gap */
+
+#define R300_RE_SCISSORS_TL                 0x43E0
+#define R300_RE_SCISSORS_BR                 0x43E4
+#       define R300_SCISSORS_OFFSET              1440
+#       define R300_SCISSORS_X_SHIFT             0
+#       define R300_SCISSORS_X_MASK              (0x1FFF << 0)
+#       define R300_SCISSORS_Y_SHIFT             13
+#       define R300_SCISSORS_Y_MASK              (0x1FFF << 13)
+/* END: Scissors and cliprects */
+
+/* BEGIN: Texture specification */
+
+/*
+ * The texture specification dwords are grouped by meaning and not by texture
+ * unit. This means that e.g. the offset for texture image unit N is found in
+ * register TX_OFFSET_0 + (4*N)
+ */
+#define R300_TX_FILTER_0                    0x4400
+#       define R300_TX_REPEAT                    0
+#       define R300_TX_MIRRORED                  1
+#       define R300_TX_CLAMP                     4
+#       define R300_TX_CLAMP_TO_EDGE             2
+#       define R300_TX_CLAMP_TO_BORDER           6
+#       define R300_TX_WRAP_S_SHIFT              0
+#       define R300_TX_WRAP_S_MASK               (7 << 0)
+#       define R300_TX_WRAP_T_SHIFT              3
+#       define R300_TX_WRAP_T_MASK               (7 << 3)
+#       define R300_TX_WRAP_Q_SHIFT              6
+#       define R300_TX_WRAP_Q_MASK               (7 << 6)
+#       define R300_TX_MAG_FILTER_NEAREST        (1 << 9)
+#       define R300_TX_MAG_FILTER_LINEAR         (2 << 9)
+#       define R300_TX_MAG_FILTER_MASK           (3 << 9)
+#       define R300_TX_MIN_FILTER_NEAREST        (1 << 11)
+#       define R300_TX_MIN_FILTER_LINEAR         (2 << 11)
+#	define R300_TX_MIN_FILTER_NEAREST_MIP_NEAREST       (5  <<  11)
+#	define R300_TX_MIN_FILTER_NEAREST_MIP_LINEAR        (9  <<  11)
+#	define R300_TX_MIN_FILTER_LINEAR_MIP_NEAREST        (6  <<  11)
+#	define R300_TX_MIN_FILTER_LINEAR_MIP_LINEAR         (10 <<  11)
+
+/* NOTE: NEAREST doesnt seem to exist.
+ * Im not seting MAG_FILTER_MASK and (3 << 11) on for all
+ * anisotropy modes because that would void selected mag filter
+ */
+#	define R300_TX_MIN_FILTER_ANISO_NEAREST             (0 << 13)
+#	define R300_TX_MIN_FILTER_ANISO_LINEAR              (0 << 13)
+#	define R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_NEAREST (1 << 13)
+#	define R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_LINEAR  (2 << 13)
+#       define R300_TX_MIN_FILTER_MASK   ( (15 << 11) | (3 << 13) )
+#	define R300_TX_MAX_ANISO_1_TO_1  (0 << 21)
+#	define R300_TX_MAX_ANISO_2_TO_1  (2 << 21)
+#	define R300_TX_MAX_ANISO_4_TO_1  (4 << 21)
+#	define R300_TX_MAX_ANISO_8_TO_1  (6 << 21)
+#	define R300_TX_MAX_ANISO_16_TO_1 (8 << 21)
+#	define R300_TX_MAX_ANISO_MASK    (14 << 21)
+
+#define R300_TX_FILTER1_0                      0x4440
+#	define R300_CHROMA_KEY_MODE_DISABLE    0
+#	define R300_CHROMA_KEY_FORCE	       1
+#	define R300_CHROMA_KEY_BLEND           2
+#	define R300_MC_ROUND_NORMAL            (0<<2)
+#	define R300_MC_ROUND_MPEG4             (1<<2)
+#	define R300_LOD_BIAS_MASK	    0x1fff
+#	define R300_EDGE_ANISO_EDGE_DIAG       (0<<13)
+#	define R300_EDGE_ANISO_EDGE_ONLY       (1<<13)
+#	define R300_MC_COORD_TRUNCATE_DISABLE  (0<<14)
+#	define R300_MC_COORD_TRUNCATE_MPEG     (1<<14)
+#	define R300_TX_TRI_PERF_0_8            (0<<15)
+#	define R300_TX_TRI_PERF_1_8            (1<<15)
+#	define R300_TX_TRI_PERF_1_4            (2<<15)
+#	define R300_TX_TRI_PERF_3_8            (3<<15)
+#	define R300_ANISO_THRESHOLD_MASK       (7<<17)
+
+#define R300_TX_SIZE_0                      0x4480
+#       define R300_TX_WIDTHMASK_SHIFT           0
+#       define R300_TX_WIDTHMASK_MASK            (2047 << 0)
+#       define R300_TX_HEIGHTMASK_SHIFT          11
+#       define R300_TX_HEIGHTMASK_MASK           (2047 << 11)
+#       define R300_TX_UNK23                     (1 << 23)
+#       define R300_TX_MAX_MIP_LEVEL_SHIFT       26
+#       define R300_TX_MAX_MIP_LEVEL_MASK        (0xf << 26)
+#       define R300_TX_SIZE_PROJECTED            (1<<30)
+#       define R300_TX_SIZE_TXPITCH_EN           (1<<31)
+#define R300_TX_FORMAT_0                    0x44C0
+	/* The interpretation of the format word by Wladimir van der Laan */
+	/* The X, Y, Z and W refer to the layout of the components.
+	   They are given meanings as R, G, B and Alpha by the swizzle
+	   specification */
+#	define R300_TX_FORMAT_X8		    0x0
+#	define R300_TX_FORMAT_X16		    0x1
+#	define R300_TX_FORMAT_Y4X4		    0x2
+#	define R300_TX_FORMAT_Y8X8		    0x3
+#	define R300_TX_FORMAT_Y16X16		    0x4
+#	define R300_TX_FORMAT_Z3Y3X2		    0x5
+#	define R300_TX_FORMAT_Z5Y6X5		    0x6
+#	define R300_TX_FORMAT_Z6Y5X5		    0x7
+#	define R300_TX_FORMAT_Z11Y11X10		    0x8
+#	define R300_TX_FORMAT_Z10Y11X11		    0x9
+#	define R300_TX_FORMAT_W4Z4Y4X4		    0xA
+#	define R300_TX_FORMAT_W1Z5Y5X5		    0xB
+#	define R300_TX_FORMAT_W8Z8Y8X8		    0xC
+#	define R300_TX_FORMAT_W2Z10Y10X10	    0xD
+#	define R300_TX_FORMAT_W16Z16Y16X16	    0xE
+#	define R300_TX_FORMAT_DXT1	    	    0xF
+#	define R300_TX_FORMAT_DXT3	    	    0x10
+#	define R300_TX_FORMAT_DXT5	    	    0x11
+#	define R300_TX_FORMAT_D3DMFT_CxV8U8	    0x12     /* no swizzle */
+#	define R300_TX_FORMAT_A8R8G8B8	    	    0x13     /* no swizzle */
+#	define R300_TX_FORMAT_B8G8_B8G8	    	    0x14     /* no swizzle */
+#	define R300_TX_FORMAT_G8R8_G8B8	    	    0x15     /* no swizzle */
+	/* 0x16 - some 16 bit green format.. ?? */
+#	define R300_TX_FORMAT_UNK25		   (1 << 25) /* no swizzle */
+#	define R300_TX_FORMAT_CUBIC_MAP		   (1 << 26)
+
+	/* gap */
+	/* Floating point formats */
+	/* Note - hardware supports both 16 and 32 bit floating point */
+#	define R300_TX_FORMAT_FL_I16	    	    0x18
+#	define R300_TX_FORMAT_FL_I16A16	    	    0x19
+#	define R300_TX_FORMAT_FL_R16G16B16A16	    0x1A
+#	define R300_TX_FORMAT_FL_I32	    	    0x1B
+#	define R300_TX_FORMAT_FL_I32A32	    	    0x1C
+#	define R300_TX_FORMAT_FL_R32G32B32A32	    0x1D
+	/* alpha modes, convenience mostly */
+	/* if you have alpha, pick constant appropriate to the
+	   number of channels (1 for I8, 2 for I8A8, 4 for R8G8B8A8, etc */
+# 	define R300_TX_FORMAT_ALPHA_1CH		    0x000
+# 	define R300_TX_FORMAT_ALPHA_2CH		    0x200
+# 	define R300_TX_FORMAT_ALPHA_4CH		    0x600
+# 	define R300_TX_FORMAT_ALPHA_NONE	    0xA00
+	/* Swizzling */
+	/* constants */
+#	define R300_TX_FORMAT_X		0
+#	define R300_TX_FORMAT_Y		1
+#	define R300_TX_FORMAT_Z		2
+#	define R300_TX_FORMAT_W		3
+#	define R300_TX_FORMAT_ZERO	4
+#	define R300_TX_FORMAT_ONE	5
+	/* 2.0*Z, everything above 1.0 is set to 0.0 */
+#	define R300_TX_FORMAT_CUT_Z	6
+	/* 2.0*W, everything above 1.0 is set to 0.0 */
+#	define R300_TX_FORMAT_CUT_W	7
+
+#	define R300_TX_FORMAT_B_SHIFT	18
+#	define R300_TX_FORMAT_G_SHIFT	15
+#	define R300_TX_FORMAT_R_SHIFT	12
+#	define R300_TX_FORMAT_A_SHIFT	9
+	/* Convenience macro to take care of layout and swizzling */
+#	define R300_EASY_TX_FORMAT(B, G, R, A, FMT)	(		\
+		((R300_TX_FORMAT_##B)<<R300_TX_FORMAT_B_SHIFT)		\
+		| ((R300_TX_FORMAT_##G)<<R300_TX_FORMAT_G_SHIFT)	\
+		| ((R300_TX_FORMAT_##R)<<R300_TX_FORMAT_R_SHIFT)	\
+		| ((R300_TX_FORMAT_##A)<<R300_TX_FORMAT_A_SHIFT)	\
+		| (R300_TX_FORMAT_##FMT)				\
+		)
+	/* These can be ORed with result of R300_EASY_TX_FORMAT()
+	   We don't really know what they do. Take values from a
+           constant color ? */
+#	define R300_TX_FORMAT_CONST_X		(1<<5)
+#	define R300_TX_FORMAT_CONST_Y		(2<<5)
+#	define R300_TX_FORMAT_CONST_Z		(4<<5)
+#	define R300_TX_FORMAT_CONST_W		(8<<5)
+
+#	define R300_TX_FORMAT_YUV_MODE		0x00800000
+
+#define R300_TX_PITCH_0			    0x4500 /* obvious missing in gap */
+#define R300_TX_OFFSET_0                    0x4540
+	/* BEGIN: Guess from R200 */
+#       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
+#       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
+#       define R300_TXO_ENDIAN_WORD_SWAP         (2 << 0)
+#       define R300_TXO_ENDIAN_HALFDW_SWAP       (3 << 0)
+#       define R300_TXO_MACRO_TILE               (1 << 2)
+#       define R300_TXO_MICRO_TILE               (1 << 3)
+#       define R300_TXO_OFFSET_MASK              0xffffffe0
+#       define R300_TXO_OFFSET_SHIFT             5
+	/* END: Guess from R200 */
+
+/* 32 bit chroma key */
+#define R300_TX_CHROMA_KEY_0                      0x4580
+/* ff00ff00 == { 0, 1.0, 0, 1.0 } */
+#define R300_TX_BORDER_COLOR_0              0x45C0
+
+/* END: Texture specification */
+
+/* BEGIN: Fragment program instruction set */
+
+/* Fragment programs are written directly into register space.
+ * There are separate instruction streams for texture instructions and ALU
+ * instructions.
+ * In order to synchronize these streams, the program is divided into up
+ * to 4 nodes. Each node begins with a number of TEX operations, followed
+ * by a number of ALU operations.
+ * The first node can have zero TEX ops, all subsequent nodes must have at
+ * least
+ * one TEX ops.
+ * All nodes must have at least one ALU op.
+ *
+ * The index of the last node is stored in PFS_CNTL_0: A value of 0 means
+ * 1 node, a value of 3 means 4 nodes.
+ * The total amount of instructions is defined in PFS_CNTL_2. The offsets are
+ * offsets into the respective instruction streams, while *_END points to the
+ * last instruction relative to this offset.
+ */
+#define R300_PFS_CNTL_0                     0x4600
+#       define R300_PFS_CNTL_LAST_NODES_SHIFT    0
+#       define R300_PFS_CNTL_LAST_NODES_MASK     (3 << 0)
+#       define R300_PFS_CNTL_FIRST_NODE_HAS_TEX  (1 << 3)
+#define R300_PFS_CNTL_1                     0x4604
+/* There is an unshifted value here which has so far always been equal to the
+ * index of the highest used temporary register.
+ */
+#define R300_PFS_CNTL_2                     0x4608
+#       define R300_PFS_CNTL_ALU_OFFSET_SHIFT    0
+#       define R300_PFS_CNTL_ALU_OFFSET_MASK     (63 << 0)
+#       define R300_PFS_CNTL_ALU_END_SHIFT       6
+#       define R300_PFS_CNTL_ALU_END_MASK        (63 << 6)
+#       define R300_PFS_CNTL_TEX_OFFSET_SHIFT    12
+#       define R300_PFS_CNTL_TEX_OFFSET_MASK     (31 << 12) /* GUESS */
+#       define R300_PFS_CNTL_TEX_END_SHIFT       18
+#       define R300_PFS_CNTL_TEX_END_MASK        (31 << 18) /* GUESS */
+
+/* gap */
+
+/* Nodes are stored backwards. The last active node is always stored in
+ * PFS_NODE_3.
+ * Example: In a 2-node program, NODE_0 and NODE_1 are set to 0. The
+ * first node is stored in NODE_2, the second node is stored in NODE_3.
+ *
+ * Offsets are relative to the master offset from PFS_CNTL_2.
+ * LAST_NODE is set for the last node, and only for the last node.
+ */
+#define R300_PFS_NODE_0                     0x4610
+#define R300_PFS_NODE_1                     0x4614
+#define R300_PFS_NODE_2                     0x4618
+#define R300_PFS_NODE_3                     0x461C
+#       define R300_PFS_NODE_ALU_OFFSET_SHIFT    0
+#       define R300_PFS_NODE_ALU_OFFSET_MASK     (63 << 0)
+#       define R300_PFS_NODE_ALU_END_SHIFT       6
+#       define R300_PFS_NODE_ALU_END_MASK        (63 << 6)
+#       define R300_PFS_NODE_TEX_OFFSET_SHIFT    12
+#       define R300_PFS_NODE_TEX_OFFSET_MASK     (31 << 12)
+#       define R300_PFS_NODE_TEX_END_SHIFT       17
+#       define R300_PFS_NODE_TEX_END_MASK        (31 << 17)
+/*#       define R300_PFS_NODE_LAST_NODE           (1 << 22) */
+#		define R300_PFS_NODE_OUTPUT_COLOR        (1 << 22)
+#		define R300_PFS_NODE_OUTPUT_DEPTH        (1 << 23)
+
+/* TEX
+ * As far as I can tell, texture instructions cannot write into output
+ * registers directly. A subsequent ALU instruction is always necessary,
+ * even if it's just MAD o0, r0, 1, 0
+ */
+#define R300_PFS_TEXI_0                     0x4620
+#	define R300_FPITX_SRC_SHIFT              0
+#	define R300_FPITX_SRC_MASK               (31 << 0)
+	/* GUESS */
+#	define R300_FPITX_SRC_CONST              (1 << 5)
+#	define R300_FPITX_DST_SHIFT              6
+#	define R300_FPITX_DST_MASK               (31 << 6)
+#	define R300_FPITX_IMAGE_SHIFT            11
+	/* GUESS based on layout and native limits */
+#       define R300_FPITX_IMAGE_MASK             (15 << 11)
+/* Unsure if these are opcodes, or some kind of bitfield, but this is how
+ * they were set when I checked
+ */
+#	define R300_FPITX_OPCODE_SHIFT		15
+#		define R300_FPITX_OP_TEX	1
+#		define R300_FPITX_OP_KIL	2
+#		define R300_FPITX_OP_TXP	3
+#		define R300_FPITX_OP_TXB	4
+#	define R300_FPITX_OPCODE_MASK           (7 << 15)
+
+/* ALU
+ * The ALU instructions register blocks are enumerated according to the order
+ * in which fglrx. I assume there is space for 64 instructions, since
+ * each block has space for a maximum of 64 DWORDs, and this matches reported
+ * native limits.
+ *
+ * The basic functional block seems to be one MAD for each color and alpha,
+ * and an adder that adds all components after the MUL.
+ *  - ADD, MUL, MAD etc.: use MAD with appropriate neutral operands
+ *  - DP4: Use OUTC_DP4, OUTA_DP4
+ *  - DP3: Use OUTC_DP3, OUTA_DP4, appropriate alpha operands
+ *  - DPH: Use OUTC_DP4, OUTA_DP4, appropriate alpha operands
+ *  - CMPH: If ARG2 > 0.5, return ARG0, else return ARG1
+ *  - CMP: If ARG2 < 0, return ARG1, else return ARG0
+ *  - FLR: use FRC+MAD
+ *  - XPD: use MAD+MAD
+ *  - SGE, SLT: use MAD+CMP
+ *  - RSQ: use ABS modifier for argument
+ *  - Use OUTC_REPL_ALPHA to write results of an alpha-only operation
+ *    (e.g. RCP) into color register
+ *  - apparently, there's no quick DST operation
+ *  - fglrx set FPI2_UNKNOWN_31 on a "MAD fragment.color, tmp0, tmp1, tmp2"
+ *  - fglrx set FPI2_UNKNOWN_31 on a "MAX r2, r1, c0"
+ *  - fglrx once set FPI0_UNKNOWN_31 on a "FRC r1, r1"
+ *
+ * Operand selection
+ * First stage selects three sources from the available registers and
+ * constant parameters. This is defined in INSTR1 (color) and INSTR3 (alpha).
+ * fglrx sorts the three source fields: Registers before constants,
+ * lower indices before higher indices; I do not know whether this is
+ * necessary.
+ *
+ * fglrx fills unused sources with "read constant 0"
+ * According to specs, you cannot select more than two different constants.
+ *
+ * Second stage selects the operands from the sources. This is defined in
+ * INSTR0 (color) and INSTR2 (alpha). You can also select the special constants
+ * zero and one.
+ * Swizzling and negation happens in this stage, as well.
+ *
+ * Important: Color and alpha seem to be mostly separate, i.e. their sources
+ * selection appears to be fully independent (the register storage is probably
+ * physically split into a color and an alpha section).
+ * However (because of the apparent physical split), there is some interaction
+ * WRT swizzling. If, for example, you want to load an R component into an
+ * Alpha operand, this R component is taken from a *color* source, not from
+ * an alpha source. The corresponding register doesn't even have to appear in
+ * the alpha sources list. (I hope this all makes sense to you)
+ *
+ * Destination selection
+ * The destination register index is in FPI1 (color) and FPI3 (alpha)
+ * together with enable bits.
+ * There are separate enable bits for writing into temporary registers
+ * (DSTC_REG_* /DSTA_REG) and and program output registers (DSTC_OUTPUT_*
+ * /DSTA_OUTPUT). You can write to both at once, or not write at all (the
+ * same index must be used for both).
+ *
+ * Note: There is a special form for LRP
+ *  - Argument order is the same as in ARB_fragment_program.
+ *  - Operation is MAD
+ *  - ARG1 is set to ARGC_SRC1C_LRP/ARGC_SRC1A_LRP
+ *  - Set FPI0/FPI2_SPECIAL_LRP
+ * Arbitrary LRP (including support for swizzling) requires vanilla MAD+MAD
+ */
+#define R300_PFS_INSTR1_0                   0x46C0
+#       define R300_FPI1_SRC0C_SHIFT             0
+#       define R300_FPI1_SRC0C_MASK              (31 << 0)
+#       define R300_FPI1_SRC0C_CONST             (1 << 5)
+#       define R300_FPI1_SRC1C_SHIFT             6
+#       define R300_FPI1_SRC1C_MASK              (31 << 6)
+#       define R300_FPI1_SRC1C_CONST             (1 << 11)
+#       define R300_FPI1_SRC2C_SHIFT             12
+#       define R300_FPI1_SRC2C_MASK              (31 << 12)
+#       define R300_FPI1_SRC2C_CONST             (1 << 17)
+#       define R300_FPI1_SRC_MASK                0x0003ffff
+#       define R300_FPI1_DSTC_SHIFT              18
+#       define R300_FPI1_DSTC_MASK               (31 << 18)
+#		define R300_FPI1_DSTC_REG_MASK_SHIFT     23
+#       define R300_FPI1_DSTC_REG_X              (1 << 23)
+#       define R300_FPI1_DSTC_REG_Y              (1 << 24)
+#       define R300_FPI1_DSTC_REG_Z              (1 << 25)
+#		define R300_FPI1_DSTC_OUTPUT_MASK_SHIFT  26
+#       define R300_FPI1_DSTC_OUTPUT_X           (1 << 26)
+#       define R300_FPI1_DSTC_OUTPUT_Y           (1 << 27)
+#       define R300_FPI1_DSTC_OUTPUT_Z           (1 << 28)
+
+#define R300_PFS_INSTR3_0                   0x47C0
+#       define R300_FPI3_SRC0A_SHIFT             0
+#       define R300_FPI3_SRC0A_MASK              (31 << 0)
+#       define R300_FPI3_SRC0A_CONST             (1 << 5)
+#       define R300_FPI3_SRC1A_SHIFT             6
+#       define R300_FPI3_SRC1A_MASK              (31 << 6)
+#       define R300_FPI3_SRC1A_CONST             (1 << 11)
+#       define R300_FPI3_SRC2A_SHIFT             12
+#       define R300_FPI3_SRC2A_MASK              (31 << 12)
+#       define R300_FPI3_SRC2A_CONST             (1 << 17)
+#       define R300_FPI3_SRC_MASK                0x0003ffff
+#       define R300_FPI3_DSTA_SHIFT              18
+#       define R300_FPI3_DSTA_MASK               (31 << 18)
+#       define R300_FPI3_DSTA_REG                (1 << 23)
+#       define R300_FPI3_DSTA_OUTPUT             (1 << 24)
+#		define R300_FPI3_DSTA_DEPTH              (1 << 27)
+
+#define R300_PFS_INSTR0_0                   0x48C0
+#       define R300_FPI0_ARGC_SRC0C_XYZ          0
+#       define R300_FPI0_ARGC_SRC0C_XXX          1
+#       define R300_FPI0_ARGC_SRC0C_YYY          2
+#       define R300_FPI0_ARGC_SRC0C_ZZZ          3
+#       define R300_FPI0_ARGC_SRC1C_XYZ          4
+#       define R300_FPI0_ARGC_SRC1C_XXX          5
+#       define R300_FPI0_ARGC_SRC1C_YYY          6
+#       define R300_FPI0_ARGC_SRC1C_ZZZ          7
+#       define R300_FPI0_ARGC_SRC2C_XYZ          8
+#       define R300_FPI0_ARGC_SRC2C_XXX          9
+#       define R300_FPI0_ARGC_SRC2C_YYY          10
+#       define R300_FPI0_ARGC_SRC2C_ZZZ          11
+#       define R300_FPI0_ARGC_SRC0A              12
+#       define R300_FPI0_ARGC_SRC1A              13
+#       define R300_FPI0_ARGC_SRC2A              14
+#       define R300_FPI0_ARGC_SRC1C_LRP          15
+#       define R300_FPI0_ARGC_ZERO               20
+#       define R300_FPI0_ARGC_ONE                21
+	/* GUESS */
+#       define R300_FPI0_ARGC_HALF               22
+#       define R300_FPI0_ARGC_SRC0C_YZX          23
+#       define R300_FPI0_ARGC_SRC1C_YZX          24
+#       define R300_FPI0_ARGC_SRC2C_YZX          25
+#       define R300_FPI0_ARGC_SRC0C_ZXY          26
+#       define R300_FPI0_ARGC_SRC1C_ZXY          27
+#       define R300_FPI0_ARGC_SRC2C_ZXY          28
+#       define R300_FPI0_ARGC_SRC0CA_WZY         29
+#       define R300_FPI0_ARGC_SRC1CA_WZY         30
+#       define R300_FPI0_ARGC_SRC2CA_WZY         31
+
+#       define R300_FPI0_ARG0C_SHIFT             0
+#       define R300_FPI0_ARG0C_MASK              (31 << 0)
+#       define R300_FPI0_ARG0C_NEG               (1 << 5)
+#       define R300_FPI0_ARG0C_ABS               (1 << 6)
+#       define R300_FPI0_ARG1C_SHIFT             7
+#       define R300_FPI0_ARG1C_MASK              (31 << 7)
+#       define R300_FPI0_ARG1C_NEG               (1 << 12)
+#       define R300_FPI0_ARG1C_ABS               (1 << 13)
+#       define R300_FPI0_ARG2C_SHIFT             14
+#       define R300_FPI0_ARG2C_MASK              (31 << 14)
+#       define R300_FPI0_ARG2C_NEG               (1 << 19)
+#       define R300_FPI0_ARG2C_ABS               (1 << 20)
+#       define R300_FPI0_SPECIAL_LRP             (1 << 21)
+#       define R300_FPI0_OUTC_MAD                (0 << 23)
+#       define R300_FPI0_OUTC_DP3                (1 << 23)
+#       define R300_FPI0_OUTC_DP4                (2 << 23)
+#       define R300_FPI0_OUTC_MIN                (4 << 23)
+#       define R300_FPI0_OUTC_MAX                (5 << 23)
+#       define R300_FPI0_OUTC_CMPH               (7 << 23)
+#       define R300_FPI0_OUTC_CMP                (8 << 23)
+#       define R300_FPI0_OUTC_FRC                (9 << 23)
+#       define R300_FPI0_OUTC_REPL_ALPHA         (10 << 23)
+#       define R300_FPI0_OUTC_SAT                (1 << 30)
+#       define R300_FPI0_INSERT_NOP              (1 << 31)
+
+#define R300_PFS_INSTR2_0                   0x49C0
+#       define R300_FPI2_ARGA_SRC0C_X            0
+#       define R300_FPI2_ARGA_SRC0C_Y            1
+#       define R300_FPI2_ARGA_SRC0C_Z            2
+#       define R300_FPI2_ARGA_SRC1C_X            3
+#       define R300_FPI2_ARGA_SRC1C_Y            4
+#       define R300_FPI2_ARGA_SRC1C_Z            5
+#       define R300_FPI2_ARGA_SRC2C_X            6
+#       define R300_FPI2_ARGA_SRC2C_Y            7
+#       define R300_FPI2_ARGA_SRC2C_Z            8
+#       define R300_FPI2_ARGA_SRC0A              9
+#       define R300_FPI2_ARGA_SRC1A              10
+#       define R300_FPI2_ARGA_SRC2A              11
+#       define R300_FPI2_ARGA_SRC1A_LRP          15
+#       define R300_FPI2_ARGA_ZERO               16
+#       define R300_FPI2_ARGA_ONE                17
+	/* GUESS */
+#       define R300_FPI2_ARGA_HALF               18
+#       define R300_FPI2_ARG0A_SHIFT             0
+#       define R300_FPI2_ARG0A_MASK              (31 << 0)
+#       define R300_FPI2_ARG0A_NEG               (1 << 5)
+	/* GUESS */
+#	define R300_FPI2_ARG0A_ABS		 (1 << 6)
+#       define R300_FPI2_ARG1A_SHIFT             7
+#       define R300_FPI2_ARG1A_MASK              (31 << 7)
+#       define R300_FPI2_ARG1A_NEG               (1 << 12)
+	/* GUESS */
+#	define R300_FPI2_ARG1A_ABS		 (1 << 13)
+#       define R300_FPI2_ARG2A_SHIFT             14
+#       define R300_FPI2_ARG2A_MASK              (31 << 14)
+#       define R300_FPI2_ARG2A_NEG               (1 << 19)
+	/* GUESS */
+#	define R300_FPI2_ARG2A_ABS		 (1 << 20)
+#       define R300_FPI2_SPECIAL_LRP             (1 << 21)
+#       define R300_FPI2_OUTA_MAD                (0 << 23)
+#       define R300_FPI2_OUTA_DP4                (1 << 23)
+#       define R300_FPI2_OUTA_MIN                (2 << 23)
+#       define R300_FPI2_OUTA_MAX                (3 << 23)
+#       define R300_FPI2_OUTA_CMP                (6 << 23)
+#       define R300_FPI2_OUTA_FRC                (7 << 23)
+#       define R300_FPI2_OUTA_EX2                (8 << 23)
+#       define R300_FPI2_OUTA_LG2                (9 << 23)
+#       define R300_FPI2_OUTA_RCP                (10 << 23)
+#       define R300_FPI2_OUTA_RSQ                (11 << 23)
+#       define R300_FPI2_OUTA_SAT                (1 << 30)
+#       define R300_FPI2_UNKNOWN_31              (1 << 31)
+/* END: Fragment program instruction set */
+
+/* Fog state and color */
+#define R300_RE_FOG_STATE                   0x4BC0
+#       define R300_FOG_ENABLE                   (1 << 0)
+#	define R300_FOG_MODE_LINEAR              (0 << 1)
+#	define R300_FOG_MODE_EXP                 (1 << 1)
+#	define R300_FOG_MODE_EXP2                (2 << 1)
+#	define R300_FOG_MODE_MASK                (3 << 1)
+#define R300_FOG_COLOR_R                    0x4BC8
+#define R300_FOG_COLOR_G                    0x4BCC
+#define R300_FOG_COLOR_B                    0x4BD0
+
+#define R300_PP_ALPHA_TEST                  0x4BD4
+#       define R300_REF_ALPHA_MASK               0x000000ff
+#       define R300_ALPHA_TEST_FAIL              (0 << 8)
+#       define R300_ALPHA_TEST_LESS              (1 << 8)
+#       define R300_ALPHA_TEST_LEQUAL            (3 << 8)
+#       define R300_ALPHA_TEST_EQUAL             (2 << 8)
+#       define R300_ALPHA_TEST_GEQUAL            (6 << 8)
+#       define R300_ALPHA_TEST_GREATER           (4 << 8)
+#       define R300_ALPHA_TEST_NEQUAL            (5 << 8)
+#       define R300_ALPHA_TEST_PASS              (7 << 8)
+#       define R300_ALPHA_TEST_OP_MASK           (7 << 8)
+#       define R300_ALPHA_TEST_ENABLE            (1 << 11)
+
+/* gap */
+
+/* Fragment program parameters in 7.16 floating point */
+#define R300_PFS_PARAM_0_X                  0x4C00
+#define R300_PFS_PARAM_0_Y                  0x4C04
+#define R300_PFS_PARAM_0_Z                  0x4C08
+#define R300_PFS_PARAM_0_W                  0x4C0C
+/* GUESS: PARAM_31 is last, based on native limits reported by fglrx */
+#define R300_PFS_PARAM_31_X                 0x4DF0
+#define R300_PFS_PARAM_31_Y                 0x4DF4
+#define R300_PFS_PARAM_31_Z                 0x4DF8
+#define R300_PFS_PARAM_31_W                 0x4DFC
+
+/* Notes:
+ * - AFAIK fglrx always sets BLEND_UNKNOWN when blending is used in
+ *   the application
+ * - AFAIK fglrx always sets BLEND_NO_SEPARATE when CBLEND and ABLEND
+ *    are set to the same
+ *   function (both registers are always set up completely in any case)
+ * - Most blend flags are simply copied from R200 and not tested yet
+ */
+#define R300_RB3D_CBLEND                    0x4E04
+#define R300_RB3D_ABLEND                    0x4E08
+/* the following only appear in CBLEND */
+#       define R300_BLEND_ENABLE                     (1 << 0)
+#       define R300_BLEND_UNKNOWN                    (3 << 1)
+#       define R300_BLEND_NO_SEPARATE                (1 << 3)
+/* the following are shared between CBLEND and ABLEND */
+#       define R300_FCN_MASK                         (3  << 12)
+#       define R300_COMB_FCN_ADD_CLAMP               (0  << 12)
+#       define R300_COMB_FCN_ADD_NOCLAMP             (1  << 12)
+#       define R300_COMB_FCN_SUB_CLAMP               (2  << 12)
+#       define R300_COMB_FCN_SUB_NOCLAMP             (3  << 12)
+#       define R300_COMB_FCN_MIN                     (4  << 12)
+#       define R300_COMB_FCN_MAX                     (5  << 12)
+#       define R300_COMB_FCN_RSUB_CLAMP              (6  << 12)
+#       define R300_COMB_FCN_RSUB_NOCLAMP            (7  << 12)
+#       define R300_BLEND_GL_ZERO                    (32)
+#       define R300_BLEND_GL_ONE                     (33)
+#       define R300_BLEND_GL_SRC_COLOR               (34)
+#       define R300_BLEND_GL_ONE_MINUS_SRC_COLOR     (35)
+#       define R300_BLEND_GL_DST_COLOR               (36)
+#       define R300_BLEND_GL_ONE_MINUS_DST_COLOR     (37)
+#       define R300_BLEND_GL_SRC_ALPHA               (38)
+#       define R300_BLEND_GL_ONE_MINUS_SRC_ALPHA     (39)
+#       define R300_BLEND_GL_DST_ALPHA               (40)
+#       define R300_BLEND_GL_ONE_MINUS_DST_ALPHA     (41)
+#       define R300_BLEND_GL_SRC_ALPHA_SATURATE      (42)
+#       define R300_BLEND_GL_CONST_COLOR             (43)
+#       define R300_BLEND_GL_ONE_MINUS_CONST_COLOR   (44)
+#       define R300_BLEND_GL_CONST_ALPHA             (45)
+#       define R300_BLEND_GL_ONE_MINUS_CONST_ALPHA   (46)
+#       define R300_BLEND_MASK                       (63)
+#       define R300_SRC_BLEND_SHIFT                  (16)
+#       define R300_DST_BLEND_SHIFT                  (24)
+#define R300_RB3D_BLEND_COLOR               0x4E10
+#define R300_RB3D_COLORMASK                 0x4E0C
+#       define R300_COLORMASK0_B                 (1<<0)
+#       define R300_COLORMASK0_G                 (1<<1)
+#       define R300_COLORMASK0_R                 (1<<2)
+#       define R300_COLORMASK0_A                 (1<<3)
+
+/* gap */
+
+#define R300_RB3D_COLOROFFSET0              0x4E28
+#       define R300_COLOROFFSET_MASK             0xFFFFFFF0 /* GUESS */
+#define R300_RB3D_COLOROFFSET1              0x4E2C /* GUESS */
+#define R300_RB3D_COLOROFFSET2              0x4E30 /* GUESS */
+#define R300_RB3D_COLOROFFSET3              0x4E34 /* GUESS */
+
+/* gap */
+
+/* Bit 16: Larger tiles
+ * Bit 17: 4x2 tiles
+ * Bit 18: Extremely weird tile like, but some pixels duplicated?
+ */
+#define R300_RB3D_COLORPITCH0               0x4E38
+#       define R300_COLORPITCH_MASK              0x00001FF8 /* GUESS */
+#       define R300_COLOR_TILE_ENABLE            (1 << 16) /* GUESS */
+#       define R300_COLOR_MICROTILE_ENABLE       (1 << 17) /* GUESS */
+#       define R300_COLOR_ENDIAN_NO_SWAP         (0 << 18) /* GUESS */
+#       define R300_COLOR_ENDIAN_WORD_SWAP       (1 << 18) /* GUESS */
+#       define R300_COLOR_ENDIAN_DWORD_SWAP      (2 << 18) /* GUESS */
+#       define R300_COLOR_FORMAT_RGB565          (2 << 22)
+#       define R300_COLOR_FORMAT_ARGB8888        (3 << 22)
+#define R300_RB3D_COLORPITCH1               0x4E3C /* GUESS */
+#define R300_RB3D_COLORPITCH2               0x4E40 /* GUESS */
+#define R300_RB3D_COLORPITCH3               0x4E44 /* GUESS */
+
+/* gap */
+
+/* Guess by Vladimir.
+ * Set to 0A before 3D operations, set to 02 afterwards.
+ */
+#define R300_RB3D_DSTCACHE_CTLSTAT          0x4E4C
+#       define R300_RB3D_DSTCACHE_UNKNOWN_02             0x00000002
+#       define R300_RB3D_DSTCACHE_UNKNOWN_0A             0x0000000A
+
+/* gap */
+/* There seems to be no "write only" setting, so use Z-test = ALWAYS
+ * for this.
+ * Bit (1<<8) is the "test" bit. so plain write is 6  - vd
+ */
+#define R300_RB3D_ZSTENCIL_CNTL_0                   0x4F00
+#       define R300_RB3D_Z_DISABLED_1            0x00000010
+#       define R300_RB3D_Z_DISABLED_2            0x00000014
+#       define R300_RB3D_Z_TEST                  0x00000012
+#       define R300_RB3D_Z_TEST_AND_WRITE        0x00000016
+#       define R300_RB3D_Z_WRITE_ONLY        	 0x00000006
+
+#       define R300_RB3D_Z_TEST                  0x00000012
+#       define R300_RB3D_Z_TEST_AND_WRITE        0x00000016
+#       define R300_RB3D_Z_WRITE_ONLY        	 0x00000006
+#	define R300_RB3D_STENCIL_ENABLE		 0x00000001
+
+#define R300_RB3D_ZSTENCIL_CNTL_1                   0x4F04
+	/* functions */
+#	define R300_ZS_NEVER			0
+#	define R300_ZS_LESS			1
+#	define R300_ZS_LEQUAL			2
+#	define R300_ZS_EQUAL			3
+#	define R300_ZS_GEQUAL			4
+#	define R300_ZS_GREATER			5
+#	define R300_ZS_NOTEQUAL			6
+#	define R300_ZS_ALWAYS			7
+#       define R300_ZS_MASK                     7
+	/* operations */
+#	define R300_ZS_KEEP			0
+#	define R300_ZS_ZERO			1
+#	define R300_ZS_REPLACE			2
+#	define R300_ZS_INCR			3
+#	define R300_ZS_DECR			4
+#	define R300_ZS_INVERT			5
+#	define R300_ZS_INCR_WRAP		6
+#	define R300_ZS_DECR_WRAP		7
+	/* front and back refer to operations done for front
+	   and back faces, i.e. separate stencil function support */
+#	define R300_RB3D_ZS1_DEPTH_FUNC_SHIFT		0
+#	define R300_RB3D_ZS1_FRONT_FUNC_SHIFT		3
+#	define R300_RB3D_ZS1_FRONT_FAIL_OP_SHIFT	6
+#	define R300_RB3D_ZS1_FRONT_ZPASS_OP_SHIFT	9
+#	define R300_RB3D_ZS1_FRONT_ZFAIL_OP_SHIFT      12
+#	define R300_RB3D_ZS1_BACK_FUNC_SHIFT           15
+#	define R300_RB3D_ZS1_BACK_FAIL_OP_SHIFT        18
+#	define R300_RB3D_ZS1_BACK_ZPASS_OP_SHIFT       21
+#	define R300_RB3D_ZS1_BACK_ZFAIL_OP_SHIFT       24
+
+#define R300_RB3D_ZSTENCIL_CNTL_2                   0x4F08
+#	define R300_RB3D_ZS2_STENCIL_REF_SHIFT		0
+#	define R300_RB3D_ZS2_STENCIL_MASK		0xFF
+#	define R300_RB3D_ZS2_STENCIL_MASK_SHIFT	        8
+#	define R300_RB3D_ZS2_STENCIL_WRITE_MASK_SHIFT	16
+
+/* gap */
+
+#define R300_RB3D_ZSTENCIL_FORMAT                   0x4F10
+#	define R300_DEPTH_FORMAT_16BIT_INT_Z     (0 << 0)
+#	define R300_DEPTH_FORMAT_24BIT_INT_Z     (2 << 0)
+	/* 16 bit format or some aditional bit ? */
+#	define R300_DEPTH_FORMAT_UNK32          (32 << 0)
+
+#define R300_RB3D_EARLY_Z                           0x4F14
+#	define R300_EARLY_Z_DISABLE              (0 << 0)
+#	define R300_EARLY_Z_ENABLE               (1 << 0)
+
+/* gap */
+
+#define R300_RB3D_ZCACHE_CTLSTAT            0x4F18 /* GUESS */
+#       define R300_RB3D_ZCACHE_UNKNOWN_01  0x1
+#       define R300_RB3D_ZCACHE_UNKNOWN_03  0x3
+
+/* gap */
+
+#define R300_RB3D_DEPTHOFFSET               0x4F20
+#define R300_RB3D_DEPTHPITCH                0x4F24
+#       define R300_DEPTHPITCH_MASK              0x00001FF8 /* GUESS */
+#       define R300_DEPTH_TILE_ENABLE            (1 << 16) /* GUESS */
+#       define R300_DEPTH_MICROTILE_ENABLE       (1 << 17) /* GUESS */
+#       define R300_DEPTH_ENDIAN_NO_SWAP         (0 << 18) /* GUESS */
+#       define R300_DEPTH_ENDIAN_WORD_SWAP       (1 << 18) /* GUESS */
+#       define R300_DEPTH_ENDIAN_DWORD_SWAP      (2 << 18) /* GUESS */
+
+/* BEGIN: Vertex program instruction set */
+
+/* Every instruction is four dwords long:
+ *  DWORD 0: output and opcode
+ *  DWORD 1: first argument
+ *  DWORD 2: second argument
+ *  DWORD 3: third argument
+ *
+ * Notes:
+ *  - ABS r, a is implemented as MAX r, a, -a
+ *  - MOV is implemented as ADD to zero
+ *  - XPD is implemented as MUL + MAD
+ *  - FLR is implemented as FRC + ADD
+ *  - apparently, fglrx tries to schedule instructions so that there is at
+ *    least one instruction between the write to a temporary and the first
+ *    read from said temporary; however, violations of this scheduling are
+ *    allowed
+ *  - register indices seem to be unrelated with OpenGL aliasing to
+ *    conventional state
+ *  - only one attribute and one parameter can be loaded at a time; however,
+ *    the same attribute/parameter can be used for more than one argument
+ *  - the second software argument for POW is the third hardware argument
+ *    (no idea why)
+ *  - MAD with only temporaries as input seems to use VPI_OUT_SELECT_MAD_2
+ *
+ * There is some magic surrounding LIT:
+ *   The single argument is replicated across all three inputs, but swizzled:
+ *     First argument: xyzy
+ *     Second argument: xyzx
+ *     Third argument: xyzw
+ *   Whenever the result is used later in the fragment program, fglrx forces
+ *   x and w to be 1.0 in the input selection; I don't know whether this is
+ *   strictly necessary
+ */
+#define R300_VPI_OUT_OP_DOT                     (1 << 0)
+#define R300_VPI_OUT_OP_MUL                     (2 << 0)
+#define R300_VPI_OUT_OP_ADD                     (3 << 0)
+#define R300_VPI_OUT_OP_MAD                     (4 << 0)
+#define R300_VPI_OUT_OP_DST                     (5 << 0)
+#define R300_VPI_OUT_OP_FRC                     (6 << 0)
+#define R300_VPI_OUT_OP_MAX                     (7 << 0)
+#define R300_VPI_OUT_OP_MIN                     (8 << 0)
+#define R300_VPI_OUT_OP_SGE                     (9 << 0)
+#define R300_VPI_OUT_OP_SLT                     (10 << 0)
+	/* Used in GL_POINT_DISTANCE_ATTENUATION_ARB, vector(scalar, vector) */
+#define R300_VPI_OUT_OP_UNK12                   (12 << 0)
+#define R300_VPI_OUT_OP_ARL                     (13 << 0)
+#define R300_VPI_OUT_OP_EXP                     (65 << 0)
+#define R300_VPI_OUT_OP_LOG                     (66 << 0)
+	/* Used in fog computations, scalar(scalar) */
+#define R300_VPI_OUT_OP_UNK67                   (67 << 0)
+#define R300_VPI_OUT_OP_LIT                     (68 << 0)
+#define R300_VPI_OUT_OP_POW                     (69 << 0)
+#define R300_VPI_OUT_OP_RCP                     (70 << 0)
+#define R300_VPI_OUT_OP_RSQ                     (72 << 0)
+	/* Used in GL_POINT_DISTANCE_ATTENUATION_ARB, scalar(scalar) */
+#define R300_VPI_OUT_OP_UNK73                   (73 << 0)
+#define R300_VPI_OUT_OP_EX2                     (75 << 0)
+#define R300_VPI_OUT_OP_LG2                     (76 << 0)
+#define R300_VPI_OUT_OP_MAD_2                   (128 << 0)
+	/* all temps, vector(scalar, vector, vector) */
+#define R300_VPI_OUT_OP_UNK129                  (129 << 0)
+
+#define R300_VPI_OUT_REG_CLASS_TEMPORARY        (0 << 8)
+#define R300_VPI_OUT_REG_CLASS_ADDR             (1 << 8)
+#define R300_VPI_OUT_REG_CLASS_RESULT           (2 << 8)
+#define R300_VPI_OUT_REG_CLASS_MASK             (31 << 8)
+
+#define R300_VPI_OUT_REG_INDEX_SHIFT            13
+	/* GUESS based on fglrx native limits */
+#define R300_VPI_OUT_REG_INDEX_MASK             (31 << 13)
+
+#define R300_VPI_OUT_WRITE_X                    (1 << 20)
+#define R300_VPI_OUT_WRITE_Y                    (1 << 21)
+#define R300_VPI_OUT_WRITE_Z                    (1 << 22)
+#define R300_VPI_OUT_WRITE_W                    (1 << 23)
+
+#define R300_VPI_IN_REG_CLASS_TEMPORARY         (0 << 0)
+#define R300_VPI_IN_REG_CLASS_ATTRIBUTE         (1 << 0)
+#define R300_VPI_IN_REG_CLASS_PARAMETER         (2 << 0)
+#define R300_VPI_IN_REG_CLASS_NONE              (9 << 0)
+#define R300_VPI_IN_REG_CLASS_MASK              (31 << 0)
+
+#define R300_VPI_IN_REG_INDEX_SHIFT             5
+	/* GUESS based on fglrx native limits */
+#define R300_VPI_IN_REG_INDEX_MASK              (255 << 5)
+
+/* The R300 can select components from the input register arbitrarily.
+ * Use the following constants, shifted by the component shift you
+ * want to select
+ */
+#define R300_VPI_IN_SELECT_X    0
+#define R300_VPI_IN_SELECT_Y    1
+#define R300_VPI_IN_SELECT_Z    2
+#define R300_VPI_IN_SELECT_W    3
+#define R300_VPI_IN_SELECT_ZERO 4
+#define R300_VPI_IN_SELECT_ONE  5
+#define R300_VPI_IN_SELECT_MASK 7
+
+#define R300_VPI_IN_X_SHIFT                     13
+#define R300_VPI_IN_Y_SHIFT                     16
+#define R300_VPI_IN_Z_SHIFT                     19
+#define R300_VPI_IN_W_SHIFT                     22
+
+#define R300_VPI_IN_NEG_X                       (1 << 25)
+#define R300_VPI_IN_NEG_Y                       (1 << 26)
+#define R300_VPI_IN_NEG_Z                       (1 << 27)
+#define R300_VPI_IN_NEG_W                       (1 << 28)
+/* END: Vertex program instruction set */
+
+/* BEGIN: Packet 3 commands */
+
+/* A primitive emission dword. */
+#define R300_PRIM_TYPE_NONE                     (0 << 0)
+#define R300_PRIM_TYPE_POINT                    (1 << 0)
+#define R300_PRIM_TYPE_LINE                     (2 << 0)
+#define R300_PRIM_TYPE_LINE_STRIP               (3 << 0)
+#define R300_PRIM_TYPE_TRI_LIST                 (4 << 0)
+#define R300_PRIM_TYPE_TRI_FAN                  (5 << 0)
+#define R300_PRIM_TYPE_TRI_STRIP                (6 << 0)
+#define R300_PRIM_TYPE_TRI_TYPE2                (7 << 0)
+#define R300_PRIM_TYPE_RECT_LIST                (8 << 0)
+#define R300_PRIM_TYPE_3VRT_POINT_LIST          (9 << 0)
+#define R300_PRIM_TYPE_3VRT_LINE_LIST           (10 << 0)
+	/* GUESS (based on r200) */
+#define R300_PRIM_TYPE_POINT_SPRITES            (11 << 0)
+#define R300_PRIM_TYPE_LINE_LOOP                (12 << 0)
+#define R300_PRIM_TYPE_QUADS                    (13 << 0)
+#define R300_PRIM_TYPE_QUAD_STRIP               (14 << 0)
+#define R300_PRIM_TYPE_POLYGON                  (15 << 0)
+#define R300_PRIM_TYPE_MASK                     0xF
+#define R300_PRIM_WALK_IND                      (1 << 4)
+#define R300_PRIM_WALK_LIST                     (2 << 4)
+#define R300_PRIM_WALK_RING                     (3 << 4)
+#define R300_PRIM_WALK_MASK                     (3 << 4)
+	/* GUESS (based on r200) */
+#define R300_PRIM_COLOR_ORDER_BGRA              (0 << 6)
+#define R300_PRIM_COLOR_ORDER_RGBA              (1 << 6)
+#define R300_PRIM_NUM_VERTICES_SHIFT            16
+#define R300_PRIM_NUM_VERTICES_MASK             0xffff
+
+/* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR.
+ * Two parameter dwords:
+ * 0. The first parameter appears to be always 0
+ * 1. The second parameter is a standard primitive emission dword.
+ */
+#define R300_PACKET3_3D_DRAW_VBUF           0x00002800
+
+/* Specify the full set of vertex arrays as (address, stride).
+ * The first parameter is the number of vertex arrays specified.
+ * The rest of the command is a variable length list of blocks, where
+ * each block is three dwords long and specifies two arrays.
+ * The first dword of a block is split into two words, the lower significant
+ * word refers to the first array, the more significant word to the second
+ * array in the block.
+ * The low byte of each word contains the size of an array entry in dwords,
+ * the high byte contains the stride of the array.
+ * The second dword of a block contains the pointer to the first array,
+ * the third dword of a block contains the pointer to the second array.
+ * Note that if the total number of arrays is odd, the third dword of
+ * the last block is omitted.
+ */
+#define R300_PACKET3_3D_LOAD_VBPNTR         0x00002F00
+
+#define R300_PACKET3_INDX_BUFFER            0x00003300
+#    define R300_EB_UNK1_SHIFT                      24
+#    define R300_EB_UNK1                    (0x80<<24)
+#    define R300_EB_UNK2                        0x0810
+#define R300_PACKET3_3D_DRAW_INDX_2         0x00003600
+
+/* END: Packet 3 commands */
+
+
+/* Color formats for 2d packets
+ */
+#define R300_CP_COLOR_FORMAT_CI8	2
+#define R300_CP_COLOR_FORMAT_ARGB1555	3
+#define R300_CP_COLOR_FORMAT_RGB565	4
+#define R300_CP_COLOR_FORMAT_ARGB8888	6
+#define R300_CP_COLOR_FORMAT_RGB332	7
+#define R300_CP_COLOR_FORMAT_RGB8	9
+#define R300_CP_COLOR_FORMAT_ARGB4444	15
+
+/*
+ * CP type-3 packets
+ */
+#define R300_CP_CMD_BITBLT_MULTI	0xC0009B00
+
+#endif /* _R300_REG_H */
+
+/* *INDENT-ON* */
diff --git a/r300/r300_render.c b/r300/r300_render.c
new file mode 100644
index 0000000..cc13e9a
--- /dev/null
+++ b/r300/r300_render.c
@@ -0,0 +1,536 @@
+/**************************************************************************
+
+Copyright (C) 2004 Nicolai Haehnle.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ATI, VA LINUX SYSTEMS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \brief R300 Render (Vertex Buffer Implementation)
+ *
+ * The immediate implementation has been removed from CVS in favor of the vertex
+ * buffer implementation.
+ *
+ * The render functions are called by the pipeline manager to render a batch of
+ * primitives. They return TRUE to pass on to the next stage (i.e. software
+ * rasterization) or FALSE to indicate that the pipeline has finished after
+ * rendering something.
+ *
+ * When falling back to software TCL still attempt to use hardware
+ * rasterization.
+ *
+ * I am not sure that the cache related registers are setup correctly, but
+ * obviously this does work... Further investigation is needed.
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include "glheader.h"
+#include "state.h"
+#include "imports.h"
+#include "enums.h"
+#include "macros.h"
+#include "context.h"
+#include "dd.h"
+#include "simple_list.h"
+#include "api_arrayelt.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
+#include "radeon_reg.h"
+#include "radeon_macros.h"
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "r300_context.h"
+#include "r300_ioctl.h"
+#include "r300_state.h"
+#include "r300_reg.h"
+#include "r300_tex.h"
+#include "r300_emit.h"
+extern int future_hw_tcl_on;
+
+/**
+ * \brief Convert a OpenGL primitive type into a R300 primitive type.
+ */
+static int r300PrimitiveType(r300ContextPtr rmesa, GLcontext * ctx, int prim)
+{
+	switch (prim & PRIM_MODE_MASK) {
+	case GL_POINTS:
+		return R300_VAP_VF_CNTL__PRIM_POINTS;
+		break;
+	case GL_LINES:
+		return R300_VAP_VF_CNTL__PRIM_LINES;
+		break;
+	case GL_LINE_STRIP:
+		return R300_VAP_VF_CNTL__PRIM_LINE_STRIP;
+		break;
+	case GL_LINE_LOOP:
+		return R300_VAP_VF_CNTL__PRIM_LINE_LOOP;
+		break;
+	case GL_TRIANGLES:
+		return R300_VAP_VF_CNTL__PRIM_TRIANGLES;
+		break;
+	case GL_TRIANGLE_STRIP:
+		return R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP;
+		break;
+	case GL_TRIANGLE_FAN:
+		return R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN;
+		break;
+	case GL_QUADS:
+		return R300_VAP_VF_CNTL__PRIM_QUADS;
+		break;
+	case GL_QUAD_STRIP:
+		return R300_VAP_VF_CNTL__PRIM_QUAD_STRIP;
+		break;
+	case GL_POLYGON:
+		return R300_VAP_VF_CNTL__PRIM_POLYGON;
+		break;
+	default:
+		assert(0);
+		return -1;
+		break;
+	}
+}
+
+static int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim)
+{
+	int verts_off = 0;
+
+	switch (prim & PRIM_MODE_MASK) {
+	case GL_POINTS:
+		verts_off = 0;
+		break;
+	case GL_LINES:
+		verts_off = num_verts % 2;
+		break;
+	case GL_LINE_STRIP:
+		if (num_verts < 2)
+			verts_off = num_verts;
+		break;
+	case GL_LINE_LOOP:
+		if (num_verts < 2)
+			verts_off = num_verts;
+		break;
+	case GL_TRIANGLES:
+		verts_off = num_verts % 3;
+		break;
+	case GL_TRIANGLE_STRIP:
+		if (num_verts < 3)
+			verts_off = num_verts;
+		break;
+	case GL_TRIANGLE_FAN:
+		if (num_verts < 3)
+			verts_off = num_verts;
+		break;
+	case GL_QUADS:
+		verts_off = num_verts % 4;
+		break;
+	case GL_QUAD_STRIP:
+		if (num_verts < 4)
+			verts_off = num_verts;
+		else
+			verts_off = num_verts % 2;
+		break;
+	case GL_POLYGON:
+		if (num_verts < 3)
+			verts_off = num_verts;
+		break;
+	default:
+		assert(0);
+		return -1;
+		break;
+	}
+
+	return num_verts - verts_off;
+}
+
+static void r300EmitElts(GLcontext * ctx, void *elts, unsigned long n_elts,
+			 int elt_size)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct r300_dma_region *rvb = &rmesa->state.elt_dma;
+	void *out;
+
+	assert(elt_size == 2 || elt_size == 4);
+
+	if (r300IsGartMemory(rmesa, elts, n_elts * elt_size)) {
+		rvb->address = rmesa->radeon.radeonScreen->gartTextures.map;
+		rvb->start = ((char *)elts) - rvb->address;
+		rvb->aos_offset =
+		    rmesa->radeon.radeonScreen->gart_texture_offset +
+		    rvb->start;
+		return;
+	} else if (r300IsGartMemory(rmesa, elts, 1)) {
+		WARN_ONCE("Pointer not within GART memory!\n");
+		_mesa_exit(-1);
+	}
+
+	r300AllocDmaRegion(rmesa, rvb, n_elts * elt_size, elt_size);
+	rvb->aos_offset = GET_START(rvb);
+
+	out = rvb->address + rvb->start;
+	memcpy(out, elts, n_elts * elt_size);
+}
+
+static void r300FireEB(r300ContextPtr rmesa, unsigned long addr,
+		       int vertex_count, int type, int elt_size)
+{
+	int cmd_reserved = 0;
+	int cmd_written = 0;
+	drm_radeon_cmd_header_t *cmd = NULL;
+	unsigned long t_addr;
+	unsigned long magic_1, magic_2;
+
+	assert(elt_size == 2 || elt_size == 4);
+
+	if (addr & (elt_size - 1)) {
+		WARN_ONCE("Badly aligned buffer\n");
+		return;
+	}
+
+	magic_1 = (addr % 32) / 4;
+	t_addr = addr & ~0x1d;
+	magic_2 = (vertex_count + 1 + (t_addr & 0x2)) / 2 + magic_1;
+
+	start_packet3(RADEON_CP_PACKET3_3D_DRAW_INDX_2, 0);
+	if (elt_size == 4) {
+		e32(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+		    (vertex_count << 16) | type |
+		    R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
+	} else {
+		e32(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+		    (vertex_count << 16) | type);
+	}
+
+	start_packet3(RADEON_CP_PACKET3_INDX_BUFFER, 2);
+#ifdef OPTIMIZE_ELTS
+	if (elt_size == 4) {
+		e32(R300_EB_UNK1 | (0 << 16) | R300_EB_UNK2);
+		e32(addr);
+	} else {
+		e32(R300_EB_UNK1 | (magic_1 << 16) | R300_EB_UNK2);
+		e32(t_addr);
+	}
+#else
+	e32(R300_EB_UNK1 | (0 << 16) | R300_EB_UNK2);
+	e32(addr);
+#endif
+
+	if (elt_size == 4) {
+		e32(vertex_count);
+	} else {
+#ifdef OPTIMIZE_ELTS
+		e32(magic_2);
+#else
+		e32((vertex_count + 1) / 2);
+#endif
+	}
+}
+
+static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
+{
+	int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
+	int i;
+	int cmd_reserved = 0;
+	int cmd_written = 0;
+	drm_radeon_cmd_header_t *cmd = NULL;
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s: nr=%d, ofs=0x%08x\n", __FUNCTION__, nr,
+			offset);
+
+	start_packet3(RADEON_CP_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+	e32(nr);
+	for (i = 0; i + 1 < nr; i += 2) {
+		e32((rmesa->state.aos[i].aos_size << 0)
+		    | (rmesa->state.aos[i].aos_stride << 8)
+		    | (rmesa->state.aos[i + 1].aos_size << 16)
+		    | (rmesa->state.aos[i + 1].aos_stride << 24)
+		    );
+		e32(rmesa->state.aos[i].aos_offset +
+		    offset * 4 * rmesa->state.aos[i].aos_stride);
+		e32(rmesa->state.aos[i + 1].aos_offset +
+		    offset * 4 * rmesa->state.aos[i + 1].aos_stride);
+	}
+
+	if (nr & 1) {
+		e32((rmesa->state.aos[nr - 1].aos_size << 0)
+		    | (rmesa->state.aos[nr - 1].aos_stride << 8)
+		    );
+		e32(rmesa->state.aos[nr - 1].aos_offset +
+		    offset * 4 * rmesa->state.aos[nr - 1].aos_stride);
+	}
+}
+
+static void r300FireAOS(r300ContextPtr rmesa, int vertex_count, int type)
+{
+	int cmd_reserved = 0;
+	int cmd_written = 0;
+	drm_radeon_cmd_header_t *cmd = NULL;
+
+	start_packet3(RADEON_CP_PACKET3_3D_DRAW_VBUF_2, 0);
+	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16)
+	    | type);
+}
+
+static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
+				   int start, int end, int prim)
+{
+	int type, num_verts;
+
+	type = r300PrimitiveType(rmesa, ctx, prim);
+	num_verts = r300NumVerts(rmesa, end - start, prim);
+
+	if (type < 0 || num_verts <= 0)
+		return;
+
+	if (rmesa->state.VB.Elts) {
+		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
+		if (num_verts > 65535) {
+			/* not implemented yet */
+			WARN_ONCE("Too many elts\n");
+			return;
+		}
+		r300EmitElts(ctx, rmesa->state.VB.Elts, num_verts,
+			     rmesa->state.VB.elt_size);
+		r300FireEB(rmesa, rmesa->state.elt_dma.aos_offset,
+			   num_verts, type, rmesa->state.VB.elt_size);
+	} else {
+		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
+		r300FireAOS(rmesa, num_verts, type);
+	}
+}
+
+#define CONV_VB(a, b) rvb->AttribPtr[(a)].size = vb->b->size, \
+			rvb->AttribPtr[(a)].type = GL_FLOAT, \
+			rvb->AttribPtr[(a)].stride = vb->b->stride, \
+			rvb->AttribPtr[(a)].data = vb->b->data
+
+static void radeon_vb_to_rvb(r300ContextPtr rmesa,
+			     struct radeon_vertex_buffer *rvb,
+			     struct vertex_buffer *vb)
+{
+	int i;
+	GLcontext *ctx;
+	ctx = rmesa->radeon.glCtx;
+
+	memset(rvb, 0, sizeof(*rvb));
+
+	rvb->Elts = vb->Elts;
+	rvb->elt_size = 4;
+	rvb->elt_min = 0;
+	rvb->elt_max = vb->Count;
+
+	rvb->Count = vb->Count;
+
+	if (hw_tcl_on) {
+		CONV_VB(VERT_ATTRIB_POS, ObjPtr);
+	} else {
+		assert(vb->ClipPtr);
+		CONV_VB(VERT_ATTRIB_POS, ClipPtr);
+	}
+
+	CONV_VB(VERT_ATTRIB_NORMAL, NormalPtr);
+	CONV_VB(VERT_ATTRIB_COLOR0, ColorPtr[0]);
+	CONV_VB(VERT_ATTRIB_COLOR1, SecondaryColorPtr[0]);
+	CONV_VB(VERT_ATTRIB_FOG, FogCoordPtr);
+
+	for (i = 0; i < ctx->Const.MaxTextureCoordUnits; i++)
+		CONV_VB(VERT_ATTRIB_TEX0 + i, TexCoordPtr[i]);
+
+	for (i = 0; i < MAX_VERTEX_PROGRAM_ATTRIBS; i++)
+		CONV_VB(VERT_ATTRIB_GENERIC0 + i,
+			AttribPtr[VERT_ATTRIB_GENERIC0 + i]);
+
+	rvb->Primitive = vb->Primitive;
+	rvb->PrimitiveCount = vb->PrimitiveCount;
+	rvb->LockFirst = rvb->LockCount = 0;
+	rvb->lock_uptodate = GL_FALSE;
+}
+
+static GLboolean r300RunRender(GLcontext * ctx,
+			       struct tnl_pipeline_stage *stage)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct radeon_vertex_buffer *VB = &rmesa->state.VB;
+	int i;
+	int cmd_reserved = 0;
+	int cmd_written = 0;
+	drm_radeon_cmd_header_t *cmd = NULL;
+
+	if (RADEON_DEBUG & DEBUG_PRIMS)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (stage) {
+		TNLcontext *tnl = TNL_CONTEXT(ctx);
+		radeon_vb_to_rvb(rmesa, VB, &tnl->vb);
+	}
+
+	r300UpdateShaders(rmesa);
+	if (r300EmitArrays(ctx))
+		return GL_TRUE;
+
+	r300UpdateShaderStates(rmesa);
+
+	reg_start(R300_RB3D_DSTCACHE_CTLSTAT, 0);
+	e32(R300_RB3D_DSTCACHE_UNKNOWN_0A);
+
+	reg_start(R300_RB3D_ZCACHE_CTLSTAT, 0);
+	e32(R300_RB3D_ZCACHE_UNKNOWN_03);
+
+	r300EmitState(rmesa);
+
+	for (i = 0; i < VB->PrimitiveCount; i++) {
+		GLuint prim = _tnl_translate_prim(&VB->Primitive[i]);
+		GLuint start = VB->Primitive[i].start;
+		GLuint end = VB->Primitive[i].start + VB->Primitive[i].count;
+		r300RunRenderPrimitive(rmesa, ctx, start, end, prim);
+	}
+
+	reg_start(R300_RB3D_DSTCACHE_CTLSTAT, 0);
+	e32(R300_RB3D_DSTCACHE_UNKNOWN_0A);
+
+	reg_start(R300_RB3D_ZCACHE_CTLSTAT, 0);
+	e32(R300_RB3D_ZCACHE_UNKNOWN_03);
+
+#ifdef USER_BUFFERS
+	r300UseArrays(ctx);
+#endif
+
+	r300ReleaseArrays(ctx);
+
+	return GL_FALSE;
+}
+
+#define FALLBACK_IF(expr)						\
+	do {								\
+		if (expr) {						\
+			if (1 || RADEON_DEBUG & DEBUG_FALLBACKS)	\
+				WARN_ONCE("Software fallback:%s\n",	\
+					  #expr);			\
+			return R300_FALLBACK_RAST;			\
+		}							\
+	} while(0)
+
+static int r300Fallback(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_fragment_program *fp = (struct r300_fragment_program *)
+	    (char *)ctx->FragmentProgram._Current;
+
+	if (fp) {
+		if (!fp->translated)
+			r300TranslateFragmentShader(r300, fp);
+		FALLBACK_IF(!fp->translated);
+	}
+
+	FALLBACK_IF(ctx->RenderMode != GL_RENDER);
+
+	FALLBACK_IF(ctx->Stencil._TestTwoSide
+		    && (ctx->Stencil.Ref[0] != ctx->Stencil.Ref[1]
+			|| ctx->Stencil.ValueMask[0] !=
+			ctx->Stencil.ValueMask[1]
+			|| ctx->Stencil.WriteMask[0] !=
+			ctx->Stencil.WriteMask[1]));
+
+	FALLBACK_IF(ctx->Color.ColorLogicOpEnabled);
+
+	if (ctx->Extensions.NV_point_sprite || ctx->Extensions.ARB_point_sprite)
+		FALLBACK_IF(ctx->Point.PointSprite);
+
+	if (!r300->disable_lowimpact_fallback) {
+		FALLBACK_IF(ctx->Polygon.OffsetPoint);
+		FALLBACK_IF(ctx->Polygon.OffsetLine);
+		FALLBACK_IF(ctx->Polygon.StippleFlag);
+		FALLBACK_IF(ctx->Multisample.Enabled);
+		FALLBACK_IF(ctx->Line.StippleFlag);
+		FALLBACK_IF(ctx->Line.SmoothFlag);
+		FALLBACK_IF(ctx->Point.SmoothFlag);
+	}
+
+	return R300_FALLBACK_NONE;
+}
+
+static GLboolean r300RunNonTCLRender(GLcontext * ctx,
+				     struct tnl_pipeline_stage *stage)
+{
+	if (RADEON_DEBUG & DEBUG_PRIMS)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (r300Fallback(ctx) >= R300_FALLBACK_RAST)
+		return GL_TRUE;
+
+	return r300RunRender(ctx, stage);
+}
+
+static GLboolean r300RunTCLRender(GLcontext * ctx,
+				  struct tnl_pipeline_stage *stage)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct r300_vertex_program *vp;
+
+	hw_tcl_on = future_hw_tcl_on;
+
+	if (RADEON_DEBUG & DEBUG_PRIMS)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (hw_tcl_on == GL_FALSE)
+		return GL_TRUE;
+
+	if (r300Fallback(ctx) >= R300_FALLBACK_TCL) {
+		hw_tcl_on = GL_FALSE;
+		return GL_TRUE;
+	}
+
+	r300UpdateShaders(rmesa);
+
+	vp = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
+	if (vp->native == GL_FALSE) {
+		hw_tcl_on = GL_FALSE;
+		return GL_TRUE;
+	}
+
+	return r300RunRender(ctx, stage);
+}
+
+const struct tnl_pipeline_stage _r300_render_stage = {
+	"r300 Hardware Rasterization",
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	r300RunNonTCLRender
+};
+
+const struct tnl_pipeline_stage _r300_tcl_stage = {
+	"r300 Hardware Transform, Clipping and Lighting",
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	r300RunTCLRender
+};
diff --git a/r300/r300_shader.c b/r300/r300_shader.c
new file mode 100644
index 0000000..59fe17b
--- /dev/null
+++ b/r300/r300_shader.c
@@ -0,0 +1,73 @@
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+
+#include "program.h"
+#include "tnl/tnl.h"
+#include "r300_context.h"
+#include "r300_fragprog.h"
+
+static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
+					 GLuint id)
+{
+	struct r300_vertex_program_cont *vp;
+	struct r300_fragment_program *fp;
+
+	switch (target) {
+	case GL_VERTEX_STATE_PROGRAM_NV:
+	case GL_VERTEX_PROGRAM_ARB:
+		vp = CALLOC_STRUCT(r300_vertex_program_cont);
+		return _mesa_init_vertex_program(ctx, &vp->mesa_program,
+						 target, id);
+	case GL_FRAGMENT_PROGRAM_ARB:
+		fp = CALLOC_STRUCT(r300_fragment_program);
+		fp->ctx = ctx;
+		return _mesa_init_fragment_program(ctx, &fp->mesa_program,
+						   target, id);
+	case GL_FRAGMENT_PROGRAM_NV:
+		fp = CALLOC_STRUCT(r300_fragment_program);
+		return _mesa_init_fragment_program(ctx, &fp->mesa_program,
+						   target, id);
+	default:
+		_mesa_problem(ctx, "Bad target in r300NewProgram");
+	}
+
+	return NULL;
+}
+
+static void r300DeleteProgram(GLcontext * ctx, struct gl_program *prog)
+{
+	_mesa_delete_program(ctx, prog);
+}
+
+static void
+r300ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
+{
+	struct r300_vertex_program_cont *vp = (void *)prog;
+	struct r300_fragment_program *fp = (struct r300_fragment_program *)prog;
+
+	switch (target) {
+	case GL_VERTEX_PROGRAM_ARB:
+		vp->progs = NULL;
+		break;
+	case GL_FRAGMENT_PROGRAM_ARB:
+		fp->translated = GL_FALSE;
+		break;
+	}
+	/* need this for tcl fallbacks */
+	_tnl_program_string(ctx, target, prog);
+}
+
+static GLboolean
+r300IsProgramNative(GLcontext * ctx, GLenum target, struct gl_program *prog)
+{
+	return 1;
+}
+
+void r300InitShaderFuncs(struct dd_function_table *functions)
+{
+	functions->NewProgram = r300NewProgram;
+	functions->DeleteProgram = r300DeleteProgram;
+	functions->ProgramStringNotify = r300ProgramStringNotify;
+	functions->IsProgramNative = r300IsProgramNative;
+}
diff --git a/r300/r300_state.c b/r300/r300_state.c
new file mode 100644
index 0000000..2aaf041
--- /dev/null
+++ b/r300/r300_state.c
@@ -0,0 +1,2375 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.
+Copyright (C) 2004 Nicolai Haehnle.
+All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include "glheader.h"
+#include "state.h"
+#include "imports.h"
+#include "enums.h"
+#include "macros.h"
+#include "context.h"
+#include "dd.h"
+#include "simple_list.h"
+
+#include "api_arrayelt.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "texformat.h"
+
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "r300_context.h"
+#include "r300_ioctl.h"
+#include "r300_state.h"
+#include "r300_reg.h"
+#include "r300_emit.h"
+#include "r300_fragprog.h"
+#include "r300_tex.h"
+
+#include "drirenderbuffer.h"
+
+static void r300BlendColor(GLcontext * ctx, const GLfloat cf[4])
+{
+	GLubyte color[4];
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	R300_STATECHANGE(rmesa, blend_color);
+
+	CLAMPED_FLOAT_TO_UBYTE(color[0], cf[0]);
+	CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]);
+	CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]);
+	CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]);
+
+	rmesa->hw.blend_color.cmd[1] = PACK_COLOR_8888(color[3], color[0],
+						       color[1], color[2]);
+}
+
+/**
+ * Calculate the hardware blend factor setting.  This same function is used
+ * for source and destination of both alpha and RGB.
+ *
+ * \returns
+ * The hardware register value for the specified blend factor.  This value
+ * will need to be shifted into the correct position for either source or
+ * destination factor.
+ *
+ * \todo
+ * Since the two cases where source and destination are handled differently
+ * are essentially error cases, they should never happen.  Determine if these
+ * cases can be removed.
+ */
+static int blend_factor(GLenum factor, GLboolean is_src)
+{
+	switch (factor) {
+	case GL_ZERO:
+		return R300_BLEND_GL_ZERO;
+		break;
+	case GL_ONE:
+		return R300_BLEND_GL_ONE;
+		break;
+	case GL_DST_COLOR:
+		return R300_BLEND_GL_DST_COLOR;
+		break;
+	case GL_ONE_MINUS_DST_COLOR:
+		return R300_BLEND_GL_ONE_MINUS_DST_COLOR;
+		break;
+	case GL_SRC_COLOR:
+		return R300_BLEND_GL_SRC_COLOR;
+		break;
+	case GL_ONE_MINUS_SRC_COLOR:
+		return R300_BLEND_GL_ONE_MINUS_SRC_COLOR;
+		break;
+	case GL_SRC_ALPHA:
+		return R300_BLEND_GL_SRC_ALPHA;
+		break;
+	case GL_ONE_MINUS_SRC_ALPHA:
+		return R300_BLEND_GL_ONE_MINUS_SRC_ALPHA;
+		break;
+	case GL_DST_ALPHA:
+		return R300_BLEND_GL_DST_ALPHA;
+		break;
+	case GL_ONE_MINUS_DST_ALPHA:
+		return R300_BLEND_GL_ONE_MINUS_DST_ALPHA;
+		break;
+	case GL_SRC_ALPHA_SATURATE:
+		return (is_src) ? R300_BLEND_GL_SRC_ALPHA_SATURATE :
+		    R300_BLEND_GL_ZERO;
+		break;
+	case GL_CONSTANT_COLOR:
+		return R300_BLEND_GL_CONST_COLOR;
+		break;
+	case GL_ONE_MINUS_CONSTANT_COLOR:
+		return R300_BLEND_GL_ONE_MINUS_CONST_COLOR;
+		break;
+	case GL_CONSTANT_ALPHA:
+		return R300_BLEND_GL_CONST_ALPHA;
+		break;
+	case GL_ONE_MINUS_CONSTANT_ALPHA:
+		return R300_BLEND_GL_ONE_MINUS_CONST_ALPHA;
+		break;
+	default:
+		fprintf(stderr, "unknown blend factor %x\n", factor);
+		return (is_src) ? R300_BLEND_GL_ONE : R300_BLEND_GL_ZERO;
+		break;
+	}
+}
+
+/**
+ * Sets both the blend equation and the blend function.
+ * This is done in a single
+ * function because some blend equations (i.e., \c GL_MIN and \c GL_MAX)
+ * change the interpretation of the blend function.
+ * Also, make sure that blend function and blend equation are set to their
+ * default value if color blending is not enabled, since at least blend
+ * equations GL_MIN and GL_FUNC_REVERSE_SUBTRACT will cause wrong results
+ * otherwise for unknown reasons.
+ */
+
+/* helper function */
+static void r300SetBlendCntl(r300ContextPtr r300, int func, int eqn,
+			     int cbits, int funcA, int eqnA)
+{
+	GLuint new_ablend, new_cblend;
+
+#if 0
+	fprintf(stderr,
+		"eqnA=%08x funcA=%08x eqn=%08x func=%08x cbits=%08x\n",
+		eqnA, funcA, eqn, func, cbits);
+#endif
+	new_ablend = eqnA | funcA;
+	new_cblend = eqn | func;
+
+	/* Some blend factor combinations don't seem to work when the
+	 * BLEND_NO_SEPARATE bit is set.
+	 *
+	 * Especially problematic candidates are the ONE_MINUS_* flags,
+	 * but I can't see a real pattern.
+	 */
+#if 0
+	if (new_ablend == new_cblend) {
+		new_cblend |= R300_BLEND_NO_SEPARATE;
+	}
+#endif
+	new_cblend |= cbits;
+
+	if ((new_ablend != r300->hw.bld.cmd[R300_BLD_ABLEND]) ||
+	    (new_cblend != r300->hw.bld.cmd[R300_BLD_CBLEND])) {
+		R300_STATECHANGE(r300, bld);
+		r300->hw.bld.cmd[R300_BLD_ABLEND] = new_ablend;
+		r300->hw.bld.cmd[R300_BLD_CBLEND] = new_cblend;
+	}
+}
+
+static void r300SetBlendState(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	int func = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+	    (R300_BLEND_GL_ZERO << R300_DST_BLEND_SHIFT);
+	int eqn = R300_COMB_FCN_ADD_CLAMP;
+	int funcA = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+	    (R300_BLEND_GL_ZERO << R300_DST_BLEND_SHIFT);
+	int eqnA = R300_COMB_FCN_ADD_CLAMP;
+
+	if (RGBA_LOGICOP_ENABLED(ctx) || !ctx->Color.BlendEnabled) {
+		r300SetBlendCntl(r300, func, eqn, 0, func, eqn);
+		return;
+	}
+
+	func =
+	    (blend_factor(ctx->Color.BlendSrcRGB, GL_TRUE) <<
+	     R300_SRC_BLEND_SHIFT) | (blend_factor(ctx->Color.BlendDstRGB,
+						   GL_FALSE) <<
+				      R300_DST_BLEND_SHIFT);
+
+	switch (ctx->Color.BlendEquationRGB) {
+	case GL_FUNC_ADD:
+		eqn = R300_COMB_FCN_ADD_CLAMP;
+		break;
+
+	case GL_FUNC_SUBTRACT:
+		eqn = R300_COMB_FCN_SUB_CLAMP;
+		break;
+
+	case GL_FUNC_REVERSE_SUBTRACT:
+		eqn = R300_COMB_FCN_RSUB_CLAMP;
+		break;
+
+	case GL_MIN:
+		eqn = R300_COMB_FCN_MIN;
+		func = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+		    (R300_BLEND_GL_ONE << R300_DST_BLEND_SHIFT);
+		break;
+
+	case GL_MAX:
+		eqn = R300_COMB_FCN_MAX;
+		func = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+		    (R300_BLEND_GL_ONE << R300_DST_BLEND_SHIFT);
+		break;
+
+	default:
+		fprintf(stderr,
+			"[%s:%u] Invalid RGB blend equation (0x%04x).\n",
+			__FUNCTION__, __LINE__, ctx->Color.BlendEquationRGB);
+		return;
+	}
+
+	funcA =
+	    (blend_factor(ctx->Color.BlendSrcA, GL_TRUE) <<
+	     R300_SRC_BLEND_SHIFT) | (blend_factor(ctx->Color.BlendDstA,
+						   GL_FALSE) <<
+				      R300_DST_BLEND_SHIFT);
+
+	switch (ctx->Color.BlendEquationA) {
+	case GL_FUNC_ADD:
+		eqnA = R300_COMB_FCN_ADD_CLAMP;
+		break;
+
+	case GL_FUNC_SUBTRACT:
+		eqnA = R300_COMB_FCN_SUB_CLAMP;
+		break;
+
+	case GL_FUNC_REVERSE_SUBTRACT:
+		eqnA = R300_COMB_FCN_RSUB_CLAMP;
+		break;
+
+	case GL_MIN:
+		eqnA = R300_COMB_FCN_MIN;
+		funcA = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+		    (R300_BLEND_GL_ONE << R300_DST_BLEND_SHIFT);
+		break;
+
+	case GL_MAX:
+		eqnA = R300_COMB_FCN_MAX;
+		funcA = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+		    (R300_BLEND_GL_ONE << R300_DST_BLEND_SHIFT);
+		break;
+
+	default:
+		fprintf(stderr,
+			"[%s:%u] Invalid A blend equation (0x%04x).\n",
+			__FUNCTION__, __LINE__, ctx->Color.BlendEquationA);
+		return;
+	}
+
+	r300SetBlendCntl(r300,
+			 func, eqn,
+			 R300_BLEND_UNKNOWN | R300_BLEND_ENABLE, funcA, eqnA);
+}
+
+static void r300BlendEquationSeparate(GLcontext * ctx,
+				      GLenum modeRGB, GLenum modeA)
+{
+	r300SetBlendState(ctx);
+}
+
+static void r300BlendFuncSeparate(GLcontext * ctx,
+				  GLenum sfactorRGB, GLenum dfactorRGB,
+				  GLenum sfactorA, GLenum dfactorA)
+{
+	r300SetBlendState(ctx);
+}
+
+/**
+ * Update our tracked culling state based on Mesa's state.
+ */
+static void r300UpdateCulling(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	uint32_t val = 0;
+
+	R300_STATECHANGE(r300, cul);
+	if (ctx->Polygon.CullFlag) {
+		if (ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
+			val = R300_CULL_FRONT | R300_CULL_BACK;
+		else if (ctx->Polygon.CullFaceMode == GL_FRONT)
+			val = R300_CULL_FRONT;
+		else
+			val = R300_CULL_BACK;
+
+		if (ctx->Polygon.FrontFace == GL_CW)
+			val |= R300_FRONT_FACE_CW;
+		else
+			val |= R300_FRONT_FACE_CCW;
+	}
+	r300->hw.cul.cmd[R300_CUL_CULL] = val;
+}
+
+static void r300SetEarlyZState(GLcontext * ctx)
+{
+	/* updates register R300_RB3D_EARLY_Z (0x4F14)
+	   if depth test is not enabled it should be R300_EARLY_Z_DISABLE
+	   if depth is enabled and alpha not it should be R300_EARLY_Z_ENABLE
+	   if depth and alpha is enabled it should be R300_EARLY_Z_DISABLE
+	 */
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	R300_STATECHANGE(r300, zstencil_format);
+	if (ctx->Color.AlphaEnabled && ctx->Color.AlphaFunc != GL_ALWAYS)
+		/* disable early Z */
+		r300->hw.zstencil_format.cmd[2] = R300_EARLY_Z_DISABLE;
+	else {
+		if (ctx->Depth.Test && ctx->Depth.Func != GL_NEVER)
+			/* enable early Z */
+			r300->hw.zstencil_format.cmd[2] = R300_EARLY_Z_ENABLE;
+		else
+			/* disable early Z */
+			r300->hw.zstencil_format.cmd[2] = R300_EARLY_Z_DISABLE;
+	}
+}
+
+static void r300SetAlphaState(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	GLubyte refByte;
+	uint32_t pp_misc = 0x0;
+	GLboolean really_enabled = ctx->Color.AlphaEnabled;
+
+	CLAMPED_FLOAT_TO_UBYTE(refByte, ctx->Color.AlphaRef);
+
+	switch (ctx->Color.AlphaFunc) {
+	case GL_NEVER:
+		pp_misc |= R300_ALPHA_TEST_FAIL;
+		break;
+	case GL_LESS:
+		pp_misc |= R300_ALPHA_TEST_LESS;
+		break;
+	case GL_EQUAL:
+		pp_misc |= R300_ALPHA_TEST_EQUAL;
+		break;
+	case GL_LEQUAL:
+		pp_misc |= R300_ALPHA_TEST_LEQUAL;
+		break;
+	case GL_GREATER:
+		pp_misc |= R300_ALPHA_TEST_GREATER;
+		break;
+	case GL_NOTEQUAL:
+		pp_misc |= R300_ALPHA_TEST_NEQUAL;
+		break;
+	case GL_GEQUAL:
+		pp_misc |= R300_ALPHA_TEST_GEQUAL;
+		break;
+	case GL_ALWAYS:
+		/*pp_misc |= R300_ALPHA_TEST_PASS; */
+		really_enabled = GL_FALSE;
+		break;
+	}
+
+	if (really_enabled) {
+		pp_misc |= R300_ALPHA_TEST_ENABLE;
+		pp_misc |= (refByte & R300_REF_ALPHA_MASK);
+	} else {
+		pp_misc = 0x0;
+	}
+
+	R300_STATECHANGE(r300, at);
+	r300->hw.at.cmd[R300_AT_ALPHA_TEST] = pp_misc;
+
+	r300SetEarlyZState(ctx);
+}
+
+static void r300AlphaFunc(GLcontext * ctx, GLenum func, GLfloat ref)
+{
+	(void)func;
+	(void)ref;
+	r300SetAlphaState(ctx);
+}
+
+static int translate_func(int func)
+{
+	switch (func) {
+	case GL_NEVER:
+		return R300_ZS_NEVER;
+	case GL_LESS:
+		return R300_ZS_LESS;
+	case GL_EQUAL:
+		return R300_ZS_EQUAL;
+	case GL_LEQUAL:
+		return R300_ZS_LEQUAL;
+	case GL_GREATER:
+		return R300_ZS_GREATER;
+	case GL_NOTEQUAL:
+		return R300_ZS_NOTEQUAL;
+	case GL_GEQUAL:
+		return R300_ZS_GEQUAL;
+	case GL_ALWAYS:
+		return R300_ZS_ALWAYS;
+	}
+	return 0;
+}
+
+static void r300SetDepthState(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	R300_STATECHANGE(r300, zs);
+	r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_RB3D_STENCIL_ENABLE;
+	r300->hw.zs.cmd[R300_ZS_CNTL_1] &=
+	    ~(R300_ZS_MASK << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT);
+
+	if (ctx->Depth.Test && ctx->Depth.Func != GL_NEVER) {
+		if (ctx->Depth.Mask)
+			r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
+			    R300_RB3D_Z_TEST_AND_WRITE;
+		else
+			r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_RB3D_Z_TEST;
+
+		r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
+		    translate_func(ctx->Depth.
+				   Func) << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT;
+	} else {
+		r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_RB3D_Z_DISABLED_1;
+		r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
+		    translate_func(GL_NEVER) << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT;
+	}
+
+	r300SetEarlyZState(ctx);
+}
+
+static void r300ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq );
+
+/**
+ * Handle glEnable()/glDisable().
+ *
+ * \note Mesa already filters redundant calls to glEnable/glDisable.
+ */
+static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	GLuint p;
+	if (RADEON_DEBUG & DEBUG_STATE)
+		fprintf(stderr, "%s( %s = %s )\n", __FUNCTION__,
+			_mesa_lookup_enum_by_nr(cap),
+			state ? "GL_TRUE" : "GL_FALSE");
+
+	switch (cap) {
+		/* Fast track this one...
+		 */
+	case GL_TEXTURE_1D:
+	case GL_TEXTURE_2D:
+	case GL_TEXTURE_3D:
+		break;
+
+	case GL_FOG:
+		R300_STATECHANGE(r300, fogs);
+		if (state) {
+			r300->hw.fogs.cmd[R300_FOGS_STATE] |= R300_FOG_ENABLE;
+
+			ctx->Driver.Fogfv(ctx, GL_FOG_MODE, NULL);
+			ctx->Driver.Fogfv(ctx, GL_FOG_DENSITY,
+					  &ctx->Fog.Density);
+			ctx->Driver.Fogfv(ctx, GL_FOG_START, &ctx->Fog.Start);
+			ctx->Driver.Fogfv(ctx, GL_FOG_END, &ctx->Fog.End);
+			ctx->Driver.Fogfv(ctx, GL_FOG_COLOR, ctx->Fog.Color);
+		} else {
+			r300->hw.fogs.cmd[R300_FOGS_STATE] &= ~R300_FOG_ENABLE;
+		}
+
+		break;
+
+	case GL_ALPHA_TEST:
+		r300SetAlphaState(ctx);
+		break;
+
+	case GL_BLEND:
+	case GL_COLOR_LOGIC_OP:
+		r300SetBlendState(ctx);
+		break;
+
+
+	case GL_CLIP_PLANE0:
+	case GL_CLIP_PLANE1:
+	case GL_CLIP_PLANE2:
+	case GL_CLIP_PLANE3:
+	case GL_CLIP_PLANE4:
+	case GL_CLIP_PLANE5:
+		/* no VAP UCP on non-TCL chipsets */
+		if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+			return;
+
+		p = cap-GL_CLIP_PLANE0;
+		R300_STATECHANGE( r300, vap_clip_cntl );
+		if (state) {
+			r300->hw.vap_clip_cntl.cmd[1] |= (R300_VAP_UCP_ENABLE_0<<p);
+			r300ClipPlane( ctx, cap, NULL );
+		}
+		else {
+			r300->hw.vap_clip_cntl.cmd[1] &= ~(R300_VAP_UCP_ENABLE_0<<p);
+		}
+		break;
+	case GL_DEPTH_TEST:
+		r300SetDepthState(ctx);
+		break;
+
+	case GL_STENCIL_TEST:
+		if (r300->state.stencil.hw_stencil) {
+			R300_STATECHANGE(r300, zs);
+			if (state) {
+				r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
+				    R300_RB3D_STENCIL_ENABLE;
+			} else {
+				r300->hw.zs.cmd[R300_ZS_CNTL_0] &=
+				    ~R300_RB3D_STENCIL_ENABLE;
+			}
+		} else {
+#if R200_MERGED
+			FALLBACK(&r300->radeon, RADEON_FALLBACK_STENCIL, state);
+#endif
+		}
+		break;
+
+	case GL_CULL_FACE:
+		r300UpdateCulling(ctx);
+		break;
+
+	case GL_POLYGON_OFFSET_POINT:
+	case GL_POLYGON_OFFSET_LINE:
+		break;
+
+	case GL_POLYGON_OFFSET_FILL:
+		R300_STATECHANGE(r300, occlusion_cntl);
+		if (state) {
+			r300->hw.occlusion_cntl.cmd[1] |= (3 << 0);
+		} else {
+			r300->hw.occlusion_cntl.cmd[1] &= ~(3 << 0);
+		}
+		break;
+	default:
+		radeonEnable(ctx, cap, state);
+		return;
+	}
+}
+
+static void r300UpdatePolygonMode(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	uint32_t hw_mode = 0;
+
+	if (ctx->Polygon.FrontMode != GL_FILL ||
+	    ctx->Polygon.BackMode != GL_FILL) {
+		GLenum f, b;
+
+		if (ctx->Polygon.FrontFace == GL_CCW) {
+			f = ctx->Polygon.FrontMode;
+			b = ctx->Polygon.BackMode;
+		} else {
+			f = ctx->Polygon.BackMode;
+			b = ctx->Polygon.FrontMode;
+		}
+
+		hw_mode |= R300_PM_ENABLED;
+
+		switch (f) {
+		case GL_LINE:
+			hw_mode |= R300_PM_FRONT_LINE;
+			break;
+		case GL_POINT:	/* noop */
+			hw_mode |= R300_PM_FRONT_POINT;
+			break;
+		case GL_FILL:
+			hw_mode |= R300_PM_FRONT_FILL;
+			break;
+		}
+
+		switch (b) {
+		case GL_LINE:
+			hw_mode |= R300_PM_BACK_LINE;
+			break;
+		case GL_POINT:	/* noop */
+			hw_mode |= R300_PM_BACK_POINT;
+			break;
+		case GL_FILL:
+			hw_mode |= R300_PM_BACK_FILL;
+			break;
+		}
+	}
+
+	if (r300->hw.polygon_mode.cmd[1] != hw_mode) {
+		R300_STATECHANGE(r300, polygon_mode);
+		r300->hw.polygon_mode.cmd[1] = hw_mode;
+	}
+}
+
+/**
+ * Change the culling mode.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300CullFace(GLcontext * ctx, GLenum mode)
+{
+	(void)mode;
+
+	r300UpdateCulling(ctx);
+}
+
+/**
+ * Change the polygon orientation.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300FrontFace(GLcontext * ctx, GLenum mode)
+{
+	(void)mode;
+
+	r300UpdateCulling(ctx);
+	r300UpdatePolygonMode(ctx);
+}
+
+/**
+ * Change the depth testing function.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300DepthFunc(GLcontext * ctx, GLenum func)
+{
+	(void)func;
+	r300SetDepthState(ctx);
+}
+
+/**
+ * Enable/Disable depth writing.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300DepthMask(GLcontext * ctx, GLboolean mask)
+{
+	(void)mask;
+	r300SetDepthState(ctx);
+}
+
+/**
+ * Handle glColorMask()
+ */
+static void r300ColorMask(GLcontext * ctx,
+			  GLboolean r, GLboolean g, GLboolean b, GLboolean a)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	int mask = (r ? R300_COLORMASK0_R : 0) |
+	    (g ? R300_COLORMASK0_G : 0) |
+	    (b ? R300_COLORMASK0_B : 0) | (a ? R300_COLORMASK0_A : 0);
+
+	if (mask != r300->hw.cmk.cmd[R300_CMK_COLORMASK]) {
+		R300_STATECHANGE(r300, cmk);
+		r300->hw.cmk.cmd[R300_CMK_COLORMASK] = mask;
+	}
+}
+
+/* =============================================================
+ * Fog
+ */
+static void r300Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	union {
+		int i;
+		float f;
+	} fogScale, fogStart;
+
+	(void)param;
+
+	fogScale.i = r300->hw.fogp.cmd[R300_FOGP_SCALE];
+	fogStart.i = r300->hw.fogp.cmd[R300_FOGP_START];
+
+	switch (pname) {
+	case GL_FOG_MODE:
+		if (!ctx->Fog.Enabled)
+			return;
+		switch (ctx->Fog.Mode) {
+		case GL_LINEAR:
+			R300_STATECHANGE(r300, fogs);
+			r300->hw.fogs.cmd[R300_FOGS_STATE] =
+			    (r300->hw.fogs.
+			     cmd[R300_FOGS_STATE] & ~R300_FOG_MODE_MASK) |
+			    R300_FOG_MODE_LINEAR;
+
+			if (ctx->Fog.Start == ctx->Fog.End) {
+				fogScale.f = -1.0;
+				fogStart.f = 1.0;
+			} else {
+				fogScale.f =
+				    1.0 / (ctx->Fog.End - ctx->Fog.Start);
+				fogStart.f =
+				    -ctx->Fog.Start / (ctx->Fog.End -
+						       ctx->Fog.Start);
+			}
+			break;
+		case GL_EXP:
+			R300_STATECHANGE(r300, fogs);
+			r300->hw.fogs.cmd[R300_FOGS_STATE] =
+			    (r300->hw.fogs.
+			     cmd[R300_FOGS_STATE] & ~R300_FOG_MODE_MASK) |
+			    R300_FOG_MODE_EXP;
+			fogScale.f = 0.0933 * ctx->Fog.Density;
+			fogStart.f = 0.0;
+			break;
+		case GL_EXP2:
+			R300_STATECHANGE(r300, fogs);
+			r300->hw.fogs.cmd[R300_FOGS_STATE] =
+			    (r300->hw.fogs.
+			     cmd[R300_FOGS_STATE] & ~R300_FOG_MODE_MASK) |
+			    R300_FOG_MODE_EXP2;
+			fogScale.f = 0.3 * ctx->Fog.Density;
+			fogStart.f = 0.0;
+		default:
+			return;
+		}
+		break;
+	case GL_FOG_DENSITY:
+		switch (ctx->Fog.Mode) {
+		case GL_EXP:
+			fogScale.f = 0.0933 * ctx->Fog.Density;
+			fogStart.f = 0.0;
+			break;
+		case GL_EXP2:
+			fogScale.f = 0.3 * ctx->Fog.Density;
+			fogStart.f = 0.0;
+		default:
+			break;
+		}
+		break;
+	case GL_FOG_START:
+	case GL_FOG_END:
+		if (ctx->Fog.Mode == GL_LINEAR) {
+			if (ctx->Fog.Start == ctx->Fog.End) {
+				fogScale.f = -1.0;
+				fogStart.f = 1.0;
+			} else {
+				fogScale.f =
+				    1.0 / (ctx->Fog.End - ctx->Fog.Start);
+				fogStart.f =
+				    -ctx->Fog.Start / (ctx->Fog.End -
+						       ctx->Fog.Start);
+			}
+		}
+		break;
+	case GL_FOG_COLOR:
+		R300_STATECHANGE(r300, fogc);
+		r300->hw.fogc.cmd[R300_FOGC_R] =
+		    (GLuint) (ctx->Fog.Color[0] * 1023.0F) & 0x3FF;
+		r300->hw.fogc.cmd[R300_FOGC_G] =
+		    (GLuint) (ctx->Fog.Color[1] * 1023.0F) & 0x3FF;
+		r300->hw.fogc.cmd[R300_FOGC_B] =
+		    (GLuint) (ctx->Fog.Color[2] * 1023.0F) & 0x3FF;
+		break;
+	case GL_FOG_COORD_SRC:
+		break;
+	default:
+		return;
+	}
+
+	if (fogScale.i != r300->hw.fogp.cmd[R300_FOGP_SCALE] ||
+	    fogStart.i != r300->hw.fogp.cmd[R300_FOGP_START]) {
+		R300_STATECHANGE(r300, fogp);
+		r300->hw.fogp.cmd[R300_FOGP_SCALE] = fogScale.i;
+		r300->hw.fogp.cmd[R300_FOGP_START] = fogStart.i;
+	}
+}
+
+/* =============================================================
+ * Point state
+ */
+static void r300PointSize(GLcontext * ctx, GLfloat size)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	size = ctx->Point._Size;
+
+	R300_STATECHANGE(r300, ps);
+	r300->hw.ps.cmd[R300_PS_POINTSIZE] =
+	    ((int)(size * 6) << R300_POINTSIZE_X_SHIFT) |
+	    ((int)(size * 6) << R300_POINTSIZE_Y_SHIFT);
+}
+
+/* =============================================================
+ * Line state
+ */
+static void r300LineWidth(GLcontext * ctx, GLfloat widthf)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	widthf = ctx->Line._Width;
+
+	R300_STATECHANGE(r300, lcntl);
+	r300->hw.lcntl.cmd[1] = (int)(widthf * 6.0);
+	r300->hw.lcntl.cmd[1] |= R300_LINE_CNT_VE;
+}
+
+static void r300PolygonMode(GLcontext * ctx, GLenum face, GLenum mode)
+{
+	(void)face;
+	(void)mode;
+
+	r300UpdatePolygonMode(ctx);
+}
+
+/* =============================================================
+ * Stencil
+ */
+
+static int translate_stencil_op(int op)
+{
+	switch (op) {
+	case GL_KEEP:
+		return R300_ZS_KEEP;
+	case GL_ZERO:
+		return R300_ZS_ZERO;
+	case GL_REPLACE:
+		return R300_ZS_REPLACE;
+	case GL_INCR:
+		return R300_ZS_INCR;
+	case GL_DECR:
+		return R300_ZS_DECR;
+	case GL_INCR_WRAP_EXT:
+		return R300_ZS_INCR_WRAP;
+	case GL_DECR_WRAP_EXT:
+		return R300_ZS_DECR_WRAP;
+	case GL_INVERT:
+		return R300_ZS_INVERT;
+	default:
+		WARN_ONCE("Do not know how to translate stencil op");
+		return R300_ZS_KEEP;
+	}
+	return 0;
+}
+
+static void r300ShadeModel(GLcontext * ctx, GLenum mode)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	R300_STATECHANGE(rmesa, shade);
+	switch (mode) {
+	case GL_FLAT:
+		rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_FLAT;
+		break;
+	case GL_SMOOTH:
+		rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_SMOOTH;
+		break;
+	default:
+		return;
+	}
+}
+
+static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
+				    GLenum func, GLint ref, GLuint mask)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	GLuint refmask =
+	    (((ctx->Stencil.
+	       Ref[0] & 0xff) << R300_RB3D_ZS2_STENCIL_REF_SHIFT) | ((ctx->
+								      Stencil.
+								      ValueMask
+								      [0] &
+								      0xff)
+								     <<
+								     R300_RB3D_ZS2_STENCIL_MASK_SHIFT));
+
+	GLuint flag;
+
+	R300_STATECHANGE(rmesa, zs);
+
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &= ~((R300_ZS_MASK <<
+					       R300_RB3D_ZS1_FRONT_FUNC_SHIFT)
+					      | (R300_ZS_MASK <<
+						 R300_RB3D_ZS1_BACK_FUNC_SHIFT));
+
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] &=
+	    ~((R300_RB3D_ZS2_STENCIL_MASK <<
+	       R300_RB3D_ZS2_STENCIL_REF_SHIFT) |
+	      (R300_RB3D_ZS2_STENCIL_MASK << R300_RB3D_ZS2_STENCIL_MASK_SHIFT));
+
+	flag = translate_func(ctx->Stencil.Function[0]);
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+	    (flag << R300_RB3D_ZS1_FRONT_FUNC_SHIFT);
+
+	if (ctx->Stencil._TestTwoSide)
+		flag = translate_func(ctx->Stencil.Function[1]);
+
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+	    (flag << R300_RB3D_ZS1_BACK_FUNC_SHIFT);
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] |= refmask;
+}
+
+static void r300StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	R300_STATECHANGE(rmesa, zs);
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] &=
+	    ~(R300_RB3D_ZS2_STENCIL_MASK <<
+	      R300_RB3D_ZS2_STENCIL_WRITE_MASK_SHIFT);
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] |=
+	    (ctx->Stencil.
+	     WriteMask[0] & 0xff) << R300_RB3D_ZS2_STENCIL_WRITE_MASK_SHIFT;
+}
+
+static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
+				  GLenum fail, GLenum zfail, GLenum zpass)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	R300_STATECHANGE(rmesa, zs);
+	/* It is easier to mask what's left.. */
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &=
+	    (R300_ZS_MASK << R300_RB3D_ZS1_DEPTH_FUNC_SHIFT) |
+	    (R300_ZS_MASK << R300_RB3D_ZS1_FRONT_FUNC_SHIFT) |
+	    (R300_ZS_MASK << R300_RB3D_ZS1_BACK_FUNC_SHIFT);
+
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+	    (translate_stencil_op(ctx->Stencil.FailFunc[0]) <<
+	     R300_RB3D_ZS1_FRONT_FAIL_OP_SHIFT)
+	    | (translate_stencil_op(ctx->Stencil.ZFailFunc[0]) <<
+	       R300_RB3D_ZS1_FRONT_ZFAIL_OP_SHIFT)
+	    | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
+	       R300_RB3D_ZS1_FRONT_ZPASS_OP_SHIFT);
+
+	if (ctx->Stencil._TestTwoSide) {
+		rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+		    (translate_stencil_op(ctx->Stencil.FailFunc[1]) <<
+		     R300_RB3D_ZS1_BACK_FAIL_OP_SHIFT)
+		    | (translate_stencil_op(ctx->Stencil.ZFailFunc[1]) <<
+		       R300_RB3D_ZS1_BACK_ZFAIL_OP_SHIFT)
+		    | (translate_stencil_op(ctx->Stencil.ZPassFunc[1]) <<
+		       R300_RB3D_ZS1_BACK_ZPASS_OP_SHIFT);
+	} else {
+		rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+		    (translate_stencil_op(ctx->Stencil.FailFunc[0]) <<
+		     R300_RB3D_ZS1_BACK_FAIL_OP_SHIFT)
+		    | (translate_stencil_op(ctx->Stencil.ZFailFunc[0]) <<
+		       R300_RB3D_ZS1_BACK_ZFAIL_OP_SHIFT)
+		    | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
+		       R300_RB3D_ZS1_BACK_ZPASS_OP_SHIFT);
+	}
+}
+
+static void r300ClearStencil(GLcontext * ctx, GLint s)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	rmesa->state.stencil.clear =
+	    ((GLuint) (ctx->Stencil.Clear & 0xff) |
+	     (R300_RB3D_ZS2_STENCIL_MASK <<
+	      R300_RB3D_ZS2_STENCIL_MASK_SHIFT) | ((ctx->Stencil.
+						    WriteMask[0] & 0xff) <<
+						   R300_RB3D_ZS2_STENCIL_WRITE_MASK_SHIFT));
+}
+
+/* =============================================================
+ * Window position and viewport transformation
+ */
+
+/*
+ * To correctly position primitives:
+ */
+#define SUBPIXEL_X 0.125
+#define SUBPIXEL_Y 0.125
+
+static void r300UpdateWindow(GLcontext * ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	__DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+	GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
+	GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
+	const GLfloat *v = ctx->Viewport._WindowMap.m;
+
+	GLfloat sx = v[MAT_SX];
+	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
+	GLfloat sy = -v[MAT_SY];
+	GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+	GLfloat sz = v[MAT_SZ] * rmesa->state.depth.scale;
+	GLfloat tz = v[MAT_TZ] * rmesa->state.depth.scale;
+
+	R300_FIREVERTICES(rmesa);
+	R300_STATECHANGE(rmesa, vpt);
+
+	rmesa->hw.vpt.cmd[R300_VPT_XSCALE] = r300PackFloat32(sx);
+	rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] = r300PackFloat32(tx);
+	rmesa->hw.vpt.cmd[R300_VPT_YSCALE] = r300PackFloat32(sy);
+	rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] = r300PackFloat32(ty);
+	rmesa->hw.vpt.cmd[R300_VPT_ZSCALE] = r300PackFloat32(sz);
+	rmesa->hw.vpt.cmd[R300_VPT_ZOFFSET] = r300PackFloat32(tz);
+}
+
+static void r300Viewport(GLcontext * ctx, GLint x, GLint y,
+			 GLsizei width, GLsizei height)
+{
+	/* Don't pipeline viewport changes, conflict with window offset
+	 * setting below.  Could apply deltas to rescue pipelined viewport
+	 * values, or keep the originals hanging around.
+	 */
+	r300UpdateWindow(ctx);
+}
+
+static void r300DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
+{
+	r300UpdateWindow(ctx);
+}
+
+void r300UpdateViewportOffset(GLcontext * ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	__DRIdrawablePrivate *dPriv = ((radeonContextPtr) rmesa)->dri.drawable;
+	GLfloat xoffset = (GLfloat) dPriv->x;
+	GLfloat yoffset = (GLfloat) dPriv->y + dPriv->h;
+	const GLfloat *v = ctx->Viewport._WindowMap.m;
+
+	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
+	GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+
+	if (rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] != r300PackFloat32(tx) ||
+	    rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] != r300PackFloat32(ty)) {
+		/* Note: this should also modify whatever data the context reset
+		 * code uses...
+		 */
+		R300_STATECHANGE(rmesa, vpt);
+		rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] = r300PackFloat32(tx);
+		rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] = r300PackFloat32(ty);
+
+	}
+
+	radeonUpdateScissor(ctx);
+}
+
+/**
+ * Tell the card where to render (offset, pitch).
+ * Effected by glDrawBuffer, etc
+ */
+void r300UpdateDrawBuffer(GLcontext * ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	r300ContextPtr r300 = rmesa;
+	struct gl_framebuffer *fb = ctx->DrawBuffer;
+	driRenderbuffer *drb;
+
+	if (fb->_ColorDrawBufferMask[0] == BUFFER_BIT_FRONT_LEFT) {
+		/* draw to front */
+		drb =
+		    (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].
+		    Renderbuffer;
+	} else if (fb->_ColorDrawBufferMask[0] == BUFFER_BIT_BACK_LEFT) {
+		/* draw to back */
+		drb =
+		    (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].
+		    Renderbuffer;
+	} else {
+		/* drawing to multiple buffers, or none */
+		return;
+	}
+
+	assert(drb);
+	assert(drb->flippedPitch);
+
+	R300_STATECHANGE(rmesa, cb);
+
+	r300->hw.cb.cmd[R300_CB_OFFSET] = drb->flippedOffset +	//r300->radeon.state.color.drawOffset +
+	    r300->radeon.radeonScreen->fbLocation;
+	r300->hw.cb.cmd[R300_CB_PITCH] = drb->flippedPitch;	//r300->radeon.state.color.drawPitch;
+
+	if (r300->radeon.radeonScreen->cpp == 4)
+		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
+	else
+		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
+
+	if (r300->radeon.sarea->tiling_enabled)
+		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
+#if 0
+	R200_STATECHANGE(rmesa, ctx);
+
+	/* Note: we used the (possibly) page-flipped values */
+	rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
+	    = ((drb->flippedOffset + rmesa->r200Screen->fbLocation)
+	       & R200_COLOROFFSET_MASK);
+	rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
+
+	if (rmesa->sarea->tiling_enabled) {
+		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
+		    R200_COLOR_TILE_ENABLE;
+	}
+#endif
+}
+
+static void
+r300FetchStateParameter(GLcontext * ctx,
+			const gl_state_index state[STATE_LENGTH],
+			GLfloat * value)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	switch (state[0]) {
+	case STATE_INTERNAL:
+		switch (state[1]) {
+		case STATE_R300_WINDOW_DIMENSION:
+			value[0] = r300->radeon.dri.drawable->w * 0.5f;	/* width*0.5 */
+			value[1] = r300->radeon.dri.drawable->h * 0.5f;	/* height*0.5 */
+			value[2] = 0.5F;	/* for moving range [-1 1] -> [0 1] */
+			value[3] = 1.0F;	/* not used */
+			break;
+
+		case STATE_R300_TEXRECT_FACTOR:{
+				struct gl_texture_object *t =
+				    ctx->Texture.Unit[state[2]].CurrentRect;
+
+				if (t && t->Image[0][t->BaseLevel]) {
+					struct gl_texture_image *image =
+					    t->Image[0][t->BaseLevel];
+					value[0] = 1.0 / image->Width2;
+					value[1] = 1.0 / image->Height2;
+				} else {
+					value[0] = 1.0;
+					value[1] = 1.0;
+				}
+				value[2] = 1.0;
+				value[3] = 1.0;
+				break;
+			}
+
+		default:
+			break;
+		}
+		break;
+
+	default:
+		break;
+	}
+}
+
+/**
+ * Update R300's own internal state parameters.
+ * For now just STATE_R300_WINDOW_DIMENSION
+ */
+void r300UpdateStateParameters(GLcontext * ctx, GLuint new_state)
+{
+	struct r300_fragment_program *fp;
+	struct gl_program_parameter_list *paramList;
+	GLuint i;
+
+	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM)))
+		return;
+
+	fp = (struct r300_fragment_program *)ctx->FragmentProgram._Current;
+	if (!fp)
+		return;
+
+	paramList = fp->mesa_program.Base.Parameters;
+
+	if (!paramList)
+		return;
+
+	for (i = 0; i < paramList->NumParameters; i++) {
+		if (paramList->Parameters[i].Type == PROGRAM_STATE_VAR) {
+			r300FetchStateParameter(ctx,
+						paramList->Parameters[i].
+						StateIndexes,
+						paramList->ParameterValues[i]);
+		}
+	}
+}
+
+/* =============================================================
+ * Polygon state
+ */
+static void r300PolygonOffset(GLcontext * ctx, GLfloat factor, GLfloat units)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	GLfloat constant = units;
+
+	switch (ctx->Visual.depthBits) {
+	case 16:
+		constant *= 4.0;
+		break;
+	case 24:
+		constant *= 2.0;
+		break;
+	}
+
+	factor *= 12.0;
+
+/*    fprintf(stderr, "%s f:%f u:%f\n", __FUNCTION__, factor, constant); */
+
+	R300_STATECHANGE(rmesa, zbs);
+	rmesa->hw.zbs.cmd[R300_ZBS_T_FACTOR] = r300PackFloat32(factor);
+	rmesa->hw.zbs.cmd[R300_ZBS_T_CONSTANT] = r300PackFloat32(constant);
+	rmesa->hw.zbs.cmd[R300_ZBS_W_FACTOR] = r300PackFloat32(factor);
+	rmesa->hw.zbs.cmd[R300_ZBS_W_CONSTANT] = r300PackFloat32(constant);
+}
+
+/* Routing and texture-related */
+
+/* r300 doesnt handle GL_CLAMP and GL_MIRROR_CLAMP_EXT correctly when filter is NEAREST.
+ * Since texwrap produces same results for GL_CLAMP and GL_CLAMP_TO_EDGE we use them instead.
+ * We need to recalculate wrap modes whenever filter mode is changed because someone might do:
+ * glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ * glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
+ * glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+ * Since r300 completely ignores R300_TX_CLAMP when either min or mag is nearest it cant handle
+ * combinations where only one of them is nearest.
+ */
+static unsigned long gen_fixed_filter(unsigned long f)
+{
+	unsigned long mag, min, needs_fixing = 0;
+	//return f;
+
+	/* We ignore MIRROR bit so we dont have to do everything twice */
+	if ((f & ((7 - 1) << R300_TX_WRAP_S_SHIFT)) ==
+	    (R300_TX_CLAMP << R300_TX_WRAP_S_SHIFT)) {
+		needs_fixing |= 1;
+	}
+	if ((f & ((7 - 1) << R300_TX_WRAP_T_SHIFT)) ==
+	    (R300_TX_CLAMP << R300_TX_WRAP_T_SHIFT)) {
+		needs_fixing |= 2;
+	}
+	if ((f & ((7 - 1) << R300_TX_WRAP_Q_SHIFT)) ==
+	    (R300_TX_CLAMP << R300_TX_WRAP_Q_SHIFT)) {
+		needs_fixing |= 4;
+	}
+
+	if (!needs_fixing)
+		return f;
+
+	mag = f & R300_TX_MAG_FILTER_MASK;
+	min = f & R300_TX_MIN_FILTER_MASK;
+
+	/* TODO: Check for anisto filters too */
+	if ((mag != R300_TX_MAG_FILTER_NEAREST)
+	    && (min != R300_TX_MIN_FILTER_NEAREST))
+		return f;
+
+	/* r300 cant handle these modes hence we force nearest to linear */
+	if ((mag == R300_TX_MAG_FILTER_NEAREST)
+	    && (min != R300_TX_MIN_FILTER_NEAREST)) {
+		f &= ~R300_TX_MAG_FILTER_NEAREST;
+		f |= R300_TX_MAG_FILTER_LINEAR;
+		return f;
+	}
+
+	if ((min == R300_TX_MIN_FILTER_NEAREST)
+	    && (mag != R300_TX_MAG_FILTER_NEAREST)) {
+		f &= ~R300_TX_MIN_FILTER_NEAREST;
+		f |= R300_TX_MIN_FILTER_LINEAR;
+		return f;
+	}
+
+	/* Both are nearest */
+	if (needs_fixing & 1) {
+		f &= ~((7 - 1) << R300_TX_WRAP_S_SHIFT);
+		f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_S_SHIFT;
+	}
+	if (needs_fixing & 2) {
+		f &= ~((7 - 1) << R300_TX_WRAP_T_SHIFT);
+		f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_T_SHIFT;
+	}
+	if (needs_fixing & 4) {
+		f &= ~((7 - 1) << R300_TX_WRAP_Q_SHIFT);
+		f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_Q_SHIFT;
+	}
+	return f;
+}
+
+static void r300SetupTextures(GLcontext * ctx)
+{
+	int i, mtu;
+	struct r300_tex_obj *t;
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	int hw_tmu = 0;
+	int last_hw_tmu = -1;	/* -1 translates into no setup costs for fields */
+	int tmu_mappings[R300_MAX_TEXTURE_UNITS] = { -1, };
+	struct r300_fragment_program *fp = (struct r300_fragment_program *)
+	    (char *)ctx->FragmentProgram._Current;
+
+	R300_STATECHANGE(r300, txe);
+	R300_STATECHANGE(r300, tex.filter);
+	R300_STATECHANGE(r300, tex.filter_1);
+	R300_STATECHANGE(r300, tex.size);
+	R300_STATECHANGE(r300, tex.format);
+	R300_STATECHANGE(r300, tex.pitch);
+	R300_STATECHANGE(r300, tex.offset);
+	R300_STATECHANGE(r300, tex.chroma_key);
+	R300_STATECHANGE(r300, tex.border_color);
+
+	r300->hw.txe.cmd[R300_TXE_ENABLE] = 0x0;
+
+	mtu = r300->radeon.glCtx->Const.MaxTextureUnits;
+	if (RADEON_DEBUG & DEBUG_STATE)
+		fprintf(stderr, "mtu=%d\n", mtu);
+
+	if (mtu > R300_MAX_TEXTURE_UNITS) {
+		fprintf(stderr,
+			"Aiiee ! mtu=%d is greater than R300_MAX_TEXTURE_UNITS=%d\n",
+			mtu, R300_MAX_TEXTURE_UNITS);
+		_mesa_exit(-1);
+	}
+
+	/* We cannot let disabled tmu offsets pass DRM */
+	for (i = 0; i < mtu; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled) {
+
+#if 0				/* Enables old behaviour */
+			hw_tmu = i;
+#endif
+			tmu_mappings[i] = hw_tmu;
+
+			t = r300->state.texture.unit[i].texobj;
+			/* XXX questionable fix for bug 9170: */
+			if (!t)
+				continue;
+
+			if ((t->format & 0xffffff00) == 0xffffff00) {
+				WARN_ONCE
+				    ("unknown texture format (entry %x) encountered. Help me !\n",
+				     t->format & 0xff);
+			}
+
+			if (RADEON_DEBUG & DEBUG_STATE)
+				fprintf(stderr,
+					"Activating texture unit %d\n", i);
+
+			r300->hw.txe.cmd[R300_TXE_ENABLE] |= (1 << hw_tmu);
+
+			r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 +
+						hw_tmu] =
+			    gen_fixed_filter(t->filter) | (hw_tmu << 28);
+			/* Currently disabled! */
+			r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] = 0x0;	//0x20501f80;
+			r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+			    t->size;
+			r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
+						hw_tmu] = t->format;
+			r300->hw.tex.pitch.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+			    t->pitch_reg;
+			r300->hw.tex.offset.cmd[R300_TEX_VALUE_0 +
+						hw_tmu] = t->offset;
+
+			if (t->offset & R300_TXO_MACRO_TILE) {
+				WARN_ONCE("macro tiling enabled!\n");
+			}
+
+			if (t->offset & R300_TXO_MICRO_TILE) {
+				WARN_ONCE("micro tiling enabled!\n");
+			}
+
+			r300->hw.tex.chroma_key.cmd[R300_TEX_VALUE_0 +
+						    hw_tmu] = 0x0;
+			r300->hw.tex.border_color.cmd[R300_TEX_VALUE_0 +
+						      hw_tmu] =
+			    t->pp_border_color;
+
+			last_hw_tmu = hw_tmu;
+
+			hw_tmu++;
+		}
+	}
+
+	r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_FILTER_0, last_hw_tmu + 1);
+	r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_FILTER1_0, last_hw_tmu + 1);
+	r300->hw.tex.size.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_SIZE_0, last_hw_tmu + 1);
+	r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_FORMAT_0, last_hw_tmu + 1);
+	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_PITCH_0, last_hw_tmu + 1);
+	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_OFFSET_0, last_hw_tmu + 1);
+	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_CHROMA_KEY_0, last_hw_tmu + 1);
+	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
+	    cmdpacket0(R300_TX_BORDER_COLOR_0, last_hw_tmu + 1);
+
+	if (!fp)		/* should only happenen once, just after context is created */
+		return;
+
+	R300_STATECHANGE(r300, fpt);
+
+	for (i = 0; i < fp->tex.length; i++) {
+		int unit;
+		int opcode;
+		unsigned long val;
+
+		unit = fp->tex.inst[i] >> R300_FPITX_IMAGE_SHIFT;
+		unit &= 15;
+
+		val = fp->tex.inst[i];
+		val &= ~R300_FPITX_IMAGE_MASK;
+
+		opcode =
+		    (val & R300_FPITX_OPCODE_MASK) >> R300_FPITX_OPCODE_SHIFT;
+		if (opcode == R300_FPITX_OP_KIL) {
+			r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
+		} else {
+			if (tmu_mappings[unit] >= 0) {
+				val |=
+				    tmu_mappings[unit] <<
+				    R300_FPITX_IMAGE_SHIFT;
+				r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
+			} else {
+				// We get here when the corresponding texture image is incomplete
+				// (e.g. incomplete mipmaps etc.)
+				r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
+			}
+		}
+	}
+
+	r300->hw.fpt.cmd[R300_FPT_CMD_0] =
+	    cmdpacket0(R300_PFS_TEXI_0, fp->tex.length);
+
+	if (RADEON_DEBUG & DEBUG_STATE)
+		fprintf(stderr, "TX_ENABLE: %08x  last_hw_tmu=%d\n",
+			r300->hw.txe.cmd[R300_TXE_ENABLE], last_hw_tmu);
+}
+
+union r300_outputs_written {
+	GLuint vp_outputs;	/* hw_tcl_on */
+	 DECLARE_RENDERINPUTS(index_bitset);	/* !hw_tcl_on */
+};
+
+#define R300_OUTPUTS_WRITTEN_TEST(ow, vp_result, tnl_attrib) \
+	((hw_tcl_on) ? (ow).vp_outputs & (1 << (vp_result)) : \
+	RENDERINPUTS_TEST( (ow.index_bitset), (tnl_attrib) ))
+
+static void r300SetupRSUnit(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	/* I'm still unsure if these are needed */
+	GLuint interp_magic[8] = {
+		0x00,
+		R300_RS_INTERP_1_UNKNOWN,
+		R300_RS_INTERP_2_UNKNOWN,
+		R300_RS_INTERP_3_UNKNOWN,
+		0x00,
+		0x00,
+		0x00,
+		0x00
+	};
+	union r300_outputs_written OutputsWritten;
+	GLuint InputsRead;
+	int fp_reg, high_rr;
+	int in_texcoords, col_interp_nr;
+	int i;
+
+	if (hw_tcl_on)
+		OutputsWritten.vp_outputs =
+		    CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+	else
+		RENDERINPUTS_COPY(OutputsWritten.index_bitset,
+				  r300->state.render_inputs_bitset);
+
+	if (ctx->FragmentProgram._Current)
+		InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
+	else {
+		fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
+		return;		/* This should only ever happen once.. */
+	}
+
+	R300_STATECHANGE(r300, ri);
+	R300_STATECHANGE(r300, rc);
+	R300_STATECHANGE(r300, rr);
+
+	fp_reg = in_texcoords = col_interp_nr = high_rr = 0;
+
+	r300->hw.rr.cmd[R300_RR_ROUTE_1] = 0;
+
+	if (InputsRead & FRAG_BIT_WPOS) {
+		for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+			if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
+				break;
+
+		if (i == ctx->Const.MaxTextureUnits) {
+			fprintf(stderr, "\tno free texcoord found...\n");
+			_mesa_exit(-1);
+		}
+
+		InputsRead |= (FRAG_BIT_TEX0 << i);
+		InputsRead &= ~FRAG_BIT_WPOS;
+	}
+
+	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = 0
+		    | R300_RS_INTERP_USED
+		    | (in_texcoords << R300_RS_INTERP_SRC_SHIFT)
+		    | interp_magic[i];
+
+		r300->hw.rr.cmd[R300_RR_ROUTE_0 + fp_reg] = 0;
+		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
+			//assert(r300->state.texture.tc_count != 0);
+			r300->hw.rr.cmd[R300_RR_ROUTE_0 + fp_reg] |= R300_RS_ROUTE_ENABLE | i	/* source INTERP */
+			    | (fp_reg << R300_RS_ROUTE_DEST_SHIFT);
+			high_rr = fp_reg;
+
+			if (!R300_OUTPUTS_WRITTEN_TEST
+			    (OutputsWritten, VERT_RESULT_TEX0 + i,
+			     _TNL_ATTRIB_TEX(i))) {
+				/* Passing invalid data here can lock the GPU. */
+				WARN_ONCE
+				    ("fragprog wants coords for tex%d, vp doesn't provide them!\n",
+				     i);
+				//_mesa_print_program(&CURRENT_VERTEX_SHADER(ctx)->Base);
+				//_mesa_exit(-1);
+			}
+			InputsRead &= ~(FRAG_BIT_TEX0 << i);
+			fp_reg++;
+		}
+		/* Need to count all coords enabled at vof */
+		if (R300_OUTPUTS_WRITTEN_TEST
+		    (OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i)))
+			in_texcoords++;
+	}
+
+	if (InputsRead & FRAG_BIT_COL0) {
+		if (!R300_OUTPUTS_WRITTEN_TEST
+		    (OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
+			WARN_ONCE
+			    ("fragprog wants col0, vp doesn't provide it\n");
+			goto out;	/* FIXME */
+			//_mesa_print_program(&CURRENT_VERTEX_SHADER(ctx)->Base);
+			//_mesa_exit(-1);
+		}
+
+		r300->hw.rr.cmd[R300_RR_ROUTE_0] |= 0
+		    | R300_RS_ROUTE_0_COLOR
+		    | (fp_reg++ << R300_RS_ROUTE_0_COLOR_DEST_SHIFT);
+		InputsRead &= ~FRAG_BIT_COL0;
+		col_interp_nr++;
+	}
+      out:
+
+	if (InputsRead & FRAG_BIT_COL1) {
+		if (!R300_OUTPUTS_WRITTEN_TEST
+		    (OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
+			WARN_ONCE
+			    ("fragprog wants col1, vp doesn't provide it\n");
+			//_mesa_exit(-1);
+		}
+
+		r300->hw.rr.cmd[R300_RR_ROUTE_1] |=
+		    R300_RS_ROUTE_1_UNKNOWN11 | R300_RS_ROUTE_1_COLOR1 |
+		    (fp_reg++ << R300_RS_ROUTE_1_COLOR1_DEST_SHIFT);
+		InputsRead &= ~FRAG_BIT_COL1;
+		if (high_rr < 1)
+			high_rr = 1;
+		col_interp_nr++;
+	}
+
+	/* Need at least one. This might still lock as the values are undefined... */
+	if (in_texcoords == 0 && col_interp_nr == 0) {
+		r300->hw.rr.cmd[R300_RR_ROUTE_0] |= 0
+		    | R300_RS_ROUTE_0_COLOR
+		    | (fp_reg++ << R300_RS_ROUTE_0_COLOR_DEST_SHIFT);
+		col_interp_nr++;
+	}
+
+	r300->hw.rc.cmd[1] = 0 | (in_texcoords << R300_RS_CNTL_TC_CNT_SHIFT)
+	    | (col_interp_nr << R300_RS_CNTL_CI_CNT_SHIFT)
+	    | R300_RS_CNTL_0_UNKNOWN_18;
+
+	assert(high_rr >= 0);
+	r300->hw.rr.cmd[R300_RR_CMD_0] =
+	    cmdpacket0(R300_RS_ROUTE_0, high_rr + 1);
+	r300->hw.rc.cmd[2] = 0xC0 | high_rr;
+
+	if (InputsRead)
+		WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n",
+			  InputsRead);
+}
+
+#define vpucount(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
+
+#define bump_vpu_count(ptr, new_count)   do{\
+	drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr));\
+	int _nc=(new_count)/4; \
+	assert(_nc < 256); \
+	if(_nc>_p->vpu.count)_p->vpu.count=_nc;\
+	}while(0)
+
+void static inline setup_vertex_shader_fragment(r300ContextPtr r300, int dest, struct
+						r300_vertex_shader_fragment
+						*vsf)
+{
+	int i;
+
+	if (vsf->length == 0)
+		return;
+
+	if (vsf->length & 0x3) {
+		fprintf(stderr,
+			"VERTEX_SHADER_FRAGMENT must have length divisible by 4\n");
+		_mesa_exit(-1);
+	}
+
+	switch ((dest >> 8) & 0xf) {
+	case 0:
+		R300_STATECHANGE(r300, vpi);
+		for (i = 0; i < vsf->length; i++)
+			r300->hw.vpi.cmd[R300_VPI_INSTR_0 + i +
+					 4 * (dest & 0xff)] = (vsf->body.d[i]);
+		bump_vpu_count(r300->hw.vpi.cmd,
+			       vsf->length + 4 * (dest & 0xff));
+		break;
+
+	case 2:
+		R300_STATECHANGE(r300, vpp);
+		for (i = 0; i < vsf->length; i++)
+			r300->hw.vpp.cmd[R300_VPP_PARAM_0 + i +
+					 4 * (dest & 0xff)] = (vsf->body.d[i]);
+		bump_vpu_count(r300->hw.vpp.cmd,
+			       vsf->length + 4 * (dest & 0xff));
+		break;
+	case 4:
+		R300_STATECHANGE(r300, vps);
+		for (i = 0; i < vsf->length; i++)
+			r300->hw.vps.cmd[1 + i + 4 * (dest & 0xff)] =
+			    (vsf->body.d[i]);
+		bump_vpu_count(r300->hw.vps.cmd,
+			       vsf->length + 4 * (dest & 0xff));
+		break;
+	default:
+		fprintf(stderr,
+			"%s:%s don't know how to handle dest %04x\n",
+			__FILE__, __FUNCTION__, dest);
+		_mesa_exit(-1);
+	}
+}
+
+/* just a skeleton for now.. */
+
+/* Generate a vertex shader that simply transforms vertex and texture coordinates,
+   while leaving colors intact. Nothing fancy (like lights)
+
+   If implementing lights make a copy first, so it is easy to switch between the two versions */
+static void r300GenerateSimpleVertexShader(r300ContextPtr r300)
+{
+	int i;
+	GLuint o_reg = 0;
+
+	/* Allocate parameters */
+	r300->state.vap_param.transform_offset = 0x0;	/* transform matrix */
+	r300->state.vertex_shader.param_offset = 0x0;
+	r300->state.vertex_shader.param_count = 0x4;	/* 4 vector values - 4x4 matrix */
+
+	r300->state.vertex_shader.program_start = 0x0;
+	r300->state.vertex_shader.unknown_ptr1 = 0x4;	/* magic value ? */
+	r300->state.vertex_shader.program_end = 0x0;
+
+	r300->state.vertex_shader.unknown_ptr2 = 0x0;	/* magic value */
+	r300->state.vertex_shader.unknown_ptr3 = 0x4;	/* magic value */
+
+	r300->state.vertex_shader.unknown1.length = 0;
+	r300->state.vertex_shader.unknown2.length = 0;
+
+#define WRITE_OP(oper,source1,source2,source3)	{\
+	r300->state.vertex_shader.program.body.i[r300->state.vertex_shader.program_end].op=(oper); \
+	r300->state.vertex_shader.program.body.i[r300->state.vertex_shader.program_end].src[0]=(source1); \
+	r300->state.vertex_shader.program.body.i[r300->state.vertex_shader.program_end].src[1]=(source2); \
+	r300->state.vertex_shader.program.body.i[r300->state.vertex_shader.program_end].src[2]=(source3); \
+	r300->state.vertex_shader.program_end++; \
+	}
+
+	for (i = VERT_ATTRIB_POS; i < VERT_ATTRIB_MAX; i++)
+		if (r300->state.sw_tcl_inputs[i] != -1) {
+			WRITE_OP(EASY_VSF_OP(MUL, o_reg++, ALL, RESULT),
+				 VSF_REG(r300->state.sw_tcl_inputs[i]),
+				 VSF_ATTR_UNITY(r300->state.
+						sw_tcl_inputs[i]),
+				 VSF_UNITY(r300->state.sw_tcl_inputs[i])
+			    )
+
+		}
+
+	r300->state.vertex_shader.program_end--;	/* r300 wants program length to be one more - no idea why */
+	r300->state.vertex_shader.program.length =
+	    (r300->state.vertex_shader.program_end + 1) * 4;
+
+	r300->state.vertex_shader.unknown_ptr1 = r300->state.vertex_shader.program_end;	/* magic value ? */
+	r300->state.vertex_shader.unknown_ptr2 = r300->state.vertex_shader.program_end;	/* magic value ? */
+	r300->state.vertex_shader.unknown_ptr3 = r300->state.vertex_shader.program_end;	/* magic value ? */
+
+}
+
+static void r300SetupVertexProgram(r300ContextPtr rmesa)
+{
+	GLcontext *ctx = rmesa->radeon.glCtx;
+	int inst_count;
+	int param_count;
+	struct r300_vertex_program *prog =
+	    (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
+
+	((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
+	R300_STATECHANGE(rmesa, vpp);
+	param_count =
+	    r300VertexProgUpdateParams(ctx, (struct r300_vertex_program_cont *)
+				       ctx->VertexProgram._Current /*prog */ ,
+				       (float *)&rmesa->hw.vpp.
+				       cmd[R300_VPP_PARAM_0]);
+	bump_vpu_count(rmesa->hw.vpp.cmd, param_count);
+	param_count /= 4;
+
+	/* Reset state, in case we don't use something */
+	((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
+
+	setup_vertex_shader_fragment(rmesa, VSF_DEST_PROGRAM, &(prog->program));
+
+#if 0
+	setup_vertex_shader_fragment(rmesa, VSF_DEST_UNKNOWN1,
+				     &(rmesa->state.vertex_shader.unknown1));
+	setup_vertex_shader_fragment(rmesa, VSF_DEST_UNKNOWN2,
+				     &(rmesa->state.vertex_shader.unknown2));
+#endif
+
+	inst_count = prog->program.length / 4 - 1;
+
+	R300_STATECHANGE(rmesa, pvs);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
+	    (0 << R300_PVS_CNTL_1_PROGRAM_START_SHIFT)
+	    | (inst_count /*pos_end */  << R300_PVS_CNTL_1_POS_END_SHIFT)
+	    | (inst_count << R300_PVS_CNTL_1_PROGRAM_END_SHIFT);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
+	    (0 << R300_PVS_CNTL_2_PARAM_OFFSET_SHIFT)
+	    | (param_count << R300_PVS_CNTL_2_PARAM_COUNT_SHIFT);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
+	    (0 /*rmesa->state.vertex_shader.unknown_ptr2 */  <<
+	     R300_PVS_CNTL_3_PROGRAM_UNKNOWN_SHIFT)
+	    | (inst_count /*rmesa->state.vertex_shader.unknown_ptr3 */  <<
+	       0);
+
+	/* This is done for vertex shader fragments, but also needs to be done for vap_pvs,
+	   so I leave it as a reminder */
+#if 0
+	reg_start(R300_VAP_PVS_WAITIDLE, 0);
+	e32(0x00000000);
+#endif
+}
+
+static void r300SetupVertexShader(r300ContextPtr rmesa)
+{
+	GLcontext *ctx = rmesa->radeon.glCtx;
+
+	/* Reset state, in case we don't use something */
+	((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
+
+	/* Not sure why this doesnt work...
+	   0x400 area might have something to do with pixel shaders as it appears right after pfs programming.
+	   0x406 is set to { 0.0, 0.0, 1.0, 0.0 } most of the time but should change with smooth points and in other rare cases. */
+	//setup_vertex_shader_fragment(rmesa, 0x406, &unk4);
+	if (hw_tcl_on
+	    && ((struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx))->
+	    translated) {
+		r300SetupVertexProgram(rmesa);
+		return;
+	}
+
+	/* This needs to be replaced by vertex shader generation code */
+	r300GenerateSimpleVertexShader(rmesa);
+
+	setup_vertex_shader_fragment(rmesa, VSF_DEST_PROGRAM,
+				     &(rmesa->state.vertex_shader.program));
+
+#if 0
+	setup_vertex_shader_fragment(rmesa, VSF_DEST_UNKNOWN1,
+				     &(rmesa->state.vertex_shader.unknown1));
+	setup_vertex_shader_fragment(rmesa, VSF_DEST_UNKNOWN2,
+				     &(rmesa->state.vertex_shader.unknown2));
+#endif
+
+	R300_STATECHANGE(rmesa, pvs);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
+	    (rmesa->state.vertex_shader.
+	     program_start << R300_PVS_CNTL_1_PROGRAM_START_SHIFT)
+	    | (rmesa->state.vertex_shader.
+	       unknown_ptr1 << R300_PVS_CNTL_1_POS_END_SHIFT)
+	    | (rmesa->state.vertex_shader.
+	       program_end << R300_PVS_CNTL_1_PROGRAM_END_SHIFT);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
+	    (rmesa->state.vertex_shader.
+	     param_offset << R300_PVS_CNTL_2_PARAM_OFFSET_SHIFT)
+	    | (rmesa->state.vertex_shader.
+	       param_count << R300_PVS_CNTL_2_PARAM_COUNT_SHIFT);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
+	    (rmesa->state.vertex_shader.
+	     unknown_ptr2 << R300_PVS_CNTL_3_PROGRAM_UNKNOWN_SHIFT)
+	    | (rmesa->state.vertex_shader.unknown_ptr3 << 0);
+
+	/* This is done for vertex shader fragments, but also needs to be done for vap_pvs,
+	   so I leave it as a reminder */
+#if 0
+	reg_start(R300_VAP_PVS_WAITIDLE, 0);
+	e32(0x00000000);
+#endif
+}
+
+/**
+ * Completely recalculates hardware state based on the Mesa state.
+ */
+static void r300ResetHwState(r300ContextPtr r300)
+{
+	GLcontext *ctx = r300->radeon.glCtx;
+	int has_tcl = 1;
+
+	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+		has_tcl = 0;
+
+	if (RADEON_DEBUG & DEBUG_STATE)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	/* This is a place to initialize registers which
+	   have bitfields accessed by different functions
+	   and not all bits are used */
+
+	/* go and compute register values from GL state */
+
+	r300UpdateWindow(ctx);
+
+	r300ColorMask(ctx,
+		      ctx->Color.ColorMask[RCOMP],
+		      ctx->Color.ColorMask[GCOMP],
+		      ctx->Color.ColorMask[BCOMP], ctx->Color.ColorMask[ACOMP]);
+
+	r300Enable(ctx, GL_DEPTH_TEST, ctx->Depth.Test);
+	r300DepthMask(ctx, ctx->Depth.Mask);
+	r300DepthFunc(ctx, ctx->Depth.Func);
+
+	/* stencil */
+	r300Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
+	r300StencilMaskSeparate(ctx, 0, ctx->Stencil.WriteMask[0]);
+	r300StencilFuncSeparate(ctx, 0, ctx->Stencil.Function[0],
+				ctx->Stencil.Ref[0], ctx->Stencil.ValueMask[0]);
+	r300StencilOpSeparate(ctx, 0, ctx->Stencil.FailFunc[0],
+			      ctx->Stencil.ZFailFunc[0],
+			      ctx->Stencil.ZPassFunc[0]);
+
+	r300UpdateCulling(ctx);
+
+	r300UpdateTextureState(ctx);
+
+	r300SetBlendState(ctx);
+
+	r300AlphaFunc(ctx, ctx->Color.AlphaFunc, ctx->Color.AlphaRef);
+	r300Enable(ctx, GL_ALPHA_TEST, ctx->Color.AlphaEnabled);
+
+	/* Initialize magic registers
+	   TODO : learn what they really do, or get rid of
+	   those we don't have to touch */
+	if (!has_tcl)
+		r300->hw.vap_cntl.cmd[1] = 0x0014045a;
+	else
+		r300->hw.vap_cntl.cmd[1] = 0x0030045A;	//0x0030065a /* Dangerous */
+	r300->hw.vte.cmd[1] = R300_VPORT_X_SCALE_ENA
+	    | R300_VPORT_X_OFFSET_ENA
+	    | R300_VPORT_Y_SCALE_ENA
+	    | R300_VPORT_Y_OFFSET_ENA
+	    | R300_VPORT_Z_SCALE_ENA
+	    | R300_VPORT_Z_OFFSET_ENA | R300_VTX_W0_FMT;
+	r300->hw.vte.cmd[2] = 0x00000008;
+
+	r300->hw.unk2134.cmd[1] = 0x00FFFFFF;
+	r300->hw.unk2134.cmd[2] = 0x00000000;
+	if (_mesa_little_endian())
+		r300->hw.vap_cntl_status.cmd[1] = R300_VC_NO_SWAP;
+	else
+		r300->hw.vap_cntl_status.cmd[1] = R300_VC_32BIT_SWAP;
+
+	/* disable VAP/TCL on non-TCL capable chips */
+	if (!has_tcl)
+		r300->hw.vap_cntl_status.cmd[1] |= R300_VAP_TCL_BYPASS;
+
+	r300->hw.unk21DC.cmd[1] = 0xAAAAAAAA;
+
+	r300->hw.vap_clip_cntl.cmd[1] = R300_221C_NORMAL;
+
+	r300->hw.unk2220.cmd[1] = r300PackFloat32(1.0);
+	r300->hw.unk2220.cmd[2] = r300PackFloat32(1.0);
+	r300->hw.unk2220.cmd[3] = r300PackFloat32(1.0);
+	r300->hw.unk2220.cmd[4] = r300PackFloat32(1.0);
+
+	/* what about other chips than r300 or rv350??? */
+	if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R300)
+		r300->hw.unk2288.cmd[1] = R300_2288_R300;
+	else
+		r300->hw.unk2288.cmd[1] = R300_2288_RV350;
+
+	r300->hw.gb_enable.cmd[1] = R300_GB_POINT_STUFF_ENABLE
+	    | R300_GB_LINE_STUFF_ENABLE
+	    | R300_GB_TRIANGLE_STUFF_ENABLE /*| R300_GB_UNK31 */ ;
+
+	r300->hw.gb_misc.cmd[R300_GB_MISC_MSPOS_0] = 0x66666666;
+	r300->hw.gb_misc.cmd[R300_GB_MISC_MSPOS_1] = 0x06666666;
+	if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R300) ||
+	    (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R350))
+		r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] =
+		    R300_GB_TILE_ENABLE | R300_GB_TILE_PIPE_COUNT_R300 |
+		    R300_GB_TILE_SIZE_16;
+	else if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410)
+		r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] =
+		    R300_GB_TILE_ENABLE | R300_GB_TILE_PIPE_COUNT_RV410 |
+		    R300_GB_TILE_SIZE_16;
+	else if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420)
+		r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] =
+		    R300_GB_TILE_ENABLE | R300_GB_TILE_PIPE_COUNT_R420 |
+		    R300_GB_TILE_SIZE_16;
+	else
+		r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] =
+		    R300_GB_TILE_ENABLE | R300_GB_TILE_PIPE_COUNT_RV300 |
+		    R300_GB_TILE_SIZE_16;
+	/* set to 0 when fog is disabled? */
+	r300->hw.gb_misc.cmd[R300_GB_MISC_SELECT] = R300_GB_FOG_SELECT_1_1_W;
+	r300->hw.gb_misc.cmd[R300_GB_MISC_AA_CONFIG] = R300_AA_DISABLE;	/* No antialiasing */
+
+	r300->hw.unk4200.cmd[1] = r300PackFloat32(0.0);
+	r300->hw.unk4200.cmd[2] = r300PackFloat32(0.0);
+	r300->hw.unk4200.cmd[3] = r300PackFloat32(1.0);
+	r300->hw.unk4200.cmd[4] = r300PackFloat32(1.0);
+
+	r300->hw.unk4214.cmd[1] = 0x00050005;
+
+	r300PointSize(ctx, 0.0);
+
+	r300->hw.unk4230.cmd[1] = 0x18000006;
+	r300->hw.unk4230.cmd[2] = 0x00020006;
+	r300->hw.unk4230.cmd[3] = r300PackFloat32(1.0 / 192.0);
+
+	r300LineWidth(ctx, 0.0);
+
+	r300->hw.unk4260.cmd[1] = 0;
+	r300->hw.unk4260.cmd[2] = r300PackFloat32(0.0);
+	r300->hw.unk4260.cmd[3] = r300PackFloat32(1.0);
+
+	r300->hw.shade.cmd[1] = 0x00000002;
+	r300ShadeModel(ctx, ctx->Light.ShadeModel);
+	r300->hw.shade.cmd[3] = 0x00000000;
+	r300->hw.shade.cmd[4] = 0x00000000;
+
+	r300PolygonMode(ctx, GL_FRONT, ctx->Polygon.FrontMode);
+	r300PolygonMode(ctx, GL_BACK, ctx->Polygon.BackMode);
+	r300->hw.polygon_mode.cmd[2] = 0x00000001;
+	r300->hw.polygon_mode.cmd[3] = 0x00000000;
+	r300->hw.zbias_cntl.cmd[1] = 0x00000000;
+
+	r300PolygonOffset(ctx, ctx->Polygon.OffsetFactor,
+			  ctx->Polygon.OffsetUnits);
+	r300Enable(ctx, GL_POLYGON_OFFSET_FILL, ctx->Polygon.OffsetFill);
+
+	r300->hw.unk42C0.cmd[1] = 0x4B7FFFFF;
+	r300->hw.unk42C0.cmd[2] = 0x00000000;
+
+	r300->hw.unk43A4.cmd[1] = 0x0000001C;
+	r300->hw.unk43A4.cmd[2] = 0x2DA49525;
+
+	r300->hw.unk43E8.cmd[1] = 0x00FFFFFF;
+
+	r300->hw.unk46A4.cmd[1] = 0x00001B01;
+	r300->hw.unk46A4.cmd[2] = 0x00001B0F;
+	r300->hw.unk46A4.cmd[3] = 0x00001B0F;
+	r300->hw.unk46A4.cmd[4] = 0x00001B0F;
+	r300->hw.unk46A4.cmd[5] = 0x00000001;
+
+	r300Enable(ctx, GL_FOG, ctx->Fog.Enabled);
+	ctx->Driver.Fogfv(ctx, GL_FOG_MODE, NULL);
+	ctx->Driver.Fogfv(ctx, GL_FOG_DENSITY, &ctx->Fog.Density);
+	ctx->Driver.Fogfv(ctx, GL_FOG_START, &ctx->Fog.Start);
+	ctx->Driver.Fogfv(ctx, GL_FOG_END, &ctx->Fog.End);
+	ctx->Driver.Fogfv(ctx, GL_FOG_COLOR, ctx->Fog.Color);
+	ctx->Driver.Fogfv(ctx, GL_FOG_COORDINATE_SOURCE_EXT, NULL);
+
+	r300->hw.at.cmd[R300_AT_UNKNOWN] = 0;
+	r300->hw.unk4BD8.cmd[1] = 0;
+
+	r300->hw.unk4E00.cmd[1] = 0;
+
+	r300BlendColor(ctx, ctx->Color.BlendColor);
+	r300->hw.blend_color.cmd[2] = 0;
+	r300->hw.blend_color.cmd[3] = 0;
+
+	/* Again, r300ClearBuffer uses this */
+	r300->hw.cb.cmd[R300_CB_OFFSET] =
+	    r300->radeon.state.color.drawOffset +
+	    r300->radeon.radeonScreen->fbLocation;
+	r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
+
+	if (r300->radeon.radeonScreen->cpp == 4)
+		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
+	else
+		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
+
+	if (r300->radeon.sarea->tiling_enabled)
+		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
+
+	r300->hw.unk4E50.cmd[1] = 0;
+	r300->hw.unk4E50.cmd[2] = 0;
+	r300->hw.unk4E50.cmd[3] = 0;
+	r300->hw.unk4E50.cmd[4] = 0;
+	r300->hw.unk4E50.cmd[5] = 0;
+	r300->hw.unk4E50.cmd[6] = 0;
+	r300->hw.unk4E50.cmd[7] = 0;
+	r300->hw.unk4E50.cmd[8] = 0;
+	r300->hw.unk4E50.cmd[9] = 0;
+
+	r300->hw.unk4E88.cmd[1] = 0;
+
+	r300->hw.unk4EA0.cmd[1] = 0x00000000;
+	r300->hw.unk4EA0.cmd[2] = 0xffffffff;
+
+	switch (ctx->Visual.depthBits) {
+	case 16:
+		r300->hw.zstencil_format.cmd[1] = R300_DEPTH_FORMAT_16BIT_INT_Z;
+		break;
+	case 24:
+		r300->hw.zstencil_format.cmd[1] = R300_DEPTH_FORMAT_24BIT_INT_Z;
+		break;
+	default:
+		fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
+			ctx->Visual.depthBits);
+		_mesa_exit(-1);
+
+	}
+	/* z compress? */
+	//r300->hw.zstencil_format.cmd[1] |= R300_DEPTH_FORMAT_UNK32;
+
+	r300->hw.zstencil_format.cmd[3] = 0x00000003;
+	r300->hw.zstencil_format.cmd[4] = 0x00000000;
+
+	r300->hw.zb.cmd[R300_ZB_OFFSET] =
+	    r300->radeon.radeonScreen->depthOffset +
+	    r300->radeon.radeonScreen->fbLocation;
+	r300->hw.zb.cmd[R300_ZB_PITCH] = r300->radeon.radeonScreen->depthPitch;
+
+	if (r300->radeon.sarea->tiling_enabled) {
+		/* Turn off when clearing buffers ? */
+		r300->hw.zb.cmd[R300_ZB_PITCH] |= R300_DEPTH_TILE_ENABLE;
+
+		if (ctx->Visual.depthBits == 24)
+			r300->hw.zb.cmd[R300_ZB_PITCH] |=
+			    R300_DEPTH_MICROTILE_ENABLE;
+	}
+
+	r300->hw.unk4F28.cmd[1] = 0;
+
+	r300->hw.unk4F30.cmd[1] = 0;
+	r300->hw.unk4F30.cmd[2] = 0;
+
+	r300->hw.unk4F44.cmd[1] = 0;
+
+	r300->hw.unk4F54.cmd[1] = 0;
+
+	if (has_tcl) {
+		r300->hw.vps.cmd[R300_VPS_ZERO_0] = 0;
+		r300->hw.vps.cmd[R300_VPS_ZERO_1] = 0;
+		r300->hw.vps.cmd[R300_VPS_POINTSIZE] = r300PackFloat32(1.0);
+		r300->hw.vps.cmd[R300_VPS_ZERO_3] = 0;
+	}
+//END: TODO
+	r300->hw.all_dirty = GL_TRUE;
+}
+
+
+extern void _tnl_UpdateFixedFunctionProgram(GLcontext * ctx);
+
+extern int future_hw_tcl_on;
+void r300UpdateShaders(r300ContextPtr rmesa)
+{
+	GLcontext *ctx;
+	struct r300_vertex_program *vp;
+	int i;
+
+	ctx = rmesa->radeon.glCtx;
+
+	if (rmesa->NewGLState && hw_tcl_on) {
+		rmesa->NewGLState = 0;
+
+		for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+			rmesa->temp_attrib[i] =
+			    TNL_CONTEXT(ctx)->vb.AttribPtr[i];
+			TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
+			    &rmesa->dummy_attrib[i];
+		}
+
+		_tnl_UpdateFixedFunctionProgram(ctx);
+
+		for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+			TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
+			    rmesa->temp_attrib[i];
+		}
+
+		r300SelectVertexShader(rmesa);
+		vp = (struct r300_vertex_program *)
+		    CURRENT_VERTEX_SHADER(ctx);
+		/*if (vp->translated == GL_FALSE)
+		   r300TranslateVertexShader(vp); */
+		if (vp->translated == GL_FALSE) {
+			fprintf(stderr, "Failing back to sw-tcl\n");
+			hw_tcl_on = future_hw_tcl_on = 0;
+			r300ResetHwState(rmesa);
+
+			return;
+		}
+		r300UpdateStateParameters(ctx, _NEW_PROGRAM);
+	}
+
+}
+
+static void r300SetupPixelShader(r300ContextPtr rmesa)
+{
+	GLcontext *ctx = rmesa->radeon.glCtx;
+	struct r300_fragment_program *fp = (struct r300_fragment_program *)
+	    (char *)ctx->FragmentProgram._Current;
+	int i, k;
+
+	if (!fp)		/* should only happenen once, just after context is created */
+		return;
+
+	r300TranslateFragmentShader(rmesa, fp);
+	if (!fp->translated) {
+		fprintf(stderr, "%s: No valid fragment shader, exiting\n",
+			__FUNCTION__);
+		return;
+	}
+#define OUTPUT_FIELD(st, reg, field)  \
+		R300_STATECHANGE(rmesa, st); \
+		for(i=0;i<=fp->alu_end;i++) \
+			rmesa->hw.st.cmd[R300_FPI_INSTR_0+i]=fp->alu.inst[i].field;\
+		rmesa->hw.st.cmd[R300_FPI_CMD_0]=cmdpacket0(reg, fp->alu_end+1);
+
+	OUTPUT_FIELD(fpi[0], R300_PFS_INSTR0_0, inst0);
+	OUTPUT_FIELD(fpi[1], R300_PFS_INSTR1_0, inst1);
+	OUTPUT_FIELD(fpi[2], R300_PFS_INSTR2_0, inst2);
+	OUTPUT_FIELD(fpi[3], R300_PFS_INSTR3_0, inst3);
+#undef OUTPUT_FIELD
+
+	R300_STATECHANGE(rmesa, fp);
+	/* I just want to say, the way these nodes are stored.. weird.. */
+	for (i = 0, k = (4 - (fp->cur_node + 1)); i < 4; i++, k++) {
+		if (i < (fp->cur_node + 1)) {
+			rmesa->hw.fp.cmd[R300_FP_NODE0 + k] =
+			    (fp->node[i].
+			     alu_offset << R300_PFS_NODE_ALU_OFFSET_SHIFT)
+			    | (fp->node[i].
+			       alu_end << R300_PFS_NODE_ALU_END_SHIFT)
+			    | (fp->node[i].
+			       tex_offset << R300_PFS_NODE_TEX_OFFSET_SHIFT)
+			    | (fp->node[i].
+			       tex_end << R300_PFS_NODE_TEX_END_SHIFT)
+			    | fp->node[i].flags;	/*  ( (k==3) ? R300_PFS_NODE_LAST_NODE : 0); */
+		} else {
+			rmesa->hw.fp.cmd[R300_FP_NODE0 + (3 - i)] = 0;
+		}
+	}
+
+	/*  PFS_CNTL_0 */
+	rmesa->hw.fp.cmd[R300_FP_CNTL0] =
+	    fp->cur_node | (fp->first_node_has_tex << 3);
+	/* PFS_CNTL_1 */
+	rmesa->hw.fp.cmd[R300_FP_CNTL1] = fp->max_temp_idx;
+	/* PFS_CNTL_2 */
+	rmesa->hw.fp.cmd[R300_FP_CNTL2] =
+	    (fp->alu_offset << R300_PFS_CNTL_ALU_OFFSET_SHIFT)
+	    | (fp->alu_end << R300_PFS_CNTL_ALU_END_SHIFT)
+	    | (fp->tex_offset << R300_PFS_CNTL_TEX_OFFSET_SHIFT)
+	    | (fp->tex_end << R300_PFS_CNTL_TEX_END_SHIFT);
+
+	R300_STATECHANGE(rmesa, fpp);
+	for (i = 0; i < fp->const_nr; i++) {
+		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] =
+		    r300PackFloat24(fp->constant[i][0]);
+		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] =
+		    r300PackFloat24(fp->constant[i][1]);
+		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] =
+		    r300PackFloat24(fp->constant[i][2]);
+		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] =
+		    r300PackFloat24(fp->constant[i][3]);
+	}
+	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] =
+	    cmdpacket0(R300_PFS_PARAM_0_X, fp->const_nr * 4);
+}
+
+void r300UpdateShaderStates(r300ContextPtr rmesa)
+{
+	GLcontext *ctx;
+	ctx = rmesa->radeon.glCtx;
+
+	r300UpdateTextureState(ctx);
+
+	r300SetupPixelShader(rmesa);
+	r300SetupTextures(ctx);
+
+	if ((rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+		r300SetupVertexShader(rmesa);
+	r300SetupRSUnit(ctx);
+}
+
+/**
+ * Called by Mesa after an internal state update.
+ */
+static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	_swrast_InvalidateState(ctx, new_state);
+	_swsetup_InvalidateState(ctx, new_state);
+	_vbo_InvalidateState(ctx, new_state);
+	_tnl_InvalidateState(ctx, new_state);
+	_ae_invalidate_state(ctx, new_state);
+
+	if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+		r300UpdateDrawBuffer(ctx);
+	}
+
+	r300UpdateStateParameters(ctx, new_state);
+
+	r300->NewGLState |= new_state;
+}
+
+/**
+ * Calculate initial hardware state and register state functions.
+ * Assumes that the command buffer and state atoms have been
+ * initialized already.
+ */
+void r300InitState(r300ContextPtr r300)
+{
+	GLcontext *ctx = r300->radeon.glCtx;
+	GLuint depth_fmt;
+
+	radeonInitState(&r300->radeon);
+
+	switch (ctx->Visual.depthBits) {
+	case 16:
+		r300->state.depth.scale = 1.0 / (GLfloat) 0xffff;
+		depth_fmt = R300_DEPTH_FORMAT_16BIT_INT_Z;
+		r300->state.stencil.clear = 0x00000000;
+		break;
+	case 24:
+		r300->state.depth.scale = 1.0 / (GLfloat) 0xffffff;
+		depth_fmt = R300_DEPTH_FORMAT_24BIT_INT_Z;
+		r300->state.stencil.clear = 0x00ff0000;
+		break;
+
+
+	default:
+		fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
+			ctx->Visual.depthBits);
+		_mesa_exit(-1);
+	}
+
+	/* Only have hw stencil when depth buffer is 24 bits deep */
+	r300->state.stencil.hw_stencil = (ctx->Visual.stencilBits > 0 &&
+					  ctx->Visual.depthBits == 24);
+
+	memset(&(r300->state.texture), 0, sizeof(r300->state.texture));
+
+	r300ResetHwState(r300);
+}
+
+static void r300RenderMode(GLcontext * ctx, GLenum mode)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	(void)rmesa;
+	(void)mode;
+}
+
+static void r300ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	GLint p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
+	GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
+
+	R300_STATECHANGE( rmesa, vpucp[p] );
+	rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
+	rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
+	rmesa->hw.vpucp[p].cmd[R300_VPUCP_Z] = ip[2];
+	rmesa->hw.vpucp[p].cmd[R300_VPUCP_W] = ip[3];
+}
+
+
+void r300UpdateClipPlanes( GLcontext *ctx )
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	GLuint p;
+	
+	for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
+		if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
+			GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
+			
+			R300_STATECHANGE( rmesa, vpucp[p] );
+			rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
+			rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
+			rmesa->hw.vpucp[p].cmd[R300_VPUCP_Z] = ip[2];
+			rmesa->hw.vpucp[p].cmd[R300_VPUCP_W] = ip[3];
+		}
+	}
+}
+
+/**
+ * Initialize driver's state callback functions
+ */
+void r300InitStateFuncs(struct dd_function_table *functions)
+{
+	radeonInitStateFuncs(functions);
+
+	functions->UpdateState = r300InvalidateState;
+	functions->AlphaFunc = r300AlphaFunc;
+	functions->BlendColor = r300BlendColor;
+	functions->BlendEquationSeparate = r300BlendEquationSeparate;
+	functions->BlendFuncSeparate = r300BlendFuncSeparate;
+	functions->Enable = r300Enable;
+	functions->ColorMask = r300ColorMask;
+	functions->DepthFunc = r300DepthFunc;
+	functions->DepthMask = r300DepthMask;
+	functions->CullFace = r300CullFace;
+	functions->Fogfv = r300Fogfv;
+	functions->FrontFace = r300FrontFace;
+	functions->ShadeModel = r300ShadeModel;
+
+	/* Stencil related */
+	functions->ClearStencil = r300ClearStencil;
+	functions->StencilFuncSeparate = r300StencilFuncSeparate;
+	functions->StencilMaskSeparate = r300StencilMaskSeparate;
+	functions->StencilOpSeparate = r300StencilOpSeparate;
+
+	/* Viewport related */
+	functions->Viewport = r300Viewport;
+	functions->DepthRange = r300DepthRange;
+	functions->PointSize = r300PointSize;
+	functions->LineWidth = r300LineWidth;
+
+	functions->PolygonOffset = r300PolygonOffset;
+	functions->PolygonMode = r300PolygonMode;
+
+	functions->RenderMode = r300RenderMode;
+
+	functions->ClipPlane = r300ClipPlane;
+}
diff --git a/r300/r300_state.h b/r300/r300_state.h
new file mode 100644
index 0000000..21a49b7
--- /dev/null
+++ b/r300/r300_state.h
@@ -0,0 +1,70 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R300_STATE_H__
+#define __R300_STATE_H__
+
+#include "r300_context.h"
+
+#define R300_STATECHANGE(r300, atom) \
+	do {						\
+		r300->hw.atom.dirty = GL_TRUE;		\
+		r300->hw.is_dirty = GL_TRUE;		\
+	} while(0)
+
+#define R300_PRINT_STATE(r300, atom) \
+		r300PrintStateAtom(r300, &r300->hw.atom)
+
+/* Fire the buffered vertices no matter what.
+   TODO: This has not been implemented yet
+ */
+#define R300_FIREVERTICES( r300 )			\
+do {							\
+    \
+   if ( (r300)->cmdbuf.count_used || (r300)->dma.flush ) {	\
+      r300Flush( (r300)->radeon.glCtx );		\
+   }							\
+    \
+} while (0)
+
+extern void r300UpdateStateParameters(GLcontext * ctx, GLuint new_state);
+extern void r300InitState(r300ContextPtr r300);
+extern void r300InitStateFuncs(struct dd_function_table *functions);
+extern void r300UpdateViewportOffset(GLcontext * ctx);
+extern void r300UpdateDrawBuffer(GLcontext * ctx);
+
+extern void r300UpdateShaders(r300ContextPtr rmesa);
+extern void r300UpdateShaderStates(r300ContextPtr rmesa);
+
+#endif				/* __R300_STATE_H__ */
diff --git a/r300/r300_tex.c b/r300/r300_tex.c
new file mode 100644
index 0000000..2a21c61
--- /dev/null
+++ b/r300/r300_tex.c
@@ -0,0 +1,1166 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "colormac.h"
+#include "context.h"
+#include "enums.h"
+#include "image.h"
+#include "simple_list.h"
+#include "texformat.h"
+#include "texstore.h"
+#include "texmem.h"
+#include "teximage.h"
+#include "texobj.h"
+
+#include "r300_context.h"
+#include "r300_state.h"
+#include "r300_ioctl.h"
+#include "r300_tex.h"
+
+#include "xmlpool.h"
+
+/**
+ * Set the texture wrap modes.
+ *
+ * \param t Texture object whose wrap modes are to be set
+ * \param swrap Wrap mode for the \a s texture coordinate
+ * \param twrap Wrap mode for the \a t texture coordinate
+ */
+
+static void r300SetTexWrap(r300TexObjPtr t, GLenum swrap, GLenum twrap,
+			   GLenum rwrap)
+{
+	unsigned long hw_swrap = 0, hw_twrap = 0, hw_qwrap = 0;
+
+	t->filter &=
+	    ~(R300_TX_WRAP_S_MASK | R300_TX_WRAP_T_MASK | R300_TX_WRAP_Q_MASK);
+
+	switch (swrap) {
+	case GL_REPEAT:
+		hw_swrap |= R300_TX_REPEAT;
+		break;
+	case GL_CLAMP:
+		hw_swrap |= R300_TX_CLAMP;
+		break;
+	case GL_CLAMP_TO_EDGE:
+		hw_swrap |= R300_TX_CLAMP_TO_EDGE;
+		break;
+	case GL_CLAMP_TO_BORDER:
+		hw_swrap |= R300_TX_CLAMP_TO_BORDER;
+		break;
+	case GL_MIRRORED_REPEAT:
+		hw_swrap |= R300_TX_REPEAT | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_EXT:
+		hw_swrap |= R300_TX_CLAMP | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_TO_EDGE_EXT:
+		hw_swrap |= R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_TO_BORDER_EXT:
+		hw_swrap |= R300_TX_CLAMP_TO_BORDER | R300_TX_MIRRORED;
+		break;
+	default:
+		_mesa_problem(NULL, "bad S wrap mode in %s", __FUNCTION__);
+	}
+
+	switch (twrap) {
+	case GL_REPEAT:
+		hw_twrap |= R300_TX_REPEAT;
+		break;
+	case GL_CLAMP:
+		hw_twrap |= R300_TX_CLAMP;
+		break;
+	case GL_CLAMP_TO_EDGE:
+		hw_twrap |= R300_TX_CLAMP_TO_EDGE;
+		break;
+	case GL_CLAMP_TO_BORDER:
+		hw_twrap |= R300_TX_CLAMP_TO_BORDER;
+		break;
+	case GL_MIRRORED_REPEAT:
+		hw_twrap |= R300_TX_REPEAT | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_EXT:
+		hw_twrap |= R300_TX_CLAMP | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_TO_EDGE_EXT:
+		hw_twrap |= R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_TO_BORDER_EXT:
+		hw_twrap |= R300_TX_CLAMP_TO_BORDER | R300_TX_MIRRORED;
+		break;
+	default:
+		_mesa_problem(NULL, "bad T wrap mode in %s", __FUNCTION__);
+	}
+
+	switch (rwrap) {
+	case GL_REPEAT:
+		hw_qwrap |= R300_TX_REPEAT;
+		break;
+	case GL_CLAMP:
+		hw_qwrap |= R300_TX_CLAMP;
+		break;
+	case GL_CLAMP_TO_EDGE:
+		hw_qwrap |= R300_TX_CLAMP_TO_EDGE;
+		break;
+	case GL_CLAMP_TO_BORDER:
+		hw_qwrap |= R300_TX_CLAMP_TO_BORDER;
+		break;
+	case GL_MIRRORED_REPEAT:
+		hw_qwrap |= R300_TX_REPEAT | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_EXT:
+		hw_qwrap |= R300_TX_CLAMP | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_TO_EDGE_EXT:
+		hw_qwrap |= R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
+		break;
+	case GL_MIRROR_CLAMP_TO_BORDER_EXT:
+		hw_qwrap |= R300_TX_CLAMP_TO_BORDER | R300_TX_MIRRORED;
+		break;
+	default:
+		_mesa_problem(NULL, "bad R wrap mode in %s", __FUNCTION__);
+	}
+
+	t->filter |= hw_swrap << R300_TX_WRAP_S_SHIFT;
+	t->filter |= hw_twrap << R300_TX_WRAP_T_SHIFT;
+	t->filter |= hw_qwrap << R300_TX_WRAP_Q_SHIFT;
+}
+
+static void r300SetTexMaxAnisotropy(r300TexObjPtr t, GLfloat max)
+{
+
+	t->filter &= ~R300_TX_MAX_ANISO_MASK;
+
+	if (max <= 1.0) {
+		t->filter |= R300_TX_MAX_ANISO_1_TO_1;
+	} else if (max <= 2.0) {
+		t->filter |= R300_TX_MAX_ANISO_2_TO_1;
+	} else if (max <= 4.0) {
+		t->filter |= R300_TX_MAX_ANISO_4_TO_1;
+	} else if (max <= 8.0) {
+		t->filter |= R300_TX_MAX_ANISO_8_TO_1;
+	} else {
+		t->filter |= R300_TX_MAX_ANISO_16_TO_1;
+	}
+}
+
+/**
+ * Set the texture magnification and minification modes.
+ *
+ * \param t Texture whose filter modes are to be set
+ * \param minf Texture minification mode
+ * \param magf Texture magnification mode
+ */
+
+static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf)
+{
+	GLuint anisotropy = (t->filter & R300_TX_MAX_ANISO_MASK);
+
+	t->filter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MAG_FILTER_MASK);
+
+	if (anisotropy == R300_TX_MAX_ANISO_1_TO_1) {
+		switch (minf) {
+		case GL_NEAREST:
+			t->filter |= R300_TX_MIN_FILTER_NEAREST;
+			break;
+		case GL_LINEAR:
+			t->filter |= R300_TX_MIN_FILTER_LINEAR;
+			break;
+		case GL_NEAREST_MIPMAP_NEAREST:
+			t->filter |= R300_TX_MIN_FILTER_NEAREST_MIP_NEAREST;
+			break;
+		case GL_NEAREST_MIPMAP_LINEAR:
+			t->filter |= R300_TX_MIN_FILTER_NEAREST_MIP_LINEAR;
+			break;
+		case GL_LINEAR_MIPMAP_NEAREST:
+			t->filter |= R300_TX_MIN_FILTER_LINEAR_MIP_NEAREST;
+			break;
+		case GL_LINEAR_MIPMAP_LINEAR:
+			t->filter |= R300_TX_MIN_FILTER_LINEAR_MIP_LINEAR;
+			break;
+		}
+	} else {
+		switch (minf) {
+		case GL_NEAREST:
+			t->filter |= R300_TX_MIN_FILTER_ANISO_NEAREST;
+			break;
+		case GL_LINEAR:
+			t->filter |= R300_TX_MIN_FILTER_ANISO_LINEAR;
+			break;
+		case GL_NEAREST_MIPMAP_NEAREST:
+		case GL_LINEAR_MIPMAP_NEAREST:
+			t->filter |=
+			    R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_NEAREST;
+			break;
+		case GL_NEAREST_MIPMAP_LINEAR:
+		case GL_LINEAR_MIPMAP_LINEAR:
+			t->filter |=
+			    R300_TX_MIN_FILTER_ANISO_NEAREST_MIP_LINEAR;
+			break;
+		}
+	}
+
+	/* Note we don't have 3D mipmaps so only use the mag filter setting
+	 * to set the 3D texture filter mode.
+	 */
+	switch (magf) {
+	case GL_NEAREST:
+		t->filter |= R300_TX_MAG_FILTER_NEAREST;
+		break;
+	case GL_LINEAR:
+		t->filter |= R300_TX_MAG_FILTER_LINEAR;
+		break;
+	}
+}
+
+static void r300SetTexBorderColor(r300TexObjPtr t, GLubyte c[4])
+{
+	t->pp_border_color = PACK_COLOR_8888(c[0], c[1], c[2], c[3]);
+}
+
+/**
+ * Allocate space for and load the mesa images into the texture memory block.
+ * This will happen before drawing with a new texture, or drawing with a
+ * texture after it was swapped out or teximaged again.
+ */
+
+static r300TexObjPtr r300AllocTexObj(struct gl_texture_object *texObj)
+{
+	r300TexObjPtr t;
+
+	t = CALLOC_STRUCT(r300_tex_obj);
+	texObj->DriverData = t;
+	if (t != NULL) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE) {
+			fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
+				(void *)texObj, (void *)t);
+		}
+
+		/* Initialize non-image-dependent parts of the state:
+		 */
+		t->base.tObj = texObj;
+		t->border_fallback = GL_FALSE;
+
+		make_empty_list(&t->base);
+
+		r300SetTexWrap(t, texObj->WrapS, texObj->WrapT, texObj->WrapR);
+		r300SetTexMaxAnisotropy(t, texObj->MaxAnisotropy);
+		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter);
+		r300SetTexBorderColor(t, texObj->_BorderChan);
+	}
+
+	return t;
+}
+
+/* try to find a format which will only need a memcopy */
+static const struct gl_texture_format *r300Choose8888TexFormat(GLenum srcFormat,
+							       GLenum srcType)
+{
+	const GLuint ui = 1;
+	const GLubyte littleEndian = *((const GLubyte *)&ui);
+
+	if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
+	    (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE
+	     && !littleEndian) || (srcFormat == GL_ABGR_EXT
+				   && srcType == GL_UNSIGNED_INT_8_8_8_8_REV)
+	    || (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE
+		&& littleEndian)) {
+		return &_mesa_texformat_rgba8888;
+	} else
+	    if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV)
+		|| (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE
+		    && littleEndian) || (srcFormat == GL_ABGR_EXT
+					 && srcType == GL_UNSIGNED_INT_8_8_8_8)
+		|| (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE
+		    && !littleEndian)) {
+		return &_mesa_texformat_rgba8888_rev;
+	} else if (srcFormat == GL_BGRA &&
+		   ((srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
+		    srcType == GL_UNSIGNED_INT_8_8_8_8)) {
+		return &_mesa_texformat_argb8888_rev;
+	} else if (srcFormat == GL_BGRA &&
+		   ((srcType == GL_UNSIGNED_BYTE && littleEndian) ||
+		    srcType == GL_UNSIGNED_INT_8_8_8_8_REV)) {
+		return &_mesa_texformat_argb8888;
+	} else
+		return _dri_texformat_argb8888;
+}
+
+static const struct gl_texture_format *r300ChooseTextureFormat(GLcontext * ctx,
+							       GLint
+							       internalFormat,
+							       GLenum format,
+							       GLenum type)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	const GLboolean do32bpt =
+	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32);
+	const GLboolean force16bpt =
+	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16);
+	(void)format;
+
+#if 0
+	fprintf(stderr, "InternalFormat=%s(%d) type=%s format=%s\n",
+		_mesa_lookup_enum_by_nr(internalFormat), internalFormat,
+		_mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
+	fprintf(stderr, "do32bpt=%d force16bpt=%d\n", do32bpt, force16bpt);
+#endif
+
+	switch (internalFormat) {
+	case 4:
+	case GL_RGBA:
+	case GL_COMPRESSED_RGBA:
+		switch (type) {
+		case GL_UNSIGNED_INT_10_10_10_2:
+		case GL_UNSIGNED_INT_2_10_10_10_REV:
+			return do32bpt ? _dri_texformat_argb8888 :
+			    _dri_texformat_argb1555;
+		case GL_UNSIGNED_SHORT_4_4_4_4:
+		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+			return _dri_texformat_argb4444;
+		case GL_UNSIGNED_SHORT_5_5_5_1:
+		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+			return _dri_texformat_argb1555;
+		default:
+			return do32bpt ? r300Choose8888TexFormat(format, type) :
+			    _dri_texformat_argb4444;
+		}
+
+	case 3:
+	case GL_RGB:
+	case GL_COMPRESSED_RGB:
+		switch (type) {
+		case GL_UNSIGNED_SHORT_4_4_4_4:
+		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+			return _dri_texformat_argb4444;
+		case GL_UNSIGNED_SHORT_5_5_5_1:
+		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+			return _dri_texformat_argb1555;
+		case GL_UNSIGNED_SHORT_5_6_5:
+		case GL_UNSIGNED_SHORT_5_6_5_REV:
+			return _dri_texformat_rgb565;
+		default:
+			return do32bpt ? _dri_texformat_argb8888 :
+			    _dri_texformat_rgb565;
+		}
+
+	case GL_RGBA8:
+	case GL_RGB10_A2:
+	case GL_RGBA12:
+	case GL_RGBA16:
+		return !force16bpt ?
+		    r300Choose8888TexFormat(format,
+					    type) : _dri_texformat_argb4444;
+
+	case GL_RGBA4:
+	case GL_RGBA2:
+		return _dri_texformat_argb4444;
+
+	case GL_RGB5_A1:
+		return _dri_texformat_argb1555;
+
+	case GL_RGB8:
+	case GL_RGB10:
+	case GL_RGB12:
+	case GL_RGB16:
+		return !force16bpt ? _dri_texformat_argb8888 :
+		    _dri_texformat_rgb565;
+
+	case GL_RGB5:
+	case GL_RGB4:
+	case GL_R3_G3_B2:
+		return _dri_texformat_rgb565;
+
+	case GL_ALPHA:
+	case GL_ALPHA4:
+	case GL_ALPHA8:
+	case GL_ALPHA12:
+	case GL_ALPHA16:
+	case GL_COMPRESSED_ALPHA:
+		return _dri_texformat_a8;
+
+	case 1:
+	case GL_LUMINANCE:
+	case GL_LUMINANCE4:
+	case GL_LUMINANCE8:
+	case GL_LUMINANCE12:
+	case GL_LUMINANCE16:
+	case GL_COMPRESSED_LUMINANCE:
+		return _dri_texformat_l8;
+
+	case 2:
+	case GL_LUMINANCE_ALPHA:
+	case GL_LUMINANCE4_ALPHA4:
+	case GL_LUMINANCE6_ALPHA2:
+	case GL_LUMINANCE8_ALPHA8:
+	case GL_LUMINANCE12_ALPHA4:
+	case GL_LUMINANCE12_ALPHA12:
+	case GL_LUMINANCE16_ALPHA16:
+	case GL_COMPRESSED_LUMINANCE_ALPHA:
+		return _dri_texformat_al88;
+
+	case GL_INTENSITY:
+	case GL_INTENSITY4:
+	case GL_INTENSITY8:
+	case GL_INTENSITY12:
+	case GL_INTENSITY16:
+	case GL_COMPRESSED_INTENSITY:
+		return _dri_texformat_i8;
+
+	case GL_YCBCR_MESA:
+		if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
+		    type == GL_UNSIGNED_BYTE)
+			return &_mesa_texformat_ycbcr;
+		else
+			return &_mesa_texformat_ycbcr_rev;
+
+	case GL_RGB_S3TC:
+	case GL_RGB4_S3TC:
+	case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+		return &_mesa_texformat_rgb_dxt1;
+
+	case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+		return &_mesa_texformat_rgba_dxt1;
+
+	case GL_RGBA_S3TC:
+	case GL_RGBA4_S3TC:
+	case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+		return &_mesa_texformat_rgba_dxt3;
+
+	case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+		return &_mesa_texformat_rgba_dxt5;
+
+	case GL_ALPHA16F_ARB:
+		return &_mesa_texformat_alpha_float16;
+	case GL_ALPHA32F_ARB:
+		return &_mesa_texformat_alpha_float32;
+	case GL_LUMINANCE16F_ARB:
+		return &_mesa_texformat_luminance_float16;
+	case GL_LUMINANCE32F_ARB:
+		return &_mesa_texformat_luminance_float32;
+	case GL_LUMINANCE_ALPHA16F_ARB:
+		return &_mesa_texformat_luminance_alpha_float16;
+	case GL_LUMINANCE_ALPHA32F_ARB:
+		return &_mesa_texformat_luminance_alpha_float32;
+	case GL_INTENSITY16F_ARB:
+		return &_mesa_texformat_intensity_float16;
+	case GL_INTENSITY32F_ARB:
+		return &_mesa_texformat_intensity_float32;
+	case GL_RGB16F_ARB:
+		return &_mesa_texformat_rgba_float16;
+	case GL_RGB32F_ARB:
+		return &_mesa_texformat_rgba_float32;
+	case GL_RGBA16F_ARB:
+		return &_mesa_texformat_rgba_float16;
+	case GL_RGBA32F_ARB:
+		return &_mesa_texformat_rgba_float32;
+
+	default:
+		_mesa_problem(ctx,
+			      "unexpected internalFormat 0x%x in r300ChooseTextureFormat",
+			      (int)internalFormat);
+		return NULL;
+	}
+
+	return NULL;		/* never get here */
+}
+
+static GLboolean
+r300ValidateClientStorage(GLcontext * ctx, GLenum target,
+			  GLint internalFormat,
+			  GLint srcWidth, GLint srcHeight,
+			  GLenum format, GLenum type, const void *pixels,
+			  const struct gl_pixelstore_attrib *packing,
+			  struct gl_texture_object *texObj,
+			  struct gl_texture_image *texImage)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	if (RADEON_DEBUG & DEBUG_TEXTURE)
+		fprintf(stderr, "intformat %s format %s type %s\n",
+			_mesa_lookup_enum_by_nr(internalFormat),
+			_mesa_lookup_enum_by_nr(format),
+			_mesa_lookup_enum_by_nr(type));
+
+	if (!ctx->Unpack.ClientStorage)
+		return 0;
+
+	if (ctx->_ImageTransferState ||
+	    texImage->IsCompressed || texObj->GenerateMipmap)
+		return 0;
+
+	/* This list is incomplete, may be different on ppc???
+	 */
+	switch (internalFormat) {
+	case GL_RGBA:
+		if (format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV) {
+			texImage->TexFormat = _dri_texformat_argb8888;
+		} else
+			return 0;
+		break;
+
+	case GL_RGB:
+		if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) {
+			texImage->TexFormat = _dri_texformat_rgb565;
+		} else
+			return 0;
+		break;
+
+	case GL_YCBCR_MESA:
+		if (format == GL_YCBCR_MESA &&
+		    type == GL_UNSIGNED_SHORT_8_8_REV_APPLE) {
+			texImage->TexFormat = &_mesa_texformat_ycbcr_rev;
+		} else if (format == GL_YCBCR_MESA &&
+			   (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
+			    type == GL_UNSIGNED_BYTE)) {
+			texImage->TexFormat = &_mesa_texformat_ycbcr;
+		} else
+			return 0;
+		break;
+
+	default:
+		return 0;
+	}
+
+	/* Could deal with these packing issues, but currently don't:
+	 */
+	if (packing->SkipPixels ||
+	    packing->SkipRows || packing->SwapBytes || packing->LsbFirst) {
+		return 0;
+	}
+
+	{
+		GLint srcRowStride = _mesa_image_row_stride(packing, srcWidth,
+							    format, type);
+
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: srcRowStride %d/%x\n",
+				__FUNCTION__, srcRowStride, srcRowStride);
+
+		/* Could check this later in upload, pitch restrictions could be
+		 * relaxed, but would need to store the image pitch somewhere,
+		 * as packing details might change before image is uploaded:
+		 */
+		if (!r300IsGartMemory(rmesa, pixels, srcHeight * srcRowStride)
+		    || (srcRowStride & 63))
+			return 0;
+
+		/* Have validated that _mesa_transfer_teximage would be a straight
+		 * memcpy at this point.  NOTE: future calls to TexSubImage will
+		 * overwrite the client data.  This is explicitly mentioned in the
+		 * extension spec.
+		 */
+		texImage->Data = (void *)pixels;
+		texImage->IsClientData = GL_TRUE;
+		texImage->RowStride =
+		    srcRowStride / texImage->TexFormat->TexelBytes;
+
+		return 1;
+	}
+}
+
+static void r300TexImage1D(GLcontext * ctx, GLenum target, GLint level,
+			   GLint internalFormat,
+			   GLint width, GLint border,
+			   GLenum format, GLenum type, const GLvoid * pixels,
+			   const struct gl_pixelstore_attrib *packing,
+			   struct gl_texture_object *texObj,
+			   struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+
+	if (t) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
+			return;
+		}
+	}
+
+	/* Note, this will call ChooseTextureFormat */
+	_mesa_store_teximage1d(ctx, target, level, internalFormat,
+			       width, border, format, type, pixels,
+			       &ctx->Unpack, texObj, texImage);
+
+	t->dirty_images[0] |= (1 << level);
+}
+
+static void r300TexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
+			      GLint xoffset,
+			      GLsizei width,
+			      GLenum format, GLenum type,
+			      const GLvoid * pixels,
+			      const struct gl_pixelstore_attrib *packing,
+			      struct gl_texture_object *texObj,
+			      struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+
+	assert(t);		/* this _should_ be true */
+	if (t) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
+			return;
+		}
+	}
+
+	_mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
+				  format, type, pixels, packing, texObj,
+				  texImage);
+
+	t->dirty_images[0] |= (1 << level);
+}
+
+static void r300TexImage2D(GLcontext * ctx, GLenum target, GLint level,
+			   GLint internalFormat,
+			   GLint width, GLint height, GLint border,
+			   GLenum format, GLenum type, const GLvoid * pixels,
+			   const struct gl_pixelstore_attrib *packing,
+			   struct gl_texture_object *texObj,
+			   struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	GLuint face;
+
+	/* which cube face or ordinary 2D image */
+	switch (target) {
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+		face =
+		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+		ASSERT(face < 6);
+		break;
+	default:
+		face = 0;
+	}
+
+	if (t != NULL) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
+			return;
+		}
+	}
+
+	texImage->IsClientData = GL_FALSE;
+
+	if (r300ValidateClientStorage(ctx, target,
+				      internalFormat,
+				      width, height,
+				      format, type, pixels,
+				      packing, texObj, texImage)) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: Using client storage\n",
+				__FUNCTION__);
+	} else {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: Using normal storage\n",
+				__FUNCTION__);
+
+		/* Normal path: copy (to cached memory) and eventually upload
+		 * via another copy to GART memory and then a blit...  Could
+		 * eliminate one copy by going straight to (permanent) GART.
+		 *
+		 * Note, this will call r300ChooseTextureFormat.
+		 */
+		_mesa_store_teximage2d(ctx, target, level, internalFormat,
+				       width, height, border, format, type,
+				       pixels, &ctx->Unpack, texObj, texImage);
+
+		t->dirty_images[face] |= (1 << level);
+	}
+}
+
+static void r300TexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
+			      GLint xoffset, GLint yoffset,
+			      GLsizei width, GLsizei height,
+			      GLenum format, GLenum type,
+			      const GLvoid * pixels,
+			      const struct gl_pixelstore_attrib *packing,
+			      struct gl_texture_object *texObj,
+			      struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	GLuint face;
+
+	/* which cube face or ordinary 2D image */
+	switch (target) {
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+		face =
+		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+		ASSERT(face < 6);
+		break;
+	default:
+		face = 0;
+	}
+
+	assert(t);		/* this _should_ be true */
+	if (t) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
+			return;
+		}
+	}
+
+	_mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
+				  height, format, type, pixels, packing, texObj,
+				  texImage);
+
+	t->dirty_images[face] |= (1 << level);
+}
+
+static void r300CompressedTexImage2D(GLcontext * ctx, GLenum target,
+				     GLint level, GLint internalFormat,
+				     GLint width, GLint height, GLint border,
+				     GLsizei imageSize, const GLvoid * data,
+				     struct gl_texture_object *texObj,
+				     struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	GLuint face;
+
+	/* which cube face or ordinary 2D image */
+	switch (target) {
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+		face =
+		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+		ASSERT(face < 6);
+		break;
+	default:
+		face = 0;
+	}
+
+	if (t != NULL) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY,
+				    "glCompressedTexImage2D");
+			return;
+		}
+	}
+
+	texImage->IsClientData = GL_FALSE;
+
+	/* can't call this, different parameters. Would never evaluate to true anyway currently */
+#if 0
+	if (r300ValidateClientStorage(ctx, target,
+				      internalFormat,
+				      width, height,
+				      format, type, pixels,
+				      packing, texObj, texImage)) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: Using client storage\n",
+				__FUNCTION__);
+	} else
+#endif
+	{
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: Using normal storage\n",
+				__FUNCTION__);
+
+		/* Normal path: copy (to cached memory) and eventually upload
+		 * via another copy to GART memory and then a blit...  Could
+		 * eliminate one copy by going straight to (permanent) GART.
+		 *
+		 * Note, this will call r300ChooseTextureFormat.
+		 */
+		_mesa_store_compressed_teximage2d(ctx, target, level,
+						  internalFormat, width, height,
+						  border, imageSize, data,
+						  texObj, texImage);
+
+		t->dirty_images[face] |= (1 << level);
+	}
+}
+
+static void r300CompressedTexSubImage2D(GLcontext * ctx, GLenum target,
+					GLint level, GLint xoffset,
+					GLint yoffset, GLsizei width,
+					GLsizei height, GLenum format,
+					GLsizei imageSize, const GLvoid * data,
+					struct gl_texture_object *texObj,
+					struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	GLuint face;
+
+	/* which cube face or ordinary 2D image */
+	switch (target) {
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+		face =
+		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+		ASSERT(face < 6);
+		break;
+	default:
+		face = 0;
+	}
+
+	assert(t);		/* this _should_ be true */
+	if (t) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY,
+				    "glCompressedTexSubImage3D");
+			return;
+		}
+	}
+
+	_mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset,
+					     yoffset, width, height, format,
+					     imageSize, data, texObj, texImage);
+
+	t->dirty_images[face] |= (1 << level);
+}
+
+static void r300TexImage3D(GLcontext * ctx, GLenum target, GLint level,
+			   GLint internalFormat,
+			   GLint width, GLint height, GLint depth,
+			   GLint border,
+			   GLenum format, GLenum type, const GLvoid * pixels,
+			   const struct gl_pixelstore_attrib *packing,
+			   struct gl_texture_object *texObj,
+			   struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+
+	if (t) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage3D");
+			return;
+		}
+	}
+
+	texImage->IsClientData = GL_FALSE;
+
+#if 0
+	if (r300ValidateClientStorage(ctx, target,
+				      internalFormat,
+				      width, height,
+				      format, type, pixels,
+				      packing, texObj, texImage)) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: Using client storage\n",
+				__FUNCTION__);
+	} else
+#endif
+	{
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: Using normal storage\n",
+				__FUNCTION__);
+
+		/* Normal path: copy (to cached memory) and eventually upload
+		 * via another copy to GART memory and then a blit...  Could
+		 * eliminate one copy by going straight to (permanent) GART.
+		 *
+		 * Note, this will call r300ChooseTextureFormat.
+		 */
+		_mesa_store_teximage3d(ctx, target, level, internalFormat,
+				       width, height, depth, border,
+				       format, type, pixels,
+				       &ctx->Unpack, texObj, texImage);
+
+		t->dirty_images[0] |= (1 << level);
+	}
+}
+
+static void
+r300TexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
+		  GLint xoffset, GLint yoffset, GLint zoffset,
+		  GLsizei width, GLsizei height, GLsizei depth,
+		  GLenum format, GLenum type,
+		  const GLvoid * pixels,
+		  const struct gl_pixelstore_attrib *packing,
+		  struct gl_texture_object *texObj,
+		  struct gl_texture_image *texImage)
+{
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+
+/*     fprintf(stderr, "%s\n", __FUNCTION__); */
+
+	assert(t);		/* this _should_ be true */
+	if (t) {
+		driSwapOutTextureObject(t);
+	} else {
+		t = (driTextureObject *) r300AllocTexObj(texObj);
+		if (!t) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage3D");
+			return;
+		}
+		texObj->DriverData = t;
+	}
+
+	_mesa_store_texsubimage3d(ctx, target, level, xoffset, yoffset, zoffset,
+				  width, height, depth,
+				  format, type, pixels, packing, texObj,
+				  texImage);
+
+	t->dirty_images[0] |= (1 << level);
+}
+
+static void r300TexEnv(GLcontext * ctx, GLenum target,
+		       GLenum pname, const GLfloat * param)
+{
+	if (RADEON_DEBUG & DEBUG_STATE) {
+		fprintf(stderr, "%s( %s )\n",
+			__FUNCTION__, _mesa_lookup_enum_by_nr(pname));
+	}
+
+	/* This is incorrect: Need to maintain this data for each of
+	 * GL_TEXTURE_{123}D, GL_TEXTURE_RECTANGLE_NV, etc, and switch
+	 * between them according to _ReallyEnabled.
+	 */
+	switch (pname) {
+	case GL_TEXTURE_LOD_BIAS_EXT:{
+#if 0				/* Needs to be relocated in order to make sure we got the right tmu */
+			GLfloat bias, min;
+			GLuint b;
+
+			/* The R300's LOD bias is a signed 2's complement value with a
+			 * range of -16.0 <= bias < 16.0.
+			 *
+			 * NOTE: Add a small bias to the bias for conform mipsel.c test.
+			 */
+			bias = *param + .01;
+			min =
+			    driQueryOptionb(&rmesa->radeon.optionCache,
+					    "no_neg_lod_bias") ? 0.0 : -16.0;
+			bias = CLAMP(bias, min, 16.0);
+
+			/* 0.0 - 16.0 == 0x0 - 0x1000 */
+			/* 0.0 - -16.0 == 0x1001 - 0x1fff */
+			b = 0x1000 / 16.0 * bias;
+			b &= R300_LOD_BIAS_MASK;
+
+			if (b !=
+			    (rmesa->hw.tex.unknown1.
+			     cmd[R300_TEX_VALUE_0 +
+				 unit] & R300_LOD_BIAS_MASK)) {
+				R300_STATECHANGE(rmesa, tex.unknown1);
+				rmesa->hw.tex.unknown1.cmd[R300_TEX_VALUE_0 +
+							   unit] &=
+				    ~R300_LOD_BIAS_MASK;
+				rmesa->hw.tex.unknown1.cmd[R300_TEX_VALUE_0 +
+							   unit] |= b;
+			}
+#endif
+			break;
+		}
+
+	default:
+		return;
+	}
+}
+
+/**
+ * Changes variables and flags for a state update, which will happen at the
+ * next UpdateTextureState
+ */
+
+static void r300TexParameter(GLcontext * ctx, GLenum target,
+			     struct gl_texture_object *texObj,
+			     GLenum pname, const GLfloat * params)
+{
+	r300TexObjPtr t = (r300TexObjPtr) texObj->DriverData;
+
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+		fprintf(stderr, "%s( %s )\n", __FUNCTION__,
+			_mesa_lookup_enum_by_nr(pname));
+	}
+
+	switch (pname) {
+	case GL_TEXTURE_MIN_FILTER:
+	case GL_TEXTURE_MAG_FILTER:
+	case GL_TEXTURE_MAX_ANISOTROPY_EXT:
+		r300SetTexMaxAnisotropy(t, texObj->MaxAnisotropy);
+		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter);
+		break;
+
+	case GL_TEXTURE_WRAP_S:
+	case GL_TEXTURE_WRAP_T:
+	case GL_TEXTURE_WRAP_R:
+		r300SetTexWrap(t, texObj->WrapS, texObj->WrapT, texObj->WrapR);
+		break;
+
+	case GL_TEXTURE_BORDER_COLOR:
+		r300SetTexBorderColor(t, texObj->_BorderChan);
+		break;
+
+	case GL_TEXTURE_BASE_LEVEL:
+	case GL_TEXTURE_MAX_LEVEL:
+	case GL_TEXTURE_MIN_LOD:
+	case GL_TEXTURE_MAX_LOD:
+		/* This isn't the most efficient solution but there doesn't appear to
+		 * be a nice alternative.  Since there's no LOD clamping,
+		 * we just have to rely on loading the right subset of mipmap levels
+		 * to simulate a clamped LOD.
+		 */
+		driSwapOutTextureObject((driTextureObject *) t);
+		break;
+
+	default:
+		return;
+	}
+
+	/* Mark this texobj as dirty (one bit per tex unit)
+	 */
+	t->dirty_state = TEX_ALL;
+}
+
+static void r300BindTexture(GLcontext * ctx, GLenum target,
+			    struct gl_texture_object *texObj)
+{
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+		fprintf(stderr, "%s( %p ) unit=%d\n", __FUNCTION__,
+			(void *)texObj, ctx->Texture.CurrentUnit);
+	}
+
+	if ((target == GL_TEXTURE_1D)
+	    || (target == GL_TEXTURE_2D)
+	    || (target == GL_TEXTURE_3D)
+	    || (target == GL_TEXTURE_CUBE_MAP)
+	    || (target == GL_TEXTURE_RECTANGLE_NV)) {
+		assert(texObj->DriverData != NULL);
+	}
+}
+
+static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+			(void *)texObj,
+			_mesa_lookup_enum_by_nr(texObj->Target));
+	}
+
+	if (t != NULL) {
+		if (rmesa) {
+			R300_FIREVERTICES(rmesa);
+		}
+
+		driDestroyTextureObject(t);
+	}
+	/* Free mipmap images and the texture object itself */
+	_mesa_delete_texture_object(ctx, texObj);
+}
+
+/**
+ * Allocate a new texture object.
+ * Called via ctx->Driver.NewTextureObject.
+ * Note: this function will be called during context creation to
+ * allocate the default texture objects.
+ * Note: we could use containment here to 'derive' the driver-specific
+ * texture object from the core mesa gl_texture_object.  Not done at this time.
+ * Fixup MaxAnisotropy according to user preference.
+ */
+static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
+						      GLuint name,
+						      GLenum target)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct gl_texture_object *obj;
+	obj = _mesa_new_texture_object(ctx, name, target);
+	if (!obj)
+		return NULL;
+	obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
+
+	r300AllocTexObj(obj);
+	return obj;
+}
+
+void r300InitTextureFuncs(struct dd_function_table *functions)
+{
+	/* Note: we only plug in the functions we implement in the driver
+	 * since _mesa_init_driver_functions() was already called.
+	 */
+	functions->ChooseTextureFormat = r300ChooseTextureFormat;
+	functions->TexImage1D = r300TexImage1D;
+	functions->TexImage2D = r300TexImage2D;
+	functions->TexImage3D = r300TexImage3D;
+	functions->TexSubImage1D = r300TexSubImage1D;
+	functions->TexSubImage2D = r300TexSubImage2D;
+	functions->TexSubImage3D = r300TexSubImage3D;
+	functions->NewTextureObject = r300NewTextureObject;
+	functions->BindTexture = r300BindTexture;
+	functions->DeleteTexture = r300DeleteTexture;
+	functions->IsTextureResident = driIsTextureResident;
+
+	functions->TexEnv = r300TexEnv;
+	functions->TexParameter = r300TexParameter;
+
+	functions->CompressedTexImage2D = r300CompressedTexImage2D;
+	functions->CompressedTexSubImage2D = r300CompressedTexSubImage2D;
+
+	driInitTextureFormats();
+}
diff --git a/r300/r300_tex.h b/r300/r300_tex.h
new file mode 100644
index 0000000..f67a8e6
--- /dev/null
+++ b/r300/r300_tex.h
@@ -0,0 +1,51 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __r300_TEX_H__
+#define __r300_TEX_H__
+
+extern void r300SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
+			     unsigned long long offset, GLint depth,
+			     GLuint pitch);
+
+extern void r300UpdateTextureState(GLcontext * ctx);
+
+extern int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t,
+			       GLuint face);
+
+extern void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t);
+
+extern void r300InitTextureFuncs(struct dd_function_table *functions);
+
+#endif				/* __r300_TEX_H__ */
diff --git a/r300/r300_texmem.c b/r300/r300_texmem.c
new file mode 100644
index 0000000..e2e8355
--- /dev/null
+++ b/r300/r300_texmem.c
@@ -0,0 +1,584 @@
+/**************************************************************************
+
+Copyright (C) Tungsten Graphics 2002.  All Rights Reserved.
+The Weather Channel, Inc. funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86
+license. This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation on the rights to use, copy, modify, merge, publish,
+distribute, sub license, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
+SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Gareth Hughes <gareth@valinux.com>
+ *
+ * \author Kevin E. Martin <martin@valinux.com>
+ */
+
+#include <errno.h>
+
+#include "glheader.h"
+#include "imports.h"
+#include "context.h"
+#include "colormac.h"
+#include "macros.h"
+#include "simple_list.h"
+#include "radeon_reg.h"		/* gets definition for usleep */
+#include "r300_context.h"
+#include "r300_state.h"
+#include "r300_cmdbuf.h"
+#include "radeon_ioctl.h"
+#include "r300_tex.h"
+#include "r300_ioctl.h"
+#include <unistd.h>		/* for usleep() */
+
+#ifdef USER_BUFFERS
+#include "r300_mem.h"
+#endif
+
+/**
+ * Destroy any device-dependent state associated with the texture.  This may
+ * include NULLing out hardware state that points to the texture.
+ */
+void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t)
+{
+	if (RADEON_DEBUG & DEBUG_TEXTURE) {
+		fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
+			(void *)t, (void *)t->base.tObj);
+	}
+
+	if (rmesa != NULL) {
+		unsigned i;
+
+		for (i = 0; i < rmesa->radeon.glCtx->Const.MaxTextureUnits; i++) {
+			if (t == rmesa->state.texture.unit[i].texobj) {
+				rmesa->state.texture.unit[i].texobj = NULL;
+				/* This code below is meant to shorten state
+				   pushed to the hardware by not programming
+				   unneeded units.
+
+				   This does not appear to be worthwhile on R300 */
+#if 0
+				remove_from_list(&rmesa->hw.tex[i]);
+				make_empty_list(&rmesa->hw.tex[i]);
+				remove_from_list(&rmesa->hw.cube[i]);
+				make_empty_list(&rmesa->hw.cube[i]);
+#endif
+			}
+		}
+	}
+}
+
+/* ------------------------------------------------------------
+ * Texture image conversions
+ */
+
+static void r300UploadGARTClientSubImage(r300ContextPtr rmesa,
+					 r300TexObjPtr t,
+					 struct gl_texture_image *texImage,
+					 GLint hwlevel,
+					 GLint x, GLint y,
+					 GLint width, GLint height)
+{
+	const struct gl_texture_format *texFormat = texImage->TexFormat;
+	GLuint srcPitch, dstPitch;
+	int blit_format;
+	int srcOffset;
+
+	/*
+	 * XXX it appears that we always upload the full image, not a subimage.
+	 * I.e. x==0, y==0, width=texWidth, height=texWidth.  If this is ever
+	 * changed, the src pitch will have to change.
+	 */
+	switch (texFormat->TexelBytes) {
+	case 1:
+		blit_format = R300_CP_COLOR_FORMAT_CI8;
+		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+		break;
+	case 2:
+		blit_format = R300_CP_COLOR_FORMAT_RGB565;
+		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+		break;
+	case 4:
+		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
+		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+		break;
+	case 8:
+	case 16:
+		blit_format = R300_CP_COLOR_FORMAT_CI8;
+		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+		break;
+	default:
+		return;
+	}
+
+	t->image[0][hwlevel].data = texImage->Data;
+	srcOffset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
+
+	assert(srcOffset != ~0);
+
+	/* Don't currently need to cope with small pitches?
+	 */
+	width = texImage->Width;
+	height = texImage->Height;
+
+	if (texFormat->TexelBytes > 4) {
+		width *= texFormat->TexelBytes;
+	}
+
+	r300EmitWait(rmesa, R300_WAIT_3D);
+
+	r300EmitBlit(rmesa, blit_format,
+		     srcPitch,
+		     srcOffset,
+		     dstPitch,
+		     t->bufAddr,
+		     x,
+		     y,
+		     t->image[0][hwlevel].x + x,
+		     t->image[0][hwlevel].y + y, width, height);
+
+	r300EmitWait(rmesa, R300_WAIT_2D);
+}
+
+static void r300UploadRectSubImage(r300ContextPtr rmesa,
+				   r300TexObjPtr t,
+				   struct gl_texture_image *texImage,
+				   GLint x, GLint y, GLint width, GLint height)
+{
+	const struct gl_texture_format *texFormat = texImage->TexFormat;
+	int blit_format, dstPitch, done;
+
+	switch (texFormat->TexelBytes) {
+	case 1:
+		blit_format = R300_CP_COLOR_FORMAT_CI8;
+		break;
+	case 2:
+		blit_format = R300_CP_COLOR_FORMAT_RGB565;
+		break;
+	case 4:
+		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
+		break;
+	case 8:
+	case 16:
+		blit_format = R300_CP_COLOR_FORMAT_CI8;
+		break;
+	default:
+		return;
+	}
+
+	t->image[0][0].data = texImage->Data;
+
+	/* Currently don't need to cope with small pitches.
+	 */
+	width = texImage->Width;
+	height = texImage->Height;
+	dstPitch = t->pitch;
+
+	if (texFormat->TexelBytes > 4) {
+		width *= texFormat->TexelBytes;
+	}
+
+	if (rmesa->prefer_gart_client_texturing && texImage->IsClientData) {
+		/* In this case, could also use GART texturing.  This is
+		 * currently disabled, but has been tested & works.
+		 */
+		t->offset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
+		t->pitch = texImage->RowStride * texFormat->TexelBytes - 32;
+
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr,
+				"Using GART texturing for rectangular client texture\n");
+
+		/* Release FB memory allocated for this image:
+		 */
+		/* FIXME This may not be correct as driSwapOutTextureObject sets
+		 * FIXME dirty_images.  It may be fine, though.
+		 */
+		if (t->base.memBlock) {
+			driSwapOutTextureObject((driTextureObject *) t);
+		}
+	} else if (texImage->IsClientData) {
+		/* Data already in GART memory, with usable pitch.
+		 */
+		GLuint srcPitch;
+		srcPitch = texImage->RowStride * texFormat->TexelBytes;
+		r300EmitBlit(rmesa,
+			     blit_format,
+			     srcPitch,
+			     r300GartOffsetFromVirtual(rmesa, texImage->Data),
+			     dstPitch, t->bufAddr, 0, 0, 0, 0, width, height);
+	} else {
+		/* Data not in GART memory, or bad pitch.
+		 */
+		for (done = 0; done < height;) {
+			struct r300_dma_region region;
+			int lines =
+			    MIN2(height - done, RADEON_BUFFER_SIZE / dstPitch);
+			int src_pitch;
+			char *tex;
+
+			src_pitch = texImage->RowStride * texFormat->TexelBytes;
+
+			tex = (char *)texImage->Data + done * src_pitch;
+
+			memset(&region, 0, sizeof(region));
+			r300AllocDmaRegion(rmesa, &region, lines * dstPitch,
+					   1024);
+
+			/* Copy texdata to dma:
+			 */
+			if (RADEON_DEBUG & DEBUG_TEXTURE)
+				fprintf(stderr,
+					"%s: src_pitch %d dst_pitch %d\n",
+					__FUNCTION__, src_pitch, dstPitch);
+
+			if (src_pitch == dstPitch) {
+				memcpy(region.address + region.start, tex,
+				       lines * src_pitch);
+			} else {
+				char *buf = region.address + region.start;
+				int i;
+				for (i = 0; i < lines; i++) {
+					memcpy(buf, tex, src_pitch);
+					buf += dstPitch;
+					tex += src_pitch;
+				}
+			}
+
+			r300EmitWait(rmesa, R300_WAIT_3D);
+
+			/* Blit to framebuffer
+			 */
+			r300EmitBlit(rmesa,
+				     blit_format,
+				     dstPitch, GET_START(&region),
+				     dstPitch | (t->tile_bits >> 16),
+				     t->bufAddr, 0, 0, 0, done, width, lines);
+
+			r300EmitWait(rmesa, R300_WAIT_2D);
+#ifdef USER_BUFFERS
+			r300_mem_use(rmesa, region.buf->id);
+#endif
+
+			r300ReleaseDmaRegion(rmesa, &region, __FUNCTION__);
+			done += lines;
+		}
+	}
+}
+
+/**
+ * Upload the texture image associated with texture \a t at the specified
+ * level at the address relative to \a start.
+ */
+static void r300UploadSubImage(r300ContextPtr rmesa, r300TexObjPtr t,
+			       GLint hwlevel,
+			       GLint x, GLint y, GLint width, GLint height,
+			       GLuint face)
+{
+	struct gl_texture_image *texImage = NULL;
+	GLuint offset;
+	GLint imageWidth, imageHeight;
+	GLint ret;
+	drm_radeon_texture_t tex;
+	drm_radeon_tex_image_t tmp;
+	const int level = hwlevel + t->base.firstLevel;
+
+	if (RADEON_DEBUG & DEBUG_TEXTURE) {
+		fprintf(stderr,
+			"%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n",
+			__FUNCTION__, (void *)t, (void *)t->base.tObj, level,
+			width, height, face);
+	}
+
+	ASSERT(face < 6);
+
+	/* Ensure we have a valid texture to upload */
+	if ((hwlevel < 0) || (hwlevel >= RADEON_MAX_TEXTURE_LEVELS)) {
+		_mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
+		return;
+	}
+
+	texImage = t->base.tObj->Image[face][level];
+
+	if (!texImage) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: texImage %d is NULL!\n",
+				__FUNCTION__, level);
+		return;
+	}
+	if (!texImage->Data) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: image data is NULL!\n",
+				__FUNCTION__);
+		return;
+	}
+
+	if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+		assert(level == 0);
+		assert(hwlevel == 0);
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "%s: image data is rectangular\n",
+				__FUNCTION__);
+		r300UploadRectSubImage(rmesa, t, texImage, x, y, width, height);
+		return;
+	} else if (texImage->IsClientData) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr,
+				"%s: image data is in GART client storage\n",
+				__FUNCTION__);
+		r300UploadGARTClientSubImage(rmesa, t, texImage, hwlevel, x, y,
+					     width, height);
+		return;
+	} else if (RADEON_DEBUG & DEBUG_TEXTURE)
+		fprintf(stderr, "%s: image data is in normal memory\n",
+			__FUNCTION__);
+
+	imageWidth = texImage->Width;
+	imageHeight = texImage->Height;
+
+	offset = t->bufAddr + t->base.totalSize / 6 * face;
+
+	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
+		GLint imageX = 0;
+		GLint imageY = 0;
+		GLint blitX = t->image[face][hwlevel].x;
+		GLint blitY = t->image[face][hwlevel].y;
+		GLint blitWidth = t->image[face][hwlevel].width;
+		GLint blitHeight = t->image[face][hwlevel].height;
+		fprintf(stderr, "   upload image: %d,%d at %d,%d\n",
+			imageWidth, imageHeight, imageX, imageY);
+		fprintf(stderr, "   upload  blit: %d,%d at %d,%d\n",
+			blitWidth, blitHeight, blitX, blitY);
+		fprintf(stderr, "       blit ofs: 0x%07x level: %d/%d\n",
+			(GLuint) offset, hwlevel, level);
+	}
+
+	t->image[face][hwlevel].data = texImage->Data;
+
+	/* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
+	 * NOTE: we're always use a 1KB-wide blit and I8 texture format.
+	 * We used to use 1, 2 and 4-byte texels and used to use the texture
+	 * width to dictate the blit width - but that won't work for compressed
+	 * textures. (Brian)
+	 * NOTE: can't do that with texture tiling. (sroland)
+	 */
+	tex.offset = offset;
+	tex.image = &tmp;
+	/* copy (x,y,width,height,data) */
+	memcpy(&tmp, &t->image[face][hwlevel], sizeof(tmp));
+
+	if (texImage->TexFormat->TexelBytes > 4) {
+		const int log2TexelBytes =
+		    (3 + (texImage->TexFormat->TexelBytes >> 4));
+		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
+		tex.pitch =
+		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
+			 64, 1);
+		tex.height = imageHeight;
+		tex.width = imageWidth << log2TexelBytes;
+		tex.offset += (tmp.x << log2TexelBytes) & ~1023;
+		tmp.x = tmp.x % (1024 >> log2TexelBytes);
+		tmp.width = tmp.width << log2TexelBytes;
+	} else if (texImage->TexFormat->TexelBytes) {
+		/* use multi-byte upload scheme */
+		tex.height = imageHeight;
+		tex.width = imageWidth;
+		switch (texImage->TexFormat->TexelBytes) {
+		case 1:
+			tex.format = RADEON_TXFORMAT_I8;
+			break;
+		case 2:
+			tex.format = RADEON_TXFORMAT_AI88;
+			break;
+		case 4:
+			tex.format = RADEON_TXFORMAT_ARGB8888;
+			break;
+		}
+		tex.pitch =
+		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
+			 64, 1);
+		tex.offset += tmp.x & ~1023;
+		tmp.x = tmp.x % 1024;
+
+		if (t->tile_bits & R300_TXO_MICRO_TILE) {
+			/* need something like "tiled coordinates" ? */
+			tmp.y = tmp.x / (tex.pitch * 128) * 2;
+			tmp.x =
+			    tmp.x % (tex.pitch * 128) / 2 /
+			    texImage->TexFormat->TexelBytes;
+			tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
+		} else {
+			tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
+		}
+#if 1
+		if ((t->tile_bits & R300_TXO_MACRO_TILE) &&
+		    (texImage->Width * texImage->TexFormat->TexelBytes >= 256)
+		    && ((!(t->tile_bits & R300_TXO_MICRO_TILE)
+			 && (texImage->Height >= 8))
+			|| (texImage->Height >= 16))) {
+			/* weird: R200 disables macro tiling if mip width is smaller than 256 bytes,
+			   OR if height is smaller than 8 automatically, but if micro tiling is active
+			   the limit is height 16 instead ? */
+			tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
+		}
+#endif
+	} else {
+		/* In case of for instance 8x8 texture (2x2 dxt blocks),
+		   padding after the first two blocks is needed (only
+		   with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
+		/* set tex.height to 1/4 since 1 "macropixel" (dxt-block)
+		   has 4 real pixels. Needed so the kernel module reads
+		   the right amount of data. */
+		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
+		tex.pitch = (R300_BLIT_WIDTH_BYTES / 64);
+		tex.height = (imageHeight + 3) / 4;
+		tex.width = (imageWidth + 3) / 4;
+		if ((t->format & R300_TX_FORMAT_DXT1) == R300_TX_FORMAT_DXT1) {
+			tex.width *= 8;
+		} else {
+			tex.width *= 16;
+		}
+	}
+
+	LOCK_HARDWARE(&rmesa->radeon);
+	do {
+		ret =
+		    drmCommandWriteRead(rmesa->radeon.dri.fd,
+					DRM_RADEON_TEXTURE, &tex,
+					sizeof(drm_radeon_texture_t));
+		if (ret) {
+			if (RADEON_DEBUG & DEBUG_IOCTL)
+				fprintf(stderr,
+					"DRM_RADEON_TEXTURE:  again!\n");
+			usleep(1);
+		}
+	} while (ret == -EAGAIN);
+
+	UNLOCK_HARDWARE(&rmesa->radeon);
+
+	if (ret) {
+		fprintf(stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret);
+		fprintf(stderr, "   offset=0x%08x\n", offset);
+		fprintf(stderr, "   image width=%d height=%d\n",
+			imageWidth, imageHeight);
+		fprintf(stderr, "    blit width=%d height=%d data=%p\n",
+			t->image[face][hwlevel].width,
+			t->image[face][hwlevel].height,
+			t->image[face][hwlevel].data);
+		_mesa_exit(-1);
+	}
+}
+
+/**
+ * Upload the texture images associated with texture \a t.  This might
+ * require the allocation of texture memory.
+ *
+ * \param rmesa Context pointer
+ * \param t Texture to be uploaded
+ * \param face Cube map face to be uploaded.  Zero for non-cube maps.
+ */
+
+int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t, GLuint face)
+{
+	const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+
+	if (t->image_override)
+		return 0;
+
+	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
+		fprintf(stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
+			(void *)rmesa->radeon.glCtx, (void *)t->base.tObj,
+			t->base.totalSize, t->base.firstLevel,
+			t->base.lastLevel);
+	}
+
+	if (!t || t->base.totalSize == 0)
+		return 0;
+
+	if (RADEON_DEBUG & DEBUG_SYNC) {
+		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
+		radeonFinish(rmesa->radeon.glCtx);
+	}
+
+	LOCK_HARDWARE(&rmesa->radeon);
+
+	if (t->base.memBlock == NULL) {
+		int heap;
+
+		heap = driAllocateTexture(rmesa->texture_heaps, rmesa->nr_heaps,
+					  (driTextureObject *) t);
+		if (heap == -1) {
+			UNLOCK_HARDWARE(&rmesa->radeon);
+			return -1;
+		}
+
+		/* Set the base offset of the texture image */
+		t->bufAddr = rmesa->radeon.radeonScreen->texOffset[heap]
+		    + t->base.memBlock->ofs;
+		t->offset = t->bufAddr;
+
+		if (!(t->base.tObj->Image[0][0]->IsClientData)) {
+			/* hope it's safe to add that here... */
+			t->offset |= t->tile_bits;
+		}
+
+		/* Mark this texobj as dirty on all units:
+		 */
+		t->dirty_state = TEX_ALL;
+	}
+
+	/* Let the world know we've used this memory recently.
+	 */
+	driUpdateTextureLRU((driTextureObject *) t);
+	UNLOCK_HARDWARE(&rmesa->radeon);
+
+	/* Upload any images that are new */
+	if (t->base.dirty_images[face]) {
+		int i;
+		for (i = 0; i < numLevels; i++) {
+			if ((t->base.
+			     dirty_images[face] & (1 <<
+						   (i + t->base.firstLevel))) !=
+			    0) {
+				r300UploadSubImage(rmesa, t, i, 0, 0,
+						   t->image[face][i].width,
+						   t->image[face][i].height,
+						   face);
+			}
+		}
+		t->base.dirty_images[face] = 0;
+	}
+
+	if (RADEON_DEBUG & DEBUG_SYNC) {
+		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
+		radeonFinish(rmesa->radeon.glCtx);
+	}
+
+	return 0;
+}
diff --git a/r300/r300_texstate.c b/r300/r300_texstate.c
new file mode 100644
index 0000000..8203189
--- /dev/null
+++ b/r300/r300_texstate.c
@@ -0,0 +1,620 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ *
+ * \todo Enable R300 texture tiling code?
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "context.h"
+#include "macros.h"
+#include "texformat.h"
+#include "teximage.h"
+#include "texobj.h"
+#include "enums.h"
+
+#include "r300_context.h"
+#include "r300_state.h"
+#include "r300_ioctl.h"
+#include "radeon_ioctl.h"
+#include "r300_tex.h"
+#include "r300_reg.h"
+
+#define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5			\
+			   || ((f) >= MESA_FORMAT_RGBA_FLOAT32 &&	\
+			       (f) <= MESA_FORMAT_INTENSITY_FLOAT16))	\
+			  && tx_table_le[f].flag )
+
+#define _ASSIGN(entry, format)				\
+	[ MESA_FORMAT_ ## entry ] = { format, 0, 1}
+
+/*
+ * Note that the _REV formats are the same as the non-REV formats.  This is
+ * because the REV and non-REV formats are identical as a byte string, but
+ * differ when accessed as 16-bit or 32-bit words depending on the endianness of
+ * the host.  Since the textures are transferred to the R300 as a byte string
+ * (i.e. without any byte-swapping), the R300 sees the REV and non-REV formats
+ * identically.  -- paulus
+ */
+
+static const struct tx_table {
+	GLuint format, filter, flag;
+} tx_table_be[] = {
+	/* *INDENT-OFF* */
+	_ASSIGN(RGBA8888, R300_EASY_TX_FORMAT(Z, Y, X, W, W8Z8Y8X8)),
+	_ASSIGN(RGBA8888_REV, R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8)),
+	_ASSIGN(ARGB8888, R300_EASY_TX_FORMAT(W, Z, Y, X, W8Z8Y8X8)),
+	_ASSIGN(ARGB8888_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8)),
+	_ASSIGN(RGB888, 0xffffffff),
+	_ASSIGN(RGB565, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
+	_ASSIGN(RGB565_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
+	_ASSIGN(ARGB4444, R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4)),
+	_ASSIGN(ARGB4444_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4)),
+	_ASSIGN(ARGB1555, R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5)),
+	_ASSIGN(ARGB1555_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5)),
+	_ASSIGN(AL88, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8)),
+	_ASSIGN(AL88_REV, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8)),
+	_ASSIGN(RGB332, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z3Y3X2)),
+	_ASSIGN(A8, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X8)),
+	_ASSIGN(L8, R300_EASY_TX_FORMAT(X, X, X, ONE, X8)),
+	_ASSIGN(I8, R300_EASY_TX_FORMAT(X, X, X, X, X8)),
+	_ASSIGN(CI8, R300_EASY_TX_FORMAT(X, X, X, X, X8)),
+	_ASSIGN(YCBCR, R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8)|R300_TX_FORMAT_YUV_MODE ),
+	_ASSIGN(YCBCR_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8)|R300_TX_FORMAT_YUV_MODE),
+	_ASSIGN(RGB_DXT1, R300_EASY_TX_FORMAT(X, Y, Z, ONE, DXT1)),
+	_ASSIGN(RGBA_DXT1, R300_EASY_TX_FORMAT(X, Y, Z, W, DXT1)),
+	_ASSIGN(RGBA_DXT3, R300_EASY_TX_FORMAT(X, Y, Z, W, DXT3)),
+	_ASSIGN(RGBA_DXT5, R300_EASY_TX_FORMAT(Y, Z, W, X, DXT5)),
+	_ASSIGN(RGBA_FLOAT32, R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R32G32B32A32)),
+	_ASSIGN(RGBA_FLOAT16, R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16)),
+	_ASSIGN(RGB_FLOAT32, 0xffffffff),
+	_ASSIGN(RGB_FLOAT16, 0xffffffff),
+	_ASSIGN(ALPHA_FLOAT32, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I32)),
+	_ASSIGN(ALPHA_FLOAT16, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I16)),
+	_ASSIGN(LUMINANCE_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I32)),
+	_ASSIGN(LUMINANCE_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I16)),
+	_ASSIGN(LUMINANCE_ALPHA_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, Y, FL_I32A32)),
+	_ASSIGN(LUMINANCE_ALPHA_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, Y, FL_I16A16)),
+	_ASSIGN(INTENSITY_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, X, FL_I32)),
+	_ASSIGN(INTENSITY_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, X, FL_I16)),
+	/* *INDENT-ON* */
+};
+
+static const struct tx_table tx_table_le[] = {
+	/* *INDENT-OFF* */
+	_ASSIGN(RGBA8888, R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8)),
+	_ASSIGN(RGBA8888_REV, R300_EASY_TX_FORMAT(Z, Y, X, W, W8Z8Y8X8)),
+	_ASSIGN(ARGB8888, R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8)),
+	_ASSIGN(ARGB8888_REV, R300_EASY_TX_FORMAT(W, Z, Y, X, W8Z8Y8X8)),
+	_ASSIGN(RGB888, R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8)),
+	_ASSIGN(RGB565, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
+	_ASSIGN(RGB565_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
+	_ASSIGN(ARGB4444, R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4)),
+	_ASSIGN(ARGB4444_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4)),
+	_ASSIGN(ARGB1555, R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5)),
+	_ASSIGN(ARGB1555_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5)),
+	_ASSIGN(AL88, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8)),
+	_ASSIGN(AL88_REV, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8)),
+	_ASSIGN(RGB332, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z3Y3X2)),
+	_ASSIGN(A8, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X8)),
+	_ASSIGN(L8, R300_EASY_TX_FORMAT(X, X, X, ONE, X8)),
+	_ASSIGN(I8, R300_EASY_TX_FORMAT(X, X, X, X, X8)),
+	_ASSIGN(CI8, R300_EASY_TX_FORMAT(X, X, X, X, X8)),
+	_ASSIGN(YCBCR, R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8)|R300_TX_FORMAT_YUV_MODE ),
+	_ASSIGN(YCBCR_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8)|R300_TX_FORMAT_YUV_MODE),
+	_ASSIGN(RGB_DXT1, R300_EASY_TX_FORMAT(X, Y, Z, ONE, DXT1)),
+	_ASSIGN(RGBA_DXT1, R300_EASY_TX_FORMAT(X, Y, Z, W, DXT1)),
+	_ASSIGN(RGBA_DXT3, R300_EASY_TX_FORMAT(X, Y, Z, W, DXT3)),
+	_ASSIGN(RGBA_DXT5, R300_EASY_TX_FORMAT(Y, Z, W, X, DXT5)),
+	_ASSIGN(RGBA_FLOAT32, R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R32G32B32A32)),
+	_ASSIGN(RGBA_FLOAT16, R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16)),
+	_ASSIGN(RGB_FLOAT32, 0xffffffff),
+	_ASSIGN(RGB_FLOAT16, 0xffffffff),
+	_ASSIGN(ALPHA_FLOAT32, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I32)),
+	_ASSIGN(ALPHA_FLOAT16, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I16)),
+	_ASSIGN(LUMINANCE_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I32)),
+	_ASSIGN(LUMINANCE_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I16)),
+	_ASSIGN(LUMINANCE_ALPHA_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, Y, FL_I32A32)),
+	_ASSIGN(LUMINANCE_ALPHA_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, Y, FL_I16A16)),
+	_ASSIGN(INTENSITY_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, X, FL_I32)),
+	_ASSIGN(INTENSITY_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, X, FL_I16)),
+	/* *INDENT-ON* */
+};
+
+#undef _ASSIGN
+
+/**
+ * This function computes the number of bytes of storage needed for
+ * the given texture object (all mipmap levels, all cube faces).
+ * The \c image[face][level].x/y/width/height parameters for upload/blitting
+ * are computed here.  \c filter, \c format, etc. will be set here
+ * too.
+ *
+ * \param rmesa Context pointer
+ * \param tObj GL texture object whose images are to be posted to
+ *                 hardware state.
+ */
+static void r300SetTexImages(r300ContextPtr rmesa,
+			     struct gl_texture_object *tObj)
+{
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	const struct gl_texture_image *baseImage =
+	    tObj->Image[0][tObj->BaseLevel];
+	GLint curOffset, blitWidth;
+	GLint i, texelBytes;
+	GLint numLevels;
+	GLint log2Width, log2Height, log2Depth;
+
+	/* Set the hardware texture format
+	 */
+	if (!t->image_override && VALID_FORMAT(baseImage->TexFormat->MesaFormat)) {
+		if (_mesa_little_endian()) {
+			t->format =
+			    tx_table_le[baseImage->TexFormat->MesaFormat].
+			    format;
+			t->filter |=
+			    tx_table_le[baseImage->TexFormat->MesaFormat].
+			    filter;
+		} else {
+			t->format =
+			    tx_table_be[baseImage->TexFormat->MesaFormat].
+			    format;
+			t->filter |=
+			    tx_table_be[baseImage->TexFormat->MesaFormat].
+			    filter;
+		}
+	} else if (!t->image_override) {
+		_mesa_problem(NULL, "unexpected texture format in %s",
+			      __FUNCTION__);
+		return;
+	}
+
+	texelBytes = baseImage->TexFormat->TexelBytes;
+
+	/* Compute which mipmap levels we really want to send to the hardware.
+	 */
+	driCalculateTextureFirstLastLevel((driTextureObject *) t);
+	log2Width = tObj->Image[0][t->base.firstLevel]->WidthLog2;
+	log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
+	log2Depth = tObj->Image[0][t->base.firstLevel]->DepthLog2;
+
+	numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+
+	assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
+
+	/* Calculate mipmap offsets and dimensions for blitting (uploading)
+	 * The idea is that we lay out the mipmap levels within a block of
+	 * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
+	 */
+	curOffset = 0;
+	blitWidth = R300_BLIT_WIDTH_BYTES;
+	t->tile_bits = 0;
+
+	/* figure out if this texture is suitable for tiling. */
+#if 0				/* Disabled for now */
+	if (texelBytes) {
+		if ((tObj->Target != GL_TEXTURE_RECTANGLE_NV) &&
+		    /* texrect might be able to use micro tiling too in theory? */
+		    (baseImage->Height > 1)) {
+
+			/* allow 32 (bytes) x 1 mip (which will use two times the space
+			   the non-tiled version would use) max if base texture is large enough */
+			if ((numLevels == 1) ||
+			    (((baseImage->Width * texelBytes /
+			       baseImage->Height) <= 32)
+			     && (baseImage->Width * texelBytes > 64))
+			    ||
+			    ((baseImage->Width * texelBytes /
+			      baseImage->Height) <= 16)) {
+				t->tile_bits |= R300_TXO_MICRO_TILE;
+			}
+		}
+
+		if (tObj->Target != GL_TEXTURE_RECTANGLE_NV) {
+			/* we can set macro tiling even for small textures, they will be untiled anyway */
+			t->tile_bits |= R300_TXO_MACRO_TILE;
+		}
+	}
+#endif
+
+	for (i = 0; i < numLevels; i++) {
+		const struct gl_texture_image *texImage;
+		GLuint size;
+
+		texImage = tObj->Image[0][i + t->base.firstLevel];
+		if (!texImage)
+			break;
+
+		/* find image size in bytes */
+		if (texImage->IsCompressed) {
+			if ((t->format & R300_TX_FORMAT_DXT1) ==
+			    R300_TX_FORMAT_DXT1) {
+				// fprintf(stderr,"DXT 1 %d %08X\n", texImage->Width, t->format);
+				if ((texImage->Width + 3) < 8)	/* width one block */
+					size = texImage->CompressedSize * 4;
+				else if ((texImage->Width + 3) < 16)
+					size = texImage->CompressedSize * 2;
+				else
+					size = texImage->CompressedSize;
+			} else {
+				/* DXT3/5, 16 bytes per block */
+				WARN_ONCE
+				    ("DXT 3/5 suffers from multitexturing problems!\n");
+				// fprintf(stderr,"DXT 3/5 %d\n", texImage->Width);
+				if ((texImage->Width + 3) < 8)
+					size = texImage->CompressedSize * 2;
+				else
+					size = texImage->CompressedSize;
+			}
+		} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+			size =
+			    ((texImage->Width * texelBytes +
+			      63) & ~63) * texImage->Height;
+			blitWidth = 64 / texelBytes;
+		} else if (t->tile_bits & R300_TXO_MICRO_TILE) {
+			/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+			   though the actual offset may be different (if texture is less than
+			   32 bytes width) to the untiled case */
+			int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
+			size =
+			    (w * ((texImage->Height + 1) / 2)) *
+			    texImage->Depth;
+			blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+		} else {
+			int w = (texImage->Width * texelBytes + 31) & ~31;
+			size = w * texImage->Height * texImage->Depth;
+			blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+		}
+		assert(size > 0);
+
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "w=%d h=%d d=%d tb=%d intFormat=%d\n",
+				texImage->Width, texImage->Height,
+				texImage->Depth,
+				texImage->TexFormat->TexelBytes,
+				texImage->InternalFormat);
+
+		/* Align to 32-byte offset.  It is faster to do this unconditionally
+		 * (no branch penalty).
+		 */
+
+		curOffset = (curOffset + 0x1f) & ~0x1f;
+
+		if (texelBytes) {
+			/* fix x and y coords up later together with offset */
+			t->image[0][i].x = curOffset;
+			t->image[0][i].y = 0;
+			t->image[0][i].width =
+			    MIN2(size / texelBytes, blitWidth);
+			t->image[0][i].height =
+			    (size / texelBytes) / t->image[0][i].width;
+		} else {
+			t->image[0][i].x = curOffset % R300_BLIT_WIDTH_BYTES;
+			t->image[0][i].y = curOffset / R300_BLIT_WIDTH_BYTES;
+			t->image[0][i].width =
+			    MIN2(size, R300_BLIT_WIDTH_BYTES);
+			t->image[0][i].height = size / t->image[0][i].width;
+		}
+
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr,
+				"level %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
+				i, texImage->Width, texImage->Height,
+				t->image[0][i].x, t->image[0][i].y,
+				t->image[0][i].width, t->image[0][i].height,
+				size, curOffset);
+
+		curOffset += size;
+	}
+
+	/* Align the total size of texture memory block.
+	 */
+	t->base.totalSize =
+	    (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+
+	/* Setup remaining cube face blits, if needed */
+	if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+		GLuint face;
+		for (face = 1; face < 6; face++) {
+			for (i = 0; i < numLevels; i++) {
+				t->image[face][i].x = t->image[0][i].x;
+				t->image[face][i].y = t->image[0][i].y;
+				t->image[face][i].width = t->image[0][i].width;
+				t->image[face][i].height =
+				    t->image[0][i].height;
+			}
+		}
+		t->base.totalSize *= 6;	/* total texmem needed */
+	}
+
+	if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+		ASSERT(log2Width == log2Height);
+		t->format |= R300_TX_FORMAT_CUBIC_MAP;
+	}
+
+	t->size =
+	    (((tObj->Image[0][t->base.firstLevel]->Width -
+	       1) << R300_TX_WIDTHMASK_SHIFT)
+	     | ((tObj->Image[0][t->base.firstLevel]->Height - 1) <<
+		R300_TX_HEIGHTMASK_SHIFT))
+	    | ((numLevels - 1) << R300_TX_MAX_MIP_LEVEL_SHIFT);
+
+	/* Only need to round to nearest 32 for textures, but the blitter
+	 * requires 64-byte aligned pitches, and we may/may not need the
+	 * blitter.   NPOT only!
+	 */
+	if (baseImage->IsCompressed) {
+		t->pitch =
+		    (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
+	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+		unsigned int align = blitWidth - 1;
+		t->pitch = ((tObj->Image[0][t->base.firstLevel]->Width *
+			     texelBytes) + 63) & ~(63);
+		t->size |= R300_TX_SIZE_TXPITCH_EN;
+		if (!t->image_override)
+			t->pitch_reg =
+			    (((tObj->Image[0][t->base.firstLevel]->Width) +
+			      align) & ~align) - 1;
+	} else {
+		t->pitch =
+		    ((tObj->Image[0][t->base.firstLevel]->Width *
+		      texelBytes) + 63) & ~(63);
+	}
+
+	t->dirty_state = TEX_ALL;
+
+	/* FYI: r300UploadTexImages( rmesa, t ) used to be called here */
+}
+
+/* ================================================================
+ * Texture unit state management
+ */
+
+static GLboolean r300EnableTexture2D(GLcontext * ctx, int unit)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+	struct gl_texture_object *tObj = texUnit->_Current;
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+
+	ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
+
+	if (t->base.dirty_images[0]) {
+		R300_FIREVERTICES(rmesa);
+
+		r300SetTexImages(rmesa, tObj);
+		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
+		if (!t->base.memBlock && !t->image_override)
+			return GL_FALSE;
+	}
+
+	return GL_TRUE;
+}
+
+static GLboolean r300EnableTexture3D(GLcontext * ctx, int unit)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+	struct gl_texture_object *tObj = texUnit->_Current;
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+
+	ASSERT(tObj->Target == GL_TEXTURE_3D);
+
+	/* r300 does not support mipmaps for 3D textures. */
+	if ((tObj->MinFilter != GL_NEAREST) && (tObj->MinFilter != GL_LINEAR)) {
+		return GL_FALSE;
+	}
+
+	if (t->base.dirty_images[0]) {
+		R300_FIREVERTICES(rmesa);
+		r300SetTexImages(rmesa, tObj);
+		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
+		if (!t->base.memBlock)
+			return GL_FALSE;
+	}
+
+	return GL_TRUE;
+}
+
+static GLboolean r300EnableTextureCube(GLcontext * ctx, int unit)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+	struct gl_texture_object *tObj = texUnit->_Current;
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	GLuint face;
+
+	ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
+
+	if (t->base.dirty_images[0] || t->base.dirty_images[1] ||
+	    t->base.dirty_images[2] || t->base.dirty_images[3] ||
+	    t->base.dirty_images[4] || t->base.dirty_images[5]) {
+		/* flush */
+		R300_FIREVERTICES(rmesa);
+		/* layout memory space, once for all faces */
+		r300SetTexImages(rmesa, tObj);
+	}
+
+	/* upload (per face) */
+	for (face = 0; face < 6; face++) {
+		if (t->base.dirty_images[face]) {
+			r300UploadTexImages(rmesa,
+					    (r300TexObjPtr) tObj->DriverData,
+					    face);
+		}
+	}
+
+	if (!t->base.memBlock) {
+		/* texmem alloc failed, use s/w fallback */
+		return GL_FALSE;
+	}
+
+	return GL_TRUE;
+}
+
+static GLboolean r300EnableTextureRect(GLcontext * ctx, int unit)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+	struct gl_texture_object *tObj = texUnit->_Current;
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+
+	ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
+
+	if (t->base.dirty_images[0]) {
+		R300_FIREVERTICES(rmesa);
+
+		r300SetTexImages(rmesa, tObj);
+		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
+		if (!t->base.memBlock && !t->image_override &&
+		    !rmesa->prefer_gart_client_texturing)
+			return GL_FALSE;
+	}
+
+	return GL_TRUE;
+}
+
+static GLboolean r300UpdateTexture(GLcontext * ctx, int unit)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+	struct gl_texture_object *tObj = texUnit->_Current;
+	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+
+	/* Fallback if there's a texture border */
+	if (tObj->Image[0][tObj->BaseLevel]->Border > 0)
+		return GL_FALSE;
+
+	/* Update state if this is a different texture object to last
+	 * time.
+	 */
+	if (rmesa->state.texture.unit[unit].texobj != t) {
+		if (rmesa->state.texture.unit[unit].texobj != NULL) {
+			/* The old texture is no longer bound to this texture unit.
+			 * Mark it as such.
+			 */
+
+			rmesa->state.texture.unit[unit].texobj->base.bound &=
+			    ~(1UL << unit);
+		}
+
+		rmesa->state.texture.unit[unit].texobj = t;
+		t->base.bound |= (1UL << unit);
+		t->dirty_state |= 1 << unit;
+		driUpdateTextureLRU((driTextureObject *) t);	/* XXX: should be locked! */
+	}
+
+	return !t->border_fallback;
+}
+
+void r300SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
+		      unsigned long long offset, GLint depth, GLuint pitch)
+{
+	r300ContextPtr rmesa =
+		(r300ContextPtr)((__DRIcontextPrivate*)pDRICtx->private)->driverPrivate;
+	struct gl_texture_object *tObj =
+		_mesa_lookup_texture(rmesa->radeon.glCtx, texname);
+	r300TexObjPtr t;
+	int idx;
+
+	if (!tObj)
+		return;
+
+	t = (r300TexObjPtr) tObj->DriverData;
+
+	t->image_override = GL_TRUE;
+
+	if (!offset)
+		return;
+
+	t->offset = offset;
+	t->pitch_reg = pitch;
+
+	switch (depth) {
+	case 32:
+		idx = 2;
+		t->pitch_reg /= 4;
+		break;
+	case 24:
+	default:
+		idx = 4;
+		t->pitch_reg /= 4;
+		break;
+	case 16:
+		idx = 5;
+		t->pitch_reg /= 2;
+		break;
+	}
+
+	t->pitch_reg--;
+
+	t->format = tx_table_le[idx].format;
+	t->filter |= tx_table_le[idx].filter;
+}
+
+static GLboolean r300UpdateTextureUnit(GLcontext * ctx, int unit)
+{
+	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+
+	if (texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT)) {
+		return (r300EnableTextureRect(ctx, unit) &&
+			r300UpdateTexture(ctx, unit));
+	} else if (texUnit->_ReallyEnabled & (TEXTURE_1D_BIT | TEXTURE_2D_BIT)) {
+		return (r300EnableTexture2D(ctx, unit) &&
+			r300UpdateTexture(ctx, unit));
+	} else if (texUnit->_ReallyEnabled & (TEXTURE_3D_BIT)) {
+		return (r300EnableTexture3D(ctx, unit) &&
+			r300UpdateTexture(ctx, unit));
+	} else if (texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT)) {
+		return (r300EnableTextureCube(ctx, unit) &&
+			r300UpdateTexture(ctx, unit));
+	} else if (texUnit->_ReallyEnabled) {
+		return GL_FALSE;
+	} else {
+		return GL_TRUE;
+	}
+}
+
+void r300UpdateTextureState(GLcontext * ctx)
+{
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		if (!r300UpdateTextureUnit(ctx, i)) {
+			_mesa_warning(ctx,
+				      "failed to update texture state for unit %d.\n",
+				      i);
+		}
+	}
+}
diff --git a/r300/r300_vertprog.c b/r300/r300_vertprog.c
new file mode 100644
index 0000000..1d90ade
--- /dev/null
+++ b/r300/r300_vertprog.c
@@ -0,0 +1,1305 @@
+/**************************************************************************
+
+Copyright (C) 2005 Aapo Tahkola.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Aapo Tahkola <aet@rasterburn.org>
+ */
+
+#include "glheader.h"
+#include "macros.h"
+#include "enums.h"
+#include "program.h"
+#include "shader/prog_instruction.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+#include "tnl/tnl.h"
+
+#include "r300_context.h"
+
+#if SWIZZLE_X != VSF_IN_COMPONENT_X || \
+    SWIZZLE_Y != VSF_IN_COMPONENT_Y || \
+    SWIZZLE_Z != VSF_IN_COMPONENT_Z || \
+    SWIZZLE_W != VSF_IN_COMPONENT_W || \
+    SWIZZLE_ZERO != VSF_IN_COMPONENT_ZERO || \
+    SWIZZLE_ONE != VSF_IN_COMPONENT_ONE || \
+    WRITEMASK_X != VSF_FLAG_X || \
+    WRITEMASK_Y != VSF_FLAG_Y || \
+    WRITEMASK_Z != VSF_FLAG_Z || \
+    WRITEMASK_W != VSF_FLAG_W
+#error Cannot change these!
+#endif
+
+#define SCALAR_FLAG (1<<31)
+#define FLAG_MASK (1<<31)
+#define OP_MASK	(0xf)		/* we are unlikely to have more than 15 */
+#define OPN(operator, ip) {#operator, OPCODE_##operator, ip}
+
+static struct {
+	char *name;
+	int opcode;
+	unsigned long ip;	/* number of input operands and flags */
+} op_names[] = {
+	/* *INDENT-OFF* */
+	OPN(ABS, 1),
+	OPN(ADD, 2),
+	OPN(ARL, 1 | SCALAR_FLAG),
+	OPN(DP3, 2),
+	OPN(DP4, 2),
+	OPN(DPH, 2),
+	OPN(DST, 2),
+	OPN(EX2, 1 | SCALAR_FLAG),
+	OPN(EXP, 1 | SCALAR_FLAG),
+	OPN(FLR, 1),
+	OPN(FRC, 1),
+	OPN(LG2, 1 | SCALAR_FLAG),
+	OPN(LIT, 1),
+	OPN(LOG, 1 | SCALAR_FLAG),
+	OPN(MAD, 3),
+	OPN(MAX, 2),
+	OPN(MIN, 2),
+	OPN(MOV, 1),
+	OPN(MUL, 2),
+	OPN(POW, 2 | SCALAR_FLAG),
+	OPN(RCP, 1 | SCALAR_FLAG),
+	OPN(RSQ, 1 | SCALAR_FLAG),
+	OPN(SGE, 2),
+	OPN(SLT, 2),
+	OPN(SUB, 2),
+	OPN(SWZ, 1),
+	OPN(XPD, 2),
+	OPN(RCC, 0),	//extra
+	OPN(PRINT, 0),
+	OPN(END, 0)
+	/* *INDENT-ON* */
+};
+
+#undef OPN
+
+int r300VertexProgUpdateParams(GLcontext * ctx,
+			       struct r300_vertex_program_cont *vp, float *dst)
+{
+	int pi;
+	struct gl_vertex_program *mesa_vp = &vp->mesa_program;
+	float *dst_o = dst;
+	struct gl_program_parameter_list *paramList;
+
+	if (mesa_vp->IsNVProgram) {
+		_mesa_load_tracked_matrices(ctx);
+
+		for (pi = 0; pi < MAX_NV_VERTEX_PROGRAM_PARAMS; pi++) {
+			*dst++ = ctx->VertexProgram.Parameters[pi][0];
+			*dst++ = ctx->VertexProgram.Parameters[pi][1];
+			*dst++ = ctx->VertexProgram.Parameters[pi][2];
+			*dst++ = ctx->VertexProgram.Parameters[pi][3];
+		}
+		return dst - dst_o;
+	}
+
+	assert(mesa_vp->Base.Parameters);
+	_mesa_load_state_parameters(ctx, mesa_vp->Base.Parameters);
+
+	if (mesa_vp->Base.Parameters->NumParameters * 4 >
+	    VSF_MAX_FRAGMENT_LENGTH) {
+		fprintf(stderr, "%s:Params exhausted\n", __FUNCTION__);
+		_mesa_exit(-1);
+	}
+
+	paramList = mesa_vp->Base.Parameters;
+	for (pi = 0; pi < paramList->NumParameters; pi++) {
+		switch (paramList->Parameters[pi].Type) {
+
+		case PROGRAM_STATE_VAR:
+		case PROGRAM_NAMED_PARAM:
+			//fprintf(stderr, "%s", vp->Parameters->Parameters[pi].Name);
+		case PROGRAM_CONSTANT:
+			*dst++ = paramList->ParameterValues[pi][0];
+			*dst++ = paramList->ParameterValues[pi][1];
+			*dst++ = paramList->ParameterValues[pi][2];
+			*dst++ = paramList->ParameterValues[pi][3];
+			break;
+
+		default:
+			_mesa_problem(NULL, "Bad param type in %s",
+				      __FUNCTION__);
+		}
+
+	}
+
+	return dst - dst_o;
+}
+
+static unsigned long t_dst_mask(GLuint mask)
+{
+	/* WRITEMASK_* is equivalent to VSF_FLAG_* */
+	return mask & VSF_FLAG_ALL;
+}
+
+static unsigned long t_dst_class(enum register_file file)
+{
+
+	switch (file) {
+	case PROGRAM_TEMPORARY:
+		return VSF_OUT_CLASS_TMP;
+	case PROGRAM_OUTPUT:
+		return VSF_OUT_CLASS_RESULT;
+	case PROGRAM_ADDRESS:
+		return VSF_OUT_CLASS_ADDR;
+		/*
+		   case PROGRAM_INPUT:
+		   case PROGRAM_LOCAL_PARAM:
+		   case PROGRAM_ENV_PARAM:
+		   case PROGRAM_NAMED_PARAM:
+		   case PROGRAM_STATE_VAR:
+		   case PROGRAM_WRITE_ONLY:
+		   case PROGRAM_ADDRESS:
+		 */
+	default:
+		fprintf(stderr, "problem in %s", __FUNCTION__);
+		_mesa_exit(-1);
+		return -1;
+	}
+}
+
+static unsigned long t_dst_index(struct r300_vertex_program *vp,
+				 struct prog_dst_register *dst)
+{
+	if (dst->File == PROGRAM_OUTPUT)
+		return vp->outputs[dst->Index];
+
+	return dst->Index;
+}
+
+static unsigned long t_src_class(enum register_file file)
+{
+
+	switch (file) {
+	case PROGRAM_TEMPORARY:
+		return VSF_IN_CLASS_TMP;
+
+	case PROGRAM_INPUT:
+		return VSF_IN_CLASS_ATTR;
+
+	case PROGRAM_LOCAL_PARAM:
+	case PROGRAM_ENV_PARAM:
+	case PROGRAM_NAMED_PARAM:
+	case PROGRAM_STATE_VAR:
+		return VSF_IN_CLASS_PARAM;
+		/*
+		   case PROGRAM_OUTPUT:
+		   case PROGRAM_WRITE_ONLY:
+		   case PROGRAM_ADDRESS:
+		 */
+	default:
+		fprintf(stderr, "problem in %s", __FUNCTION__);
+		_mesa_exit(-1);
+		return -1;
+	}
+}
+
+static __inline unsigned long t_swizzle(GLubyte swizzle)
+{
+/* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
+	return swizzle;
+}
+
+#if 0
+static void vp_dump_inputs(struct r300_vertex_program *vp, char *caller)
+{
+	int i;
+
+	if (vp == NULL) {
+		fprintf(stderr, "vp null in call to %s from %s\n", __FUNCTION__,
+			caller);
+		return;
+	}
+
+	fprintf(stderr, "%s:<", caller);
+	for (i = 0; i < VERT_ATTRIB_MAX; i++)
+		fprintf(stderr, "%d ", vp->inputs[i]);
+	fprintf(stderr, ">\n");
+
+}
+#endif
+
+static unsigned long t_src_index(struct r300_vertex_program *vp,
+				 struct prog_src_register *src)
+{
+	int i;
+	int max_reg = -1;
+
+	if (src->File == PROGRAM_INPUT) {
+		if (vp->inputs[src->Index] != -1)
+			return vp->inputs[src->Index];
+
+		for (i = 0; i < VERT_ATTRIB_MAX; i++)
+			if (vp->inputs[i] > max_reg)
+				max_reg = vp->inputs[i];
+
+		vp->inputs[src->Index] = max_reg + 1;
+
+		//vp_dump_inputs(vp, __FUNCTION__);
+
+		return vp->inputs[src->Index];
+	} else {
+		if (src->Index < 0) {
+			fprintf(stderr,
+				"negative offsets for indirect addressing do not work.\n");
+			return 0;
+		}
+		return src->Index;
+	}
+}
+
+static unsigned long t_src(struct r300_vertex_program *vp,
+			   struct prog_src_register *src)
+{
+	/* src->NegateBase uses the NEGATE_ flags from program_instruction.h,
+	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
+	 */
+	return MAKE_VSF_SOURCE(t_src_index(vp, src),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
+			       t_src_class(src->File),
+			       src->NegateBase) | (src->RelAddr << 4);
+}
+
+static unsigned long t_src_scalar(struct r300_vertex_program *vp,
+				  struct prog_src_register *src)
+{
+
+	return MAKE_VSF_SOURCE(t_src_index(vp, src),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_src_class(src->File),
+			       src->
+			       NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+	    (src->RelAddr << 4);
+}
+
+static unsigned long t_opcode(enum prog_opcode opcode)
+{
+
+	switch (opcode) {
+	/* *INDENT-OFF* */
+	case OPCODE_ARL: return R300_VPI_OUT_OP_ARL;
+	case OPCODE_DST: return R300_VPI_OUT_OP_DST;
+	case OPCODE_EX2: return R300_VPI_OUT_OP_EX2;
+	case OPCODE_EXP: return R300_VPI_OUT_OP_EXP;
+	case OPCODE_FRC: return R300_VPI_OUT_OP_FRC;
+	case OPCODE_LG2: return R300_VPI_OUT_OP_LG2;
+	case OPCODE_LOG: return R300_VPI_OUT_OP_LOG;
+	case OPCODE_MAX: return R300_VPI_OUT_OP_MAX;
+	case OPCODE_MIN: return R300_VPI_OUT_OP_MIN;
+	case OPCODE_MUL: return R300_VPI_OUT_OP_MUL;
+	case OPCODE_RCP: return R300_VPI_OUT_OP_RCP;
+	case OPCODE_RSQ: return R300_VPI_OUT_OP_RSQ;
+	case OPCODE_SGE: return R300_VPI_OUT_OP_SGE;
+	case OPCODE_SLT: return R300_VPI_OUT_OP_SLT;
+	case OPCODE_DP4: return R300_VPI_OUT_OP_DOT;
+	/* *INDENT-ON* */
+
+	default:
+		fprintf(stderr, "%s: Should not be called with opcode %d!",
+			__FUNCTION__, opcode);
+	}
+	_mesa_exit(-1);
+	return 0;
+}
+
+static unsigned long op_operands(enum prog_opcode opcode)
+{
+	int i;
+
+	/* Can we trust mesas opcodes to be in order ? */
+	for (i = 0; i < sizeof(op_names) / sizeof(*op_names); i++)
+		if (op_names[i].opcode == opcode)
+			return op_names[i].ip;
+
+	fprintf(stderr, "op %d not found in op_names\n", opcode);
+	_mesa_exit(-1);
+	return 0;
+}
+
+static GLboolean valid_dst(struct r300_vertex_program *vp,
+			   struct prog_dst_register *dst)
+{
+	if (dst->File == PROGRAM_OUTPUT && vp->outputs[dst->Index] == -1) {
+		return GL_FALSE;
+	} else if (dst->File == PROGRAM_ADDRESS) {
+		assert(dst->Index == 0);
+	}
+
+	return GL_TRUE;
+}
+
+/* TODO: Get rid of t_src_class call */
+#define CMP_SRCS(a, b) ((a.RelAddr != b.RelAddr) || (a.Index != b.Index && \
+		       ((t_src_class(a.File) == VSF_IN_CLASS_PARAM && \
+			 t_src_class(b.File) == VSF_IN_CLASS_PARAM) || \
+			(t_src_class(a.File) == VSF_IN_CLASS_ATTR && \
+			 t_src_class(b.File) == VSF_IN_CLASS_ATTR)))) \
+
+#define ZERO_SRC_0 (MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), \
+				    SWIZZLE_ZERO, SWIZZLE_ZERO, \
+				    SWIZZLE_ZERO, SWIZZLE_ZERO, \
+				    t_src_class(src[0].File), VSF_FLAG_NONE) | (src[0].RelAddr << 4))
+
+#define ZERO_SRC_1 (MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), \
+				    SWIZZLE_ZERO, SWIZZLE_ZERO, \
+				    SWIZZLE_ZERO, SWIZZLE_ZERO, \
+				    t_src_class(src[1].File), VSF_FLAG_NONE) | (src[1].RelAddr << 4))
+
+#define ZERO_SRC_2 (MAKE_VSF_SOURCE(t_src_index(vp, &src[2]), \
+				    SWIZZLE_ZERO, SWIZZLE_ZERO, \
+				    SWIZZLE_ZERO, SWIZZLE_ZERO, \
+				    t_src_class(src[2].File), VSF_FLAG_NONE) | (src[2].RelAddr << 4))
+
+#define ONE_SRC_0 (MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), \
+				    SWIZZLE_ONE, SWIZZLE_ONE, \
+				    SWIZZLE_ONE, SWIZZLE_ONE, \
+				    t_src_class(src[0].File), VSF_FLAG_NONE) | (src[0].RelAddr << 4))
+
+#define ONE_SRC_1 (MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), \
+				    SWIZZLE_ONE, SWIZZLE_ONE, \
+				    SWIZZLE_ONE, SWIZZLE_ONE, \
+				    t_src_class(src[1].File), VSF_FLAG_NONE) | (src[1].RelAddr << 4))
+
+#define ONE_SRC_2 (MAKE_VSF_SOURCE(t_src_index(vp, &src[2]), \
+				    SWIZZLE_ONE, SWIZZLE_ONE, \
+				    SWIZZLE_ONE, SWIZZLE_ONE, \
+				    t_src_class(src[2].File), VSF_FLAG_NONE) | (src[2].RelAddr << 4))
+
+/* DP4 version seems to trigger some hw peculiarity */
+//#define PREFER_DP4
+
+#define FREE_TEMPS() \
+	do { \
+		if(u_temp_i < vp->num_temporaries) { \
+			WARN_ONCE("Ran out of temps, num temps %d, us %d\n", vp->num_temporaries, u_temp_i); \
+			vp->native = GL_FALSE; \
+		} \
+		u_temp_i=VSF_MAX_FRAGMENT_TEMPS-1; \
+	} while (0)
+
+static void r300TranslateVertexShader(struct r300_vertex_program *vp,
+				      struct prog_instruction *vpi)
+{
+	int i, cur_reg = 0;
+	VERTEX_SHADER_INSTRUCTION *o_inst;
+	unsigned long operands;
+	int are_srcs_scalar;
+	unsigned long hw_op;
+	/* Initial value should be last tmp reg that hw supports.
+	   Strangely enough r300 doesnt mind even though these would be out of range.
+	   Smart enough to realize that it doesnt need it? */
+	int u_temp_i = VSF_MAX_FRAGMENT_TEMPS - 1;
+	struct prog_src_register src[3];
+
+	vp->pos_end = 0;	/* Not supported yet */
+	vp->program.length = 0;
+	/*vp->num_temporaries=mesa_vp->Base.NumTemporaries; */
+
+	for (i = 0; i < VERT_ATTRIB_MAX; i++)
+		vp->inputs[i] = -1;
+
+	for (i = 0; i < VERT_RESULT_MAX; i++)
+		vp->outputs[i] = -1;
+
+	assert(vp->key.OutputsWritten & (1 << VERT_RESULT_HPOS));
+
+	/* Assign outputs */
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_HPOS))
+		vp->outputs[VERT_RESULT_HPOS] = cur_reg++;
+
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_PSIZ))
+		vp->outputs[VERT_RESULT_PSIZ] = cur_reg++;
+
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL0))
+		vp->outputs[VERT_RESULT_COL0] = cur_reg++;
+
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL1))
+		vp->outputs[VERT_RESULT_COL1] = cur_reg++;
+
+#if 0				/* Not supported yet */
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0))
+		vp->outputs[VERT_RESULT_BFC0] = cur_reg++;
+
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1))
+		vp->outputs[VERT_RESULT_BFC1] = cur_reg++;
+
+	if (vp->key.OutputsWritten & (1 << VERT_RESULT_FOGC))
+		vp->outputs[VERT_RESULT_FOGC] = cur_reg++;
+#endif
+
+	for (i = VERT_RESULT_TEX0; i <= VERT_RESULT_TEX7; i++)
+		if (vp->key.OutputsWritten & (1 << i))
+			vp->outputs[i] = cur_reg++;
+
+	vp->translated = GL_TRUE;
+	vp->native = GL_TRUE;
+
+	o_inst = vp->program.body.i;
+	for (; vpi->Opcode != OPCODE_END; vpi++, o_inst++) {
+		FREE_TEMPS();
+
+		if (!valid_dst(vp, &vpi->DstReg)) {
+			/* redirect result to unused temp */
+			vpi->DstReg.File = PROGRAM_TEMPORARY;
+			vpi->DstReg.Index = u_temp_i;
+		}
+
+		operands = op_operands(vpi->Opcode);
+		are_srcs_scalar = operands & SCALAR_FLAG;
+		operands &= OP_MASK;
+
+		for (i = 0; i < operands; i++)
+			src[i] = vpi->SrcReg[i];
+
+		if (operands == 3) {	/* TODO: scalars */
+			if (CMP_SRCS(src[1], src[2])
+			    || CMP_SRCS(src[0], src[2])) {
+				o_inst->op =
+				    MAKE_VSF_OP(R300_VPI_OUT_OP_ADD, u_temp_i,
+						VSF_FLAG_ALL,
+						VSF_OUT_CLASS_TMP);
+
+				o_inst->src[0] =
+				    MAKE_VSF_SOURCE(t_src_index(vp, &src[2]),
+						    SWIZZLE_X, SWIZZLE_Y,
+						    SWIZZLE_Z, SWIZZLE_W,
+						    t_src_class(src[2].File),
+						    VSF_FLAG_NONE) | (src[2].
+								      RelAddr <<
+								      4);
+
+				o_inst->src[1] = ZERO_SRC_2;
+				o_inst->src[2] = ZERO_SRC_2;
+				o_inst++;
+
+				src[2].File = PROGRAM_TEMPORARY;
+				src[2].Index = u_temp_i;
+				src[2].RelAddr = 0;
+				u_temp_i--;
+			}
+
+		}
+
+		if (operands >= 2) {
+			if (CMP_SRCS(src[1], src[0])) {
+				o_inst->op =
+				    MAKE_VSF_OP(R300_VPI_OUT_OP_ADD, u_temp_i,
+						VSF_FLAG_ALL,
+						VSF_OUT_CLASS_TMP);
+
+				o_inst->src[0] =
+				    MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+						    SWIZZLE_X, SWIZZLE_Y,
+						    SWIZZLE_Z, SWIZZLE_W,
+						    t_src_class(src[0].File),
+						    VSF_FLAG_NONE) | (src[0].
+								      RelAddr <<
+								      4);
+
+				o_inst->src[1] = ZERO_SRC_0;
+				o_inst->src[2] = ZERO_SRC_0;
+				o_inst++;
+
+				src[0].File = PROGRAM_TEMPORARY;
+				src[0].Index = u_temp_i;
+				src[0].RelAddr = 0;
+				u_temp_i--;
+			}
+		}
+
+		/* These ops need special handling. */
+		switch (vpi->Opcode) {
+		case OPCODE_POW:
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_POW,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src_scalar(vp, &src[0]);
+			o_inst->src[1] = ZERO_SRC_0;
+			o_inst->src[2] = t_src_scalar(vp, &src[1]);
+			goto next;
+
+		case OPCODE_MOV:	//ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO}
+		case OPCODE_SWZ:
+#if 1
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_ADD,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = ZERO_SRC_0;
+			o_inst->src[2] = ZERO_SRC_0;
+#else
+			hw_op =
+			    (src[0].File ==
+			     PROGRAM_TEMPORARY) ? R300_VPI_OUT_OP_MAD_2 :
+			    R300_VPI_OUT_OP_MAD;
+
+			o_inst->op =
+			    MAKE_VSF_OP(hw_op, t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = ONE_SRC_0;
+			o_inst->src[2] = ZERO_SRC_0;
+#endif
+
+			goto next;
+
+		case OPCODE_ADD:
+#if 1
+			hw_op = (src[0].File == PROGRAM_TEMPORARY &&
+				 src[1].File ==
+				 PROGRAM_TEMPORARY) ? R300_VPI_OUT_OP_MAD_2 :
+			    R300_VPI_OUT_OP_MAD;
+
+			o_inst->op =
+			    MAKE_VSF_OP(hw_op, t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = ONE_SRC_0;
+			o_inst->src[1] = t_src(vp, &src[0]);
+			o_inst->src[2] = t_src(vp, &src[1]);
+#else
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_ADD,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = t_src(vp, &src[1]);
+			o_inst->src[2] = ZERO_SRC_1;
+
+#endif
+			goto next;
+
+		case OPCODE_MAD:
+			hw_op = (src[0].File == PROGRAM_TEMPORARY &&
+				 src[1].File == PROGRAM_TEMPORARY &&
+				 src[2].File ==
+				 PROGRAM_TEMPORARY) ? R300_VPI_OUT_OP_MAD_2 :
+			    R300_VPI_OUT_OP_MAD;
+
+			o_inst->op =
+			    MAKE_VSF_OP(hw_op, t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = t_src(vp, &src[1]);
+			o_inst->src[2] = t_src(vp, &src[2]);
+			goto next;
+
+		case OPCODE_MUL:	/* HW mul can take third arg but appears to have some other limitations. */
+			hw_op = (src[0].File == PROGRAM_TEMPORARY &&
+				 src[1].File ==
+				 PROGRAM_TEMPORARY) ? R300_VPI_OUT_OP_MAD_2 :
+			    R300_VPI_OUT_OP_MAD;
+
+			o_inst->op =
+			    MAKE_VSF_OP(hw_op, t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = t_src(vp, &src[1]);
+
+			o_inst->src[2] = ZERO_SRC_1;
+			goto next;
+
+		case OPCODE_DP3:	//DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ZERO} PARAM 0{} {X Y Z ZERO}
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_DOT,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 1)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 2)),
+					    SWIZZLE_ZERO,
+					    t_src_class(src[0].File),
+					    src[0].
+					    NegateBase ? VSF_FLAG_XYZ :
+					    VSF_FLAG_NONE) | (src[0].
+							      RelAddr << 4);
+
+			o_inst->src[1] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 1)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 2)),
+					    SWIZZLE_ZERO,
+					    t_src_class(src[1].File),
+					    src[1].
+					    NegateBase ? VSF_FLAG_XYZ :
+					    VSF_FLAG_NONE) | (src[1].
+							      RelAddr << 4);
+
+			o_inst->src[2] = ZERO_SRC_1;
+			goto next;
+
+		case OPCODE_SUB:	//ADD RESULT 1.X Y Z W TMP 0{} {X Y Z W} PARAM 1{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
+#if 1
+			hw_op = (src[0].File == PROGRAM_TEMPORARY &&
+				 src[1].File ==
+				 PROGRAM_TEMPORARY) ? R300_VPI_OUT_OP_MAD_2 :
+			    R300_VPI_OUT_OP_MAD;
+
+			o_inst->op =
+			    MAKE_VSF_OP(hw_op, t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = ONE_SRC_0;
+			o_inst->src[2] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 1)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 2)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 3)),
+					    t_src_class(src[1].File),
+					    (!src[1].
+					     NegateBase) ? VSF_FLAG_ALL :
+					    VSF_FLAG_NONE) | (src[1].
+							      RelAddr << 4);
+#else
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_ADD,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 1)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 2)),
+					    t_swizzle(GET_SWZ
+						      (src[1].Swizzle, 3)),
+					    t_src_class(src[1].File),
+					    (!src[1].
+					     NegateBase) ? VSF_FLAG_ALL :
+					    VSF_FLAG_NONE) | (src[1].
+							      RelAddr << 4);
+			o_inst->src[2] = 0;
+#endif
+			goto next;
+
+		case OPCODE_ABS:	//MAX RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_MAX,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 1)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 2)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 3)),
+					    t_src_class(src[0].File),
+					    (!src[0].
+					     NegateBase) ? VSF_FLAG_ALL :
+					    VSF_FLAG_NONE) | (src[0].
+							      RelAddr << 4);
+			o_inst->src[2] = 0;
+			goto next;
+
+		case OPCODE_FLR:
+			/* FRC TMP 0.X Y Z W PARAM 0{} {X Y Z W}
+			   ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} TMP 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W */
+
+			o_inst->op = MAKE_VSF_OP(R300_VPI_OUT_OP_FRC, u_temp_i,
+						 t_dst_mask(vpi->DstReg.
+							    WriteMask),
+						 VSF_OUT_CLASS_TMP);
+
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = ZERO_SRC_0;
+			o_inst->src[2] = ZERO_SRC_0;
+			o_inst++;
+
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_ADD,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] = t_src(vp, &src[0]);
+			o_inst->src[1] = MAKE_VSF_SOURCE(u_temp_i,
+							 VSF_IN_COMPONENT_X,
+							 VSF_IN_COMPONENT_Y,
+							 VSF_IN_COMPONENT_Z,
+							 VSF_IN_COMPONENT_W,
+							 VSF_IN_CLASS_TMP,
+							 /* Not 100% sure about this */
+							 (!src[0].
+							  NegateBase) ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE
+							 /*VSF_FLAG_ALL */ );
+
+			o_inst->src[2] = ZERO_SRC_0;
+			u_temp_i--;
+			goto next;
+
+		case OPCODE_LG2:	// LG2 RESULT 1.X Y Z W PARAM 0{} {X X X X}
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_LG2,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_src_class(src[0].File),
+					    src[0].
+					    NegateBase ? VSF_FLAG_ALL :
+					    VSF_FLAG_NONE) | (src[0].
+							      RelAddr << 4);
+			o_inst->src[1] = ZERO_SRC_0;
+			o_inst->src[2] = ZERO_SRC_0;
+			goto next;
+
+		case OPCODE_LIT:	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_LIT,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+			/* NOTE: Users swizzling might not work. */
+			o_inst->src[0] = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// x
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// w
+							 VSF_IN_COMPONENT_ZERO,	// z
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// y
+							 t_src_class(src[0].
+								     File),
+							 src[0].
+							 NegateBase ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[0].RelAddr << 4);
+			o_inst->src[1] = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// y
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// w
+							 VSF_IN_COMPONENT_ZERO,	// z
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// x
+							 t_src_class(src[0].
+								     File),
+							 src[0].
+							 NegateBase ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[0].RelAddr << 4);
+			o_inst->src[2] = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// y
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// x
+							 VSF_IN_COMPONENT_ZERO,	// z
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// w
+							 t_src_class(src[0].
+								     File),
+							 src[0].
+							 NegateBase ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[0].RelAddr << 4);
+			goto next;
+
+		case OPCODE_DPH:	//DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W}
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_DOT,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] =
+			    MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 0)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 1)),
+					    t_swizzle(GET_SWZ
+						      (src[0].Swizzle, 2)),
+					    VSF_IN_COMPONENT_ONE,
+					    t_src_class(src[0].File),
+					    src[0].
+					    NegateBase ? VSF_FLAG_XYZ :
+					    VSF_FLAG_NONE) | (src[0].
+							      RelAddr << 4);
+			o_inst->src[1] = t_src(vp, &src[1]);
+			o_inst->src[2] = ZERO_SRC_1;
+			goto next;
+
+		case OPCODE_XPD:
+			/* mul r0, r1.yzxw, r2.zxyw
+			   mad r0, -r2.yzxw, r1.zxyw, r0
+			   NOTE: might need MAD_2
+			 */
+
+			o_inst->op = MAKE_VSF_OP(R300_VPI_OUT_OP_MAD, u_temp_i,
+						 t_dst_mask(vpi->DstReg.
+							    WriteMask),
+						 VSF_OUT_CLASS_TMP);
+
+			o_inst->src[0] = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// y
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 2)),	// z
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// x
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// w
+							 t_src_class(src[0].
+								     File),
+							 src[0].
+							 NegateBase ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[0].RelAddr << 4);
+
+			o_inst->src[1] = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), t_swizzle(GET_SWZ(src[1].Swizzle, 2)),	// z
+							 t_swizzle(GET_SWZ(src[1].Swizzle, 0)),	// x
+							 t_swizzle(GET_SWZ(src[1].Swizzle, 1)),	// y
+							 t_swizzle(GET_SWZ(src[1].Swizzle, 3)),	// w
+							 t_src_class(src[1].
+								     File),
+							 src[1].
+							 NegateBase ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[1].RelAddr << 4);
+
+			o_inst->src[2] = ZERO_SRC_1;
+			o_inst++;
+			u_temp_i--;
+
+			o_inst->op =
+			    MAKE_VSF_OP(R300_VPI_OUT_OP_MAD,
+					t_dst_index(vp, &vpi->DstReg),
+					t_dst_mask(vpi->DstReg.WriteMask),
+					t_dst_class(vpi->DstReg.File));
+
+			o_inst->src[0] = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]), t_swizzle(GET_SWZ(src[1].Swizzle, 1)),	// y
+							 t_swizzle(GET_SWZ(src[1].Swizzle, 2)),	// z
+							 t_swizzle(GET_SWZ(src[1].Swizzle, 0)),	// x
+							 t_swizzle(GET_SWZ(src[1].Swizzle, 3)),	// w
+							 t_src_class(src[1].
+								     File),
+							 (!src[1].
+							  NegateBase) ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[1].RelAddr << 4);
+
+			o_inst->src[1] = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 2)),	// z
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// x
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// y
+							 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// w
+							 t_src_class(src[0].
+								     File),
+							 src[0].
+							 NegateBase ?
+							 VSF_FLAG_ALL :
+							 VSF_FLAG_NONE) |
+			    (src[0].RelAddr << 4);
+
+			o_inst->src[2] = MAKE_VSF_SOURCE(u_temp_i + 1,
+							 VSF_IN_COMPONENT_X,
+							 VSF_IN_COMPONENT_Y,
+							 VSF_IN_COMPONENT_Z,
+							 VSF_IN_COMPONENT_W,
+							 VSF_IN_CLASS_TMP,
+							 VSF_FLAG_NONE);
+
+			goto next;
+
+		case OPCODE_RCC:
+			fprintf(stderr, "Dont know how to handle op %d yet\n",
+				vpi->Opcode);
+			_mesa_exit(-1);
+			break;
+		case OPCODE_END:
+			break;
+		default:
+			break;
+		}
+
+		o_inst->op =
+		    MAKE_VSF_OP(t_opcode(vpi->Opcode),
+				t_dst_index(vp, &vpi->DstReg),
+				t_dst_mask(vpi->DstReg.WriteMask),
+				t_dst_class(vpi->DstReg.File));
+
+		if (are_srcs_scalar) {
+			switch (operands) {
+			case 1:
+				o_inst->src[0] = t_src_scalar(vp, &src[0]);
+				o_inst->src[1] = ZERO_SRC_0;
+				o_inst->src[2] = ZERO_SRC_0;
+				break;
+
+			case 2:
+				o_inst->src[0] = t_src_scalar(vp, &src[0]);
+				o_inst->src[1] = t_src_scalar(vp, &src[1]);
+				o_inst->src[2] = ZERO_SRC_1;
+				break;
+
+			case 3:
+				o_inst->src[0] = t_src_scalar(vp, &src[0]);
+				o_inst->src[1] = t_src_scalar(vp, &src[1]);
+				o_inst->src[2] = t_src_scalar(vp, &src[2]);
+				break;
+
+			default:
+				fprintf(stderr,
+					"scalars and op RCC not handled yet");
+				_mesa_exit(-1);
+				break;
+			}
+		} else {
+			switch (operands) {
+			case 1:
+				o_inst->src[0] = t_src(vp, &src[0]);
+				o_inst->src[1] = ZERO_SRC_0;
+				o_inst->src[2] = ZERO_SRC_0;
+				break;
+
+			case 2:
+				o_inst->src[0] = t_src(vp, &src[0]);
+				o_inst->src[1] = t_src(vp, &src[1]);
+				o_inst->src[2] = ZERO_SRC_1;
+				break;
+
+			case 3:
+				o_inst->src[0] = t_src(vp, &src[0]);
+				o_inst->src[1] = t_src(vp, &src[1]);
+				o_inst->src[2] = t_src(vp, &src[2]);
+				break;
+
+			default:
+				fprintf(stderr,
+					"scalars and op RCC not handled yet");
+				_mesa_exit(-1);
+				break;
+			}
+		}
+	      next:;
+	}
+
+	/* Will most likely segfault before we get here... fix later. */
+	if (o_inst - vp->program.body.i >= VSF_MAX_FRAGMENT_LENGTH / 4) {
+		vp->program.length = 0;
+		vp->native = GL_FALSE;
+		return;
+	}
+	vp->program.length = (o_inst - vp->program.body.i) * 4;
+#if 0
+	fprintf(stderr, "hw program:\n");
+	for (i = 0; i < vp->program.length; i++)
+		fprintf(stderr, "%08x\n", vp->program.body.d[i]);
+#endif
+}
+
+static void position_invariant(struct gl_program *prog)
+{
+	struct prog_instruction *vpi;
+	struct gl_program_parameter_list *paramList;
+	int i;
+
+	gl_state_index tokens[STATE_LENGTH] = { STATE_MVP_MATRIX, 0, 0, 0, 0 };
+
+	/* tokens[4] = matrix modifier */
+#ifdef PREFER_DP4
+	tokens[4] = 0;		/* not transposed or inverted */
+#else
+	tokens[4] = STATE_MATRIX_TRANSPOSE;
+#endif
+	paramList = prog->Parameters;
+
+	vpi = _mesa_alloc_instructions(prog->NumInstructions + 4);
+	_mesa_init_instructions(vpi, prog->NumInstructions + 4);
+
+	for (i = 0; i < 4; i++) {
+		GLint idx;
+		tokens[2] = tokens[3] = i;	/* matrix row[i]..row[i] */
+		idx = _mesa_add_state_reference(paramList, tokens);
+#ifdef PREFER_DP4
+		vpi[i].Opcode = OPCODE_DP4;
+		vpi[i].StringPos = 0;
+		vpi[i].Data = 0;
+
+		vpi[i].DstReg.File = PROGRAM_OUTPUT;
+		vpi[i].DstReg.Index = VERT_RESULT_HPOS;
+		vpi[i].DstReg.WriteMask = 1 << i;
+		vpi[i].DstReg.CondMask = COND_TR;
+
+		vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
+		vpi[i].SrcReg[0].Index = idx;
+		vpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+		vpi[i].SrcReg[1].File = PROGRAM_INPUT;
+		vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS;
+		vpi[i].SrcReg[1].Swizzle = SWIZZLE_XYZW;
+#else
+		if (i == 0)
+			vpi[i].Opcode = OPCODE_MUL;
+		else
+			vpi[i].Opcode = OPCODE_MAD;
+
+		vpi[i].StringPos = 0;
+		vpi[i].Data = 0;
+
+		if (i == 3)
+			vpi[i].DstReg.File = PROGRAM_OUTPUT;
+		else
+			vpi[i].DstReg.File = PROGRAM_TEMPORARY;
+		vpi[i].DstReg.Index = 0;
+		vpi[i].DstReg.WriteMask = 0xf;
+		vpi[i].DstReg.CondMask = COND_TR;
+
+		vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
+		vpi[i].SrcReg[0].Index = idx;
+		vpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+		vpi[i].SrcReg[1].File = PROGRAM_INPUT;
+		vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS;
+		vpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(i, i, i, i);
+
+		if (i > 0) {
+			vpi[i].SrcReg[2].File = PROGRAM_TEMPORARY;
+			vpi[i].SrcReg[2].Index = 0;
+			vpi[i].SrcReg[2].Swizzle = SWIZZLE_XYZW;
+		}
+#endif
+	}
+
+	_mesa_copy_instructions(&vpi[i], prog->Instructions,
+				prog->NumInstructions);
+
+	free(prog->Instructions);
+
+	prog->Instructions = vpi;
+
+	prog->NumInstructions += 4;
+	vpi = &prog->Instructions[prog->NumInstructions - 1];
+
+	assert(vpi->Opcode == OPCODE_END);
+}
+
+static void insert_wpos(struct r300_vertex_program *vp,
+			struct gl_program *prog, GLuint temp_index)
+{
+	struct prog_instruction *vpi;
+	struct prog_instruction *vpi_insert;
+	int i = 0;
+
+	vpi = _mesa_alloc_instructions(prog->NumInstructions + 2);
+	_mesa_init_instructions(vpi, prog->NumInstructions + 2);
+	/* all but END */
+	_mesa_copy_instructions(vpi, prog->Instructions,
+				prog->NumInstructions - 1);
+	/* END */
+	_mesa_copy_instructions(&vpi[prog->NumInstructions + 1],
+				&prog->Instructions[prog->NumInstructions - 1],
+				1);
+	vpi_insert = &vpi[prog->NumInstructions - 1];
+
+	vpi_insert[i].Opcode = OPCODE_MOV;
+
+	vpi_insert[i].DstReg.File = PROGRAM_OUTPUT;
+	vpi_insert[i].DstReg.Index = VERT_RESULT_HPOS;
+	vpi_insert[i].DstReg.WriteMask = WRITEMASK_XYZW;
+	vpi_insert[i].DstReg.CondMask = COND_TR;
+
+	vpi_insert[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+	vpi_insert[i].SrcReg[0].Index = temp_index;
+	vpi_insert[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+	i++;
+
+	vpi_insert[i].Opcode = OPCODE_MOV;
+
+	vpi_insert[i].DstReg.File = PROGRAM_OUTPUT;
+	vpi_insert[i].DstReg.Index = VERT_RESULT_TEX0 + vp->wpos_idx;
+	vpi_insert[i].DstReg.WriteMask = WRITEMASK_XYZW;
+	vpi_insert[i].DstReg.CondMask = COND_TR;
+
+	vpi_insert[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+	vpi_insert[i].SrcReg[0].Index = temp_index;
+	vpi_insert[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+	i++;
+
+	free(prog->Instructions);
+
+	prog->Instructions = vpi;
+
+	prog->NumInstructions += i;
+	vpi = &prog->Instructions[prog->NumInstructions - 1];
+
+	assert(vpi->Opcode == OPCODE_END);
+}
+
+static void pos_as_texcoord(struct r300_vertex_program *vp,
+			    struct gl_program *prog)
+{
+	struct prog_instruction *vpi;
+	GLuint tempregi = prog->NumTemporaries;
+	/* should do something else if no temps left... */
+	prog->NumTemporaries++;
+
+	for (vpi = prog->Instructions; vpi->Opcode != OPCODE_END; vpi++) {
+		if (vpi->DstReg.File == PROGRAM_OUTPUT &&
+		    vpi->DstReg.Index == VERT_RESULT_HPOS) {
+			vpi->DstReg.File = PROGRAM_TEMPORARY;
+			vpi->DstReg.Index = tempregi;
+		}
+	}
+	insert_wpos(vp, prog, tempregi);
+}
+
+static struct r300_vertex_program *build_program(struct r300_vertex_program_key
+						 *wanted_key, struct gl_vertex_program
+						 *mesa_vp, GLint wpos_idx)
+{
+	struct r300_vertex_program *vp;
+
+	vp = _mesa_calloc(sizeof(*vp));
+	_mesa_memcpy(&vp->key, wanted_key, sizeof(vp->key));
+
+	vp->wpos_idx = wpos_idx;
+
+	if (mesa_vp->IsPositionInvariant) {
+		position_invariant(&mesa_vp->Base);
+	}
+
+	if (wpos_idx > -1)
+		pos_as_texcoord(vp, &mesa_vp->Base);
+
+	assert(mesa_vp->Base.NumInstructions);
+
+	vp->num_temporaries = mesa_vp->Base.NumTemporaries;
+
+	r300TranslateVertexShader(vp, mesa_vp->Base.Instructions);
+
+	return vp;
+}
+
+void r300SelectVertexShader(r300ContextPtr r300)
+{
+	GLcontext *ctx = ctx = r300->radeon.glCtx;
+	GLuint InputsRead;
+	struct r300_vertex_program_key wanted_key = { 0 };
+	GLint i;
+	struct r300_vertex_program_cont *vpc;
+	struct r300_vertex_program *vp;
+	GLint wpos_idx;
+
+	vpc = (struct r300_vertex_program_cont *)ctx->VertexProgram._Current;
+	InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
+
+	wanted_key.OutputsWritten |= 1 << VERT_RESULT_HPOS;
+
+	wpos_idx = -1;
+	if (InputsRead & FRAG_BIT_WPOS) {
+		for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+			if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
+				break;
+
+		if (i == ctx->Const.MaxTextureUnits) {
+			fprintf(stderr, "\tno free texcoord found\n");
+			_mesa_exit(-1);
+		}
+
+		InputsRead |= (FRAG_BIT_TEX0 << i);
+		wpos_idx = i;
+	}
+
+	if (InputsRead & FRAG_BIT_COL0)
+		wanted_key.OutputsWritten |= 1 << VERT_RESULT_COL0;
+
+	if ((InputsRead & FRAG_BIT_COL1)	/*||
+						   (InputsRead & FRAG_BIT_FOGC) */ )
+		wanted_key.OutputsWritten |= 1 << VERT_RESULT_COL1;
+
+	for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+		if (InputsRead & (FRAG_BIT_TEX0 << i))
+			wanted_key.OutputsWritten |=
+			    1 << (VERT_RESULT_TEX0 + i);
+
+	wanted_key.InputsRead = vpc->mesa_program.Base.InputsRead;
+	if (vpc->mesa_program.IsPositionInvariant) {
+		/* we wan't position don't we ? */
+		wanted_key.InputsRead |= (1 << VERT_ATTRIB_POS);
+	}
+
+	for (vp = vpc->progs; vp; vp = vp->next)
+		if (_mesa_memcmp(&vp->key, &wanted_key, sizeof(wanted_key)) ==
+		    0) {
+			r300->selected_vp = vp;
+			return;
+		}
+	//_mesa_print_program(&vpc->mesa_program.Base);
+
+	vp = build_program(&wanted_key, &vpc->mesa_program, wpos_idx);
+	vp->next = vpc->progs;
+	vpc->progs = vp;
+	r300->selected_vp = vp;
+}
diff --git a/r300/r300_vertprog.h b/r300/r300_vertprog.h
new file mode 100644
index 0000000..252d5a9
--- /dev/null
+++ b/r300/r300_vertprog.h
@@ -0,0 +1,89 @@
+#ifndef __R300_VERTPROG_H_
+#define __R300_VERTPROG_H_
+
+#include "r300_reg.h"
+
+typedef struct {
+	GLuint op;
+	GLuint src[3];
+} VERTEX_SHADER_INSTRUCTION;
+
+#define VSF_FLAG_X	1
+#define VSF_FLAG_Y	2
+#define VSF_FLAG_Z	4
+#define VSF_FLAG_W	8
+#define VSF_FLAG_XYZ	(VSF_FLAG_X | VSF_FLAG_Y | VSF_FLAG_Z)
+#define VSF_FLAG_ALL  0xf
+#define VSF_FLAG_NONE  0
+
+#define VSF_OUT_CLASS_TMP	0
+#define VSF_OUT_CLASS_ADDR	1
+#define VSF_OUT_CLASS_RESULT	2
+
+/* first DWORD of an instruction */
+
+/* possible operations: 
+    DOT, MUL, ADD, MAD, FRC, MAX, MIN, SGE, SLT, EXP, LOG, LIT, POW, RCP, RSQ, EX2,
+    LG2, MAD_2 */
+
+#define MAKE_VSF_OP(op, out_reg_index, out_reg_fields, class) \
+   ((op)  \
+  	| ((out_reg_index) << R300_VPI_OUT_REG_INDEX_SHIFT) 	\
+ 	 | ((out_reg_fields) << 20) 	\
+  	| ( (class) << 8 ) )
+
+#define EASY_VSF_OP(op, out_reg_index, out_reg_fields, class) \
+	MAKE_VSF_OP(R300_VPI_OUT_OP_##op, out_reg_index, VSF_FLAG_##out_reg_fields, VSF_OUT_CLASS_##class) \
+
+/* according to Nikolai, the subsequent 3 DWORDs are sources, use same define for each */
+
+#define VSF_IN_CLASS_TMP	0
+#define VSF_IN_CLASS_ATTR	1
+#define VSF_IN_CLASS_PARAM	2
+#define VSF_IN_CLASS_NONE	9
+
+#define VSF_IN_COMPONENT_X	0
+#define VSF_IN_COMPONENT_Y	1
+#define VSF_IN_COMPONENT_Z	2
+#define VSF_IN_COMPONENT_W	3
+#define VSF_IN_COMPONENT_ZERO	4
+#define VSF_IN_COMPONENT_ONE	5
+
+#define MAKE_VSF_SOURCE(in_reg_index, comp_x, comp_y, comp_z, comp_w, class, negate) \
+	( ((in_reg_index)<<R300_VPI_IN_REG_INDEX_SHIFT) \
+	   | ((comp_x)<<R300_VPI_IN_X_SHIFT) \
+	   | ((comp_y)<<R300_VPI_IN_Y_SHIFT) \
+	   | ((comp_z)<<R300_VPI_IN_Z_SHIFT) \
+	   | ((comp_w)<<R300_VPI_IN_W_SHIFT) \
+	   | ((negate)<<25) | ((class)))
+
+#define EASY_VSF_SOURCE(in_reg_index, comp_x, comp_y, comp_z, comp_w, class, negate) \
+	MAKE_VSF_SOURCE(in_reg_index, \
+		VSF_IN_COMPONENT_##comp_x, \
+		VSF_IN_COMPONENT_##comp_y, \
+		VSF_IN_COMPONENT_##comp_z, \
+		VSF_IN_COMPONENT_##comp_w, \
+		VSF_IN_CLASS_##class, VSF_FLAG_##negate)
+
+/* special sources: */
+
+/* (1.0,1.0,1.0,1.0) vector (ATTR, plain ) */
+#define VSF_ATTR_UNITY(reg) 	EASY_VSF_SOURCE(reg, ONE, ONE, ONE, ONE, ATTR, NONE)
+#define VSF_UNITY(reg) 	EASY_VSF_SOURCE(reg, ONE, ONE, ONE, ONE, NONE, NONE)
+
+/* contents of unmodified register */
+#define VSF_REG(reg) 	EASY_VSF_SOURCE(reg, X, Y, Z, W, ATTR, NONE)
+
+/* contents of unmodified parameter */
+#define VSF_PARAM(reg) 	EASY_VSF_SOURCE(reg, X, Y, Z, W, PARAM, NONE)
+
+/* contents of unmodified temporary register */
+#define VSF_TMP(reg) 	EASY_VSF_SOURCE(reg, X, Y, Z, W, TMP, NONE)
+
+/* components of ATTR register */
+#define VSF_ATTR_X(reg) EASY_VSF_SOURCE(reg, X, X, X, X, ATTR, NONE)
+#define VSF_ATTR_Y(reg) EASY_VSF_SOURCE(reg, Y, Y, Y, Y, ATTR, NONE)
+#define VSF_ATTR_Z(reg) EASY_VSF_SOURCE(reg, Z, Z, Z, Z, ATTR, NONE)
+#define VSF_ATTR_W(reg) EASY_VSF_SOURCE(reg, W, W, W, W, ATTR, NONE)
+
+#endif
diff --git a/r300/radeon_context.c b/r300/radeon_context.c
new file mode 100644
index 0000000..e9634b4
--- /dev/null
+++ b/r300/radeon_context.c
@@ -0,0 +1,327 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file radeon_context.c
+ * Common context initialization.
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include <dlfcn.h>
+
+#include "glheader.h"
+#include "imports.h"
+#include "context.h"
+#include "state.h"
+#include "matrix.h"
+#include "framebuffer.h"
+
+#include "drivers/common/driverfuncs.h"
+#include "swrast/swrast.h"
+
+#include "radeon_screen.h"
+#include "radeon_ioctl.h"
+#include "radeon_macros.h"
+#include "radeon_reg.h"
+
+#include "radeon_state.h"
+#include "r300_state.h"
+
+#include "utils.h"
+#include "vblank.h"
+#include "xmlpool.h"		/* for symbolic values of enum-type options */
+
+#define DRIVER_DATE "20060815"
+
+
+/* Return various strings for glGetString().
+ */
+static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	static char buffer[128];
+
+	switch (name) {
+	case GL_VENDOR:
+		if (IS_R300_CLASS(radeon->radeonScreen))
+			return (GLubyte *) "DRI R300 Project";
+		else
+			return (GLubyte *) "Tungsten Graphics, Inc.";
+
+	case GL_RENDERER:
+	{
+		unsigned offset;
+		GLuint agp_mode = (radeon->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
+			radeon->radeonScreen->AGPMode;
+		const char* chipname;
+
+		if (IS_R300_CLASS(radeon->radeonScreen))
+			chipname = "R300";
+		else
+			chipname = "R200";
+
+		offset = driGetRendererString(buffer, chipname, DRIVER_DATE,
+					      agp_mode);
+
+		if (IS_R300_CLASS(radeon->radeonScreen)) {
+		sprintf(&buffer[offset], " %sTCL",
+			(radeon->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)
+			? "" : "NO-");
+		} else {
+			sprintf(&buffer[offset], " %sTCL",
+			!(radeon->TclFallback & RADEON_TCL_FALLBACK_TCL_DISABLE)
+			? "" : "NO-");
+		}
+
+		return (GLubyte *) buffer;
+	}
+
+	default:
+		return NULL;
+	}
+}
+
+/* Initialize the driver's misc functions.
+ */
+static void radeonInitDriverFuncs(struct dd_function_table *functions)
+{
+	functions->GetString = radeonGetString;
+}
+
+
+/**
+ * Create and initialize all common fields of the context,
+ * including the Mesa context itself.
+ */
+GLboolean radeonInitContext(radeonContextPtr radeon,
+			    struct dd_function_table* functions,
+			    const __GLcontextModes * glVisual,
+			    __DRIcontextPrivate * driContextPriv,
+			    void *sharedContextPrivate)
+{
+	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+	GLcontext* ctx;
+	GLcontext* shareCtx;
+	int fthrottle_mode;
+
+	/* Fill in additional standard functions. */
+	radeonInitDriverFuncs(functions);
+
+	/* Allocate and initialize the Mesa context */
+	if (sharedContextPrivate)
+		shareCtx = ((radeonContextPtr)sharedContextPrivate)->glCtx;
+	else
+		shareCtx = NULL;
+	radeon->glCtx = _mesa_create_context(glVisual, shareCtx,
+					    functions, (void *)radeon);
+	if (!radeon->glCtx)
+		return GL_FALSE;
+
+	ctx = radeon->glCtx;
+	driContextPriv->driverPrivate = radeon;
+
+	/* DRI fields */
+	radeon->dri.context = driContextPriv;
+	radeon->dri.screen = sPriv;
+	radeon->dri.drawable = NULL;
+	radeon->dri.readable = NULL;
+	radeon->dri.hwContext = driContextPriv->hHWContext;
+	radeon->dri.hwLock = &sPriv->pSAREA->lock;
+	radeon->dri.fd = sPriv->fd;
+	radeon->dri.drmMinor = sPriv->drmMinor;
+
+	radeon->radeonScreen = screen;
+	radeon->sarea = (drm_radeon_sarea_t *) ((GLubyte *) sPriv->pSAREA +
+					       screen->sarea_priv_offset);
+
+	/* Setup IRQs */
+	fthrottle_mode = driQueryOptioni(&radeon->optionCache, "fthrottle_mode");
+	radeon->iw.irq_seq = -1;
+	radeon->irqsEmitted = 0;
+	radeon->do_irqs = (fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS &&
+			  radeon->radeonScreen->irq);
+
+	radeon->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
+
+	if (!radeon->do_irqs)
+		fprintf(stderr,
+			"IRQ's not enabled, falling back to %s: %d %d\n",
+			radeon->do_usleeps ? "usleeps" : "busy waits",
+			fthrottle_mode, radeon->radeonScreen->irq);
+
+	radeon->vblank_flags = (radeon->radeonScreen->irq != 0)
+	    ? driGetDefaultVBlankFlags(&radeon->optionCache) : VBLANK_FLAG_NO_IRQ;
+
+	(*dri_interface->getUST) (&radeon->swap_ust);
+
+	return GL_TRUE;
+}
+
+
+/**
+ * Cleanup common context fields.
+ * Called by r200DestroyContext/r300DestroyContext
+ */
+void radeonCleanupContext(radeonContextPtr radeon)
+{
+	/* _mesa_destroy_context() might result in calls to functions that
+	 * depend on the DriverCtx, so don't set it to NULL before.
+	 *
+	 * radeon->glCtx->DriverCtx = NULL;
+	 */
+
+	/* free the Mesa context */
+	_mesa_destroy_context(radeon->glCtx);
+
+	if (radeon->state.scissor.pClipRects) {
+		FREE(radeon->state.scissor.pClipRects);
+		radeon->state.scissor.pClipRects = 0;
+	}
+}
+
+
+/**
+ * Swap front and back buffer.
+ */
+void radeonSwapBuffers(__DRIdrawablePrivate * dPriv)
+{
+	if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+		radeonContextPtr radeon;
+		GLcontext *ctx;
+
+		radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+		ctx = radeon->glCtx;
+
+		if (ctx->Visual.doubleBufferMode) {
+			_mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
+			if (radeon->doPageFlip) {
+				radeonPageFlip(dPriv);
+			} else {
+			    radeonCopyBuffer(dPriv, NULL);
+			}
+		}
+	} else {
+		/* XXX this shouldn't be an error but we can't handle it for now */
+		_mesa_problem(NULL, "%s: drawable has no context!",
+			      __FUNCTION__);
+	}
+}
+
+void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+			 int x, int y, int w, int h )
+{
+    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+	radeonContextPtr radeon;
+	GLcontext *ctx;
+
+	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+	ctx = radeon->glCtx;
+
+	if (ctx->Visual.doubleBufferMode) {
+	    drm_clip_rect_t rect;
+	    rect.x1 = x + dPriv->x;
+	    rect.y1 = (dPriv->h - y - h) + dPriv->y;
+	    rect.x2 = rect.x1 + w;
+	    rect.y2 = rect.y1 + h;
+	    _mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
+	    radeonCopyBuffer(dPriv, &rect);
+	}
+    } else {
+	/* XXX this shouldn't be an error but we can't handle it for now */
+	_mesa_problem(NULL, "%s: drawable has no context!",
+		      __FUNCTION__);
+    }
+}
+
+/* Force the context `c' to be the current context and associate with it
+ * buffer `b'.
+ */
+GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
+			    __DRIdrawablePrivate * driDrawPriv,
+			    __DRIdrawablePrivate * driReadPriv)
+{
+	if (driContextPriv) {
+		radeonContextPtr radeon =
+			(radeonContextPtr) driContextPriv->driverPrivate;
+
+		if (RADEON_DEBUG & DEBUG_DRI)
+			fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
+				radeon->glCtx);
+
+		if (radeon->dri.drawable != driDrawPriv) {
+			driDrawableInitVBlank(driDrawPriv,
+					      radeon->vblank_flags,
+					      &radeon->vbl_seq);
+		}
+
+		radeon->dri.readable = driReadPriv;
+
+		if (radeon->dri.drawable != driDrawPriv ||
+		    radeon->lastStamp != driDrawPriv->lastStamp) {
+			radeon->dri.drawable = driDrawPriv;
+
+			radeonSetCliprects(radeon);
+			r300UpdateViewportOffset(radeon->glCtx);
+		}
+
+		_mesa_make_current(radeon->glCtx,
+				    (GLframebuffer *) driDrawPriv->
+				    driverPrivate,
+				    (GLframebuffer *) driReadPriv->
+				    driverPrivate);
+
+		_mesa_update_state(radeon->glCtx);		
+
+		radeonUpdatePageFlipping(radeon);
+	} else {
+		if (RADEON_DEBUG & DEBUG_DRI)
+			fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
+		_mesa_make_current(0, 0, 0);
+	}
+
+	if (RADEON_DEBUG & DEBUG_DRI)
+		fprintf(stderr, "End %s\n", __FUNCTION__);
+	return GL_TRUE;
+}
+
+/* Force the context `c' to be unbound from its buffer.
+ */
+GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv)
+{
+	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
+
+	if (RADEON_DEBUG & DEBUG_DRI)
+		fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
+			radeon->glCtx);
+
+	return GL_TRUE;
+}
+
diff --git a/r300/radeon_context.h b/r300/radeon_context.h
new file mode 100644
index 0000000..2f23941
--- /dev/null
+++ b/r300/radeon_context.h
@@ -0,0 +1,246 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __RADEON_CONTEXT_H__
+#define __RADEON_CONTEXT_H__
+
+#include "mtypes.h"
+#include "radeon_screen.h"
+#include "drm.h"
+#include "dri_util.h"
+#include "colormac.h"
+
+struct radeon_context;
+typedef struct radeon_context radeonContextRec;
+typedef struct radeon_context *radeonContextPtr;
+
+#define TEX_0   0x1
+#define TEX_1   0x2
+#define TEX_2	0x4
+#define TEX_3	0x8
+#define TEX_4	0x10
+#define TEX_5	0x20
+#define TEX_6	0x40
+#define TEX_7	0x80
+#define TEX_ALL 0xff
+
+/* Rasterizing fallbacks */
+/* See correponding strings in r200_swtcl.c */
+#define RADEON_FALLBACK_TEXTURE		0x0001
+#define RADEON_FALLBACK_DRAW_BUFFER	0x0002
+#define RADEON_FALLBACK_STENCIL		0x0004
+#define RADEON_FALLBACK_RENDER_MODE	0x0008
+#define RADEON_FALLBACK_BLEND_EQ	0x0010
+#define RADEON_FALLBACK_BLEND_FUNC	0x0020
+#define RADEON_FALLBACK_DISABLE		0x0040
+#define RADEON_FALLBACK_BORDER_MODE	0x0080
+
+#if R200_MERGED
+extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
+
+#define FALLBACK( radeon, bit, mode ) do {			\
+   if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",	\
+		     __FUNCTION__, bit, mode );			\
+   radeonFallback( (radeon)->glCtx, bit, mode );		\
+} while (0)
+#else
+#define FALLBACK( radeon, bit, mode ) fprintf(stderr, "%s:%s\n", __LINE__, __FILE__);
+#endif
+
+/* TCL fallbacks */
+extern void radeonTclFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
+
+#define RADEON_TCL_FALLBACK_RASTER		0x0001	/* rasterization */
+#define RADEON_TCL_FALLBACK_UNFILLED		0x0002	/* unfilled tris */
+#define RADEON_TCL_FALLBACK_LIGHT_TWOSIDE	0x0004	/* twoside tris */
+#define RADEON_TCL_FALLBACK_MATERIAL		0x0008	/* material in vb */
+#define RADEON_TCL_FALLBACK_TEXGEN_0		0x0010	/* texgen, unit 0 */
+#define RADEON_TCL_FALLBACK_TEXGEN_1		0x0020	/* texgen, unit 1 */
+#define RADEON_TCL_FALLBACK_TEXGEN_2		0x0040	/* texgen, unit 2 */
+#define RADEON_TCL_FALLBACK_TEXGEN_3		0x0080	/* texgen, unit 3 */
+#define RADEON_TCL_FALLBACK_TEXGEN_4		0x0100	/* texgen, unit 4 */
+#define RADEON_TCL_FALLBACK_TEXGEN_5		0x0200	/* texgen, unit 5 */
+#define RADEON_TCL_FALLBACK_TCL_DISABLE		0x0400	/* user disable */
+#define RADEON_TCL_FALLBACK_BITMAP		0x0800	/* draw bitmap with points */
+#define RADEON_TCL_FALLBACK_VERTEX_PROGRAM	0x1000	/* vertex program active */
+
+#if R200_MERGED
+#define TCL_FALLBACK( ctx, bit, mode )	radeonTclFallback( ctx, bit, mode )
+#else
+#define TCL_FALLBACK( ctx, bit, mode )	;
+#endif
+
+struct radeon_dri_mirror {
+	__DRIcontextPrivate *context;	/* DRI context */
+	__DRIscreenPrivate *screen;	/* DRI screen */
+	/**
+	 * DRI drawable bound to this context for drawing.
+	 */
+	__DRIdrawablePrivate *drawable;
+
+	/**
+	 * DRI drawable bound to this context for reading.
+	 */
+	__DRIdrawablePrivate *readable;
+
+	drm_context_t hwContext;
+	drm_hw_lock_t *hwLock;
+	int fd;
+	int drmMinor;
+};
+
+/**
+ * Derived state for internal purposes.
+ */
+struct radeon_scissor_state {
+	drm_clip_rect_t rect;
+	GLboolean enabled;
+
+	GLuint numClipRects;	/* Cliprects active */
+	GLuint numAllocedClipRects;	/* Cliprects available */
+	drm_clip_rect_t *pClipRects;
+};
+
+struct radeon_colorbuffer_state {
+	GLuint clear;
+	GLint drawOffset, drawPitch;
+};
+
+struct radeon_state {
+	struct radeon_colorbuffer_state color;
+	struct radeon_scissor_state scissor;
+};
+
+/**
+ * Common per-context variables shared by R200 and R300.
+ * R200- and R300-specific code "derive" their own context from this
+ * structure.
+ */
+struct radeon_context {
+	GLcontext *glCtx;	/* Mesa context */
+	radeonScreenPtr radeonScreen;	/* Screen private DRI data */
+
+	/* Fallback state */
+	GLuint Fallback;
+	GLuint TclFallback;
+
+	/* Page flipping */
+	GLuint doPageFlip;
+
+	/* Drawable, cliprect and scissor information */
+	GLuint numClipRects;	/* Cliprects for the draw buffer */
+	drm_clip_rect_t *pClipRects;
+	unsigned int lastStamp;
+	GLboolean lost_context;
+	drm_radeon_sarea_t *sarea;	/* Private SAREA data */
+
+	/* Mirrors of some DRI state */
+	struct radeon_dri_mirror dri;
+
+	/* Busy waiting */
+	GLuint do_usleeps;
+	GLuint do_irqs;
+	GLuint irqsEmitted;
+	drm_radeon_irq_wait_t iw;
+
+	/* VBI / buffer swap */
+	GLuint vbl_seq;
+	GLuint vblank_flags;
+
+	int64_t swap_ust;
+	int64_t swap_missed_ust;
+
+	GLuint swap_count;
+	GLuint swap_missed_count;
+
+	/* Derived state */
+	struct radeon_state state;
+
+	/* Configuration cache
+	 */
+	driOptionCache optionCache;
+};
+
+#define RADEON_CONTEXT(glctx) ((radeonContextPtr)(ctx->DriverCtx))
+
+extern void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
+extern void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+				int x, int y, int w, int h);
+extern GLboolean radeonInitContext(radeonContextPtr radeon,
+				   struct dd_function_table *functions,
+				   const __GLcontextModes * glVisual,
+				   __DRIcontextPrivate * driContextPriv,
+				   void *sharedContextPrivate);
+extern void radeonCleanupContext(radeonContextPtr radeon);
+extern GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
+				   __DRIdrawablePrivate * driDrawPriv,
+				   __DRIdrawablePrivate * driReadPriv);
+extern GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
+
+/* ================================================================
+ * Debugging:
+ */
+#define DO_DEBUG		1
+
+#if DO_DEBUG
+extern int RADEON_DEBUG;
+#else
+#define RADEON_DEBUG		0
+#endif
+
+#define DEBUG_TEXTURE	0x0001
+#define DEBUG_STATE	0x0002
+#define DEBUG_IOCTL	0x0004
+#define DEBUG_PRIMS	0x0008
+#define DEBUG_VERTS	0x0010
+#define DEBUG_FALLBACKS	0x0020
+#define DEBUG_VFMT	0x0040
+#define DEBUG_CODEGEN	0x0080
+#define DEBUG_VERBOSE	0x0100
+#define DEBUG_DRI       0x0200
+#define DEBUG_DMA       0x0400
+#define DEBUG_SANITY    0x0800
+#define DEBUG_SYNC      0x1000
+#define DEBUG_PIXEL     0x2000
+#define DEBUG_MEMORY    0x4000
+
+#endif				/* __RADEON_CONTEXT_H__ */
diff --git a/r300/radeon_ioctl.c b/r300/radeon_ioctl.c
new file mode 100644
index 0000000..0b8656b
--- /dev/null
+++ b/r300/radeon_ioctl.c
@@ -0,0 +1,394 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include <sched.h>
+#include <errno.h>
+
+#include "glheader.h"
+#include "imports.h"
+#include "macros.h"
+#include "context.h"
+#include "swrast/swrast.h"
+#include "r300_context.h"
+#include "radeon_ioctl.h"
+#include "r300_ioctl.h"
+#include "r300_state.h"
+#include "radeon_reg.h"
+
+#include "drirenderbuffer.h"
+#include "vblank.h"
+
+static void radeonWaitForIdle(radeonContextPtr radeon);
+
+/* ================================================================
+ * SwapBuffers with client-side throttling
+ */
+
+static uint32_t radeonGetLastFrame(radeonContextPtr radeon)
+{
+	drm_radeon_getparam_t gp;
+	int ret;
+	uint32_t frame;
+
+	gp.param = RADEON_PARAM_LAST_FRAME;
+	gp.value = (int *)&frame;
+	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+				  &gp, sizeof(gp));
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+
+	return frame;
+}
+
+uint32_t radeonGetAge(radeonContextPtr radeon)
+{
+	drm_radeon_getparam_t gp;
+	int ret;
+	uint32_t age;
+
+	gp.param = RADEON_PARAM_LAST_CLEAR;
+	gp.value = (int *)&age;
+	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+				  &gp, sizeof(gp));
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+
+	return age;
+}
+
+static void radeonEmitIrqLocked(radeonContextPtr radeon)
+{
+	drm_radeon_irq_emit_t ie;
+	int ret;
+
+	ie.irq_seq = &radeon->iw.irq_seq;
+	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_IRQ_EMIT,
+				  &ie, sizeof(ie));
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonIrqEmit: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+}
+
+static void radeonWaitIrq(radeonContextPtr radeon)
+{
+	int ret;
+
+	do {
+		ret = drmCommandWrite(radeon->dri.fd, DRM_RADEON_IRQ_WAIT,
+				      &radeon->iw, sizeof(radeon->iw));
+	} while (ret && (errno == EINTR || errno == EBUSY));
+
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+}
+
+static void radeonWaitForFrameCompletion(radeonContextPtr radeon)
+{
+	drm_radeon_sarea_t *sarea = radeon->sarea;
+
+	if (radeon->do_irqs) {
+		if (radeonGetLastFrame(radeon) < sarea->last_frame) {
+			if (!radeon->irqsEmitted) {
+				while (radeonGetLastFrame(radeon) <
+				       sarea->last_frame) ;
+			} else {
+				UNLOCK_HARDWARE(radeon);
+				radeonWaitIrq(radeon);
+				LOCK_HARDWARE(radeon);
+			}
+			radeon->irqsEmitted = 10;
+		}
+
+		if (radeon->irqsEmitted) {
+			radeonEmitIrqLocked(radeon);
+			radeon->irqsEmitted--;
+		}
+	} else {
+		while (radeonGetLastFrame(radeon) < sarea->last_frame) {
+			UNLOCK_HARDWARE(radeon);
+			if (radeon->do_usleeps)
+				DO_USLEEP(1);
+			LOCK_HARDWARE(radeon);
+		}
+	}
+}
+
+/* Copy the back color buffer to the front color buffer.
+ */
+void radeonCopyBuffer(const __DRIdrawablePrivate * dPriv,
+		      const drm_clip_rect_t	 * rect)
+{
+	radeonContextPtr radeon;
+	GLint nbox, i, ret;
+	GLboolean missed_target;
+	int64_t ust;
+
+	assert(dPriv);
+	assert(dPriv->driContextPriv);
+	assert(dPriv->driContextPriv->driverPrivate);
+
+	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL) {
+		fprintf(stderr, "\n%s( %p )\n\n", __FUNCTION__,
+			(void *)radeon->glCtx);
+	}
+
+	r300Flush(radeon->glCtx);
+
+	LOCK_HARDWARE(radeon);
+
+	/* Throttle the frame rate -- only allow one pending swap buffers
+	 * request at a time.
+	 */
+	radeonWaitForFrameCompletion(radeon);
+	if (!rect)
+	{
+	    UNLOCK_HARDWARE(radeon);
+	    driWaitForVBlank(dPriv, &radeon->vbl_seq, radeon->vblank_flags,
+			     &missed_target);
+	    LOCK_HARDWARE(radeon);
+	}
+
+	nbox = dPriv->numClipRects;	/* must be in locked region */
+
+	for (i = 0; i < nbox;) {
+		GLint nr = MIN2(i + RADEON_NR_SAREA_CLIPRECTS, nbox);
+		drm_clip_rect_t *box = dPriv->pClipRects;
+		drm_clip_rect_t *b = radeon->sarea->boxes;
+		GLint n = 0;
+
+		for ( ; i < nr ; i++ ) {
+
+		    *b = box[i];
+
+		    if (rect)
+		    {
+			if (rect->x1 > b->x1)
+			    b->x1 = rect->x1;
+			if (rect->y1 > b->y1)
+			    b->y1 = rect->y1;
+			if (rect->x2 < b->x2)
+			    b->x2 = rect->x2;
+			if (rect->y2 < b->y2)
+			    b->y2 = rect->y2;
+
+			if (b->x1 < b->x2 && b->y1 < b->y2)
+			    b++;
+		    }
+		    else
+			b++;
+
+		    n++;
+		}
+		radeon->sarea->nbox = n;
+
+		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_SWAP);
+
+		if (ret) {
+			fprintf(stderr, "DRM_RADEON_SWAP: return = %d\n",
+				ret);
+			UNLOCK_HARDWARE(radeon);
+			exit(1);
+		}
+	}
+
+	UNLOCK_HARDWARE(radeon);
+	if (!rect)
+	{
+	    ((r300ContextPtr)radeon)->hw.all_dirty = GL_TRUE;
+
+	    radeon->swap_count++;
+	    (*dri_interface->getUST) (&ust);
+	    if (missed_target) {
+		radeon->swap_missed_count++;
+		radeon->swap_missed_ust = ust - radeon->swap_ust;
+	    }
+
+	    radeon->swap_ust = ust;
+
+	    sched_yield();
+	}
+}
+
+void radeonPageFlip(const __DRIdrawablePrivate * dPriv)
+{
+	radeonContextPtr radeon;
+	GLint ret;
+	GLboolean missed_target;
+
+	assert(dPriv);
+	assert(dPriv->driContextPriv);
+	assert(dPriv->driContextPriv->driverPrivate);
+
+	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL) {
+		fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
+			radeon->sarea->pfCurrentPage);
+	}
+
+	r300Flush(radeon->glCtx);
+	LOCK_HARDWARE(radeon);
+
+	if (!dPriv->numClipRects) {
+		UNLOCK_HARDWARE(radeon);
+		usleep(10000);	/* throttle invisible client 10ms */
+		return;
+	}
+
+	/* Need to do this for the perf box placement:
+	 */
+	{
+		drm_clip_rect_t *box = dPriv->pClipRects;
+		drm_clip_rect_t *b = radeon->sarea->boxes;
+		b[0] = box[0];
+		radeon->sarea->nbox = 1;
+	}
+
+	/* Throttle the frame rate -- only allow a few pending swap buffers
+	 * request at a time.
+	 */
+	radeonWaitForFrameCompletion(radeon);
+	UNLOCK_HARDWARE(radeon);
+	driWaitForVBlank(dPriv, &radeon->vbl_seq, radeon->vblank_flags,
+			 &missed_target);
+	if (missed_target) {
+		radeon->swap_missed_count++;
+		(void)(*dri_interface->getUST) (&radeon->swap_missed_ust);
+	}
+	LOCK_HARDWARE(radeon);
+
+	ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_FLIP);
+
+	UNLOCK_HARDWARE(radeon);
+
+	if (ret) {
+		fprintf(stderr, "DRM_RADEON_FLIP: return = %d\n", ret);
+		exit(1);
+	}
+
+	radeon->swap_count++;
+	(void)(*dri_interface->getUST) (&radeon->swap_ust);
+
+        driFlipRenderbuffers(radeon->glCtx->WinSysDrawBuffer, 
+                             radeon->sarea->pfCurrentPage);
+
+	if (radeon->sarea->pfCurrentPage == 1) {
+		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
+		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
+	} else {
+		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
+		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
+	}
+
+	if (IS_R300_CLASS(radeon->radeonScreen)) {
+		r300ContextPtr r300 = (r300ContextPtr)radeon;
+		R300_STATECHANGE(r300, cb);
+		r300->hw.cb.cmd[R300_CB_OFFSET] = r300->radeon.state.color.drawOffset + 
+						r300->radeon.radeonScreen->fbLocation;
+		r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
+		
+		if (r300->radeon.radeonScreen->cpp == 4)
+			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
+		else
+			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
+	
+		if (r300->radeon.sarea->tiling_enabled)
+			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
+	}
+}
+
+void radeonWaitForIdleLocked(radeonContextPtr radeon)
+{
+	int ret;
+	int i = 0;
+
+	do {
+		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_CP_IDLE);
+		if (ret)
+			DO_USLEEP(1);
+	} while (ret && ++i < 100);
+
+	if (ret < 0) {
+		UNLOCK_HARDWARE(radeon);
+		fprintf(stderr, "Error: R300 timed out... exiting\n");
+		exit(-1);
+	}
+}
+
+static void radeonWaitForIdle(radeonContextPtr radeon)
+{
+	LOCK_HARDWARE(radeon);
+	radeonWaitForIdleLocked(radeon);
+	UNLOCK_HARDWARE(radeon);
+}
+
+void radeonFlush(GLcontext * ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+	if (IS_R300_CLASS(radeon->radeonScreen))
+		r300Flush(ctx);
+}
+
+
+/* Make sure all commands have been sent to the hardware and have
+ * completed processing.
+ */
+void radeonFinish(GLcontext * ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+	radeonFlush(ctx);
+
+	if (radeon->do_irqs) {
+		LOCK_HARDWARE(radeon);
+		radeonEmitIrqLocked(radeon);
+		UNLOCK_HARDWARE(radeon);
+		radeonWaitIrq(radeon);
+	} else
+		radeonWaitForIdle(radeon);
+}
diff --git a/r300/radeon_ioctl.h b/r300/radeon_ioctl.h
new file mode 100644
index 0000000..3a80d36
--- /dev/null
+++ b/r300/radeon_ioctl.h
@@ -0,0 +1,57 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __RADEON_IOCTL_H__
+#define __RADEON_IOCTL_H__
+
+#include "simple_list.h"
+#include "radeon_dri.h"
+#include "radeon_lock.h"
+
+#include "xf86drm.h"
+#include "drm.h"
+#if 0
+#include "r200_context.h"
+#endif
+#include "radeon_drm.h"
+
+extern void radeonCopyBuffer(const __DRIdrawablePrivate * drawable,
+			     const drm_clip_rect_t	* rect);
+extern void radeonPageFlip(const __DRIdrawablePrivate * drawable);
+extern void radeonFlush(GLcontext * ctx);
+extern void radeonFinish(GLcontext * ctx);
+extern void radeonWaitForIdleLocked(radeonContextPtr radeon);
+extern uint32_t radeonGetAge(radeonContextPtr radeon);
+
+#endif				/* __RADEON_IOCTL_H__ */
diff --git a/r300/radeon_lock.c b/r300/radeon_lock.c
new file mode 100644
index 0000000..bc3c2d6
--- /dev/null
+++ b/r300/radeon_lock.c
@@ -0,0 +1,137 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Kevin E. Martin <martin@valinux.com>
+ */
+
+#include "radeon_lock.h"
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "r300_context.h"
+#include "r300_state.h"
+
+#include "framebuffer.h"
+
+#include "drirenderbuffer.h"
+
+#if DEBUG_LOCKING
+char *prevLockFile = NULL;
+int prevLockLine = 0;
+#endif
+
+/* Turn on/off page flipping according to the flags in the sarea:
+ */
+void radeonUpdatePageFlipping(radeonContextPtr rmesa)
+{
+	int use_back;
+
+	rmesa->doPageFlip = rmesa->sarea->pfState;
+	if (rmesa->glCtx->WinSysDrawBuffer) {
+		driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
+				     rmesa->sarea->pfCurrentPage);
+		r300UpdateDrawBuffer(rmesa->glCtx);
+	}
+
+	use_back = rmesa->glCtx->DrawBuffer ?
+	    (rmesa->glCtx->DrawBuffer->_ColorDrawBufferMask[0] ==
+	     BUFFER_BIT_BACK_LEFT) : 1;
+	use_back ^= (rmesa->sarea->pfCurrentPage == 1);
+
+	if (use_back) {
+		rmesa->state.color.drawOffset =
+		    rmesa->radeonScreen->backOffset;
+		rmesa->state.color.drawPitch = rmesa->radeonScreen->backPitch;
+	} else {
+		rmesa->state.color.drawOffset =
+		    rmesa->radeonScreen->frontOffset;
+		rmesa->state.color.drawPitch =
+		    rmesa->radeonScreen->frontPitch;
+	}
+}
+
+/* Update the hardware state.  This is called if another context has
+ * grabbed the hardware lock, which includes the X server.  This
+ * function also updates the driver's window state after the X server
+ * moves, resizes or restacks a window -- the change will be reflected
+ * in the drawable position and clip rects.  Since the X server grabs
+ * the hardware lock when it changes the window state, this routine will
+ * automatically be called after such a change.
+ */
+void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
+{
+	__DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
+	__DRIdrawablePrivate *const readable = rmesa->dri.readable;
+	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
+	drm_radeon_sarea_t *sarea = rmesa->sarea;
+	r300ContextPtr r300 = (r300ContextPtr) rmesa;
+
+	assert(drawable != NULL);
+
+	drmGetLock(rmesa->dri.fd, rmesa->dri.hwContext, flags);
+
+	/* The window might have moved, so we might need to get new clip
+	 * rects.
+	 *
+	 * NOTE: This releases and regrabs the hw lock to allow the X server
+	 * to respond to the DRI protocol request for new drawable info.
+	 * Since the hardware state depends on having the latest drawable
+	 * clip rects, all state checking must be done _after_ this call.
+	 */
+	DRI_VALIDATE_DRAWABLE_INFO(sPriv, drawable);
+	if (drawable != readable) {
+		DRI_VALIDATE_DRAWABLE_INFO(sPriv, readable);
+	}
+
+	if (rmesa->lastStamp != drawable->lastStamp) {
+		radeonUpdatePageFlipping(rmesa);
+		radeonSetCliprects(rmesa);
+		r300UpdateViewportOffset(rmesa->glCtx);
+		driUpdateFramebufferSize(rmesa->glCtx, drawable);
+	}
+
+	if (sarea->ctx_owner != rmesa->dri.hwContext) {
+		int i;
+
+		sarea->ctx_owner = rmesa->dri.hwContext;
+		for (i = 0; i < r300->nr_heaps; i++) {
+			DRI_AGE_TEXTURES(r300->texture_heaps[i]);
+		}
+	}
+
+	rmesa->lost_context = GL_TRUE;
+}
diff --git a/r300/radeon_lock.h b/r300/radeon_lock.h
new file mode 100644
index 0000000..c47adc9
--- /dev/null
+++ b/r300/radeon_lock.h
@@ -0,0 +1,118 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Kevin E. Martin <martin@valinux.com>
+ */
+
+#ifndef __RADEON_LOCK_H__
+#define __RADEON_LOCK_H__
+
+#if 0
+#include "r200_ioctl.h"
+#endif
+#include "radeon_context.h"
+
+extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
+extern void radeonUpdatePageFlipping(radeonContextPtr rmesa);
+
+/* Turn DEBUG_LOCKING on to find locking conflicts.
+ */
+#define DEBUG_LOCKING	0
+
+#if DEBUG_LOCKING
+extern char *prevLockFile;
+extern int prevLockLine;
+
+#define DEBUG_LOCK()							\
+   do {									\
+      prevLockFile = (__FILE__);					\
+      prevLockLine = (__LINE__);					\
+   } while (0)
+
+#define DEBUG_RESET()							\
+   do {									\
+      prevLockFile = 0;							\
+      prevLockLine = 0;							\
+   } while (0)
+
+#define DEBUG_CHECK_LOCK()						\
+   do {									\
+      if (prevLockFile) {						\
+	 fprintf(stderr,						\
+		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
+		  prevLockFile, prevLockLine, __FILE__, __LINE__);	\
+	 exit(1);							\
+      }									\
+   } while (0)
+
+#else
+
+#define DEBUG_LOCK()
+#define DEBUG_RESET()
+#define DEBUG_CHECK_LOCK()
+
+#endif
+
+/*
+ * !!! We may want to separate locks from locks with validation.  This
+ * could be used to improve performance for those things commands that
+ * do not do any drawing !!!
+ */
+
+/* Lock the hardware and validate our state.
+ */
+#define LOCK_HARDWARE( rmesa )						\
+	do {								\
+		char __ret = 0;						\
+		DEBUG_CHECK_LOCK();					\
+		DRM_CAS((rmesa)->dri.hwLock, (rmesa)->dri.hwContext,	\
+			(DRM_LOCK_HELD | (rmesa)->dri.hwContext), __ret); \
+		if (__ret)						\
+			radeonGetLock((rmesa), 0);			\
+		DEBUG_LOCK();						\
+	} while (0)
+
+#define UNLOCK_HARDWARE( rmesa )					\
+	do {								\
+		DRM_UNLOCK((rmesa)->dri.fd,				\
+			(rmesa)->dri.hwLock,				\
+			(rmesa)->dri.hwContext);			\
+		DEBUG_RESET();						\
+	} while (0)
+
+#endif				/* __RADEON_LOCK_H__ */
diff --git a/r300/radeon_span.c b/r300/radeon_span.c
new file mode 100644
index 0000000..eae09d6
--- /dev/null
+++ b/r300/radeon_span.c
@@ -0,0 +1,321 @@
+/**************************************************************************
+
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *
+ */
+
+#include "glheader.h"
+#include "swrast/swrast.h"
+
+#include "r300_state.h"
+#include "radeon_ioctl.h"
+#include "r300_ioctl.h"
+#include "radeon_span.h"
+
+#include "drirenderbuffer.h"
+
+#define DBG 0
+
+/*
+ * Note that all information needed to access pixels in a renderbuffer
+ * should be obtained through the gl_renderbuffer parameter, not per-context
+ * information.
+ */
+#define LOCAL_VARS						\
+   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
+   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
+   const GLuint bottom = dPriv->h - 1;				\
+   GLubyte *buf = (GLubyte *) drb->flippedData			\
+      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
+   GLuint p;							\
+   (void) p;
+
+#define LOCAL_DEPTH_VARS				\
+   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
+   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
+   const GLuint bottom = dPriv->h - 1;			\
+   GLuint xo = dPriv->x;				\
+   GLuint yo = dPriv->y;				\
+   GLubyte *buf = (GLubyte *) drb->Base.Data;
+
+#define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
+
+#define Y_FLIP(Y) (bottom - (Y))
+
+#define HW_LOCK()
+
+#define HW_UNLOCK()
+
+/* ================================================================
+ * Color buffer
+ */
+
+/* 16 bit, RGB565 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_RGB
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
+
+#define TAG(x)    radeon##x##_RGB565
+#define TAG2(x,y) radeon##x##_RGB565##y
+#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
+#include "spantmp2.h"
+
+/* 32 bit, ARGB8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+
+#define TAG(x)    radeon##x##_ARGB8888
+#define TAG2(x,y) radeon##x##_ARGB8888##y
+#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
+#include "spantmp2.h"
+
+/* ================================================================
+ * Depth buffer
+ */
+
+/* The Radeon family has depth tiling on all the time, so we have to convert
+ * the x,y coordinates into the memory bus address (mba) in the same
+ * manner as the engine.  In each case, the linear block address (ba)
+ * is calculated, and then wired with x and y to produce the final
+ * memory address.
+ * The chip will do address translation on its own if the surface registers
+ * are set up correctly. It is not quite enough to get it working with hyperz
+ * too...
+ */
+
+static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
+{
+	GLuint pitch = drb->pitch;
+	if (drb->depthHasSurface) {
+		return 4 * (x + y * pitch);
+	} else {
+		GLuint ba, address = 0;	/* a[0..1] = 0           */
+
+#ifdef COMPILE_R300
+		ba = (y / 8) * (pitch / 8) + (x / 8);
+#else
+		ba = (y / 16) * (pitch / 16) + (x / 16);
+#endif
+
+		address |= (x & 0x7) << 2;	/* a[2..4] = x[0..2]     */
+		address |= (y & 0x3) << 5;	/* a[5..6] = y[0..1]     */
+		address |= (((x & 0x10) >> 2) ^ (y & 0x4)) << 5;	/* a[7]    = x[4] ^ y[2] */
+		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
+
+		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
+		address |= (((x & 0x8) << 1) ^ (y & 0x10)) << 7;	/* a[11]   = x[3] ^ y[4] */
+		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
+
+		return address;
+	}
+}
+
+static INLINE GLuint
+radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
+{
+	GLuint pitch = drb->pitch;
+	if (drb->depthHasSurface) {
+		return 2 * (x + y * pitch);
+	} else {
+		GLuint ba, address = 0;	/* a[0]    = 0           */
+
+		ba = (y / 16) * (pitch / 32) + (x / 32);
+
+		address |= (x & 0x7) << 1;	/* a[1..3] = x[0..2]     */
+		address |= (y & 0x7) << 4;	/* a[4..6] = y[0..2]     */
+		address |= (x & 0x8) << 4;	/* a[7]    = x[3]        */
+		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
+		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
+		address |= ((x & 0x10) ^ (y & 0x10)) << 7;	/* a[11]   = x[4] ^ y[4] */
+		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
+
+		return address;
+	}
+}
+
+/* 16-bit depth buffer functions
+ */
+#define WRITE_DEPTH( _x, _y, d )					\
+   *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo )) = d;
+
+#define READ_DEPTH( d, _x, _y )						\
+   d = *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo ));
+
+#define TAG(x) radeon##x##_z16
+#include "depthtmp.h"
+
+/* 24 bit depth, 8 bit stencil depthbuffer functions
+ *
+ * Careful: It looks like the R300 uses ZZZS byte order while the R200
+ * uses SZZZ for 24 bit depth, 8 bit stencil mode.
+ */
+#ifdef COMPILE_R300
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0x000000ff;							\
+   tmp |= ((d << 8) & 0xffffff00);					\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+#else
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0xff000000;							\
+   tmp |= ((d) & 0x00ffffff);						\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+#endif
+
+#ifdef COMPILE_R300
+#define READ_DEPTH( d, _x, _y )						\
+  do { \
+    d = (*(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,		\
+					 _y + yo )) & 0xffffff00) >> 8; \
+  }while(0)
+#else
+#define READ_DEPTH( d, _x, _y )						\
+   d = *(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,			\
+					 _y + yo )) & 0x00ffffff;
+#endif
+
+#define TAG(x) radeon##x##_z24_s8
+#include "depthtmp.h"
+
+/* ================================================================
+ * Stencil buffer
+ */
+
+/* 24 bit depth, 8 bit stencil depthbuffer functions
+ */
+#ifdef COMPILE_R300
+#define WRITE_STENCIL( _x, _y, d )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0xffffff00;							\
+   tmp |= (d) & 0xff;							\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+#else
+#define WRITE_STENCIL( _x, _y, d )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   tmp &= 0x00ffffff;							\
+   tmp |= (((d) & 0xff) << 24);						\
+   *(GLuint *)(buf + offset) = tmp;					\
+} while (0)
+#endif
+
+#ifdef COMPILE_R300
+#define READ_STENCIL( d, _x, _y )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   d = tmp & 0x000000ff;						\
+} while (0)
+#else
+#define READ_STENCIL( d, _x, _y )					\
+do {									\
+   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint tmp = *(GLuint *)(buf + offset);				\
+   d = (tmp & 0xff000000) >> 24;					\
+} while (0)
+#endif
+
+#define TAG(x) radeon##x##_z24_s8
+#include "stenciltmp.h"
+
+/* Move locking out to get reasonable span performance (10x better
+ * than doing this in HW_LOCK above).  WaitForIdle() is the main
+ * culprit.
+ */
+
+static void radeonSpanRenderStart(GLcontext * ctx)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+#ifdef COMPILE_R300
+	r300ContextPtr r300 = (r300ContextPtr) rmesa;
+	R300_FIREVERTICES(r300);
+#else
+	RADEON_FIREVERTICES(rmesa);
+#endif
+	LOCK_HARDWARE(rmesa);
+	radeonWaitForIdleLocked(rmesa);
+}
+
+static void radeonSpanRenderFinish(GLcontext * ctx)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	_swrast_flush(ctx);
+	UNLOCK_HARDWARE(rmesa);
+}
+
+void radeonInitSpanFuncs(GLcontext * ctx)
+{
+	struct swrast_device_driver *swdd =
+	    _swrast_GetDeviceDriverReference(ctx);
+	swdd->SpanRenderStart = radeonSpanRenderStart;
+	swdd->SpanRenderFinish = radeonSpanRenderFinish;
+}
+
+/**
+ * Plug in the Get/Put routines for the given driRenderbuffer.
+ */
+void radeonSetSpanFunctions(driRenderbuffer * drb, const GLvisual * vis)
+{
+	if (drb->Base.InternalFormat == GL_RGBA) {
+		if (vis->redBits == 5 && vis->greenBits == 6
+		    && vis->blueBits == 5) {
+			radeonInitPointers_RGB565(&drb->Base);
+		} else {
+			radeonInitPointers_ARGB8888(&drb->Base);
+		}
+	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
+		radeonInitDepthPointers_z16(&drb->Base);
+	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
+		radeonInitDepthPointers_z24_s8(&drb->Base);
+	} else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
+		radeonInitStencilPointers_z24_s8(&drb->Base);
+	}
+}
diff --git a/r300/radeon_state.c b/r300/radeon_state.c
new file mode 100644
index 0000000..82bfd95
--- /dev/null
+++ b/r300/radeon_state.c
@@ -0,0 +1,243 @@
+/**************************************************************************
+
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "imports.h"
+#include "api_arrayelt.h"
+#include "enums.h"
+#include "colormac.h"
+#include "light.h"
+
+#include "swrast/swrast.h"
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "swrast_setup/swrast_setup.h"
+
+#include "radeon_ioctl.h"
+#include "radeon_state.h"
+#include "r300_ioctl.h"
+#include "framebuffer.h"
+
+/* =============================================================
+ * Scissoring
+ */
+
+static GLboolean intersect_rect(drm_clip_rect_t * out,
+				drm_clip_rect_t * a, drm_clip_rect_t * b)
+{
+	*out = *a;
+	if (b->x1 > out->x1)
+		out->x1 = b->x1;
+	if (b->y1 > out->y1)
+		out->y1 = b->y1;
+	if (b->x2 < out->x2)
+		out->x2 = b->x2;
+	if (b->y2 < out->y2)
+		out->y2 = b->y2;
+	if (out->x1 >= out->x2)
+		return GL_FALSE;
+	if (out->y1 >= out->y2)
+		return GL_FALSE;
+	return GL_TRUE;
+}
+
+void radeonRecalcScissorRects(radeonContextPtr radeon)
+{
+	drm_clip_rect_t *out;
+	int i;
+
+	/* Grow cliprect store?
+	 */
+	if (radeon->state.scissor.numAllocedClipRects < radeon->numClipRects) {
+		while (radeon->state.scissor.numAllocedClipRects <
+		       radeon->numClipRects) {
+			radeon->state.scissor.numAllocedClipRects += 1;	/* zero case */
+			radeon->state.scissor.numAllocedClipRects *= 2;
+		}
+
+		if (radeon->state.scissor.pClipRects)
+			FREE(radeon->state.scissor.pClipRects);
+
+		radeon->state.scissor.pClipRects =
+		    MALLOC(radeon->state.scissor.numAllocedClipRects *
+			   sizeof(drm_clip_rect_t));
+
+		if (radeon->state.scissor.pClipRects == NULL) {
+			radeon->state.scissor.numAllocedClipRects = 0;
+			return;
+		}
+	}
+
+	out = radeon->state.scissor.pClipRects;
+	radeon->state.scissor.numClipRects = 0;
+
+	for (i = 0; i < radeon->numClipRects; i++) {
+		if (intersect_rect(out,
+				   &radeon->pClipRects[i],
+				   &radeon->state.scissor.rect)) {
+			radeon->state.scissor.numClipRects++;
+			out++;
+		}
+	}
+}
+
+void radeonUpdateScissor(GLcontext* ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+	if (radeon->dri.drawable) {
+		__DRIdrawablePrivate *dPriv = radeon->dri.drawable;
+		int x1 = dPriv->x + ctx->Scissor.X;
+		int y1 = dPriv->y + dPriv->h - (ctx->Scissor.Y + ctx->Scissor.Height);
+
+		radeon->state.scissor.rect.x1 = x1;
+		radeon->state.scissor.rect.y1 = y1;
+		radeon->state.scissor.rect.x2 = x1 + ctx->Scissor.Width - 1;
+		radeon->state.scissor.rect.y2 = y1 + ctx->Scissor.Height - 1;
+
+		radeonRecalcScissorRects(radeon);
+	}
+}
+
+static void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h)
+{
+	if (ctx->Scissor.Enabled) {
+		/* We don't pipeline cliprect changes */
+		r300Flush(ctx);
+		radeonUpdateScissor(ctx);
+	}
+}
+
+
+/**
+ * Update cliprects and scissors.
+ */
+void radeonSetCliprects(radeonContextPtr radeon)
+{
+	__DRIdrawablePrivate *const drawable = radeon->dri.drawable;
+	__DRIdrawablePrivate *const readable = radeon->dri.readable;
+	GLframebuffer *const draw_fb = (GLframebuffer*)drawable->driverPrivate;
+	GLframebuffer *const read_fb = (GLframebuffer*)readable->driverPrivate;
+
+	if (draw_fb->_ColorDrawBufferMask[0] == BUFFER_BIT_BACK_LEFT) {
+		/* Can't ignore 2d windows if we are page flipping. */
+		if (drawable->numBackClipRects == 0 || radeon->doPageFlip ||
+		    radeon->sarea->pfCurrentPage == 1) {
+			radeon->numClipRects = drawable->numClipRects;
+			radeon->pClipRects = drawable->pClipRects;
+		} else {
+			radeon->numClipRects = drawable->numBackClipRects;
+			radeon->pClipRects = drawable->pBackClipRects;
+		}
+	} else {
+		/* front buffer (or none, or multiple buffers */
+		radeon->numClipRects = drawable->numClipRects;
+		radeon->pClipRects = drawable->pClipRects;
+	}
+
+	if ((draw_fb->Width != drawable->w) ||
+	    (draw_fb->Height != drawable->h)) {
+		_mesa_resize_framebuffer(radeon->glCtx, draw_fb,
+					 drawable->w, drawable->h);
+		draw_fb->Initialized = GL_TRUE;
+	}
+
+	if (drawable != readable) {
+		if ((read_fb->Width != readable->w) ||
+		    (read_fb->Height != readable->h)) {
+			_mesa_resize_framebuffer(radeon->glCtx, read_fb,
+						 readable->w, readable->h);
+			read_fb->Initialized = GL_TRUE;
+		}
+	}
+
+	if (radeon->state.scissor.enabled)
+		radeonRecalcScissorRects(radeon);
+
+	radeon->lastStamp = drawable->lastStamp;
+}
+
+
+/**
+ * Handle common enable bits.
+ * Called as a fallback by r200Enable/r300Enable.
+ */
+void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+	switch(cap) {
+	case GL_SCISSOR_TEST:
+		/* We don't pipeline cliprect & scissor changes */
+		r300Flush(ctx);
+
+		radeon->state.scissor.enabled = state;
+		radeonUpdateScissor(ctx);
+		break;
+
+	default:
+		return;
+	}
+}
+
+
+/**
+ * Initialize default state.
+ * This function is called once at context init time from
+ * r200InitState/r300InitState
+ */
+void radeonInitState(radeonContextPtr radeon)
+{
+	radeon->Fallback = 0;
+
+	if (radeon->glCtx->Visual.doubleBufferMode && radeon->sarea->pfCurrentPage == 0) {
+		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
+		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
+	} else {
+		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
+		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
+	}
+}
+
+
+/**
+ * Initialize common state functions.
+ * Called by r200InitStateFuncs/r300InitStateFuncs
+ */
+void radeonInitStateFuncs(struct dd_function_table *functions)
+{
+	functions->Scissor = radeonScissor;
+}
diff --git a/r300/radeon_state.h b/r300/radeon_state.h
new file mode 100644
index 0000000..821cb40
--- /dev/null
+++ b/r300/radeon_state.h
@@ -0,0 +1,43 @@
+/*
+Copyright (C) 2004 Nicolai Haehnle.  All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __RADEON_STATE_H__
+#define __RADEON_STATE_H__
+
+extern void radeonRecalcScissorRects(radeonContextPtr radeon);
+extern void radeonSetCliprects(radeonContextPtr radeon);
+extern void radeonUpdateScissor(GLcontext* ctx);
+
+extern void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state);
+
+extern void radeonInitState(radeonContextPtr radeon);
+extern void radeonInitStateFuncs(struct dd_function_table* functions);
+
+#endif