summaryrefslogtreecommitdiff
path: root/src/arch
diff options
context:
space:
mode:
Diffstat (limited to 'src/arch')
-rw-r--r--src/arch/sparc/norm.S605
-rw-r--r--src/arch/sparc/sparc.c142
-rw-r--r--src/arch/sparc/sparc.h36
-rw-r--r--src/arch/sparc/sparc_clip.S233
-rw-r--r--src/arch/sparc/sparc_matrix.h170
-rw-r--r--src/arch/sparc/xform.S1392
-rw-r--r--src/arch/x86-64/Makefile.am40
-rw-r--r--src/arch/x86-64/calling_convention.txt50
-rw-r--r--src/arch/x86-64/x86-64.c119
-rw-r--r--src/arch/x86-64/x86-64.h31
-rw-r--r--src/arch/x86-64/xform4.S483
-rw-r--r--src/arch/x86/3dnow.c91
-rw-r--r--src/arch/x86/3dnow.h36
-rw-r--r--src/arch/x86/3dnow_normal.S852
-rw-r--r--src/arch/x86/3dnow_xform1.S437
-rw-r--r--src/arch/x86/3dnow_xform2.S477
-rw-r--r--src/arch/x86/3dnow_xform3.S561
-rw-r--r--src/arch/x86/3dnow_xform4.S570
-rw-r--r--src/arch/x86/Makefile.am40
-rw-r--r--src/arch/x86/assyntax.h1747
-rw-r--r--src/arch/x86/clip_args.h59
-rw-r--r--src/arch/x86/common_x86.c336
-rw-r--r--src/arch/x86/common_x86_asm.S220
-rw-r--r--src/arch/x86/common_x86_asm.h53
-rw-r--r--src/arch/x86/common_x86_features.h67
-rw-r--r--src/arch/x86/gen_matypes.c240
-rw-r--r--src/arch/x86/mmx.h59
-rw-r--r--src/arch/x86/mmx_blend.S402
-rw-r--r--src/arch/x86/mmx_blendtmp.h114
-rw-r--r--src/arch/x86/norm_args.h57
-rw-r--r--src/arch/x86/read_rgba_span_x86.S686
-rw-r--r--src/arch/x86/read_rgba_span_x86.h56
-rw-r--r--src/arch/x86/rtasm/x86sse.c1203
-rw-r--r--src/arch/x86/rtasm/x86sse.h256
-rw-r--r--src/arch/x86/sse.c123
-rw-r--r--src/arch/x86/sse.h36
-rw-r--r--src/arch/x86/sse_normal.S261
-rw-r--r--src/arch/x86/sse_xform1.S446
-rw-r--r--src/arch/x86/sse_xform2.S466
-rw-r--r--src/arch/x86/sse_xform3.S512
-rw-r--r--src/arch/x86/sse_xform4.S235
-rw-r--r--src/arch/x86/x86_cliptest.S407
-rw-r--r--src/arch/x86/x86_xform.c126
-rw-r--r--src/arch/x86/x86_xform.h106
-rw-r--r--src/arch/x86/x86_xform2.S574
-rw-r--r--src/arch/x86/x86_xform3.S644
-rw-r--r--src/arch/x86/x86_xform4.S677
-rw-r--r--src/arch/x86/xform_args.h51
48 files changed, 16584 insertions, 0 deletions
diff --git a/src/arch/sparc/norm.S b/src/arch/sparc/norm.S
new file mode 100644
index 0000000..117d36f
--- /dev/null
+++ b/src/arch/sparc/norm.S
@@ -0,0 +1,605 @@
+
+#include "sparc_matrix.h"
+
+ .register %g2, #scratch
+ .register %g3, #scratch
+
+ .text
+
+#ifdef __arch64__
+#define STACK_VAR_OFF (2047 + (8 * 16))
+#else
+#define STACK_VAR_OFF (4 * 16)
+#endif
+
+ /* Newton-Raphson approximation turns out to be slower
+ * (and less accurate) than direct fsqrts/fdivs.
+ */
+#define ONE_DOT_ZERO 0x3f800000
+
+ .globl _mesa_sparc_transform_normalize_normals
+_mesa_sparc_transform_normalize_normals:
+ /* o0=mat o1=scale o2=in o3=lengths o4=dest */
+
+ sethi %hi(ONE_DOT_ZERO), %g2
+ sub %sp, 16, %sp
+ st %g2, [%sp + STACK_VAR_OFF+0x0]
+ st %o1, [%sp + STACK_VAR_OFF+0x4]
+ ld [%sp + STACK_VAR_OFF+0x0], %f12 ! f12 = 1.0f
+ ld [%sp + STACK_VAR_OFF+0x4], %f15 ! f15 = scale
+ add %sp, 16, %sp
+
+ LDPTR [%o0 + MAT_INV], %o0 ! o0 = mat->inv
+ LDPTR [%o2 + V4F_START], %o5 ! o5 = 'from' in->start
+ ld [%o2 + V4F_COUNT], %g1 ! g1 = in->count
+ ld [%o2 + V4F_STRIDE], %g2 ! g2 = in->stride
+ LDPTR [%o4 + V4F_START], %g3 ! g3 = 'out' dest->start
+
+ LDMATRIX_0_1_2_4_5_6_8_9_10(%o0)
+
+ /* dest->count = in->count */
+ st %g1, [%o4 + V4F_COUNT]
+
+ cmp %g1, 1
+ bl 7f
+ cmp %o3, 0
+ bne 4f
+ clr %o4 ! 'i' for STRIDE_LOOP
+
+1: /* LENGTHS == NULL */
+ ld [%o5 + 0x00], %f0 ! ux = from[0]
+ ld [%o5 + 0x04], %f1 ! uy = from[1]
+ ld [%o5 + 0x08], %f2 ! uz = from[2]
+ add %o5, %g2, %o5 ! STRIDE_F(from, stride)
+ add %o4, 1, %o4 ! i++
+
+ /* tx (f3) = (ux * m0) + (uy * m1) + (uz * m2)
+ * ty (f5) = (ux * m4) + (uy * m5) + (uz * m6)
+ * tz (f7) = (ux * m8) + (uy * m9) + (uz * m10)
+ */
+ fmuls %f0, M0, %f3 ! FGM Group
+ fmuls %f1, M1, %f4 ! FGM Group
+ fmuls %f0, M4, %f5 ! FGM Group
+ fmuls %f1, M5, %f6 ! FGM Group
+ fmuls %f0, M8, %f7 ! FGM Group f3 available
+ fmuls %f1, M9, %f8 ! FGM Group f4 available
+ fadds %f3, %f4, %f3 ! FGA
+ fmuls %f2, M2, %f10 ! FGM Group f5 available
+ fmuls %f2, M6, %f0 ! FGM Group f6 available
+ fadds %f5, %f6, %f5 ! FGA
+ fmuls %f2, M10, %f4 ! FGM Group f7 available
+ fadds %f7, %f8, %f7 ! FGA Group f8,f3 available
+ fadds %f3, %f10, %f3 ! FGA Group f10 available
+ fadds %f5, %f0, %f5 ! FGA Group stall f0,f5 available
+ fadds %f7, %f4, %f7 ! FGA Group stall f4,f7 available
+
+ /* f3=tx, f5=ty, f7=tz */
+
+ /* len (f6) = (tx * tx) + (ty * ty) + (tz * tz) */
+ fmuls %f3, %f3, %f6 ! FGM Group f3 available
+ fmuls %f5, %f5, %f8 ! FGM Group f5 available
+ fmuls %f7, %f7, %f10 ! FGM Group f7 available
+ fadds %f6, %f8, %f6 ! FGA Group 2cyc stall f6,f8 available
+ fadds %f6, %f10, %f6 ! FGA Group 4cyc stall f6,f10 available
+
+ /* scale (f6) = 1.0 / sqrt(len) */
+ fsqrts %f6, %f6 ! FDIV 20 cycles
+ fdivs %f12, %f6, %f6 ! FDIV 14 cycles
+
+ fmuls %f3, %f6, %f3
+ st %f3, [%g3 + 0x00] ! out[i][0] = tx * scale
+ fmuls %f5, %f6, %f5
+ st %f5, [%g3 + 0x04] ! out[i][1] = ty * scale
+ fmuls %f7, %f6, %f7
+ st %f7, [%g3 + 0x08] ! out[i][2] = tz * scale
+
+ cmp %o4, %g1 ! continue if (i < count)
+ bl 1b
+ add %g3, 0x10, %g3 ! advance out vector pointer
+
+ ba 7f
+ nop
+
+4: /* LENGTHS != NULL */
+ fmuls M0, %f15, M0
+ fmuls M1, %f15, M1
+ fmuls M2, %f15, M2
+ fmuls M4, %f15, M4
+ fmuls M5, %f15, M5
+ fmuls M6, %f15, M6
+ fmuls M8, %f15, M8
+ fmuls M9, %f15, M9
+ fmuls M10, %f15, M10
+
+5:
+ ld [%o5 + 0x00], %f0 ! ux = from[0]
+ ld [%o5 + 0x04], %f1 ! uy = from[1]
+ ld [%o5 + 0x08], %f2 ! uz = from[2]
+ add %o5, %g2, %o5 ! STRIDE_F(from, stride)
+ add %o4, 1, %o4 ! i++
+
+ /* tx (f3) = (ux * m0) + (uy * m1) + (uz * m2)
+ * ty (f5) = (ux * m4) + (uy * m5) + (uz * m6)
+ * tz (f7) = (ux * m8) + (uy * m9) + (uz * m10)
+ */
+ fmuls %f0, M0, %f3 ! FGM Group
+ fmuls %f1, M1, %f4 ! FGM Group
+ fmuls %f0, M4, %f5 ! FGM Group
+ fmuls %f1, M5, %f6 ! FGM Group
+ fmuls %f0, M8, %f7 ! FGM Group f3 available
+ fmuls %f1, M9, %f8 ! FGM Group f4 available
+ fadds %f3, %f4, %f3 ! FGA
+ fmuls %f2, M2, %f10 ! FGM Group f5 available
+ fmuls %f2, M6, %f0 ! FGM Group f6 available
+ fadds %f5, %f6, %f5 ! FGA
+ fmuls %f2, M10, %f4 ! FGM Group f7 available
+ fadds %f7, %f8, %f7 ! FGA Group f8,f3 available
+ fadds %f3, %f10, %f3 ! FGA Group f10 available
+ ld [%o3], %f13 ! LSU
+ fadds %f5, %f0, %f5 ! FGA Group stall f0,f5 available
+ add %o3, 4, %o3 ! IEU0
+ fadds %f7, %f4, %f7 ! FGA Group stall f4,f7 available
+
+ /* f3=tx, f5=ty, f7=tz, f13=lengths[i] */
+
+ fmuls %f3, %f13, %f3
+ st %f3, [%g3 + 0x00] ! out[i][0] = tx * len
+ fmuls %f5, %f13, %f5
+ st %f5, [%g3 + 0x04] ! out[i][1] = ty * len
+ fmuls %f7, %f13, %f7
+ st %f7, [%g3 + 0x08] ! out[i][2] = tz * len
+
+ cmp %o4, %g1 ! continue if (i < count)
+ bl 5b
+ add %g3, 0x10, %g3 ! advance out vector pointer
+
+7: retl
+ nop
+
+ .globl _mesa_sparc_transform_normalize_normals_no_rot
+_mesa_sparc_transform_normalize_normals_no_rot:
+ /* o0=mat o1=scale o2=in o3=lengths o4=dest */
+
+ sethi %hi(ONE_DOT_ZERO), %g2
+ sub %sp, 16, %sp
+ st %g2, [%sp + STACK_VAR_OFF+0x0]
+ st %o1, [%sp + STACK_VAR_OFF+0x4]
+ ld [%sp + STACK_VAR_OFF+0x0], %f12 ! f12 = 1.0f
+ ld [%sp + STACK_VAR_OFF+0x4], %f15 ! f15 = scale
+ add %sp, 16, %sp
+
+ LDPTR [%o0 + MAT_INV], %o0 ! o0 = mat->inv
+ LDPTR [%o2 + V4F_START], %o5 ! o5 = 'from' in->start
+ ld [%o2 + V4F_COUNT], %g1 ! g1 = in->count
+ ld [%o2 + V4F_STRIDE], %g2 ! g2 = in->stride
+ LDPTR [%o4 + V4F_START], %g3 ! g3 = 'out' dest->start
+
+ LDMATRIX_0_5_10(%o0)
+
+ /* dest->count = in->count */
+ st %g1, [%o4 + V4F_COUNT]
+
+ cmp %g1, 1
+ bl 7f
+ cmp %o3, 0
+ bne 4f
+ clr %o4 ! 'i' for STRIDE_LOOP
+
+1: /* LENGTHS == NULL */
+ ld [%o5 + 0x00], %f0 ! ux = from[0]
+ ld [%o5 + 0x04], %f1 ! uy = from[1]
+ ld [%o5 + 0x08], %f2 ! uz = from[2]
+ add %o5, %g2, %o5 ! STRIDE_F(from, stride)
+ add %o4, 1, %o4 ! i++
+
+ /* tx (f3) = (ux * m0)
+ * ty (f5) = (uy * m5)
+ * tz (f7) = (uz * m10)
+ */
+ fmuls %f0, M0, %f3 ! FGM Group
+ fmuls %f1, M5, %f5 ! FGM Group
+ fmuls %f2, M10, %f7 ! FGM Group
+
+ /* f3=tx, f5=ty, f7=tz */
+
+ /* len (f6) = (tx * tx) + (ty * ty) + (tz * tz) */
+ fmuls %f3, %f3, %f6 ! FGM Group stall, f3 available
+ fmuls %f5, %f5, %f8 ! FGM Group f5 available
+ fmuls %f7, %f7, %f10 ! FGM Group f7 available
+ fadds %f6, %f8, %f6 ! FGA Group 2cyc stall f6,f8 available
+ fadds %f6, %f10, %f6 ! FGA Group 4cyc stall f6,f10 available
+
+ /* scale (f6) = 1.0 / sqrt(len) */
+ fsqrts %f6, %f6 ! FDIV 20 cycles
+ fdivs %f12, %f6, %f6 ! FDIV 14 cycles
+
+ fmuls %f3, %f6, %f3
+ st %f3, [%g3 + 0x00] ! out[i][0] = tx * scale
+ fmuls %f5, %f6, %f5
+ st %f5, [%g3 + 0x04] ! out[i][1] = ty * scale
+ fmuls %f7, %f6, %f7
+ st %f7, [%g3 + 0x08] ! out[i][2] = tz * scale
+
+ cmp %o4, %g1 ! continue if (i < count)
+ bl 1b
+ add %g3, 0x10, %g3 ! advance out vector pointer
+
+ ba 7f
+ nop
+
+4: /* LENGTHS != NULL */
+ fmuls M0, %f15, M0
+ fmuls M5, %f15, M5
+ fmuls M10, %f15, M10
+
+5:
+ ld [%o5 + 0x00], %f0 ! ux = from[0]
+ ld [%o5 + 0x04], %f1 ! uy = from[1]
+ ld [%o5 + 0x08], %f2 ! uz = from[2]
+ add %o5, %g2, %o5 ! STRIDE_F(from, stride)
+ add %o4, 1, %o4 ! i++
+
+ /* tx (f3) = (ux * m0)
+ * ty (f5) = (uy * m5)
+ * tz (f7) = (uz * m10)
+ */
+ fmuls %f0, M0, %f3 ! FGM Group
+ ld [%o3], %f13 ! LSU
+ fmuls %f1, M5, %f5 ! FGM Group
+ add %o3, 4, %o3 ! IEU0
+ fmuls %f2, M10, %f7 ! FGM Group
+
+ /* f3=tx, f5=ty, f7=tz, f13=lengths[i] */
+
+ fmuls %f3, %f13, %f3
+ st %f3, [%g3 + 0x00] ! out[i][0] = tx * len
+ fmuls %f5, %f13, %f5
+ st %f5, [%g3 + 0x04] ! out[i][1] = ty * len
+ fmuls %f7, %f13, %f7
+ st %f7, [%g3 + 0x08] ! out[i][2] = tz * len
+
+ cmp %o4, %g1 ! continue if (i < count)
+ bl 5b
+ add %g3, 0x10, %g3 ! advance out vector pointer
+
+7: retl
+ nop
+
+ .globl _mesa_sparc_transform_rescale_normals_no_rot
+_mesa_sparc_transform_rescale_normals_no_rot:
+ /* o0=mat o1=scale o2=in o3=lengths o4=dest */
+ sub %sp, 16, %sp
+ st %o1, [%sp + STACK_VAR_OFF+0x0]
+ ld [%sp + STACK_VAR_OFF+0x0], %f15 ! f15 = scale
+ add %sp, 16, %sp
+
+ LDPTR [%o0 + MAT_INV], %o0 ! o0 = mat->inv
+ LDPTR [%o2 + V4F_START], %o5 ! o5 = 'from' in->start
+ ld [%o2 + V4F_COUNT], %g1 ! g1 = in->count
+ ld [%o2 + V4F_STRIDE], %g2 ! g2 = in->stride
+ LDPTR [%o4 + V4F_START], %g3 ! g3 = 'out' dest->start
+
+ LDMATRIX_0_5_10(%o0)
+
+ /* dest->count = in->count */
+ st %g1, [%o4 + V4F_COUNT]
+
+ cmp %g1, 1
+ bl 7f
+ clr %o4 ! 'i' for STRIDE_LOOP
+
+ fmuls M0, %f15, M0
+ fmuls M5, %f15, M5
+ fmuls M10, %f15, M10
+
+1: ld [%o5 + 0x00], %f0 ! ux = from[0]
+ ld [%o5 + 0x04], %f1 ! uy = from[1]
+ ld [%o5 + 0x08], %f2 ! uz = from[2]
+ add %o5, %g2, %o5 ! STRIDE_F(from, stride)
+ add %o4, 1, %o4 ! i++
+
+ /* tx (f3) = (ux * m0)
+ * ty (f5) = (uy * m5)
+ * tz (f7) = (uz * m10)
+ */
+ fmuls %f0, M0, %f3 ! FGM Group
+ st %f3, [%g3 + 0x00] ! LSU
+ fmuls %f1, M5, %f5 ! FGM Group
+ st %f5, [%g3 + 0x04] ! LSU
+ fmuls %f2, M10, %f7 ! FGM Group
+ st %f7, [%g3 + 0x08] ! LSU
+
+ cmp %o4, %g1 ! continue if (i < count)
+ bl 1b
+ add %g3, 0x10, %g3 ! advance out vector pointer
+
+7: retl
+ nop
+
+ .globl _mesa_sparc_transform_rescale_normals
+_mesa_sparc_transform_rescale_normals:
+ /* o0=mat o1=scale o2=in o3=lengths o4=dest */
+ sub %sp, 16, %sp
+ st %o1, [%sp + STACK_VAR_OFF+0x0]
+ ld [%sp + STACK_VAR_OFF+0x0], %f15 ! f15 = scale
+ add %sp, 16, %sp
+
+ LDPTR [%o0 + MAT_INV], %o0 ! o0 = mat->inv
+ LDPTR [%o2 + V4F_START], %o5 ! o5 = 'from' in->start
+ ld [%o2 + V4F_COUNT], %g1 ! g1 = in->count
+ ld [%o2 + V4F_STRIDE], %g2 ! g2 = in->stride
+ LDPTR [%o4 + V4F_START], %g3 ! g3 = 'out' dest->start
+
+ LDMATRIX_0_1_2_4_5_6_8_9_10(%o0)
+
+ /* dest->count = in->count */
+ st %g1, [%o4 + V4F_COUNT]
+
+ cmp %g1, 1
+ bl 7f
+ clr %o4 ! 'i' for STRIDE_LOOP
+
+ fmuls M0, %f15, M0
+ fmuls M1, %f15, M1
+ fmuls M2, %f15, M2
+ fmuls M4, %f15, M4
+ fmuls M5, %f15, M5
+ fmuls M6, %f15, M6
+ fmuls M8, %f15, M8
+ fmuls M9, %f15, M9
+ fmuls M10, %f15, M10
+
+1: ld [%o5 + 0x00], %f0 ! ux = from[0]
+ ld [%o5 + 0x04], %f1 ! uy = from[1]
+ ld [%o5 + 0x08], %f2 ! uz = from[2]
+ add %o5, %g2, %o5 ! STRIDE_F(from, stride)
+ add %o4, 1, %o4 ! i++
+
+ fmuls %f0, M0, %f3 ! FGM Group
+ fmuls %f1, M1, %f4 ! FGM Group
+ fmuls %f0, M4, %f5 ! FGM Group
+ fmuls %f1, M5, %f6 ! FGM Group
+ fmuls %f0, M8, %f7 ! FGM Group f3 available
+ fmuls %f1, M9, %f8 ! FGM Group f4 available
+ fadds %f3, %f4, %f3 ! FGA
+ fmuls %f2, M2, %f10 ! FGM Group f5 available
+ fmuls %f2, M6, %f0 ! FGM Group f6 available
+ fadds %f5, %f6, %f5 ! FGA
+ fmuls %f2, M10, %f4 ! FGM Group f7 available
+ fadds %f7, %f8, %f7 ! FGA Group f8,f3 available
+ fadds %f3, %f10, %f3 ! FGA Group f10 available
+ st %f3, [%g3 + 0x00] ! LSU
+ fadds %f5, %f0, %f5 ! FGA Group stall f0,f5 available
+ st %f5, [%g3 + 0x04] ! LSU
+ fadds %f7, %f4, %f7 ! FGA Group stall f4,f7 available
+ st %f7, [%g3 + 0x08] ! LSU
+
+ cmp %o4, %g1 ! continue if (i < count)
+ bl 1b
+ add %g3, 0x10, %g3 ! advance out vector pointer
+
+7: retl
+ nop
+
+ .globl _mesa_sparc_transform_normals_no_rot
+_mesa_sparc_transform_normals_no_rot:
+ /* o0=mat o1=scale o2=in o3=lengths o4=dest */
+ LDPTR [%o0 + MAT_INV], %o0 ! o0 = mat->inv
+ LDPTR [%o2 + V4F_START], %o5 ! o5 = 'from' in->start
+ ld [%o2 + V4F_COUNT], %g1 ! g1 = in->count
+ ld [%o2 + V4F_STRIDE], %g2 ! g2 = in->stride
+ LDPTR [%o4 + V4F_START], %g3 ! g3 = 'out' dest->start
+
+ LDMATRIX_0_5_10(%o0)
+
+ /* dest->count = in->count */
+ st %g1, [%o4 + V4F_COUNT]
+
+ cmp %g1, 1
+ bl 7f
+ clr %o4 ! 'i' for STRIDE_LOOP
+
+1: ld [%o5 + 0x00], %f0 ! ux = from[0]
+ ld [%o5 + 0x04], %f1 ! uy = from[1]
+ ld [%o5 + 0x08], %f2 ! uz = from[2]
+ add %o5, %g2, %o5 ! STRIDE_F(from, stride)
+ add %o4, 1, %o4 ! i++
+
+ /* tx (f3) = (ux * m0)
+ * ty (f5) = (uy * m5)
+ * tz (f7) = (uz * m10)
+ */
+ fmuls %f0, M0, %f3 ! FGM Group
+ st %f3, [%g3 + 0x00] ! LSU
+ fmuls %f1, M5, %f5 ! FGM Group
+ st %f5, [%g3 + 0x04] ! LSU
+ fmuls %f2, M10, %f7 ! FGM Group
+ st %f7, [%g3 + 0x08] ! LSU
+
+ cmp %o4, %g1 ! continue if (i < count)
+ bl 1b
+ add %g3, 0x10, %g3 ! advance out vector pointer
+
+7: retl
+ nop
+
+ .globl _mesa_sparc_transform_normals
+_mesa_sparc_transform_normals:
+ /* o0=mat o1=scale o2=in o3=lengths o4=dest */
+ LDPTR [%o0 + MAT_INV], %o0 ! o0 = mat->inv
+ LDPTR [%o2 + V4F_START], %o5 ! o5 = 'from' in->start
+ ld [%o2 + V4F_COUNT], %g1 ! g1 = in->count
+ ld [%o2 + V4F_STRIDE], %g2 ! g2 = in->stride
+ LDPTR [%o4 + V4F_START], %g3 ! g3 = 'out' dest->start
+
+ LDMATRIX_0_1_2_4_5_6_8_9_10(%o0)
+
+ /* dest->count = in->count */
+ st %g1, [%o4 + V4F_COUNT]
+
+ cmp %g1, 1
+ bl 7f
+ clr %o4 ! 'i' for STRIDE_LOOP
+
+1: ld [%o5 + 0x00], %f0 ! ux = from[0]
+ ld [%o5 + 0x04], %f1 ! uy = from[1]
+ ld [%o5 + 0x08], %f2 ! uz = from[2]
+ add %o5, %g2, %o5 ! STRIDE_F(from, stride)
+ add %o4, 1, %o4 ! i++
+
+ fmuls %f0, M0, %f3 ! FGM Group
+ fmuls %f1, M1, %f4 ! FGM Group
+ fmuls %f0, M4, %f5 ! FGM Group
+ fmuls %f1, M5, %f6 ! FGM Group
+ fmuls %f0, M8, %f7 ! FGM Group f3 available
+ fmuls %f1, M9, %f8 ! FGM Group f4 available
+ fadds %f3, %f4, %f3 ! FGA
+ fmuls %f2, M2, %f10 ! FGM Group f5 available
+ fmuls %f2, M6, %f0 ! FGM Group f6 available
+ fadds %f5, %f6, %f5 ! FGA
+ fmuls %f2, M10, %f4 ! FGM Group f7 available
+ fadds %f7, %f8, %f7 ! FGA Group f8,f3 available
+ fadds %f3, %f10, %f3 ! FGA Group f10 available
+ st %f3, [%g3 + 0x00] ! LSU
+ fadds %f5, %f0, %f5 ! FGA Group stall f0,f5 available
+ st %f5, [%g3 + 0x04] ! LSU
+ fadds %f7, %f4, %f7 ! FGA Group stall f4,f7 available
+ st %f7, [%g3 + 0x08] ! LSU
+
+ cmp %o4, %g1 ! continue if (i < count)
+ bl 1b
+ add %g3, 0x10, %g3 ! advance out vector pointer
+
+7: retl
+ nop
+
+ .globl _mesa_sparc_normalize_normals
+_mesa_sparc_normalize_normals:
+ /* o0=mat o1=scale o2=in o3=lengths o4=dest */
+
+ sethi %hi(ONE_DOT_ZERO), %g2
+ sub %sp, 16, %sp
+ st %g2, [%sp + STACK_VAR_OFF+0x0]
+ ld [%sp + STACK_VAR_OFF+0x0], %f12 ! f12 = 1.0f
+ add %sp, 16, %sp
+
+ LDPTR [%o2 + V4F_START], %o5 ! o5 = 'from' in->start
+ ld [%o2 + V4F_COUNT], %g1 ! g1 = in->count
+ ld [%o2 + V4F_STRIDE], %g2 ! g2 = in->stride
+ LDPTR [%o4 + V4F_START], %g3 ! g3 = 'out' dest->start
+
+ /* dest->count = in->count */
+ st %g1, [%o4 + V4F_COUNT]
+
+ cmp %g1, 1
+ bl 7f
+ cmp %o3, 0
+ bne 4f
+ clr %o4 ! 'i' for STRIDE_LOOP
+
+1: /* LENGTHS == NULL */
+ ld [%o5 + 0x00], %f3 ! ux = from[0]
+ ld [%o5 + 0x04], %f5 ! uy = from[1]
+ ld [%o5 + 0x08], %f7 ! uz = from[2]
+ add %o5, %g2, %o5 ! STRIDE_F(from, stride)
+ add %o4, 1, %o4 ! i++
+
+ /* f3=tx, f5=ty, f7=tz */
+
+ /* len (f6) = (tx * tx) + (ty * ty) + (tz * tz) */
+ fmuls %f3, %f3, %f6 ! FGM Group f3 available
+ fmuls %f5, %f5, %f8 ! FGM Group f5 available
+ fmuls %f7, %f7, %f10 ! FGM Group f7 available
+ fadds %f6, %f8, %f6 ! FGA Group 2cyc stall f6,f8 available
+ fadds %f6, %f10, %f6 ! FGA Group 4cyc stall f6,f10 available
+
+ /* scale (f6) = 1.0 / sqrt(len) */
+ fsqrts %f6, %f6 ! FDIV 20 cycles
+ fdivs %f12, %f6, %f6 ! FDIV 14 cycles
+
+ fmuls %f3, %f6, %f3
+ st %f3, [%g3 + 0x00] ! out[i][0] = tx * scale
+ fmuls %f5, %f6, %f5
+ st %f5, [%g3 + 0x04] ! out[i][1] = ty * scale
+ fmuls %f7, %f6, %f7
+ st %f7, [%g3 + 0x08] ! out[i][2] = tz * scale
+
+ cmp %o4, %g1 ! continue if (i < count)
+ bl 1b
+ add %g3, 0x10, %g3 ! advance out vector pointer
+
+ ba 7f
+ nop
+
+4: /* LENGTHS != NULL */
+
+5:
+ ld [%o5 + 0x00], %f3 ! ux = from[0]
+ ld [%o5 + 0x04], %f5 ! uy = from[1]
+ ld [%o5 + 0x08], %f7 ! uz = from[2]
+ add %o5, %g2, %o5 ! STRIDE_F(from, stride)
+ add %o4, 1, %o4 ! i++
+
+ ld [%o3], %f13 ! LSU
+ add %o3, 4, %o3 ! IEU0
+
+ /* f3=tx, f5=ty, f7=tz, f13=lengths[i] */
+
+ fmuls %f3, %f13, %f3
+ st %f3, [%g3 + 0x00] ! out[i][0] = tx * len
+ fmuls %f5, %f13, %f5
+ st %f5, [%g3 + 0x04] ! out[i][1] = ty * len
+ fmuls %f7, %f13, %f7
+ st %f7, [%g3 + 0x08] ! out[i][2] = tz * len
+
+ cmp %o4, %g1 ! continue if (i < count)
+ bl 5b
+ add %g3, 0x10, %g3 ! advance out vector pointer
+
+7: retl
+ nop
+
+ .globl _mesa_sparc_rescale_normals
+_mesa_sparc_rescale_normals:
+ /* o0=mat o1=scale o2=in o3=lengths o4=dest */
+
+ sethi %hi(ONE_DOT_ZERO), %g2
+ sub %sp, 16, %sp
+ st %o1, [%sp + STACK_VAR_OFF+0x0]
+ ld [%sp + STACK_VAR_OFF+0x0], %f15 ! f15 = scale
+ add %sp, 16, %sp
+
+ LDPTR [%o2 + V4F_START], %o5 ! o5 = 'from' in->start
+ ld [%o2 + V4F_COUNT], %g1 ! g1 = in->count
+ ld [%o2 + V4F_STRIDE], %g2 ! g2 = in->stride
+ LDPTR [%o4 + V4F_START], %g3 ! g3 = 'out' dest->start
+
+ /* dest->count = in->count */
+ st %g1, [%o4 + V4F_COUNT]
+
+ cmp %g1, 1
+ bl 7f
+ clr %o4 ! 'i' for STRIDE_LOOP
+
+1:
+ ld [%o5 + 0x00], %f3 ! ux = from[0]
+ ld [%o5 + 0x04], %f5 ! uy = from[1]
+ ld [%o5 + 0x08], %f7 ! uz = from[2]
+ add %o5, %g2, %o5 ! STRIDE_F(from, stride)
+ add %o4, 1, %o4 ! i++
+
+ /* f3=tx, f5=ty, f7=tz */
+
+ fmuls %f3, %f15, %f3
+ st %f3, [%g3 + 0x00] ! out[i][0] = tx * scale
+ fmuls %f5, %f15, %f5
+ st %f5, [%g3 + 0x04] ! out[i][1] = ty * scale
+ fmuls %f7, %f15, %f7
+ st %f7, [%g3 + 0x08] ! out[i][2] = tz * scale
+
+ cmp %o4, %g1 ! continue if (i < count)
+ bl 1b
+ add %g3, 0x10, %g3 ! advance out vector pointer
+
+7: retl
+ nop
diff --git a/src/arch/sparc/sparc.c b/src/arch/sparc/sparc.c
new file mode 100644
index 0000000..cea0c7c
--- /dev/null
+++ b/src/arch/sparc/sparc.c
@@ -0,0 +1,142 @@
+/*
+ * Mesa 3-D graphics library
+ * Version: 6.3
+ *
+ * Copyright (C) 1999-2003 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Sparc assembly code by David S. Miller
+ */
+
+
+#include "sparc.h"
+
+#ifdef USE_SPARC_ASM
+
+#include "main/context.h"
+#include "math/m_xform.h"
+#include "tnl/t_context.h"
+
+#ifdef DEBUG
+#include "math/m_debug.h"
+#endif
+
+#define XFORM_ARGS GLvector4f *to_vec, \
+ const GLfloat m[16], \
+ const GLvector4f *from_vec
+
+#define DECLARE_XFORM_GROUP(pfx, sz) \
+ extern void _mesa_##pfx##_transform_points##sz##_general(XFORM_ARGS); \
+ extern void _mesa_##pfx##_transform_points##sz##_identity(XFORM_ARGS); \
+ extern void _mesa_##pfx##_transform_points##sz##_3d_no_rot(XFORM_ARGS); \
+ extern void _mesa_##pfx##_transform_points##sz##_perspective(XFORM_ARGS); \
+ extern void _mesa_##pfx##_transform_points##sz##_2d(XFORM_ARGS); \
+ extern void _mesa_##pfx##_transform_points##sz##_2d_no_rot(XFORM_ARGS); \
+ extern void _mesa_##pfx##_transform_points##sz##_3d(XFORM_ARGS);
+
+#define ASSIGN_XFORM_GROUP(pfx, sz) \
+ _mesa_transform_tab[sz][MATRIX_GENERAL] = \
+ _mesa_##pfx##_transform_points##sz##_general; \
+ _mesa_transform_tab[sz][MATRIX_IDENTITY] = \
+ _mesa_##pfx##_transform_points##sz##_identity; \
+ _mesa_transform_tab[sz][MATRIX_3D_NO_ROT] = \
+ _mesa_##pfx##_transform_points##sz##_3d_no_rot; \
+ _mesa_transform_tab[sz][MATRIX_PERSPECTIVE] = \
+ _mesa_##pfx##_transform_points##sz##_perspective; \
+ _mesa_transform_tab[sz][MATRIX_2D] = \
+ _mesa_##pfx##_transform_points##sz##_2d; \
+ _mesa_transform_tab[sz][MATRIX_2D_NO_ROT] = \
+ _mesa_##pfx##_transform_points##sz##_2d_no_rot; \
+ _mesa_transform_tab[sz][MATRIX_3D] = \
+ _mesa_##pfx##_transform_points##sz##_3d;
+
+
+DECLARE_XFORM_GROUP(sparc, 1)
+DECLARE_XFORM_GROUP(sparc, 2)
+DECLARE_XFORM_GROUP(sparc, 3)
+DECLARE_XFORM_GROUP(sparc, 4)
+
+extern GLvector4f *_mesa_sparc_cliptest_points4(GLvector4f *clip_vec,
+ GLvector4f *proj_vec,
+ GLubyte clipMask[],
+ GLubyte *orMask,
+ GLubyte *andMask,
+ GLboolean viewport_z_clip);
+
+extern GLvector4f *_mesa_sparc_cliptest_points4_np(GLvector4f *clip_vec,
+ GLvector4f *proj_vec,
+ GLubyte clipMask[],
+ GLubyte *orMask,
+ GLubyte *andMask,
+ GLboolean viewport_z_clip);
+
+#define NORM_ARGS const GLmatrix *mat, \
+ GLfloat scale, \
+ const GLvector4f *in, \
+ const GLfloat *lengths, \
+ GLvector4f *dest
+
+extern void _mesa_sparc_transform_normalize_normals(NORM_ARGS);
+extern void _mesa_sparc_transform_normalize_normals_no_rot(NORM_ARGS);
+extern void _mesa_sparc_transform_rescale_normals_no_rot(NORM_ARGS);
+extern void _mesa_sparc_transform_rescale_normals(NORM_ARGS);
+extern void _mesa_sparc_transform_normals_no_rot(NORM_ARGS);
+extern void _mesa_sparc_transform_normals(NORM_ARGS);
+extern void _mesa_sparc_normalize_normals(NORM_ARGS);
+extern void _mesa_sparc_rescale_normals(NORM_ARGS);
+
+
+
+void _mesa_init_all_sparc_transform_asm(void)
+{
+ ASSIGN_XFORM_GROUP(sparc, 1)
+ ASSIGN_XFORM_GROUP(sparc, 2)
+ ASSIGN_XFORM_GROUP(sparc, 3)
+ ASSIGN_XFORM_GROUP(sparc, 4)
+
+ _mesa_clip_tab[4] = _mesa_sparc_cliptest_points4;
+ _mesa_clip_np_tab[4] = _mesa_sparc_cliptest_points4_np;
+
+ _mesa_normal_tab[NORM_TRANSFORM | NORM_NORMALIZE] =
+ _mesa_sparc_transform_normalize_normals;
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_NORMALIZE] =
+ _mesa_sparc_transform_normalize_normals_no_rot;
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_RESCALE] =
+ _mesa_sparc_transform_rescale_normals_no_rot;
+ _mesa_normal_tab[NORM_TRANSFORM | NORM_RESCALE] =
+ _mesa_sparc_transform_rescale_normals;
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT] =
+ _mesa_sparc_transform_normals_no_rot;
+ _mesa_normal_tab[NORM_TRANSFORM] =
+ _mesa_sparc_transform_normals;
+ _mesa_normal_tab[NORM_NORMALIZE] =
+ _mesa_sparc_normalize_normals;
+ _mesa_normal_tab[NORM_RESCALE] =
+ _mesa_sparc_rescale_normals;
+
+#ifdef DEBUG_MATH
+ _math_test_all_transform_functions("sparc");
+ _math_test_all_cliptest_functions("sparc");
+ _math_test_all_normal_transform_functions("sparc");
+#endif
+}
+
+#endif /* USE_SPARC_ASM */
diff --git a/src/arch/sparc/sparc.h b/src/arch/sparc/sparc.h
new file mode 100644
index 0000000..b9ea336
--- /dev/null
+++ b/src/arch/sparc/sparc.h
@@ -0,0 +1,36 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.1
+ *
+ * Copyright (C) 1999 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Sparc assembly code by David S. Miller
+ */
+
+
+#ifndef SPARC_H
+#define SPARC_H
+
+extern void _mesa_init_all_sparc_transform_asm(void);
+
+#endif /* !(SPARC_H) */
diff --git a/src/arch/sparc/sparc_clip.S b/src/arch/sparc/sparc_clip.S
new file mode 100644
index 0000000..dc23917
--- /dev/null
+++ b/src/arch/sparc/sparc_clip.S
@@ -0,0 +1,233 @@
+/*
+ * Clip testing in SPARC assembly
+ */
+
+#if __arch64__
+#define LDPTR ldx
+#define V4F_DATA 0x00
+#define V4F_START 0x08
+#define V4F_COUNT 0x10
+#define V4F_STRIDE 0x14
+#define V4F_SIZE 0x18
+#define V4F_FLAGS 0x1c
+#else
+#define LDPTR ld
+#define V4F_DATA 0x00
+#define V4F_START 0x04
+#define V4F_COUNT 0x08
+#define V4F_STRIDE 0x0c
+#define V4F_SIZE 0x10
+#define V4F_FLAGS 0x14
+#endif
+
+#define VEC_SIZE_1 1
+#define VEC_SIZE_2 3
+#define VEC_SIZE_3 7
+#define VEC_SIZE_4 15
+
+ .register %g2, #scratch
+ .register %g3, #scratch
+
+ .text
+ .align 64
+
+one_dot_zero:
+ .word 0x3f800000 /* 1.0f */
+
+ /* This trick is shamelessly stolen from the x86
+ * Mesa asm. Very clever, and we can do it too
+ * since we have the necessary add with carry
+ * instructions on Sparc.
+ */
+clip_table:
+ .byte 0, 1, 0, 2, 4, 5, 4, 6
+ .byte 0, 1, 0, 2, 8, 9, 8, 10
+ .byte 32, 33, 32, 34, 36, 37, 36, 38
+ .byte 32, 33, 32, 34, 40, 41, 40, 42
+ .byte 0, 1, 0, 2, 4, 5, 4, 6
+ .byte 0, 1, 0, 2, 8, 9, 8, 10
+ .byte 16, 17, 16, 18, 20, 21, 20, 22
+ .byte 16, 17, 16, 18, 24, 25, 24, 26
+ .byte 63, 61, 63, 62, 55, 53, 55, 54
+ .byte 63, 61, 63, 62, 59, 57, 59, 58
+ .byte 47, 45, 47, 46, 39, 37, 39, 38
+ .byte 47, 45, 47, 46, 43, 41, 43, 42
+ .byte 63, 61, 63, 62, 55, 53, 55, 54
+ .byte 63, 61, 63, 62, 59, 57, 59, 58
+ .byte 31, 29, 31, 30, 23, 21, 23, 22
+ .byte 31, 29, 31, 30, 27, 25, 27, 26
+
+/* GLvector4f *clip_vec, GLvector4f *proj_vec,
+ GLubyte clipMask[], GLubyte *orMask, GLubyte *andMask,
+ GLboolean viewport_z_enable */
+
+ .align 64
+__pc_tramp:
+ retl
+ nop
+
+ .globl _mesa_sparc_cliptest_points4
+_mesa_sparc_cliptest_points4:
+ save %sp, -64, %sp
+ call __pc_tramp
+ sub %o7, (. - one_dot_zero - 4), %g1
+ ld [%g1 + 0x0], %f4
+ add %g1, 0x4, %g1
+
+ ld [%i0 + V4F_STRIDE], %l1
+ ld [%i0 + V4F_COUNT], %l3
+ LDPTR [%i0 + V4F_START], %i0
+ LDPTR [%i1 + V4F_START], %i5
+ ldub [%i3], %g2
+ ldub [%i4], %g3
+ sll %g3, 8, %g3
+ or %g2, %g3, %g2
+
+ ld [%i1 + V4F_FLAGS], %g3
+ or %g3, VEC_SIZE_4, %g3
+ st %g3, [%i1 + V4F_FLAGS]
+ mov 3, %g3
+ st %g3, [%i1 + V4F_SIZE]
+ st %l3, [%i1 + V4F_COUNT]
+ clr %l2
+ clr %l0
+
+ /* l0: i
+ * l3: count
+ * l1: stride
+ * l2: c
+ * g2: (tmpAndMask << 8) | tmpOrMask
+ * g1: clip_table
+ * i0: from[stride][i]
+ * i2: clipMask
+ * i5: vProj[4][i]
+ */
+
+1: ld [%i0 + 0x0c], %f3 ! LSU Group
+ ld [%i0 + 0x0c], %g5 ! LSU Group
+ ld [%i0 + 0x08], %g4 ! LSU Group
+ fdivs %f4, %f3, %f8 ! FGM
+ addcc %g5, %g5, %g5 ! IEU1 Group
+ addx %g0, 0x0, %g3 ! IEU1 Group
+ addcc %g4, %g4, %g4 ! IEU1 Group
+ addx %g3, %g3, %g3 ! IEU1 Group
+ subcc %g5, %g4, %g0 ! IEU1 Group
+ ld [%i0 + 0x04], %g4 ! LSU Group
+ addx %g3, %g3, %g3 ! IEU1 Group
+ addcc %g4, %g4, %g4 ! IEU1 Group
+ addx %g3, %g3, %g3 ! IEU1 Group
+ subcc %g5, %g4, %g0 ! IEU1 Group
+ ld [%i0 + 0x00], %g4 ! LSU Group
+ addx %g3, %g3, %g3 ! IEU1 Group
+ addcc %g4, %g4, %g4 ! IEU1 Group
+ addx %g3, %g3, %g3 ! IEU1 Group
+ subcc %g5, %g4, %g0 ! IEU1 Group
+ addx %g3, %g3, %g3 ! IEU1 Group
+ ldub [%g1 + %g3], %g3 ! LSU Group
+ cmp %g3, 0 ! IEU1 Group, stall
+ be 2f ! CTI
+ stb %g3, [%i2] ! LSU
+ sll %g3, 8, %g4 ! IEU1 Group
+ add %l2, 1, %l2 ! IEU0
+ st %g0, [%i5 + 0x00] ! LSU
+ or %g4, 0xff, %g4 ! IEU0 Group
+ or %g2, %g3, %g2 ! IEU1
+ st %g0, [%i5 + 0x04] ! LSU
+ and %g2, %g4, %g2 ! IEU0 Group
+ st %g0, [%i5 + 0x08] ! LSU
+ b 3f ! CTI
+ st %f4, [%i5 + 0x0c] ! LSU Group
+2: ld [%i0 + 0x00], %f0 ! LSU Group
+ ld [%i0 + 0x04], %f1 ! LSU Group
+ ld [%i0 + 0x08], %f2 ! LSU Group
+ fmuls %f0, %f8, %f0 ! FGM
+ st %f0, [%i5 + 0x00] ! LSU Group
+ fmuls %f1, %f8, %f1 ! FGM
+ st %f1, [%i5 + 0x04] ! LSU Group
+ fmuls %f2, %f8, %f2 ! FGM
+ st %f2, [%i5 + 0x08] ! LSU Group
+ st %f8, [%i5 + 0x0c] ! LSU Group
+3: add %i5, 0x10, %i5 ! IEU1
+ add %l0, 1, %l0 ! IEU0 Group
+ add %i2, 1, %i2 ! IEU0 Group
+ cmp %l0, %l3 ! IEU1 Group
+ bne 1b ! CTI
+ add %i0, %l1, %i0 ! IEU0 Group
+ stb %g2, [%i3] ! LSU
+ srl %g2, 8, %g3 ! IEU0 Group
+ cmp %l2, %l3 ! IEU1 Group
+ bl,a 1f ! CTI
+ clr %g3 ! IEU0
+1: stb %g3, [%i4] ! LSU Group
+ ret ! CTI Group
+ restore %i1, 0x0, %o0
+
+ .globl _mesa_sparc_cliptest_points4_np
+_mesa_sparc_cliptest_points4_np:
+ save %sp, -64, %sp
+
+ call __pc_tramp
+ sub %o7, (. - one_dot_zero - 4), %g1
+ add %g1, 0x4, %g1
+
+ ld [%i0 + V4F_STRIDE], %l1
+ ld [%i0 + V4F_COUNT], %l3
+ LDPTR [%i0 + V4F_START], %i0
+ ldub [%i3], %g2
+ ldub [%i4], %g3
+ sll %g3, 8, %g3
+ or %g2, %g3, %g2
+
+ clr %l2
+ clr %l0
+
+ /* l0: i
+ * l3: count
+ * l1: stride
+ * l2: c
+ * g2: (tmpAndMask << 8) | tmpOrMask
+ * g1: clip_table
+ * i0: from[stride][i]
+ * i2: clipMask
+ */
+
+1: ld [%i0 + 0x0c], %g5 ! LSU Group
+ ld [%i0 + 0x08], %g4 ! LSU Group
+ addcc %g5, %g5, %g5 ! IEU1 Group
+ addx %g0, 0x0, %g3 ! IEU1 Group
+ addcc %g4, %g4, %g4 ! IEU1 Group
+ addx %g3, %g3, %g3 ! IEU1 Group
+ subcc %g5, %g4, %g0 ! IEU1 Group
+ ld [%i0 + 0x04], %g4 ! LSU Group
+ addx %g3, %g3, %g3 ! IEU1 Group
+ addcc %g4, %g4, %g4 ! IEU1 Group
+ addx %g3, %g3, %g3 ! IEU1 Group
+ subcc %g5, %g4, %g0 ! IEU1 Group
+ ld [%i0 + 0x00], %g4 ! LSU Group
+ addx %g3, %g3, %g3 ! IEU1 Group
+ addcc %g4, %g4, %g4 ! IEU1 Group
+ addx %g3, %g3, %g3 ! IEU1 Group
+ subcc %g5, %g4, %g0 ! IEU1 Group
+ addx %g3, %g3, %g3 ! IEU1 Group
+ ldub [%g1 + %g3], %g3 ! LSU Group
+ cmp %g3, 0 ! IEU1 Group, stall
+ be 2f ! CTI
+ stb %g3, [%i2] ! LSU
+ sll %g3, 8, %g4 ! IEU1 Group
+ add %l2, 1, %l2 ! IEU0
+ or %g4, 0xff, %g4 ! IEU0 Group
+ or %g2, %g3, %g2 ! IEU1
+ and %g2, %g4, %g2 ! IEU0 Group
+2: add %l0, 1, %l0 ! IEU0 Group
+ add %i2, 1, %i2 ! IEU0 Group
+ cmp %l0, %l3 ! IEU1 Group
+ bne 1b ! CTI
+ add %i0, %l1, %i0 ! IEU0 Group
+ stb %g2, [%i3] ! LSU
+ srl %g2, 8, %g3 ! IEU0 Group
+ cmp %l2, %l3 ! IEU1 Group
+ bl,a 1f ! CTI
+ clr %g3 ! IEU0
+1: stb %g3, [%i4] ! LSU Group
+ ret ! CTI Group
+ restore %i1, 0x0, %o0
diff --git a/src/arch/sparc/sparc_matrix.h b/src/arch/sparc/sparc_matrix.h
new file mode 100644
index 0000000..f677d9b
--- /dev/null
+++ b/src/arch/sparc/sparc_matrix.h
@@ -0,0 +1,170 @@
+/*
+ * SPARC assembly matrix code.
+ */
+
+#ifndef _SPARC_MATRIX_H
+#define _SPARC_MATRIX_H
+
+#ifdef __arch64__
+#define LDPTR ldx
+#define MAT_M 0x00
+#define MAT_INV 0x08
+#define V4F_DATA 0x00
+#define V4F_START 0x08
+#define V4F_COUNT 0x10
+#define V4F_STRIDE 0x14
+#define V4F_SIZE 0x18
+#define V4F_FLAGS 0x1c
+#else
+#define LDPTR ld
+#define MAT_M 0x00
+#define MAT_INV 0x04
+#define V4F_DATA 0x00
+#define V4F_START 0x04
+#define V4F_COUNT 0x08
+#define V4F_STRIDE 0x0c
+#define V4F_SIZE 0x10
+#define V4F_FLAGS 0x14
+#endif
+
+#define VEC_SIZE_1 1
+#define VEC_SIZE_2 3
+#define VEC_SIZE_3 7
+#define VEC_SIZE_4 15
+
+#define M0 %f16
+#define M1 %f17
+#define M2 %f18
+#define M3 %f19
+#define M4 %f20
+#define M5 %f21
+#define M6 %f22
+#define M7 %f23
+#define M8 %f24
+#define M9 %f25
+#define M10 %f26
+#define M11 %f27
+#define M12 %f28
+#define M13 %f29
+#define M14 %f30
+#define M15 %f31
+
+#define LDMATRIX_0_1_2_3_12_13_14_15(BASE) \
+ ldd [BASE + ( 0 * 0x4)], M0; \
+ ldd [BASE + ( 2 * 0x4)], M2; \
+ ldd [BASE + (12 * 0x4)], M12; \
+ ldd [BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_1_12_13(BASE) \
+ ldd [BASE + ( 0 * 0x4)], M0; \
+ ldd [BASE + (12 * 0x4)], M12
+
+#define LDMATRIX_0_12_13(BASE) \
+ ld [BASE + ( 0 * 0x4)], M0; \
+ ldd [BASE + (12 * 0x4)], M12
+
+#define LDMATRIX_0_1_2_12_13_14(BASE) \
+ ldd [BASE + ( 0 * 0x4)], M0; \
+ ld [BASE + ( 2 * 0x4)], M2; \
+ ldd [BASE + (12 * 0x4)], M12; \
+ ld [BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_12_13_14(BASE) \
+ ld [BASE + ( 0 * 0x4)], M0; \
+ ldd [BASE + (12 * 0x4)], M12; \
+ ld [BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_14(BASE) \
+ ld [BASE + ( 0 * 0x4)], M0; \
+ ld [BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_1_2_3_4_5_6_7_12_13_14_15(BASE) \
+ ldd [BASE + ( 0 * 0x4)], M0; \
+ ldd [BASE + ( 2 * 0x4)], M2; \
+ ldd [BASE + ( 4 * 0x4)], M4; \
+ ldd [BASE + ( 6 * 0x4)], M6; \
+ ldd [BASE + (12 * 0x4)], M12; \
+ ldd [BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_5_12_13(BASE) \
+ ld [BASE + ( 0 * 0x4)], M0; \
+ ld [BASE + ( 5 * 0x4)], M5; \
+ ldd [BASE + (12 * 0x4)], M12
+
+#define LDMATRIX_0_1_2_3_4_5_6_12_13_14(BASE) \
+ ldd [BASE + ( 0 * 0x4)], M0; \
+ ldd [BASE + ( 2 * 0x4)], M2; \
+ ldd [BASE + ( 4 * 0x4)], M4; \
+ ld [BASE + ( 6 * 0x4)], M6; \
+ ldd [BASE + (12 * 0x4)], M12; \
+ ld [BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_5_12_13_14(BASE) \
+ ld [BASE + ( 0 * 0x4)], M0; \
+ ld [BASE + ( 5 * 0x4)], M5; \
+ ldd [BASE + (12 * 0x4)], M12; \
+ ld [BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_5_14(BASE) \
+ ld [BASE + ( 0 * 0x4)], M0; \
+ ld [BASE + ( 5 * 0x4)], M5; \
+ ld [BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15(BASE) \
+ ldd [BASE + ( 0 * 0x4)], M0; \
+ ldd [BASE + ( 2 * 0x4)], M2; \
+ ldd [BASE + ( 4 * 0x4)], M4; \
+ ldd [BASE + ( 6 * 0x4)], M6; \
+ ldd [BASE + ( 8 * 0x4)], M8; \
+ ldd [BASE + (10 * 0x4)], M10; \
+ ldd [BASE + (12 * 0x4)], M12; \
+ ldd [BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_1_4_5_12_13(BASE) \
+ ldd [BASE + ( 0 * 0x4)], M0; \
+ ldd [BASE + ( 4 * 0x4)], M4; \
+ ldd [BASE + (12 * 0x4)], M12
+
+#define LDMATRIX_0_5_12_13(BASE) \
+ ld [BASE + ( 0 * 0x4)], M0; \
+ ld [BASE + ( 5 * 0x4)], M5; \
+ ldd [BASE + (12 * 0x4)], M12
+
+#define LDMATRIX_0_1_2_4_5_6_8_9_10(BASE) \
+ ldd [BASE + ( 0 * 0x4)], M0; \
+ ld [BASE + ( 2 * 0x4)], M2; \
+ ldd [BASE + ( 4 * 0x4)], M4; \
+ ld [BASE + ( 6 * 0x4)], M6; \
+ ldd [BASE + ( 8 * 0x4)], M8; \
+ ld [BASE + (10 * 0x4)], M10
+
+#define LDMATRIX_0_1_2_4_5_6_8_9_10_12_13_14(BASE) \
+ ldd [BASE + ( 0 * 0x4)], M0; \
+ ld [BASE + ( 2 * 0x4)], M2; \
+ ldd [BASE + ( 4 * 0x4)], M4; \
+ ld [BASE + ( 6 * 0x4)], M6; \
+ ldd [BASE + ( 8 * 0x4)], M8; \
+ ld [BASE + (10 * 0x4)], M10; \
+ ldd [BASE + (12 * 0x4)], M12; \
+ ld [BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_5_10(BASE) \
+ ld [BASE + ( 0 * 0x4)], M0; \
+ ld [BASE + ( 5 * 0x4)], M5; \
+ ld [BASE + (10 * 0x4)], M10; \
+
+#define LDMATRIX_0_5_10_12_13_14(BASE) \
+ ld [BASE + ( 0 * 0x4)], M0; \
+ ld [BASE + ( 5 * 0x4)], M5; \
+ ld [BASE + (10 * 0x4)], M10; \
+ ldd [BASE + (12 * 0x4)], M12; \
+ ld [BASE + (14 * 0x4)], M14
+
+#define LDMATRIX_0_5_8_9_10_14(BASE) \
+ ld [BASE + ( 0 * 0x4)], M0; \
+ ld [BASE + ( 5 * 0x4)], M5; \
+ ldd [BASE + ( 8 * 0x4)], M8; \
+ ld [BASE + (10 * 0x4)], M10; \
+ ld [BASE + (14 * 0x4)], M14
+
+#endif /* !(_SPARC_MATRIX_H) */
diff --git a/src/arch/sparc/xform.S b/src/arch/sparc/xform.S
new file mode 100644
index 0000000..2a7cce4
--- /dev/null
+++ b/src/arch/sparc/xform.S
@@ -0,0 +1,1392 @@
+
+ /* TODO
+ *
+ * 1) It would be nice if load/store double could be used
+ * at least for the matrix parts. I think for the matrices
+ * it is safe, but for the vertices it probably is not due to
+ * things like glInterleavedArrays etc.
+ *
+ * UPDATE: Trying this now in sparc_matrix.h -DaveM_990624
+ *
+ * 2) One extremely slick trick would be if we could enclose
+ * groups of xform calls on the same vertices such that
+ * we just load the matrix into f16-->f31 before the calls
+ * and then we would not have to do them here. This may be
+ * tricky and not much of a gain though.
+ */
+
+#include "sparc_matrix.h"
+
+#if defined(SVR4) || defined(__SVR4) || defined(__svr4__) || defined(__arch64__)
+ /* Solaris requires this for 64-bit. */
+ .register %g2, #scratch
+ .register %g3, #scratch
+#endif
+
+ .text
+ .align 64
+
+__set_v4f_1:
+ ld [%o0 + V4F_FLAGS], %g2
+ mov 1, %g1
+ st %g1, [%o0 + V4F_SIZE]
+ or %g2, VEC_SIZE_1, %g2
+ retl
+ st %g2, [%o0 + V4F_FLAGS]
+__set_v4f_2:
+ ld [%o0 + V4F_FLAGS], %g2
+ mov 2, %g1
+ st %g1, [%o0 + V4F_SIZE]
+ or %g2, VEC_SIZE_2, %g2
+ retl
+ st %g2, [%o0 + V4F_FLAGS]
+__set_v4f_3:
+ ld [%o0 + V4F_FLAGS], %g2
+ mov 3, %g1
+ st %g1, [%o0 + V4F_SIZE]
+ or %g2, VEC_SIZE_3, %g2
+ retl
+ st %g2, [%o0 + V4F_FLAGS]
+__set_v4f_4:
+ ld [%o0 + V4F_FLAGS], %g2
+ mov 4, %g1
+ st %g1, [%o0 + V4F_SIZE]
+ or %g2, VEC_SIZE_4, %g2
+ retl
+ st %g2, [%o0 + V4F_FLAGS]
+
+ /* First the raw versions. */
+
+ .globl _mesa_sparc_transform_points1_general
+_mesa_sparc_transform_points1_general:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_2_3_12_13_14_15(%o1)
+
+ cmp %g3, 1
+ st %g3, [%o0 + V4F_COUNT]
+ bl 3f
+ clr %o1
+
+ be 2f
+ andn %g3, 1, %o2
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ add %g1, %o5, %g1 ! IEU0
+ ld [%g1 + 0x00], %f8 ! LSU Group
+ add %o1, 2, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f1 ! FGM Group 1-cycle stall on %f0
+ fmuls %f0, M1, %f2 ! FGM Group
+ fmuls %f0, M2, %f3 ! FGM Group
+ fmuls %f0, M3, %f4 ! FGM Group
+ fmuls %f8, M0, %f9 ! FGM Group f1 available
+ fadds %f1, M12, %f1 ! FGA
+ st %f1, [%g2 + 0x00] ! LSU
+ fmuls %f8, M1, %f10 ! FGM Group f2 available
+ fadds %f2, M13, %f2 ! FGA
+ st %f2, [%g2 + 0x04] ! LSU
+ fmuls %f8, M2, %f11 ! FGM Group f3 available
+ fadds %f3, M14, %f3 ! FGA
+ st %f3, [%g2 + 0x08] ! LSU
+ fmuls %f8, M3, %f12 ! FGM Group f4 available
+ fadds %f4, M15, %f4 ! FGA
+ st %f4, [%g2 + 0x0c] ! LSU
+ fadds %f9, M12, %f9 ! FGA Group f9 available
+ st %f9, [%g2 + 0x10] ! LSU
+ fadds %f10, M13, %f10 ! FGA Group f10 available
+ st %f10, [%g2 + 0x14] ! LSU
+ fadds %f11, M14, %f11 ! FGA Group f11 available
+ st %f11, [%g2 + 0x18] ! LSU
+ fadds %f12, M15, %f12 ! FGA Group f12 available
+ st %f12, [%g2 + 0x1c] ! LSU
+ cmp %o1, %o2 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x20, %g2 ! IEU0 Group
+
+ cmp %o1, %g3
+ be 3f
+ nop
+
+2: ld [%g1 + 0x00], %f0 ! LSU Group
+ fmuls %f0, M0, %f1 ! FGM Group 1-cycle stall on %f0
+ fmuls %f0, M1, %f2 ! FGM Group
+ fmuls %f0, M2, %f3 ! FGM Group
+ fmuls %f0, M3, %f4 ! FGM Group
+ fadds %f1, M12, %f1 ! FGA Group
+ st %f1, [%g2 + 0x00] ! LSU
+ fadds %f2, M13, %f2 ! FGA Group
+ st %f2, [%g2 + 0x04] ! LSU
+ fadds %f3, M14, %f3 ! FGA Group
+ st %f3, [%g2 + 0x08] ! LSU
+ fadds %f4, M15, %f4 ! FGA Group
+ st %f4, [%g2 + 0x0c] ! LSU
+
+3:
+ ba __set_v4f_4
+ nop
+
+ .globl _mesa_sparc_transform_points1_identity
+_mesa_sparc_transform_points1_identity:
+ cmp %o0, %o2
+ be 4f
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ cmp %g3, 1
+ st %g3, [%o0 + V4F_COUNT]
+ bl 3f
+ clr %o1
+
+ be 2f
+ andn %g3, 1, %o2
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ add %g1, %o5, %g1 ! IEU0
+ ld [%g1 + 0x00], %f1 ! LSU Group
+ add %o1, 2, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ st %f0, [%g2 + 0x00] ! LSU Group
+ cmp %o1, %o2 ! IEU1
+ st %f1, [%g2 + 0x10] ! LSU Group
+ bne 1b ! CTI
+ add %g2, 0x20, %g2 ! IEU0
+
+ cmp %o1, %g3
+ be 3f
+ nop
+
+2: ld [%g1 + 0x00], %f0
+ addx %g0, %g0, %g0
+ st %f0, [%g2 + 0x00]
+
+3:
+ ba __set_v4f_1
+ nop
+
+4: retl
+ nop
+
+ .globl _mesa_sparc_transform_points1_2d
+_mesa_sparc_transform_points1_2d:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_12_13(%o1)
+
+ cmp %g3, 1
+ st %g3, [%o0 + V4F_COUNT]
+ bl 3f
+ clr %o1
+
+ be 2f
+ andn %g3, 1, %o2
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ add %g1, %o5, %g1 ! IEU0
+ ld [%g1 + 0x00], %f8 ! LSU Group
+ add %o1, 2, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f1 ! FGM Group
+ fmuls %f0, M1, %f2 ! FGM Group
+ fmuls %f8, M0, %f9 ! FGM Group
+ fmuls %f8, M1, %f10 ! FGM Group
+ fadds %f1, M12, %f3 ! FGA Group f1 available
+ st %f3, [%g2 + 0x00] ! LSU
+ fadds %f2, M13, %f4 ! FGA Group f2 available
+ st %f4, [%g2 + 0x04] ! LSU
+ fadds %f9, M12, %f11 ! FGA Group f9 available
+ st %f11, [%g2 + 0x10] ! LSU
+ fadds %f10, M13, %f12 ! FGA Group f10 available
+ st %f12, [%g2 + 0x14] ! LSU
+ cmp %o1, %o2 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x20, %g2 ! IEU0 Group
+
+ cmp %o1, %g3
+ be 3f
+ nop
+
+2: ld [%g1 + 0x00], %f0
+ fmuls %f0, M0, %f1
+ fmuls %f0, M1, %f2
+ fadds %f1, M12, %f3
+ st %f3, [%g2 + 0x00]
+ fadds %f2, M13, %f4
+ st %f4, [%g2 + 0x04]
+
+3:
+ ba __set_v4f_2
+ nop
+
+ .globl _mesa_sparc_transform_points1_2d_no_rot
+_mesa_sparc_transform_points1_2d_no_rot:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_12_13(%o1)
+
+ cmp %g3, 1
+ st %g3, [%o0 + V4F_COUNT]
+ bl 3f
+ clr %o1
+
+ be 2f
+ andn %g3, 1, %o2
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ add %g1, %o5, %g1 ! IEU0
+ ld [%g1 + 0x00], %f4 ! LSU Group
+ add %o1, 2, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f1 ! FGM Group
+ fmuls %f4, M0, %f5 ! FGM Group
+ fadds %f1, M12, %f3 ! FGA Group, 2 cycle stall, f1 available
+ st %f3, [%g2 + 0x00] ! LSU
+ st M13, [%g2 + 0x04] ! LSU Group, f5 available
+ fadds %f5, M12, %f6 ! FGA
+ st %f6, [%g2 + 0x10] ! LSU Group
+ st M13, [%g2 + 0x14] ! LSU Group
+ cmp %o1, %o2 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x20, %g2 ! IEU0 Group
+
+ cmp %o1, %g3
+ be 3f
+ nop
+
+2: ld [%g1 + 0x00], %f0
+ fmuls %f0, M0, %f1
+ fadds %f1, M12, %f3
+ st %f3, [%g2 + 0x00]
+ st M13, [%g2 + 0x04]
+
+3:
+ ba __set_v4f_2
+ nop
+
+ .globl _mesa_sparc_transform_points1_3d
+_mesa_sparc_transform_points1_3d:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_2_12_13_14(%o1)
+
+ cmp %g3, 1
+ st %g3, [%o0 + V4F_COUNT]
+ bl 3f
+ clr %o1
+
+ be 2f
+ andn %g3, 1, %o2
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ add %g1, %o5, %g1 ! IEU0
+ ld [%g1 + 0x00], %f4 ! LSU Group
+ add %o1, 2, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f1 ! FGM Group
+ fmuls %f0, M1, %f2 ! FGM Group
+ fmuls %f0, M2, %f3 ! FGM Group
+ fmuls %f4, M0, %f5 ! FGM Group
+ fadds %f1, M12, %f1 ! FGA Group, f1 available
+ st %f1, [%g2 + 0x00] ! LSU
+ fmuls %f4, M1, %f6 ! FGM
+ fadds %f2, M13, %f2 ! FGA Group, f2 available
+ st %f2, [%g2 + 0x04] ! LSU
+ fmuls %f4, M2, %f7 ! FGM
+ fadds %f3, M14, %f3 ! FGA Group, f3 available
+ st %f3, [%g2 + 0x08] ! LSU
+ fadds %f5, M12, %f5 ! FGA Group, f5 available
+ st %f5, [%g2 + 0x10] ! LSU
+ fadds %f6, M13, %f6 ! FGA Group, f6 available
+ st %f6, [%g2 + 0x14] ! LSU
+ fadds %f7, M14, %f7 ! FGA Group, f7 available
+ st %f7, [%g2 + 0x18] ! LSU
+ cmp %o1, %o2 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x20, %g2 ! IEU0 Group
+
+ cmp %o1, %g3
+ be 3f
+ nop
+
+2: ld [%g1 + 0x00], %f0
+ fmuls %f0, M0, %f1
+ fmuls %f0, M1, %f2
+ fmuls %f0, M2, %f3
+ fadds %f1, M12, %f1
+ st %f1, [%g2 + 0x00]
+ fadds %f2, M13, %f2
+ st %f2, [%g2 + 0x04]
+ fadds %f3, M14, %f3
+ st %f3, [%g2 + 0x08]
+
+3:
+ ba __set_v4f_3
+ nop
+
+ .globl _mesa_sparc_transform_points1_3d_no_rot
+_mesa_sparc_transform_points1_3d_no_rot:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_12_13_14(%o1)
+
+ cmp %g3, 1
+ st %g3, [%o0 + V4F_COUNT]
+ bl 3f
+ clr %o1
+
+ be 2f
+ andn %g3, 1, %o2
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ add %g1, %o5, %g1 ! IEU0
+ ld [%g1 + 0x00], %f2 ! LSU Group
+ add %o1, 2, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f1 ! FGM Group
+ fmuls %f2, M0, %f3 ! FGM Group
+ fadds %f1, M12, %f1 ! FGA Group, 2 cycle stall, f1 available
+ st %f1, [%g2 + 0x00] ! LSU
+ fadds %f3, M12, %f3 ! FGA Group, f3 available
+ st M13, [%g2 + 0x04] ! LSU
+ st M14, [%g2 + 0x08] ! LSU Group
+ st %f3, [%g2 + 0x10] ! LSU Group
+ st M13, [%g2 + 0x14] ! LSU Group
+ st M14, [%g2 + 0x18] ! LSU Group
+ cmp %o1, %o2 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x20, %g2 ! IEU0 Group
+
+ cmp %o1, %g3
+ be 3f
+ nop
+
+2: ld [%g1 + 0x00], %f0
+ fmuls %f0, M0, %f1
+ fadds %f1, M12, %f1
+ st %f1, [%g2 + 0x00]
+ st M13, [%g2 + 0x04]
+ st M14, [%g2 + 0x08]
+
+3:
+ ba __set_v4f_3
+ nop
+
+ .globl _mesa_sparc_transform_points1_perspective
+_mesa_sparc_transform_points1_perspective:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_14(%o1)
+
+ cmp %g3, 1
+ st %g3, [%o0 + V4F_COUNT]
+ bl 3f
+ clr %o1
+
+ be 2f
+ andn %g3, 1, %o2
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ add %g1, %o5, %g1 ! IEU0
+ ld [%g1 + 0x00], %f2 ! LSU Group
+ add %o1, 2, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f1 ! FGM Group
+ st %f1, [%g2 + 0x00] ! LSU
+ fmuls %f2, M0, %f3 ! FGM Group
+ st %g0, [%g2 + 0x04] ! LSU
+ st M14, [%g2 + 0x08] ! LSU Group
+ st %g0, [%g2 + 0x0c] ! LSU Group
+ st %f3, [%g2 + 0x10] ! LSU Group
+ st %g0, [%g2 + 0x14] ! LSU Group
+ st M14, [%g2 + 0x18] ! LSU Group
+ st %g0, [%g2 + 0x1c] ! LSU Group
+ cmp %o1, %o2 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x20, %g2 ! IEU0 Group
+
+ cmp %o1, %g3
+ be 3f
+ nop
+
+2: ld [%g1 + 0x00], %f0
+ fmuls %f0, M0, %f1
+ st %f1, [%g2 + 0x00]
+ st %g0, [%g2 + 0x04]
+ st M14, [%g2 + 0x08]
+ st %g0, [%g2 + 0x0c]
+
+3:
+ ba __set_v4f_4
+ nop
+
+ .globl _mesa_sparc_transform_points2_general
+_mesa_sparc_transform_points2_general:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_2_3_4_5_6_7_12_13_14_15(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f2 ! FGM Group
+ fmuls %f0, M1, %f3 ! FGM Group
+ fmuls %f0, M2, %f4 ! FGM Group
+ fmuls %f0, M3, %f5 ! FGM Group
+ fadds %f2, M12, %f2 ! FGA Group f2 available
+ fmuls %f1, M4, %f6 ! FGM
+ fadds %f3, M13, %f3 ! FGA Group f3 available
+ fmuls %f1, M5, %f7 ! FGM
+ fadds %f4, M14, %f4 ! FGA Group f4 available
+ fmuls %f1, M6, %f8 ! FGM
+ fadds %f5, M15, %f5 ! FGA Group f5 available
+ fmuls %f1, M7, %f9 ! FGM
+ fadds %f2, %f6, %f2 ! FGA Group f6 available
+ st %f2, [%g2 + 0x00] ! LSU
+ fadds %f3, %f7, %f3 ! FGA Group f7 available
+ st %f3, [%g2 + 0x04] ! LSU
+ fadds %f4, %f8, %f4 ! FGA Group f8 available
+ st %f4, [%g2 + 0x08] ! LSU
+ fadds %f5, %f9, %f5 ! FGA Group f9 available
+ st %f5, [%g2 + 0x0c] ! LSU
+ cmp %o1, %g3 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0 Group
+2:
+ ba __set_v4f_4
+ nop
+
+ .globl _mesa_sparc_transform_points2_identity
+_mesa_sparc_transform_points2_identity:
+ cmp %o2, %o0
+ be 3f
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ add %g1, %o5, %g1 ! IEU0
+ cmp %o1, %g3 ! IEU1
+ st %f0, [%g2 + 0x00] ! LSU Group
+ st %f1, [%g2 + 0x04] ! LSU Group
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0
+2:
+ ba __set_v4f_2
+ nop
+
+3: retl
+ nop
+
+ .globl _mesa_sparc_transform_points2_2d
+_mesa_sparc_transform_points2_2d:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_4_5_12_13(%o1)
+
+ cmp %g3, 1
+ st %g3, [%o0 + V4F_COUNT]
+ bl 3f
+ clr %o1
+
+ be 2f
+ andn %g3, 1, %o2
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ add %o1, 2, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f2 ! FGM
+ ld [%g1 + 0x00], %f8 ! LSU Group
+ fmuls %f0, M1, %f3 ! FGM
+ ld [%g1 + 0x04], %f9 ! LSU Group
+ fmuls %f1, M4, %f6 ! FGM
+ fmuls %f1, M5, %f7 ! FGM Group
+ add %g1, %o5, %g1 ! IEU0
+ fmuls %f8, M0, %f10 ! FGM Group f2 available
+ fadds %f2, M12, %f2 ! FGA
+ fmuls %f8, M1, %f11 ! FGM Group f3 available
+ fadds %f3, M13, %f3 ! FGA
+ fmuls %f9, M4, %f12 ! FGM Group
+ fmuls %f9, M5, %f13 ! FGM Group
+ fadds %f10, M12, %f10 ! FGA Group f2, f10 available
+ fadds %f2, %f6, %f2 ! FGA Group f3, f11 available
+ st %f2, [%g2 + 0x00] ! LSU
+ fadds %f11, M13, %f11 ! FGA Group f12 available
+ fadds %f3, %f7, %f3 ! FGA Group f13 available
+ st %f3, [%g2 + 0x04] ! LSU
+ fadds %f10, %f12, %f10 ! FGA Group f10 available
+ st %f10, [%g2 + 0x10] ! LSU
+ fadds %f11, %f13, %f11 ! FGA Group f11 available
+ st %f11, [%g2 + 0x14] ! LSU
+ cmp %o1, %o2 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x20, %g2 ! IEU0 Group
+
+ cmp %o1, %g3
+ be 3f
+ nop
+
+2: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ fmuls %f0, M0, %f2 ! FGM Group
+ fmuls %f0, M1, %f3 ! FGM Group
+ fmuls %f1, M4, %f6 ! FGM Group
+ fmuls %f1, M5, %f7 ! FGM Group
+ fadds %f2, M12, %f2 ! FGA Group f2 available
+ fadds %f3, M13, %f3 ! FGA Group f3 available
+ fadds %f2, %f6, %f2 ! FGA Group 2 cycle stall, f2 available
+ st %f2, [%g2 + 0x00] ! LSU
+ fadds %f3, %f7, %f3 ! FGA Group f3 available
+ st %f3, [%g2 + 0x04] ! LSU
+
+3:
+ ba __set_v4f_2
+ nop
+
+ .globl _mesa_sparc_transform_points2_2d_no_rot
+_mesa_sparc_transform_points2_2d_no_rot:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_5_12_13(%o1)
+
+ cmp %g3, 1
+ st %g3, [%o0 + V4F_COUNT]
+ bl 3f
+ clr %o1
+
+ be 2f
+ andn %g3, 1, %o2
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ add %o1, 2, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ ld [%g1 + 0x00], %f4 ! LSU Group
+ fmuls %f0, M0, %f2 ! FGM
+ ld [%g1 + 0x04], %f5 ! LSU Group
+ fmuls %f1, M5, %f3 ! FGM
+ fmuls %f4, M0, %f6 ! FGM Group
+ add %g1, %o5, %g1 ! IEU0
+ fmuls %f5, M5, %f7 ! FGM Group
+ fadds %f2, M12, %f2 ! FGA Group f2 available
+ st %f2, [%g2 + 0x00] ! LSU
+ fadds %f3, M13, %f3 ! FGA Group f3 available
+ st %f3, [%g2 + 0x04] ! LSU
+ fadds %f6, M12, %f6 ! FGA Group f6 available
+ st %f6, [%g2 + 0x10] ! LSU
+ fadds %f7, M13, %f7 ! FGA Group f7 available
+ st %f7, [%g2 + 0x14] ! LSU
+ cmp %o1, %o2 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x20, %g2 ! IEU0 Group
+
+ cmp %o1, %g3
+ be 3f
+ nop
+
+2: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ fmuls %f0, M0, %f2 ! FGM Group
+ fmuls %f1, M5, %f3 ! FGM Group
+ fadds %f2, M12, %f2 ! FGA Group, 2 cycle stall, f2 available
+ st %f2, [%g2 + 0x00] ! LSU
+ fadds %f3, M13, %f3 ! FGA Group f3 available
+ st %f3, [%g2 + 0x04] ! LSU
+
+3:
+ ba __set_v4f_2
+ nop
+
+ /* orig: 12 cycles */
+ .globl _mesa_sparc_transform_points2_3d
+_mesa_sparc_transform_points2_3d:
+ ld [%o2 + V4F_STRIDE], %o5
+ ld [%o2 + V4F_START], %g1
+ ld [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_2_3_4_5_6_12_13_14(%o1)
+
+ cmp %g3, 1
+ st %g3, [%o0 + V4F_COUNT]
+ bl 3f
+ clr %o1
+
+ be 2f
+ andn %g3, 1, %o2
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ add %o1, 2, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ ld [%g1 + 0x00], %f9 ! LSU Group
+ fmuls %f0, M0, %f2 ! FGM
+ ld [%g1 + 0x04], %f10 ! LSU Group
+ fmuls %f0, M1, %f3 ! FGM
+ fmuls %f0, M2, %f4 ! FGM Group
+ add %g1, %o5, %g1 ! IEU0
+ fmuls %f1, M4, %f6 ! FGM Group
+ fmuls %f1, M5, %f7 ! FGM Group f2 available
+ fadds %f2, M12, %f2 ! FGA
+ fmuls %f1, M6, %f8 ! FGM Group f3 available
+ fadds %f3, M13, %f3 ! FGA
+ fmuls %f9, M0, %f11 ! FGM Group f4 available
+ fadds %f4, M14, %f4 ! FGA
+ fmuls %f9, M1, %f12 ! FGM Group f6 available
+ fmuls %f9, M2, %f13 ! FGM Group f2, f7 available
+ fadds %f2, %f6, %f2 ! FGA
+ st %f2, [%g2 + 0x00] ! LSU
+ fmuls %f10, M4, %f14 ! FGM Group f3, f8 available
+ fadds %f3, %f7, %f3 ! FGA
+ st %f3, [%g2 + 0x04] ! LSU
+ fmuls %f10, M5, %f15 ! FGM Group f4, f11 available
+ fadds %f11, M12, %f11 ! FGA
+ fmuls %f10, M6, %f0 ! FGM Group f12 available
+ fadds %f12, M13, %f12 ! FGA
+ fadds %f13, M14, %f13 ! FGA Group f13 available
+ fadds %f4, %f8, %f4 ! FGA Group f14 available
+ st %f4, [%g2 + 0x08] ! LSU
+ fadds %f11, %f14, %f11 ! FGA Group f15, f11 available
+ st %f11, [%g2 + 0x10] ! LSU
+ fadds %f12, %f15, %f12 ! FGA Group f0, f12 available
+ st %f12, [%g2 + 0x14] ! LSU
+ fadds %f13, %f0, %f13 ! FGA Group f13 available
+ st %f13, [%g2 + 0x18] ! LSU
+
+ cmp %o1, %o2 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x20, %g2 ! IEU0 Group
+
+ cmp %o1, %g3
+ be 3f
+ nop
+
+2: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ fmuls %f0, M0, %f2 ! FGM Group
+ fmuls %f0, M1, %f3 ! FGM Group
+ fmuls %f0, M2, %f4 ! FGM Group
+ fmuls %f1, M4, %f6 ! FGM Group
+ fmuls %f1, M5, %f7 ! FGM Group f2 available
+ fadds %f2, M12, %f2 ! FGA
+ fmuls %f1, M6, %f8 ! FGM Group f3 available
+ fadds %f3, M13, %f3 ! FGA
+ fadds %f4, M14, %f4 ! FGA Group f4 available
+ fadds %f2, %f6, %f2 ! FGA Group stall, f2, f6, f7 available
+ st %f2, [%g2 + 0x00] ! LSU
+ fadds %f3, %f7, %f3 ! FGA Group f3, f8 available
+ st %f3, [%g2 + 0x04] ! LSU
+ fadds %f4, %f8, %f4 ! FGA Group f4 available
+ st %f4, [%g2 + 0x08] ! LSU
+
+3:
+ ba __set_v4f_3
+ nop
+
+ .globl _mesa_sparc_transform_points2_3d_no_rot
+_mesa_sparc_transform_points2_3d_no_rot:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_5_12_13_14(%o1)
+
+ cmp %g3, 1
+ st %g3, [%o0 + V4F_COUNT]
+ bl 3f
+ clr %o3
+
+ be 2f
+ andn %g3, 1, %o2
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ add %o3, 2, %o3 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ ld [%g1 + 0x00], %f4 ! LSU Group
+ fmuls %f0, M0, %f2 ! FGM
+ ld [%g1 + 0x04], %f5 ! LSU Group
+ fmuls %f1, M5, %f3 ! FGM
+ fmuls %f4, M0, %f6 ! FGM Group
+ add %g1, %o5, %g1 ! IEU0
+ fmuls %f5, M5, %f7 ! FGM Group
+ fadds %f2, M12, %f2 ! FGA Group f2 available
+ st %f2, [%g2 + 0x00] ! LSU
+ fadds %f3, M13, %f3 ! FGA Group f3 available
+ st %f3, [%g2 + 0x04] ! LSU
+ fadds %f6, M12, %f6 ! FGA Group f6 available
+ st M14, [%g2 + 0x08] ! LSU
+ fadds %f7, M13, %f7 ! FGA Group f7 available
+ st %f6, [%g2 + 0x10] ! LSU
+ st %f7, [%g2 + 0x14] ! LSU Group
+ st M14, [%g2 + 0x18] ! LSU Group
+ cmp %o3, %o2 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x20, %g2 ! IEU0 Group
+
+ cmp %o3, %g3
+ be 3f
+ nop
+
+2: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ fmuls %f0, M0, %f2 ! FGM Group
+ fmuls %f1, M5, %f3 ! FGM Group
+ fadds %f2, M12, %f2 ! FGA Group, 2 cycle stall, f2 available
+ st %f2, [%g2 + 0x00] ! LSU
+ fadds %f3, M13, %f3 ! FGA Group f3 available
+ st %f3, [%g2 + 0x04] ! LSU
+ st M14, [%g2 + 0x08] ! LSU Group
+
+3: ld [%o1 + (14 * 0x4)], %g3
+ cmp %g3, 0
+ bne __set_v4f_3
+ nop
+ ba __set_v4f_2
+ nop
+
+ .globl _mesa_sparc_transform_points2_perspective
+_mesa_sparc_transform_points2_perspective:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_5_14(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0
+ ld [%g1 + 0x04], %f1
+ add %o1, 1, %o1
+ add %g1, %o5, %g1
+ fmuls %f0, M0, %f2
+ st %f2, [%g2 + 0x00]
+ fmuls %f1, M5, %f3
+ st %f3, [%g2 + 0x04]
+ st M14, [%g2 + 0x08]
+ st %g0, [%g2 + 0x0c]
+ cmp %o1, %g3
+ bne 1b
+ add %g2, 0x10, %g2
+2:
+ ba __set_v4f_4
+ nop
+
+ .globl _mesa_sparc_transform_points3_general
+_mesa_sparc_transform_points3_general:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ ld [%g1 + 0x08], %f2 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f3 ! FGM
+ fmuls %f1, M4, %f7 ! FGM Group
+ fmuls %f0, M1, %f4 ! FGM Group
+ fmuls %f1, M5, %f8 ! FGM Group
+ fmuls %f0, M2, %f5 ! FGM Group f3 available
+ fmuls %f1, M6, %f9 ! FGM Group f7 available
+ fadds %f3, %f7, %f3 ! FGA
+ fmuls %f0, M3, %f6 ! FGM Group f4 available
+ fmuls %f1, M7, %f10 ! FGM Group f8 available
+ fadds %f4, %f8, %f4 ! FGA
+ fmuls %f2, M8, %f7 ! FGM Group f5 available
+ fmuls %f2, M9, %f8 ! FGM Group f9,f3 available
+ fadds %f5, %f9, %f5 ! FGA
+ fmuls %f2, M10, %f9 ! FGM Group f6 available
+ fadds %f6, %f10, %f6 ! FGA Group f10,f4 available
+ fmuls %f2, M11, %f10 ! FGM
+ fadds %f3, M12, %f3 ! FGA Group f7 available
+ fadds %f4, M13, %f4 ! FGA Group f8,f5 available
+ fadds %f5, M14, %f5 ! FGA Group f9 available
+ fadds %f6, M15, %f6 ! FGA Group f10,f6 available
+ fadds %f3, %f7, %f3 ! FGA Group f3 available
+ st %f3, [%g2 + 0x00] ! LSU
+ fadds %f4, %f8, %f4 ! FGA Group f4 available
+ st %f4, [%g2 + 0x04] ! LSU
+ fadds %f5, %f9, %f5 ! FGA Group f5 available
+ st %f5, [%g2 + 0x08] ! LSU
+ fadds %f6, %f10, %f6 ! FGA Group f6 available
+ st %f6, [%g2 + 0x0c] ! LSU
+ cmp %o1, %g3 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0 Group
+2:
+ ba __set_v4f_4
+ nop
+
+ .globl _mesa_sparc_transform_points3_identity
+_mesa_sparc_transform_points3_identity:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0
+ ld [%g1 + 0x04], %f1
+ ld [%g1 + 0x08], %f2
+ add %o1, 1, %o1
+ add %g1, %o5, %g1
+ cmp %o1, %g3
+ st %f0, [%g2 + 0x00]
+ st %f1, [%g2 + 0x04]
+ st %f2, [%g2 + 0x08]
+ bne 1b
+ add %g2, 0x10, %g2
+2:
+ ba __set_v4f_3
+ nop
+
+ .globl _mesa_sparc_transform_points3_2d
+_mesa_sparc_transform_points3_2d:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_4_5_12_13(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ ld [%g1 + 0x08], %f2 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f3 ! FGM
+ fmuls %f0, M1, %f4 ! FGM Group
+ fmuls %f1, M4, %f6 ! FGM Group
+ fmuls %f1, M5, %f7 ! FGM Group
+ fadds %f3, M12, %f3 ! FGA Group f3 available
+ fadds %f4, M13, %f4 ! FGA Group f4 available
+ fadds %f3, %f6, %f3 ! FGA Group f6 available
+ st %f3, [%g2 + 0x00] ! LSU
+ fadds %f4, %f7, %f4 ! FGA Group f7 available
+ st %f4, [%g2 + 0x04] ! LSU
+ st %f2, [%g2 + 0x08] ! LSU Group
+ cmp %o1, %g3 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0 Group
+2:
+ ba __set_v4f_3
+ nop
+
+ .globl _mesa_sparc_transform_points3_2d_no_rot
+_mesa_sparc_transform_points3_2d_no_rot:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_5_12_13(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ ld [%g1 + 0x08], %f2 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f3 ! FGM
+ fmuls %f1, M5, %f4 ! FGM Group
+ st %f2, [%g2 + 0x08] ! LSU
+ fadds %f3, M12, %f3 ! FGA Group
+ st %f3, [%g2 + 0x00] ! LSU
+ fadds %f4, M13, %f4 ! FGA Group
+ st %f4, [%g2 + 0x04] ! LSU
+ cmp %o1, %g3 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0 Group
+2:
+ ba __set_v4f_3
+ nop
+
+ .globl _mesa_sparc_transform_points3_3d
+_mesa_sparc_transform_points3_3d:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_2_4_5_6_8_9_10_12_13_14(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ ld [%g1 + 0x08], %f2 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f3 ! FGM
+ fmuls %f1, M4, %f6 ! FGM Group
+ fmuls %f0, M1, %f4 ! FGM Group
+ fmuls %f1, M5, %f7 ! FGM Group
+ fmuls %f0, M2, %f5 ! FGM Group f3 available
+ fmuls %f1, M6, %f8 ! FGM Group f6 available
+ fadds %f3, %f6, %f3 ! FGA
+ fmuls %f2, M8, %f9 ! FGM Group f4 available
+ fmuls %f2, M9, %f10 ! FGM Group f7 available
+ fadds %f4, %f7, %f4 ! FGA
+ fmuls %f2, M10, %f11 ! FGM Group f5 available
+ fadds %f5, %f8, %f5 ! FGA Group f8, f3 available
+ fadds %f3, %f9, %f3 ! FGA Group f9 available
+ fadds %f4, %f10, %f4 ! FGA Group f10, f4 available
+ fadds %f5, %f11, %f5 ! FGA Group stall, f11, f5 available
+ fadds %f3, M12, %f3 ! FGA Group f3 available
+ st %f3, [%g2 + 0x00] ! LSU
+ fadds %f4, M13, %f4 ! FGA Group f4 available
+ st %f4, [%g2 + 0x04] ! LSU
+ fadds %f5, M14, %f5 ! FGA Group f5 available
+ st %f5, [%g2 + 0x08] ! LSU
+ cmp %o1, %g3 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0 Group
+2:
+ ba __set_v4f_3
+ nop
+
+ .globl _mesa_sparc_transform_points3_3d_no_rot
+_mesa_sparc_transform_points3_3d_no_rot:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_5_10_12_13_14(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ ld [%g1 + 0x08], %f2 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ cmp %o1, %g3 ! IEU1 Group
+ fmuls %f0, M0, %f3 ! FGM
+ fmuls %f1, M5, %f4 ! FGM Group
+ fmuls %f2, M10, %f5 ! FGM Group
+ fadds %f3, M12, %f3 ! FGA Group, stall, f3 available
+ st %f3, [%g2 + 0x00] ! LSU
+ fadds %f4, M13, %f4 ! FGA Group, f4 available
+ st %f4, [%g2 + 0x04] ! LSU
+ fadds %f5, M14, %f5 ! FGA Group, f5 available
+ st %f5, [%g2 + 0x08] ! LEU
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0 Group
+2:
+ ba __set_v4f_3
+ nop
+
+ .globl _mesa_sparc_transform_points3_perspective
+_mesa_sparc_transform_points3_perspective:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_5_8_9_10_14(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ ld [%g1 + 0x08], %f2 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f3 ! FGM
+ fmuls %f2, M8, %f6 ! FGM Group
+ fmuls %f1, M5, %f4 ! FGM Group
+ fmuls %f2, M9, %f7 ! FGM Group
+ fmuls %f2, M10, %f5 ! FGM Group f3 available
+ fadds %f3, %f6, %f3 ! FGA Group f6 available
+ st %f3, [%g2 + 0x00] ! LSU
+ fadds %f4, %f7, %f4 ! FGA Group stall, f4, f7 available
+ st %f4, [%g2 + 0x04] ! LSU
+ fadds %f5, M14, %f5 ! FGA Group
+ st %f5, [%g2 + 0x08] ! LSU
+ fnegs %f2, %f6 ! FGA Group
+ st %f6, [%g2 + 0x0c] ! LSU
+ cmp %o1, %g3 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0 Group
+2:
+ ba __set_v4f_4
+ nop
+
+ .globl _mesa_sparc_transform_points4_general
+_mesa_sparc_transform_points4_general:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ ld [%g1 + 0x08], %f2 ! LSU Group
+ ld [%g1 + 0x0c], %f3 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f4 ! FGM Group
+ fmuls %f1, M4, %f8 ! FGM Group
+ fmuls %f0, M1, %f5 ! FGM Group
+ fmuls %f1, M5, %f9 ! FGM Group
+ fmuls %f0, M2, %f6 ! FGM Group f4 available
+ fmuls %f1, M6, %f10 ! FGM Group f8 available
+ fadds %f4, %f8, %f4 ! FGA
+ fmuls %f0, M3, %f7 ! FGM Group f5 available
+ fmuls %f1, M7, %f11 ! FGM Group f9 available
+ fadds %f5, %f9, %f5 ! FGA
+ fmuls %f2, M8, %f12 ! FGM Group f6 available
+ fmuls %f2, M9, %f13 ! FGM Group f10, f4 available
+ fadds %f6, %f10, %f6 ! FGA
+ fmuls %f2, M10, %f14 ! FGM Group f7 available
+ fmuls %f2, M11, %f15 ! FGM Group f11, f5 available
+ fadds %f7, %f11, %f7 ! FGA
+ fmuls %f3, M12, %f8 ! FGM Group f12 available
+ fadds %f4, %f12, %f4 ! FGA
+ fmuls %f3, M13, %f9 ! FGM Group f13, f6 available
+ fadds %f5, %f13, %f5 ! FGA
+ fmuls %f3, M14, %f10 ! FGM Group f14 available
+ fadds %f6, %f14, %f6 ! FGA
+ fmuls %f3, M15, %f11 ! FGM Group f15, f7 available
+ fadds %f7, %f15, %f7 ! FGA
+ fadds %f4, %f8, %f4 ! FGA Group f8, f4 available
+ st %f4, [%g2 + 0x00] ! LSU
+ fadds %f5, %f9, %f5 ! FGA Group f9, f5 available
+ st %f5, [%g2 + 0x04] ! LSU
+ fadds %f6, %f10, %f6 ! FGA Group f10, f6 available
+ st %f6, [%g2 + 0x08] ! LSU
+ fadds %f7, %f11, %f7 ! FGA Group f11, f7 available
+ st %f7, [%g2 + 0x0c] ! LSU
+ cmp %o1, %g3 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0 Group
+2:
+ ba __set_v4f_4
+ nop
+
+ .globl _mesa_sparc_transform_points4_identity
+_mesa_sparc_transform_points4_identity:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0
+ ld [%g1 + 0x04], %f1
+ ld [%g1 + 0x08], %f2
+ add %o1, 1, %o1
+ ld [%g1 + 0x0c], %f3
+ add %g1, %o5, %g1
+ st %f0, [%g2 + 0x00]
+ st %f1, [%g2 + 0x04]
+ st %f2, [%g2 + 0x08]
+ cmp %o1, %g3
+ st %f3, [%g2 + 0x0c]
+ bne 1b
+ add %g2, 0x10, %g2
+2:
+ ba __set_v4f_4
+ nop
+
+ .globl _mesa_sparc_transform_points4_2d
+_mesa_sparc_transform_points4_2d:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_4_5_12_13(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ ld [%g1 + 0x08], %f2 ! LSU Group
+ ld [%g1 + 0x0c], %f3 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f4 ! FGM
+ fmuls %f1, M4, %f8 ! FGM Group
+ fmuls %f0, M1, %f5 ! FGM Group
+ fmuls %f1, M5, %f9 ! FGM Group f4 available
+ fmuls %f3, M12, %f12 ! FGM Group
+ fmuls %f3, M13, %f13 ! FGM Group f8 available
+ fadds %f4, %f8, %f4 ! FGA
+ fadds %f5, %f9, %f5 ! FGA Group stall, f5, f9 available
+ fadds %f4, %f12, %f4 ! FGA Group 2 cycle stall, f4, f12, f13 avail
+ st %f4, [%g2 + 0x00] ! LSU
+ fadds %f5, %f13, %f5 ! FGA Group f5 available
+ st %f5, [%g2 + 0x04] ! LSU
+ st %f2, [%g2 + 0x08] ! LSU Group
+ st %f3, [%g2 + 0x0c] ! LSU Group
+ cmp %o1, %g3 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0 Group
+2:
+ ba __set_v4f_4
+ nop
+
+ .globl _mesa_sparc_transform_points4_2d_no_rot
+_mesa_sparc_transform_points4_2d_no_rot:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_4_5_12_13(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0
+ ld [%g1 + 0x04], %f1
+ ld [%g1 + 0x08], %f2
+ ld [%g1 + 0x0c], %f3
+ add %o1, 1, %o1
+ add %g1, %o5, %g1
+ fmuls %f0, M0, %f4
+ fmuls %f3, M12, %f8
+ fmuls %f1, M5, %f5
+ fmuls %f3, M13, %f9
+ fadds %f4, %f8, %f4
+ st %f4, [%g2 + 0x00]
+ fadds %f5, %f9, %f5
+ st %f5, [%g2 + 0x04]
+ st %f2, [%g2 + 0x08]
+ st %f3, [%g2 + 0x0c]
+ cmp %o1, %g3
+ bne 1b
+ add %g2, 0x10, %g2
+2:
+ ba __set_v4f_4
+ nop
+
+ .globl _mesa_sparc_transform_points4_3d
+_mesa_sparc_transform_points4_3d:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_1_2_4_5_6_8_9_10_12_13_14(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ ld [%g1 + 0x08], %f2 ! LSU Group
+ ld [%g1 + 0x0c], %f3 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f4 ! FGM
+ fmuls %f1, M4, %f7 ! FGM Group
+ fmuls %f0, M1, %f5 ! FGM Group
+ fmuls %f1, M5, %f8 ! FGM Group
+ fmuls %f0, M2, %f6 ! FGM Group f4 available
+ fmuls %f1, M6, %f9 ! FGM Group f7 available
+ fadds %f4, %f7, %f4 ! FGA
+ fmuls %f2, M8, %f10 ! FGM Group f5 available
+ fmuls %f2, M9, %f11 ! FGM Group f8 available
+ fadds %f5, %f8, %f5 ! FGA
+ fmuls %f2, M10, %f12 ! FGM Group f6 available
+ fmuls %f3, M12, %f13 ! FGM Group f9, f4 available
+ fadds %f6, %f9, %f6 ! FGA
+ fmuls %f3, M13, %f14 ! FGM Group f10 available
+ fadds %f4, %f10, %f4 ! FGA
+ fmuls %f3, M14, %f15 ! FGM Group f11, f5 available
+ fadds %f5, %f11, %f5 ! FGA
+ fadds %f6, %f12, %f6 ! FGA Group stall, f12, f13, f6 available
+ fadds %f4, %f13, %f4 ! FGA Group f14, f4 available
+ st %f4, [%g2 + 0x00] ! LSU
+ fadds %f5, %f14, %f5 ! FGA Group f15, f5 available
+ st %f5, [%g2 + 0x04] ! LSU
+ fadds %f6, %f15, %f6 ! FGA Group f6 available
+ st %f6, [%g2 + 0x08] ! LSU
+ st %f3, [%g2 + 0x0c] ! LSU Group
+ cmp %o1, %g3 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0 Group
+2:
+ ba __set_v4f_4
+ nop
+
+ .globl _mesa_sparc_transform_points4_3d_no_rot
+_mesa_sparc_transform_points4_3d_no_rot:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_5_10_12_13_14(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ ld [%g1 + 0x08], %f2 ! LSU Group
+ ld [%g1 + 0x0c], %f3 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f4 ! FGM
+ fmuls %f3, M12, %f7 ! FGM Group
+ fmuls %f1, M5, %f5 ! FGM Group
+ fmuls %f3, M13, %f8 ! FGM Group
+ fmuls %f2, M10, %f6 ! FGM Group f4 available
+ fmuls %f3, M14, %f9 ! FGM Group f7 available
+ fadds %f4, %f7, %f4 ! FGA
+ st %f4, [%g2 + 0x00] ! LSU
+ fadds %f5, %f8, %f5 ! FGA Group stall, f5, f8 available
+ st %f5, [%g2 + 0x04] ! LSU
+ fadds %f6, %f9, %f6 ! FGA Group stall, f6, f9 available
+ st %f6, [%g2 + 0x08] ! LSU
+ st %f3, [%g2 + 0x0c] ! LSU Group
+ cmp %o1, %g3 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0 Group
+2:
+ ba __set_v4f_4
+ nop
+
+ .globl _mesa_sparc_transform_points4_perspective
+_mesa_sparc_transform_points4_perspective:
+ ld [%o2 + V4F_STRIDE], %o5
+ LDPTR [%o2 + V4F_START], %g1
+ LDPTR [%o0 + V4F_START], %g2
+ ld [%o2 + V4F_COUNT], %g3
+
+ LDMATRIX_0_5_8_9_10_14(%o1)
+
+ cmp %g3, 0
+ st %g3, [%o0 + V4F_COUNT]
+ be 2f
+ clr %o1
+
+1: ld [%g1 + 0x00], %f0 ! LSU Group
+ ld [%g1 + 0x04], %f1 ! LSU Group
+ ld [%g1 + 0x08], %f2 ! LSU Group
+ ld [%g1 + 0x0c], %f3 ! LSU Group
+ add %o1, 1, %o1 ! IEU0
+ add %g1, %o5, %g1 ! IEU1
+ fmuls %f0, M0, %f4 ! FGM
+ fmuls %f2, M8, %f7 ! FGM Group
+ fmuls %f1, M5, %f5 ! FGM Group
+ fmuls %f2, M9, %f8 ! FGM Group
+ fmuls %f2, M10, %f6 ! FGM Group f4 available
+ fmuls %f3, M14, %f9 ! FGM Group f7 available
+ fadds %f4, %f7, %f4 ! FGA
+ st %f4, [%g2 + 0x00] ! LSU
+ fadds %f5, %f8, %f5 ! FGA Group stall, f5, f8 available
+ st %f5, [%g2 + 0x04] ! LSU
+ fadds %f6, %f9, %f6 ! FGA Group stall, f6, f9 available
+ st %f6, [%g2 + 0x08] ! LSU
+ fnegs %f2, %f7 ! FGA Group
+ st %f7, [%g2 + 0x0c] ! LSU
+ cmp %o1, %g3 ! IEU1
+ bne 1b ! CTI
+ add %g2, 0x10, %g2 ! IEU0 Group
+2:
+ ba __set_v4f_4
+ nop
diff --git a/src/arch/x86-64/Makefile.am b/src/arch/x86-64/Makefile.am
new file mode 100644
index 0000000..ad4c4c8
--- /dev/null
+++ b/src/arch/x86-64/Makefile.am
@@ -0,0 +1,40 @@
+# Copyright © 2012 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+if HAVE_X86_64_ASM
+
+AM_CPPFLAGS = \
+ -I$(top_srcdir)/include \
+ -I$(top_srcdir)/src/mesa \
+ -I$(top_srcdir)/src/GLdispatch/mapi \
+ $(API_DEFINES) \
+ $(DEFINES)
+
+noinst_PROGRAMS = gen_matypes
+
+gen_matypes_SOURCES = ../x86/gen_matypes.c
+BUILT_SOURCES = matypes.h
+CLEANFILES = matypes.h
+
+matypes.h: gen_matypes
+ $(AM_V_GEN)./gen_matypes > $@
+
+endif
diff --git a/src/arch/x86-64/calling_convention.txt b/src/arch/x86-64/calling_convention.txt
new file mode 100644
index 0000000..4147f7e
--- /dev/null
+++ b/src/arch/x86-64/calling_convention.txt
@@ -0,0 +1,50 @@
+Register Usage
+rax temporary register; with variable arguments passes information
+ about the number of SSE registers used; 1st return register
+
+rbx* callee-saved register; optionally used as base pointer
+
+rcx used to pass 4th integer argument to functions
+
+rdx used to pass 3rd argument to functions 2nd return register
+
+rsp* stack pointer
+
+rbp* callee-saved register; optionally used as frame pointer
+
+rsi used to pass 2nd argument to functions
+
+rdi used to pass 1st argument to functions
+
+r8 used to pass 5th argument to functions
+
+r9 used to pass 6th argument to functions
+
+r10 temporary register, used for passing a function's static chain pointer
+
+r11 temporary register
+
+r12-15* callee-saved registers
+
+xmm0­1 used to pass and return floating point arguments
+
+xmm2­7 used to pass floating point arguments
+
+xmm8­15 temporary registers
+
+mmx0­7 temporary registers
+
+st0 temporary register; used to return long double arguments
+
+st1 temporary registers; used to return long double arguments
+
+st2­7 temporary registers
+
+fs Reserved for system use (as thread specific data register)
+
+
+
+*) must be preserved across function calls
+
+Integer arguments from list: rdi,rsi,rdx,rcx,r8,r9,stack
+Floating point arguments from list: xmm0-xmm7 \ No newline at end of file
diff --git a/src/arch/x86-64/x86-64.c b/src/arch/x86-64/x86-64.c
new file mode 100644
index 0000000..10564d9
--- /dev/null
+++ b/src/arch/x86-64/x86-64.c
@@ -0,0 +1,119 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 6.3
+ *
+ * Copyright (C) 1999-2003 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * x86-64 optimizations shamelessy converted from x86/sse/3dnow assembly by
+ * Mikko Tiihonen
+ */
+
+#ifdef USE_X86_64_ASM
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "math/m_xform.h"
+#include "tnl/t_context.h"
+#include "x86-64.h"
+#include "../x86/x86_xform.h"
+
+#ifdef DEBUG
+#include "math/m_debug.h"
+#endif
+
+extern void _mesa_x86_64_cpuid(unsigned int *regs);
+
+DECLARE_XFORM_GROUP( x86_64, 4 )
+DECLARE_XFORM_GROUP( 3dnow, 4 )
+
+#else
+/* just to silence warning below */
+#include "x86-64.h"
+#endif
+
+/*
+extern void _mesa_x86_64_transform_points4_general( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_identity( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_perspective( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_3d( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_3d_no_rot( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_2d_no_rot( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_2d( XFORM_ARGS );
+*/
+
+#ifdef USE_X86_64_ASM
+static void message( const char *msg )
+{
+ if (_mesa_getenv("MESA_DEBUG")) {
+ _mesa_debug( NULL, "%s", msg );
+ }
+}
+#endif
+
+
+void _mesa_init_all_x86_64_transform_asm(void)
+{
+#ifdef USE_X86_64_ASM
+ unsigned int regs[4];
+
+ if ( _mesa_getenv( "MESA_NO_ASM" ) ) {
+ return;
+ }
+
+ message("Initializing x86-64 optimizations\n");
+
+
+ _mesa_transform_tab[4][MATRIX_GENERAL] =
+ _mesa_x86_64_transform_points4_general;
+ _mesa_transform_tab[4][MATRIX_IDENTITY] =
+ _mesa_x86_64_transform_points4_identity;
+ _mesa_transform_tab[4][MATRIX_3D] =
+ _mesa_x86_64_transform_points4_3d;
+
+ regs[0] = 0x80000001;
+ regs[1] = 0x00000000;
+ regs[2] = 0x00000000;
+ regs[3] = 0x00000000;
+ _mesa_x86_64_cpuid(regs);
+ if (regs[3] & (1U << 31)) {
+ message("3Dnow! detected\n");
+ _mesa_transform_tab[4][MATRIX_3D_NO_ROT] =
+ _mesa_3dnow_transform_points4_3d_no_rot;
+ _mesa_transform_tab[4][MATRIX_PERSPECTIVE] =
+ _mesa_3dnow_transform_points4_perspective;
+ _mesa_transform_tab[4][MATRIX_2D_NO_ROT] =
+ _mesa_3dnow_transform_points4_2d_no_rot;
+ _mesa_transform_tab[4][MATRIX_2D] =
+ _mesa_3dnow_transform_points4_2d;
+
+ }
+
+
+#ifdef DEBUG_MATH
+ _math_test_all_transform_functions("x86_64");
+ _math_test_all_cliptest_functions("x86_64");
+ _math_test_all_normal_transform_functions("x86_64");
+#endif
+
+#endif
+}
diff --git a/src/arch/x86-64/x86-64.h b/src/arch/x86-64/x86-64.h
new file mode 100644
index 0000000..1d931fa
--- /dev/null
+++ b/src/arch/x86-64/x86-64.h
@@ -0,0 +1,31 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __X86_64_ASM_H__
+#define __X86_64_ASM_H__
+
+extern void _mesa_init_all_x86_64_transform_asm( void );
+
+#endif
diff --git a/src/arch/x86-64/xform4.S b/src/arch/x86-64/xform4.S
new file mode 100644
index 0000000..5abd5a2
--- /dev/null
+++ b/src/arch/x86-64/xform4.S
@@ -0,0 +1,483 @@
+/*
+ * Mesa 3-D graphics library
+ * Version: 7.1
+ *
+ * Copyright (C) 1999-2007 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_X86_64_ASM
+
+#include "matypes.h"
+
+.text
+
+.align 16
+.globl _mesa_x86_64_cpuid
+.hidden _mesa_x86_64_cpuid
+_mesa_x86_64_cpuid:
+ pushq %rbx
+ movl (%rdi), %eax
+ movl 8(%rdi), %ecx
+
+ cpuid
+
+ movl %ebx, 4(%rdi)
+ movl %eax, (%rdi)
+ movl %ecx, 8(%rdi)
+ movl %edx, 12(%rdi)
+ popq %rbx
+ ret
+
+.align 16
+.globl _mesa_x86_64_transform_points4_general
+.hidden _mesa_x86_64_transform_points4_general
+_mesa_x86_64_transform_points4_general:
+/*
+ * rdi = dest
+ * rsi = matrix
+ * rdx = source
+ */
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzbl V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ testl %ecx, %ecx /* verify non-zero count */
+ prefetchnta 64(%rsi)
+ jz p4_general_done
+
+ movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+
+ prefetch 16(%rdx)
+
+ movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
+ movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
+ movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
+
+p4_general_loop:
+
+ movups (%rdx), %xmm8 /* ox | oy | oz | ow */
+ prefetchw 16(%rdi)
+
+ pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
+ addq %rax, %rdx
+ pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
+ mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+ pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
+ mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+ pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
+ mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+ addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
+ mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+ addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
+ prefetch 16(%rdx)
+ addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+
+ movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+ addq $16, %rdi
+
+ decl %ecx
+ jnz p4_general_loop
+
+p4_general_done:
+ .byte 0xf3
+ ret
+
+.section .rodata
+
+.align 16
+p4_constants:
+.byte 0xff, 0xff, 0xff, 0xff
+.byte 0xff, 0xff, 0xff, 0xff
+.byte 0xff, 0xff, 0xff, 0xff
+.byte 0x00, 0x00, 0x00, 0x00
+
+.byte 0x00, 0x00, 0x00, 0x00
+.byte 0x00, 0x00, 0x00, 0x00
+.byte 0x00, 0x00, 0x00, 0x00
+.float 1.0
+
+.text
+.align 16
+.globl _mesa_x86_64_transform_points4_3d
+.hidden _mesa_x86_64_transform_points4_3d
+/*
+ * this is slower than _mesa_x86_64_transform_points4_general
+ * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
+ */
+_mesa_x86_64_transform_points4_3d:
+
+ leaq p4_constants(%rip), %rax
+
+ prefetchnta 64(%rsi)
+
+ movaps (%rax), %xmm9
+ movaps 16(%rax), %xmm10
+
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzbl V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ testl %ecx, %ecx /* verify non-zero count */
+ jz p4_3d_done
+
+ movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+
+ prefetch 16(%rdx)
+
+ movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
+ movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
+ andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
+ movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
+ andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
+ movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
+ andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
+ andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
+
+p4_3d_loop:
+
+ movups (%rdx), %xmm8 /* ox | oy | oz | ow */
+ prefetchw 16(%rdi)
+
+ pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
+ addq %rax, %rdx
+ pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
+ mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+ pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
+ mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+ pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
+ mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+ addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
+ mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+ addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
+ prefetch 16(%rdx)
+ addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+
+ movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+ addq $16, %rdi
+
+ dec %ecx
+ jnz p4_3d_loop
+
+p4_3d_done:
+ .byte 0xf3
+ ret
+
+
+.align 16
+.globl _mesa_x86_64_transform_points4_identity
+.hidden _mesa_x86_64_transform_points4_identity
+_mesa_x86_64_transform_points4_identity:
+
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzbl V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ test %ecx, %ecx
+ jz p4_identity_done
+
+ movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+ prefetch 64(%rsi)
+ prefetchw 64(%rdi)
+
+ add %ecx, %ecx
+
+ rep movsq
+
+p4_identity_done:
+ .byte 0xf3
+ ret
+
+
+.align 16
+.globl _mesa_3dnow_transform_points4_3d_no_rot
+.hidden _mesa_3dnow_transform_points4_3d_no_rot
+_mesa_3dnow_transform_points4_3d_no_rot:
+
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzbl V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ test %ecx, %ecx
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ jz p4_3d_no_rot_done
+
+ movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+
+ prefetch (%rdx)
+
+ movd (%rsi), %mm0 /* | m00 */
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ punpckldq 20(%rsi), %mm0 /* m11 | m00 */
+
+ movd 40(%rsi), %mm2 /* | m22 */
+ movq 48(%rsi), %mm1 /* m31 | m30 */
+
+ punpckldq 56(%rsi), %mm2 /* m11 | m00 */
+
+p4_3d_no_rot_loop:
+
+ prefetchw 32(%rdi)
+
+ movq (%rdx), %mm4 /* x1 | x0 */
+ movq 8(%rdx), %mm5 /* x3 | x2 */
+ movd 12(%rdx), %mm7 /* | x3 */
+
+ movq %mm5, %mm6 /* x3 | x2 */
+ pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
+
+ punpckhdq %mm6, %mm6 /* x3 | x3 */
+ pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
+
+ pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
+ pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
+
+ pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
+
+ addq %rax, %rdx
+ movq %mm4, (%rdi) /* write r0, r1 */
+ movq %mm5, 8(%rdi) /* write r2, r3 */
+
+ addq $16, %rdi
+
+ decl %ecx
+ prefetch 32(%rdx)
+ jnz p4_3d_no_rot_loop
+
+p4_3d_no_rot_done:
+ femms
+ ret
+
+
+.align 16
+.globl _mesa_3dnow_transform_points4_perspective
+.hidden _mesa_3dnow_transform_points4_perspective
+_mesa_3dnow_transform_points4_perspective:
+
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzbl V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ test %ecx, %ecx
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ jz p4_perspective_done
+
+ movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+
+ movd (%rsi), %mm0 /* | m00 */
+ pxor %mm7, %mm7 /* 0 | 0 */
+ punpckldq 20(%rsi), %mm0 /* m11 | m00 */
+
+ movq 32(%rsi), %mm2 /* m21 | m20 */
+ prefetch (%rdx)
+
+ movd 40(%rsi), %mm1 /* | m22 */
+
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ punpckldq 56(%rsi), %mm1 /* m32 | m22 */
+
+
+p4_perspective_loop:
+
+ prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
+
+ movq (%rdx), %mm4 /* x1 | x0 */
+ movq 8(%rdx), %mm5 /* x3 | x2 */
+ movd 8(%rdx), %mm3 /* | x2 */
+
+ movq %mm5, %mm6 /* x3 | x2 */
+ pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
+
+ punpckldq %mm5, %mm5 /* x2 | x2 */
+
+ pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
+ pfsubr %mm7, %mm3 /* | -x2 */
+
+ pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
+ pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
+
+ pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
+
+ movq %mm5, (%rdi) /* write r0, r1 */
+ addq %rax, %rdx
+ movq %mm6, 8(%rdi) /* write r2, r3 */
+
+ addq $16, %rdi
+
+ decl %ecx
+ prefetch 32(%rdx) /* hopefully stride is zero */
+ jnz p4_perspective_loop
+
+p4_perspective_done:
+ femms
+ ret
+
+.align 16
+.globl _mesa_3dnow_transform_points4_2d_no_rot
+.hidden _mesa_3dnow_transform_points4_2d_no_rot
+_mesa_3dnow_transform_points4_2d_no_rot:
+
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzbl V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ test %ecx, %ecx
+ .byte 0x90 /* manual align += 1 */
+ jz p4_2d_no_rot_done
+
+ movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+
+ movd (%rsi), %mm0 /* | m00 */
+ prefetch (%rdx)
+ punpckldq 20(%rsi), %mm0 /* m11 | m00 */
+
+ movq 48(%rsi), %mm1 /* m31 | m30 */
+
+p4_2d_no_rot_loop:
+
+ prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
+
+ movq (%rdx), %mm4 /* x1 | x0 */
+ movq 8(%rdx), %mm5 /* x3 | x2 */
+
+ pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
+ movq %mm5, %mm6 /* x3 | x2 */
+
+ punpckhdq %mm6, %mm6 /* x3 | x3 */
+
+ addq %rax, %rdx
+ pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
+
+ prefetch 32(%rdx) /* hopefully stride is zero */
+ pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
+
+ movq %mm6, (%rdi) /* write r0, r1 */
+ movq %mm5, 8(%rdi) /* write r2, r3 */
+
+ addq $16, %rdi
+
+ decl %ecx
+ jnz p4_2d_no_rot_loop
+
+p4_2d_no_rot_done:
+ femms
+ ret
+
+
+.align 16
+.globl _mesa_3dnow_transform_points4_2d
+.hidden _mesa_3dnow_transform_points4_2d
+_mesa_3dnow_transform_points4_2d:
+
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzbl V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ .byte 0x66, 0x66, 0x90 /* manual align += 4 */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ test %ecx, %ecx
+ .byte 0x66, 0x66, 0x90 /* manual align += 4 */
+ jz p4_2d_done
+
+ movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+
+ movd (%rsi), %mm0 /* | m00 */
+ movd 4(%rsi), %mm1 /* | m01 */
+
+ prefetch (%rdx)
+
+ punpckldq 16(%rsi), %mm0 /* m10 | m00 */
+ .byte 0x66, 0x66, 0x90 /* manual align += 4 */
+ punpckldq 20(%rsi), %mm1 /* m11 | m01 */
+
+ movq 48(%rsi), %mm2 /* m31 | m30 */
+
+p4_2d_loop:
+
+ prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
+
+ movq (%rdx), %mm3 /* x1 | x0 */
+ movq 8(%rdx), %mm5 /* x3 | x2 */
+
+ movq %mm3, %mm4 /* x1 | x0 */
+ movq %mm5, %mm6 /* x3 | x2 */
+
+ pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
+ punpckhdq %mm6, %mm6 /* x3 | x3 */
+
+ pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
+
+ addq %rax, %rdx
+ pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
+
+ pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
+ prefetch 32(%rdx) /* hopefully stride is zero */
+
+ pfadd %mm6, %mm3 /* r1 | r0 */
+
+ movq %mm3, (%rdi) /* write r0, r1 */
+ movq %mm5, 8(%rdi) /* write r2, r3 */
+
+ addq $16, %rdi
+
+ decl %ecx
+ jnz p4_2d_loop
+
+p4_2d_done:
+ femms
+ ret
+
+#endif
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/3dnow.c b/src/arch/x86/3dnow.c
new file mode 100644
index 0000000..de2fb1e
--- /dev/null
+++ b/src/arch/x86/3dnow.c
@@ -0,0 +1,91 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 5.0.1
+ *
+ * Copyright (C) 1999-2003 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * 3DNow! optimizations contributed by
+ * Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "math/m_xform.h"
+#include "tnl/t_context.h"
+
+#include "3dnow.h"
+#include "x86_xform.h"
+
+#ifdef DEBUG_MATH
+#include "math/m_debug.h"
+#endif
+
+
+#ifdef USE_3DNOW_ASM
+DECLARE_XFORM_GROUP( 3dnow, 2 )
+DECLARE_XFORM_GROUP( 3dnow, 3 )
+DECLARE_XFORM_GROUP( 3dnow, 4 )
+
+DECLARE_NORM_GROUP( 3dnow )
+
+
+extern void _ASMAPI
+_mesa_v16_3dnow_general_xform( GLfloat *first_vert,
+ const GLfloat *m,
+ const GLfloat *src,
+ GLuint src_stride,
+ GLuint count );
+
+extern void _ASMAPI
+_mesa_3dnow_project_vertices( GLfloat *first,
+ GLfloat *last,
+ const GLfloat *m,
+ GLuint stride );
+
+extern void _ASMAPI
+_mesa_3dnow_project_clipped_vertices( GLfloat *first,
+ GLfloat *last,
+ const GLfloat *m,
+ GLuint stride,
+ const GLubyte *clipmask );
+#endif
+
+
+void _mesa_init_3dnow_transform_asm( void )
+{
+#ifdef USE_3DNOW_ASM
+ ASSIGN_XFORM_GROUP( 3dnow, 2 );
+ ASSIGN_XFORM_GROUP( 3dnow, 3 );
+ ASSIGN_XFORM_GROUP( 3dnow, 4 );
+
+ /* There's a bug somewhere in the 3dnow_normal.S file that causes
+ * bad shading. Disable for now.
+ ASSIGN_NORM_GROUP( 3dnow );
+ */
+
+#ifdef DEBUG_MATH
+ _math_test_all_transform_functions( "3DNow!" );
+ _math_test_all_normal_transform_functions( "3DNow!" );
+#endif
+#endif
+}
diff --git a/src/arch/x86/3dnow.h b/src/arch/x86/3dnow.h
new file mode 100644
index 0000000..1c1fedc
--- /dev/null
+++ b/src/arch/x86/3dnow.h
@@ -0,0 +1,36 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * 3DNow! optimizations contributed by
+ * Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
+ */
+
+#ifndef __3DNOW_H__
+#define __3DNOW_H__
+
+void _mesa_init_3dnow_transform_asm( void );
+
+#endif
diff --git a/src/arch/x86/3dnow_normal.S b/src/arch/x86/3dnow_normal.S
new file mode 100644
index 0000000..7f5f6b3
--- /dev/null
+++ b/src/arch/x86/3dnow_normal.S
@@ -0,0 +1,852 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 5.1
+ *
+ * Copyright (C) 1999-2003 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * 3Dnow assembly code by Holger Waechtler
+ */
+
+#ifdef USE_3DNOW_ASM
+
+#include "assyntax.h"
+#include "matypes.h"
+#include "norm_args.h"
+
+ SEG_TEXT
+
+#define M(i) REGOFF(i * 4, ECX)
+#define STRIDE REGOFF(12, ESI)
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_normalize_normals)
+HIDDEN(_mesa_3dnow_transform_normalize_normals)
+GLNAME(_mesa_3dnow_transform_normalize_normals):
+
+#define FRAME_OFFSET 12
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+ PUSH_L ( EBP )
+
+ MOV_L ( ARG_LENGTHS, EDI )
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EBP ) /* dest->count = in->count */
+ MOV_L ( EBP, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( ARG_MAT, ECX )
+ MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
+
+ CMP_L ( CONST(0), EBP ) /* count > 0 ?? */
+ JE ( LLBL (G3TN_end) )
+
+ MOV_L ( REGOFF (V4F_COUNT, ESI), EBP )
+ FEMMS
+
+ PUSH_L ( EBP )
+ PUSH_L ( EAX )
+ PUSH_L ( EDX ) /* save counter & pointer for */
+ /* the normalize pass */
+#undef FRAME_OFFSET
+#define FRAME_OFFSET 24
+
+ MOVQ ( M(0), MM3 ) /* m1 | m0 */
+ MOVQ ( M(4), MM4 ) /* m5 | m4 */
+
+ MOVD ( M(2), MM5 ) /* | m2 */
+ PUNPCKLDQ ( M(6), MM5 ) /* m6 | m2 */
+
+ MOVQ ( M(8), MM6 ) /* m9 | m8 */
+ MOVQ ( M(10), MM7 ) /* | m10 */
+
+ CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
+ JNE ( LLBL (G3TN_scale_end ) )
+
+ MOVD ( ARG_SCALE, MM0 ) /* | scale */
+ PUNPCKLDQ ( MM0, MM0 ) /* scale | scale */
+
+ PFMUL ( MM0, MM3 ) /* scale * m1 | scale * m0 */
+ PFMUL ( MM0, MM4 ) /* scale * m5 | scale * m4 */
+ PFMUL ( MM0, MM5 ) /* scale * m6 | scale * m2 */
+ PFMUL ( MM0, MM6 ) /* scale * m9 | scale * m8 */
+ PFMUL ( MM0, MM7 ) /* | scale * m10 */
+
+ALIGNTEXT32
+LLBL (G3TN_scale_end):
+LLBL (G3TN_transform):
+ MOVQ ( REGIND (EDX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF (8, EDX), MM2 ) /* | x2 */
+
+ MOVQ ( MM0, MM1 ) /* x1 | x0 */
+ PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
+
+ PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */
+ ADD_L ( CONST(16), EAX ) /* next r */
+
+ PREFETCHW ( REGIND(EAX) )
+
+ PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */
+ PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */
+
+ PFMUL ( MM5, MM2 ) /* x2*m6 | x2*m2 */
+ PFADD ( MM2, MM0 ) /* x0*m4+x1*m5+x2*m6| x0*m0+...+x2**/
+
+ MOVQ ( REGIND (EDX), MM1 ) /* x1 | x0 */
+ MOVQ ( MM0, REGOFF(-16, EAX) ) /* write r0, r1 */
+
+ PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */
+ MOVD ( REGOFF (8, EDX), MM2 ) /* | x2 */
+
+ PFMUL ( MM7, MM2 ) /* | x2*m10 */
+ PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */
+
+ PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m*/
+ ADD_L ( STRIDE, EDX ) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ MOVD ( MM1, REGOFF(-8, EAX) ) /* write r2 */
+ SUB_L ( CONST(1), EBP ) /* decrement normal counter */
+ JNZ ( LLBL (G3TN_transform) )
+
+
+ POP_L ( EDX ) /* end of transform --- */
+ POP_L ( EAX ) /* now normalizing ... */
+ POP_L ( EBP )
+
+ CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
+ JE ( LLBL (G3TN_norm ) ) /* calculate lengths */
+
+
+ALIGNTEXT32
+LLBL (G3TN_norm_w_lengths):
+
+ PREFETCHW ( REGOFF(12,EAX) )
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+
+ MOVD ( REGIND (EDI), MM3 ) /* | length (x) */
+ PFMUL ( MM3, MM1 ) /* | x2 (normalize*/
+
+ PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */
+ PFMUL ( MM3, MM0 ) /* x1 (normalized) | x0 (normalize*/
+
+ ADD_L ( STRIDE, EDX ) /* next normal */
+ ADD_L ( CONST(4), EDI ) /* next length */
+
+ PREFETCH ( REGIND(EDI) )
+
+ MOVQ ( MM0, REGIND(EAX) ) /* write new x0, x1 */
+ MOVD ( MM1, REGOFF(8, EAX) ) /* write new x2 */
+
+ ADD_L ( CONST(16), EAX ) /* next r */
+ SUB_L ( CONST(1), EBP ) /* decrement normal counter */
+
+ JNZ ( LLBL (G3TN_norm_w_lengths) )
+ JMP ( LLBL (G3TN_exit_3dnow) )
+
+ALIGNTEXT32
+LLBL (G3TN_norm):
+
+ PREFETCHW ( REGIND(EAX) )
+
+ MOVQ ( REGIND (EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+
+ MOVQ ( MM0, MM3 ) /* x1 | x0 */
+ MOVQ ( MM1, MM4 ) /* | x2 */
+
+ PFMUL ( MM0, MM3 ) /* x1*x1 | x0*x0 */
+ ADD_L ( CONST(16), EAX ) /* next r */
+
+ PFMUL ( MM1, MM4 ) /* | x2*x2 */
+ PFADD ( MM4, MM3 ) /* | x0*x0+x2*x2 */
+
+ PFACC ( MM3, MM3 ) /* **not used** | x0*x0+x1*x1+x2**/
+ PFRSQRT ( MM3, MM5 ) /* 1/sqrt (x0*x0+x1*x1+x2*x2) */
+
+ MOVQ ( MM5, MM4 )
+ PUNPCKLDQ ( MM3, MM3 )
+
+ SUB_L ( CONST(1), EBP ) /* decrement normal counter */
+ PFMUL ( MM5, MM5 )
+
+ PFRSQIT1 ( MM3, MM5 )
+ PFRCPIT2 ( MM4, MM5 )
+
+ PFMUL ( MM5, MM0 ) /* x1 (normalized) | x0 (normalize*/
+
+ MOVQ ( MM0, REGOFF(-16, EAX) ) /* write new x0, x1 */
+ PFMUL ( MM5, MM1 ) /* | x2 (normalize*/
+
+ MOVD ( MM1, REGOFF(-8, EAX) ) /* write new x2 */
+ JNZ ( LLBL (G3TN_norm) )
+
+LLBL (G3TN_exit_3dnow):
+ FEMMS
+
+LLBL (G3TN_end):
+ POP_L ( EBP )
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_normalize_normals_no_rot)
+HIDDEN(_mesa_3dnow_transform_normalize_normals_no_rot)
+GLNAME(_mesa_3dnow_transform_normalize_normals_no_rot):
+
+#undef FRAME_OFFSET
+#define FRAME_OFFSET 12
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+ PUSH_L ( EBP )
+
+ MOV_L ( ARG_LENGTHS, EDI )
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EBP ) /* dest->count = in->count */
+ MOV_L ( EBP, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( ARG_MAT, ECX )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
+ MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
+
+ CMP_L ( CONST(0), EBP ) /* count > 0 ?? */
+ JE ( LLBL (G3TNNR_end) )
+
+ FEMMS
+
+ MOVD ( M(0), MM0 ) /* | m0 */
+ PUNPCKLDQ ( M(5), MM0 ) /* m5 | m0 */
+
+ MOVD ( M(10), MM2 ) /* | m10 */
+ PUNPCKLDQ ( MM2, MM2 ) /* m10 | m10 */
+
+ CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
+ JNE ( LLBL (G3TNNR_scale_end ) )
+
+ MOVD ( ARG_SCALE, MM7 ) /* | scale */
+ PUNPCKLDQ ( MM7, MM7 ) /* scale | scale */
+
+ PFMUL ( MM7, MM0 ) /* scale * m5 | scale * m0 */
+ PFMUL ( MM7, MM2 ) /* scale * m10 | scale * m10 */
+
+ALIGNTEXT32
+LLBL (G3TNNR_scale_end):
+ CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
+ JE ( LLBL (G3TNNR_norm) ) /* need to calculate lengths */
+
+ MOVD ( REGIND(EDI), MM3 ) /* | length (x) */
+
+
+ALIGNTEXT32
+LLBL (G3TNNR_norm_w_lengths): /* use precalculated lengths */
+
+ PREFETCHW ( REGIND(EAX) )
+
+ MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */
+
+ PFMUL ( MM0, MM6 ) /* x1*m5 | x0*m0 */
+ ADD_L ( STRIDE, EDX ) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ PFMUL ( MM2, MM7 ) /* | x2*m10 */
+ ADD_L ( CONST(16), EAX ) /* next r */
+
+ PFMUL ( MM3, MM7 ) /* | x2 (normalized) */
+ PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */
+
+ ADD_L ( CONST(4), EDI ) /* next length */
+ PFMUL ( MM3, MM6 ) /* x1 (normalized) | x0 (normalized) */
+
+ SUB_L ( CONST(1), EBP ) /* decrement normal counter */
+ MOVQ ( MM6, REGOFF(-16, EAX) ) /* write r0, r1 */
+
+ MOVD ( MM7, REGOFF(-8, EAX) ) /* write r2 */
+ MOVD ( REGIND(EDI), MM3 ) /* | length (x) */
+
+ JNZ ( LLBL (G3TNNR_norm_w_lengths) )
+ JMP ( LLBL (G3TNNR_exit_3dnow) )
+
+ALIGNTEXT32
+LLBL (G3TNNR_norm): /* need to calculate lengths */
+
+ PREFETCHW ( REGIND(EAX) )
+
+ MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */
+
+ PFMUL ( MM0, MM6 ) /* x1*m5 | x0*m0 */
+ ADD_L ( CONST(16), EAX ) /* next r */
+
+ PFMUL ( MM2, MM7 ) /* | x2*m10 */
+ MOVQ ( MM6, MM3 ) /* x1 (transformed)| x0 (transformed) */
+
+ MOVQ ( MM7, MM4 ) /* | x2 (transformed) */
+ PFMUL ( MM6, MM3 ) /* x1*x1 | x0*x0 */
+
+
+ PFMUL ( MM7, MM4 ) /* | x2*x2 */
+ PFACC ( MM3, MM3 ) /* **not used** | x0*x0+x1*x1 */
+
+ PFADD ( MM4, MM3 ) /* | x0*x0+x1*x1+x2*x2*/
+ ADD_L ( STRIDE, EDX ) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ PFRSQRT ( MM3, MM5 ) /* 1/sqrt (x0*x0+x1*x1+x2*x2) */
+ MOVQ ( MM5, MM4 )
+
+ PUNPCKLDQ ( MM3, MM3 )
+ PFMUL ( MM5, MM5 )
+
+ PFRSQIT1 ( MM3, MM5 )
+ SUB_L ( CONST(1), EBP ) /* decrement normal counter */
+
+ PFRCPIT2 ( MM4, MM5 )
+ PFMUL ( MM5, MM6 ) /* x1 (normalized) | x0 (normalized) */
+
+ MOVQ ( MM6, REGOFF(-16, EAX) ) /* write r0, r1 */
+ PFMUL ( MM5, MM7 ) /* | x2 (normalized) */
+
+ MOVD ( MM7, REGOFF(-8, EAX) ) /* write r2 */
+ JNZ ( LLBL (G3TNNR_norm) )
+
+
+LLBL (G3TNNR_exit_3dnow):
+ FEMMS
+
+LLBL (G3TNNR_end):
+ POP_L ( EBP )
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_rescale_normals_no_rot)
+HIDDEN(_mesa_3dnow_transform_rescale_normals_no_rot)
+GLNAME(_mesa_3dnow_transform_rescale_normals_no_rot):
+
+#undef FRAME_OFFSET
+#define FRAME_OFFSET 12
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+ PUSH_L ( EBP )
+
+ MOV_L ( ARG_IN, EAX )
+ MOV_L ( ARG_DEST, EDX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EBP ) /* dest->count = in->count */
+ MOV_L ( EBP, REGOFF(V4F_COUNT, EDX) )
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_MAT, ECX )
+ MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
+ MOV_L ( REGOFF(V4F_START, EDX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
+
+ CMP_L ( CONST(0), EBP )
+ JE ( LLBL (G3TRNR_end) )
+
+ FEMMS
+
+ MOVD ( ARG_SCALE, MM6 ) /* | scale */
+ PUNPCKLDQ ( MM6, MM6 ) /* scale | scale */
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m0 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m5 | m0 */
+
+ PFMUL ( MM6, MM0 ) /* scale*m5 | scale*m0 */
+ MOVD ( REGOFF(40, ECX), MM2 ) /* | m10 */
+
+ PFMUL ( MM6, MM2 ) /* | scale*m10 */
+
+ALIGNTEXT32
+LLBL (G3TRNR_rescale):
+
+ PREFETCHW ( REGIND(EAX) )
+
+ MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
+
+ PFMUL ( MM0, MM4 ) /* x1*m5 | x0*m0 */
+ ADD_L ( STRIDE, EDX ) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ PFMUL ( MM2, MM5 ) /* | x2*m10 */
+ ADD_L ( CONST(16), EAX ) /* next r */
+
+ SUB_L ( CONST(1), EBP ) /* decrement normal counter */
+ MOVQ ( MM4, REGOFF(-16, EAX) ) /* write r0, r1 */
+
+ MOVD ( MM5, REGOFF(-8, EAX) ) /* write r2 */
+ JNZ ( LLBL (G3TRNR_rescale) ) /* cnt > 0 ? -> process next normal */
+
+ FEMMS
+
+LLBL (G3TRNR_end):
+ POP_L ( EBP )
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_rescale_normals)
+HIDDEN(_mesa_3dnow_transform_rescale_normals)
+GLNAME(_mesa_3dnow_transform_rescale_normals):
+
+#undef FRAME_OFFSET
+#define FRAME_OFFSET 8
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( ARG_MAT, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EDI ) /* dest->count = in->count */
+ MOV_L ( EDI, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
+
+ CMP_L ( CONST(0), EDI )
+ JE ( LLBL (G3TR_end) )
+
+ FEMMS
+
+ MOVQ ( REGIND(ECX), MM3 ) /* m1 | m0 */
+
+ MOVQ ( REGOFF(16,ECX), MM4 ) /* m5 | m4 */
+ MOVD ( ARG_SCALE, MM0 ) /* scale */
+
+ MOVD ( REGOFF(8,ECX), MM5 ) /* | m2 */
+ PUNPCKLDQ ( MM0, MM0 ) /* scale | scale */
+
+ PUNPCKLDQ ( REGOFF(24, ECX), MM5 )
+ PFMUL ( MM0, MM3 ) /* scale*m1 | scale*m0 */
+
+ MOVQ ( REGOFF(32, ECX), MM6 ) /* m9 | m8*/
+ PFMUL ( MM0, MM4 ) /* scale*m5 | scale*m4 */
+
+ MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */
+ PFMUL ( MM0, MM5 ) /* scale*m6 | scale*m2 */
+
+ PFMUL ( MM0, MM6 ) /* scale*m9 | scale*m8 */
+
+ PFMUL ( MM0, MM7 ) /* | scale*m10 */
+
+ALIGNTEXT32
+LLBL (G3TR_rescale):
+
+ PREFETCHW ( REGIND(EAX) )
+
+ MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+
+ MOVQ ( MM0, MM1 ) /* x1 | x0 */
+ PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
+
+ PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */
+ ADD_L ( CONST(16), EAX ) /* next r */
+
+ PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */
+ PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */
+
+ MOVQ ( REGIND(EDX), MM1 ) /* x1 | x0 */
+
+ PFMUL ( MM5, MM2 ) /* x2*m6 | x2*m2 */
+ PFADD ( MM2, MM0 ) /* x0*m4...+x2*m6| x0*m0+x1*m1+x2*m2 */
+
+ MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+ ADD_L ( STRIDE, EDX ) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ MOVQ ( MM0, REGOFF(-16, EAX) ) /* write r0, r1 */
+ PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */
+
+ PFMUL ( MM7, MM2 ) /* | x2*m10 */
+ PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */
+
+ PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m10 */
+ MOVD ( MM1, REGOFF(-8, EAX) ) /* write r2 */
+
+ SUB_L ( CONST(1), EDI ) /* decrement normal counter */
+ JNZ ( LLBL (G3TR_rescale) )
+
+ FEMMS
+
+LLBL (G3TR_end):
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_normals_no_rot)
+HIDDEN(_mesa_3dnow_transform_normals_no_rot)
+GLNAME(_mesa_3dnow_transform_normals_no_rot):
+
+#undef FRAME_OFFSET
+#define FRAME_OFFSET 8
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( ARG_MAT, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EDI ) /* dest->count = in->count */
+ MOV_L ( EDI, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
+
+ CMP_L ( CONST(0), EDI )
+ JE ( LLBL (G3TNR_end) )
+
+ FEMMS
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m0 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m5 | m0 */
+
+ MOVD ( REGOFF(40, ECX), MM2 ) /* | m10 */
+ PUNPCKLDQ ( MM2, MM2 ) /* m10 | m10 */
+
+ALIGNTEXT32
+LLBL (G3TNR_transform):
+
+ PREFETCHW ( REGIND(EAX) )
+
+ MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
+
+ PFMUL ( MM0, MM4 ) /* x1*m5 | x0*m0 */
+ ADD_L ( STRIDE, EDX) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ PFMUL ( MM2, MM5 ) /* | x2*m10 */
+ ADD_L ( CONST(16), EAX ) /* next r */
+
+ SUB_L ( CONST(1), EDI ) /* decrement normal counter */
+ MOVQ ( MM4, REGOFF(-16, EAX) ) /* write r0, r1 */
+
+ MOVD ( MM5, REGOFF(-8, EAX) ) /* write r2 */
+ JNZ ( LLBL (G3TNR_transform) )
+
+ FEMMS
+
+LLBL (G3TNR_end):
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_normals)
+HIDDEN(_mesa_3dnow_transform_normals)
+GLNAME(_mesa_3dnow_transform_normals):
+
+#undef FRAME_OFFSET
+#define FRAME_OFFSET 8
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( ARG_MAT, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EDI ) /* dest->count = in->count */
+ MOV_L ( EDI, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
+
+ CMP_L ( CONST(0), EDI ) /* count > 0 ?? */
+ JE ( LLBL (G3T_end) )
+
+ FEMMS
+
+ MOVQ ( REGIND(ECX), MM3 ) /* m1 | m0 */
+ MOVQ ( REGOFF(16, ECX), MM4 ) /* m5 | m4 */
+
+ MOVD ( REGOFF(8, ECX), MM5 ) /* | m2 */
+ PUNPCKLDQ ( REGOFF(24, ECX), MM5 ) /* m6 | m2 */
+
+ MOVQ ( REGOFF(32, ECX), MM6 ) /* m9 | m8 */
+ MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */
+
+ALIGNTEXT32
+LLBL (G3T_transform):
+
+ PREFETCHW ( REGIND(EAX) )
+
+ MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+
+ MOVQ ( MM0, MM1 ) /* x1 | x0 */
+ PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
+
+ PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */
+ ADD_L ( CONST(16), EAX ) /* next r */
+
+ PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */
+ PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */
+
+ PFMUL ( MM5, MM2 ) /* x2*m6 | x2*m2 */
+ PFADD ( MM2, MM0 ) /* x0*m4...+x2*m6| x0*m0+x1*m1+x2*m2 */
+
+ MOVQ ( REGIND(EDX), MM1 ) /* x1 | x0 */
+ MOVQ ( MM0, REGOFF(-16, EAX) ) /* write r0, r1 */
+
+ PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */
+ MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+
+ PFMUL ( MM7, MM2 ) /* | x2*m10 */
+ ADD_L ( STRIDE, EDX ) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */
+ PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m10 */
+
+ MOVD ( MM1, REGOFF(-8, EAX) ) /* write r2 */
+ SUB_L ( CONST(1), EDI ) /* decrement normal counter */
+
+ JNZ ( LLBL (G3T_transform) )
+
+ FEMMS
+
+LLBL (G3T_end):
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_normalize_normals)
+HIDDEN(_mesa_3dnow_normalize_normals)
+GLNAME(_mesa_3dnow_normalize_normals):
+
+#undef FRAME_OFFSET
+#define FRAME_OFFSET 12
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+ PUSH_L ( EBP )
+
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EBP ) /* dest->count = in->count */
+ MOV_L ( EBP, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, ESI), ECX ) /* in->start */
+ MOV_L ( ARG_LENGTHS, EDX )
+
+ CMP_L ( CONST(0), EBP ) /* count > 0 ?? */
+ JE ( LLBL (G3N_end) )
+
+ FEMMS
+
+ CMP_L ( CONST(0), EDX ) /* lengths == 0 ? */
+ JE ( LLBL (G3N_norm2) ) /* calculate lengths */
+
+ALIGNTEXT32
+LLBL (G3N_norm1): /* use precalculated lengths */
+
+ PREFETCH ( REGIND(EAX) )
+
+ MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */
+
+ MOVD ( REGIND(EDX), MM3 ) /* | length (x) */
+ PFMUL ( MM3, MM1 ) /* | x2 (normalized) */
+
+ PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */
+ ADD_L ( STRIDE, ECX ) /* next normal */
+
+ PREFETCH ( REGIND(ECX) )
+
+ PFMUL ( MM3, MM0 ) /* x1 (normalized) | x0 (normalized) */
+ MOVQ ( MM0, REGIND(EAX) ) /* write new x0, x1 */
+
+ MOVD ( MM1, REGOFF(8, EAX) ) /* write new x2 */
+ ADD_L ( CONST(16), EAX ) /* next r */
+
+ ADD_L ( CONST(4), EDX ) /* next length */
+ SUB_L ( CONST(1), EBP ) /* decrement normal counter */
+
+ JNZ ( LLBL (G3N_norm1) )
+
+ JMP ( LLBL (G3N_end1) )
+
+ALIGNTEXT32
+LLBL (G3N_norm2): /* need to calculate lengths */
+
+ PREFETCHW ( REGIND(EAX) )
+
+ PREFETCH ( REGIND(ECX) )
+
+ MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */
+
+ MOVQ ( MM0, MM3 ) /* x1 | x0 */
+ ADD_L ( STRIDE, ECX ) /* next normal */
+
+ PFMUL ( MM0, MM3 ) /* x1*x1 | x0*x0 */
+ MOVQ ( MM1, MM4 ) /* | x2 */
+
+ ADD_L ( CONST(16), EAX ) /* next r */
+ PFMUL ( MM1, MM4 ) /* | x2*x2 */
+
+ PFADD ( MM4, MM3 ) /* | x0*x0+x2*x2 */
+ PFACC ( MM3, MM3 ) /* x0*x0+...+x2*x2 | x0*x0+x1*x1+x2*x2*/
+
+ PFRSQRT ( MM3, MM5 ) /* 1/sqrt (x0*x0+x1*x1+x2*x2) */
+ MOVQ ( MM5, MM4 )
+
+ PUNPCKLDQ ( MM3, MM3 )
+ PFMUL ( MM5, MM5 )
+
+ PFRSQIT1 ( MM3, MM5 )
+ SUB_L ( CONST(1), EBP ) /* decrement normal counter */
+
+ PFRCPIT2 ( MM4, MM5 )
+
+ PFMUL ( MM5, MM0 ) /* x1 (normalized) | x0 (normalized) */
+ MOVQ ( MM0, REGOFF(-16, EAX) ) /* write new x0, x1 */
+
+ PFMUL ( MM5, MM1 ) /* | x2 (normalized) */
+ MOVD ( MM1, REGOFF(-8, EAX) ) /* write new x2 */
+
+ JNZ ( LLBL (G3N_norm2) )
+
+LLBL (G3N_end1):
+ FEMMS
+
+LLBL (G3N_end):
+ POP_L ( EBP )
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_rescale_normals)
+HIDDEN(_mesa_3dnow_rescale_normals)
+GLNAME(_mesa_3dnow_rescale_normals):
+
+#undef FRAME_OFFSET
+#define FRAME_OFFSET 8
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( REGOFF(V4F_COUNT, ESI), EDX ) /* dest->count = in->count */
+ MOV_L ( EDX, REGOFF(V4F_COUNT, EAX) )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V4F_START, ESI), ECX ) /* in->start */
+
+ CMP_L ( CONST(0), EDX )
+ JE ( LLBL (G3R_end) )
+
+ FEMMS
+
+ MOVD ( ARG_SCALE, MM0 ) /* scale */
+ PUNPCKLDQ ( MM0, MM0 )
+
+ALIGNTEXT32
+LLBL (G3R_rescale):
+
+ PREFETCHW ( REGIND(EAX) )
+
+ MOVQ ( REGIND(ECX), MM1 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, ECX), MM2 ) /* | x2 */
+
+ PFMUL ( MM0, MM1 ) /* x1*scale | x0*scale */
+ ADD_L ( STRIDE, ECX ) /* next normal */
+
+ PREFETCH ( REGIND(ECX) )
+
+ PFMUL ( MM0, MM2 ) /* | x2*scale */
+ ADD_L ( CONST(16), EAX ) /* next r */
+
+ MOVQ ( MM1, REGOFF(-16, EAX) ) /* write r0, r1 */
+ MOVD ( MM2, REGOFF(-8, EAX) ) /* write r2 */
+
+ SUB_L ( CONST(1), EDX ) /* decrement normal counter */
+ JNZ ( LLBL (G3R_rescale) )
+
+ FEMMS
+
+LLBL (G3R_end):
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+#endif
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/3dnow_xform1.S b/src/arch/x86/3dnow_xform1.S
new file mode 100644
index 0000000..a73301a
--- /dev/null
+++ b/src/arch/x86/3dnow_xform1.S
@@ -0,0 +1,437 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_3DNOW_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FRAME_OFFSET 4
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_general )
+HIDDEN(_mesa_3dnow_transform_points1_general)
+GLNAME( _mesa_3dnow_transform_points1_general ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPGR_3 ) )
+
+ MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */
+ MOVQ ( REGOFF(8, ECX), MM1 ) /* m03 | m02 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+ MOVQ ( REGOFF(56, ECX), MM3 ) /* m33 | m32 */
+
+ALIGNTEXT16
+LLBL( G3TPGR_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* | x0 */
+ PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */
+
+ MOVQ ( MM4, MM5 ) /* x0 | x0 */
+ PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */
+
+ PFMUL ( MM1, MM5 ) /* x0*m03 | x0*m02 */
+ PFADD ( MM2, MM4 ) /* x0*m01+m31 | x0*m00+m30 */
+
+ PFADD ( MM3, MM5 ) /* x0*m03+m33 | x0*m02+m32 */
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+
+ MOVQ ( MM5, REGOFF(8, EDX) ) /* write r3, r2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TPGR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPGR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_identity )
+HIDDEN(_mesa_3dnow_transform_points1_identity)
+GLNAME( _mesa_3dnow_transform_points1_identity ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(1), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_1), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPIR_4) )
+
+ALIGNTEXT16
+LLBL( G3TPIR_3 ):
+
+ MOVD ( REGIND(EAX), MM0 ) /* | x0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ MOVD ( MM0, REGIND(EDX) ) /* | r0 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPIR_3 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPIR_4 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_3d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points1_3d_no_rot)
+GLNAME( _mesa_3dnow_transform_points1_3d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3NRR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TP3NRR_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* | x0 */
+ PFMUL ( MM0, MM4 ) /* | x0*m00 */
+
+ PFADD ( MM2, MM4 ) /* m31 | x0*m00+m30 */
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+
+ MOVD ( MM3, REGOFF(8, EDX) ) /* write r2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP3NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3NRR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_perspective )
+HIDDEN(_mesa_3dnow_transform_points1_perspective)
+GLNAME( _mesa_3dnow_transform_points1_perspective ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPPR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TPPR_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* 0 | x0 */
+ PFMUL ( MM0, MM4 ) /* 0 | x0*m00 */
+
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+ MOVQ ( MM3, REGOFF(8, EDX) ) /* write r2 (=m32), r3 (=0) */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPPR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPPR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_2d )
+HIDDEN(_mesa_3dnow_transform_points1_2d)
+GLNAME( _mesa_3dnow_transform_points1_2d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2R_3 ) )
+
+ MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ALIGNTEXT16
+LLBL( G3TP2R_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* | x0 */
+ PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */
+
+ PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */
+ PFADD ( MM2, MM4 ) /* x0*m01+m31 | x0*m00+m30 */
+
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP2R_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2R_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_2d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points1_2d_no_rot)
+GLNAME( _mesa_3dnow_transform_points1_2d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2NRR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ALIGNTEXT16
+LLBL( G3TP2NRR_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* | x0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ PFMUL ( MM0, MM4 ) /* | x0*m00 */
+ PFADD ( MM2, MM4 ) /* m31 | x0*m00+m30 */
+
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP2NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2NRR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_3d )
+HIDDEN(_mesa_3dnow_transform_points1_3d)
+GLNAME( _mesa_3dnow_transform_points1_3d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3R_3 ) )
+
+ MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */
+ MOVD ( REGOFF(8, ECX), MM1 ) /* | m02 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TP3R_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* | x0 */
+ PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */
+
+ MOVQ ( MM4, MM5 ) /* | x0 */
+ PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */
+
+ PFMUL ( MM1, MM5 ) /* | x0*m02 */
+ PFADD ( MM2, MM4 ) /* x0*m01+m31 | x0*m00+m30 */
+
+ PFADD ( MM3, MM5 ) /* | x0*m02+m32 */
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+
+ MOVD ( MM5, REGOFF(8, EDX) ) /* write r2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP3R_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3R_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+#endif
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/3dnow_xform2.S b/src/arch/x86/3dnow_xform2.S
new file mode 100644
index 0000000..2988fb7
--- /dev/null
+++ b/src/arch/x86/3dnow_xform2.S
@@ -0,0 +1,477 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_3DNOW_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FRAME_OFFSET 4
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_general )
+HIDDEN(_mesa_3dnow_transform_points2_general)
+GLNAME( _mesa_3dnow_transform_points2_general ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPGR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */
+
+ MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */
+
+ MOVD ( REGOFF(8, ECX), MM2 ) /* | m02 */
+ PUNPCKLDQ ( REGOFF(24, ECX), MM2 ) /* m12 | m02 */
+
+ MOVD ( REGOFF(12, ECX), MM3 ) /* | m03 */
+ PUNPCKLDQ ( REGOFF(28, ECX), MM3 ) /* m13 | m03 */
+
+ MOVQ ( REGOFF(48, ECX), MM4 ) /* m31 | m30 */
+ MOVQ ( REGOFF(56, ECX), MM5 ) /* m33 | m32 */
+
+ALIGNTEXT16
+LLBL( G3TPGR_2 ):
+
+ MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */
+ MOVQ ( MM6, MM7 ) /* x1 | x0 */
+
+ PFMUL ( MM0, MM6 ) /* x1*m10 | x0*m00 */
+ PFMUL ( MM1, MM7 ) /* x1*m11 | x0*m01 */
+
+ PFACC ( MM7, MM6 ) /* x0*m01+x1*m11 | x0*x00+x1*m10 */
+ PFADD ( MM4, MM6 ) /* x0*...*m11+m31 | x0*...*m10+m30 */
+
+ MOVQ ( MM6, REGIND(EDX) ) /* write r1, r0 */
+ MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */
+
+ MOVQ ( MM6, MM7 ) /* x1 | x0 */
+ PFMUL ( MM2, MM6 ) /* x1*m12 | x0*m02 */
+
+ PFMUL ( MM3, MM7 ) /* x1*m13 | x0*m03 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ PFACC ( MM7, MM6 ) /* x0*m03+x1*m13 | x0*x02+x1*m12 */
+ PFADD ( MM5, MM6 ) /* x0*...*m13+m33 | x0*...*m12+m32 */
+
+ MOVQ ( MM6, REGOFF(8, EDX) ) /* write r3, r2 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPGR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPGR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_perspective )
+HIDDEN(_mesa_3dnow_transform_points2_perspective)
+GLNAME( _mesa_3dnow_transform_points2_perspective ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPPR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TPPR_2 ):
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+ MOVQ ( MM3, REGOFF(8, EDX) ) /* write r2 (=m32), r3 (=0) */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPPR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPPR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_3d )
+HIDDEN(_mesa_3dnow_transform_points2_3d)
+GLNAME( _mesa_3dnow_transform_points2_3d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3 ), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3R_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */
+
+ MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */
+
+ MOVD ( REGOFF(8, ECX), MM2 ) /* | m02 */
+ PUNPCKLDQ ( REGOFF(24, ECX), MM2 ) /* m12 | m02 */
+
+ MOVQ ( REGOFF(48, ECX), MM4 ) /* m31 | m30 */
+ MOVD ( REGOFF(56, ECX), MM5 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TP3R_2 ):
+
+ MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */
+ MOVQ ( MM6, MM7 ) /* x1 | x0 */
+
+ PFMUL ( MM0, MM6 ) /* x1*m10 | x0*m00 */
+ PFMUL ( MM1, MM7 ) /* x1*m11 | x0*m01 */
+
+ PFACC ( MM7, MM6 ) /* x0*m01+x1*m11 | x0*x00+x1*m10 */
+ PFADD ( MM4, MM6 ) /* x0*...*m11+m31 | x0*...*m10+m30 */
+
+ MOVQ ( MM6, REGIND(EDX) ) /* write r1, r0 */
+ MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */
+
+ MOVQ ( MM6, MM7 ) /* x1 | x0 */
+ PFMUL ( MM2, MM6 ) /* x1*m12 | x0*m02 */
+
+ PFACC ( MM7, MM6 ) /* ***trash*** | x0*x02+x1*m12 */
+ PFADD ( MM5, MM6 ) /* ***trash*** | x0*...*m12+m32 */
+
+ MOVD ( MM6, REGOFF(8, EDX) ) /* write r2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP3R_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3R_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_3d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points2_3d_no_rot)
+GLNAME( _mesa_3dnow_transform_points2_3d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3 ), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3NRR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TP3NRR_2 ):
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+
+ PFADD ( MM2, MM4 ) /* x1*m11+m31 | x0*m00+m30 */
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+
+ MOVD ( MM3, REGOFF(8, EDX) ) /* write r2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP3NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3NRR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_2d )
+HIDDEN(_mesa_3dnow_transform_points2_2d)
+GLNAME( _mesa_3dnow_transform_points2_2d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2R_3 ) )
+
+ MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */
+ MOVQ ( REGOFF(16, ECX), MM1 ) /* m11 | m10 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ALIGNTEXT16
+LLBL( G3TP2R_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* | x0 */
+ MOVD ( REGOFF(4, EAX), MM5 ) /* | x1 */
+
+ PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */
+ PUNPCKLDQ ( MM5, MM5 ) /* x1 | x1 */
+
+ PFMUL ( MM1, MM5 ) /* x1*m11 | x1*m10 */
+ PFADD ( MM2, MM4 ) /* x...x1*m11+31 | x0*..*m10+m30 */
+
+ PFADD ( MM5, MM4 ) /* x0*m01+x1*m11 | x0*m00+x1*m10 */
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP2R_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2R_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_2d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points2_2d_no_rot)
+GLNAME( _mesa_3dnow_transform_points2_2d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2NRR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ALIGNTEXT16
+LLBL( G3TP2NRR_2 ):
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+ PFADD ( MM2, MM4 ) /* m31 | x0*m00+m30 */
+
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP2NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2NRR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_identity )
+HIDDEN(_mesa_3dnow_transform_points2_identity)
+GLNAME( _mesa_3dnow_transform_points2_identity ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPIR_3 ) )
+
+ALIGNTEXT16
+LLBL( G3TPIR_3 ):
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ MOVQ ( MM0, REGIND(EDX) ) /* r1 | r0 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPIR_3 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPIR_4 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#endif
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/3dnow_xform3.S b/src/arch/x86/3dnow_xform3.S
new file mode 100644
index 0000000..a356aae
--- /dev/null
+++ b/src/arch/x86/3dnow_xform3.S
@@ -0,0 +1,561 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_3DNOW_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FRAME_OFFSET 4
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_general )
+HIDDEN(_mesa_3dnow_transform_points3_general)
+GLNAME( _mesa_3dnow_transform_points3_general ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPGR_2 ) )
+
+ PREFETCHW ( REGIND(EDX) )
+
+ALIGNTEXT16
+LLBL( G3TPGR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM2 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PREFETCH ( REGIND(EAX) )
+
+ MOVQ ( MM0, MM1 ) /* x1 | x0 */
+ PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
+
+ PUNPCKLDQ ( MM0, MM0 ) /* x0 | x0 */
+ MOVQ ( MM2, MM5 ) /* x2 | x2 */
+
+ PUNPCKHDQ ( MM1, MM1 ) /* x1 | x1 */
+ PFMUL ( REGOFF(32, ECX), MM2 ) /* x2*m9 | x2*m8 */
+
+ MOVQ ( MM0, MM3 ) /* x0 | x0 */
+ PFMUL ( REGOFF(40, ECX), MM5 ) /* x2*m11 | x2*m10 */
+
+ MOVQ ( MM1, MM4 ) /* x1 | x1 */
+ PFMUL ( REGIND(ECX), MM0 ) /* x0*m1 | x0*m0 */
+
+ PFADD ( REGOFF(48, ECX), MM2 ) /* x2*m9+m13 | x2*m8+m12 */
+ PFMUL ( REGOFF(16, ECX), MM1 ) /* x1*m5 | x1*m4 */
+
+ PFADD ( REGOFF(56, ECX), MM5 ) /* x2*m11+m15 | x2*m10+m14 */
+ PFADD ( MM0, MM1 ) /* x0*m1+x1*m5 | x0*m0+x1*m4 */
+
+ PFMUL ( REGOFF(8, ECX), MM3 ) /* x0*m3 | x0*m2 */
+ PFADD ( MM1, MM2 ) /* r1 | r0 */
+
+ PFMUL ( REGOFF(24, ECX), MM4 ) /* x1*m7 | x1*m6 */
+ ADD_L ( CONST(16), EDX ) /* next output vertex */
+
+ PFADD ( MM3, MM4 ) /* x0*m3+x1*m7 | x0*m2+x1*m6 */
+ MOVQ ( MM2, REGOFF(-16, EDX) ) /* write r0, r1 */
+
+ PFADD ( MM4, MM5 ) /* r3 | r2 */
+ MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPGR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPGR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_perspective )
+HIDDEN(_mesa_3dnow_transform_points3_perspective)
+GLNAME( _mesa_3dnow_transform_points3_perspective ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPPR_2 ) )
+
+ PREFETCH ( REGIND(EAX) )
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVQ ( REGOFF(32, ECX), MM1 ) /* m21 | m20 */
+ MOVD ( REGOFF(40, ECX), MM2 ) /* | m22 */
+
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TPPR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+
+ MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PREFETCH ( REGIND(EAX) )
+
+ PXOR ( MM7, MM7 ) /* 0 | 0 */
+ MOVQ ( MM5, MM6 ) /* | x2 */
+
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+ PFSUB ( MM5, MM7 ) /* | -x2 */
+
+ PFMUL ( MM2, MM6 ) /* | x2*m22 */
+ PUNPCKLDQ ( MM5, MM5 ) /* x2 | x2 */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ PFMUL ( MM1, MM5 ) /* x2*m21 | x2*m20 */
+
+ PFADD ( MM3, MM6 ) /* | x2*m22+m32 */
+ PFADD ( MM4, MM5 ) /* x1*m11+x2*m21 | x0*m00+x2*m20 */
+
+ MOVQ ( MM5, REGOFF(-16, EDX) ) /* write r0, r1 */
+ MOVD ( MM6, REGOFF(-8, EDX) ) /* write r2 */
+
+ MOVD ( MM7, REGOFF(-4, EDX) ) /* write r3 */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPPR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPPR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_3d )
+HIDDEN(_mesa_3dnow_transform_points3_3d)
+GLNAME( _mesa_3dnow_transform_points3_3d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3R_2 ) )
+
+ PREFETCH ( REGIND(EAX) )
+ PREFETCH ( REGIND(EDX) )
+
+ MOVD ( REGOFF(8, ECX), MM7 ) /* | m2 */
+ PUNPCKLDQ ( REGOFF(24, ECX), MM7 ) /* m6 | m2 */
+
+
+ALIGNTEXT16
+LLBL( G3TP3R_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PREFETCH ( REGIND(EAX) )
+
+ MOVQ ( MM0, MM2 ) /* x1 | x0 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ PUNPCKLDQ ( MM2, MM2 ) /* x0 | x0 */
+ MOVQ ( MM0, MM3 ) /* x1 | x0 */
+
+ PFMUL ( REGIND(ECX), MM2 ) /* x0*m1 | x0*m0 */
+ PUNPCKHDQ ( MM3, MM3 ) /* x1 | x1 */
+
+ MOVQ ( MM1, MM4 ) /* | x2 */
+ PFMUL ( REGOFF(16, ECX), MM3 ) /* x1*m5 | x1*m4 */
+
+ PUNPCKLDQ ( MM4, MM4 ) /* x2 | x2 */
+ PFADD ( MM2, MM3 ) /* x0*m1+x1*m5 | x0*m0+x1*m4 */
+
+ PFMUL ( REGOFF(32, ECX), MM4 ) /* x2*m9 | x2*m8 */
+ PFADD ( REGOFF(48, ECX), MM3 ) /* x0*m1+...+m11 | x0*m0+x1*m4+m12 */
+
+ PFMUL ( MM7, MM0 ) /* x1*m6 | x0*m2 */
+ PFADD ( MM4, MM3 ) /* r1 | r0 */
+
+ PFMUL ( REGOFF(40, ECX), MM1 ) /* | x2*m10 */
+ PUNPCKLDQ ( REGOFF(56, ECX), MM1 ) /* m14 | x2*m10 */
+
+ PFACC ( MM0, MM1 )
+
+ MOVQ ( MM3, REGOFF(-16, EDX) ) /* write r0, r1 */
+ PFACC ( MM1, MM1 ) /* | r2 */
+
+ MOVD ( MM1, REGOFF(-8, EDX) ) /* write r2 */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP3R_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3R_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_3d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points3_3d_no_rot)
+GLNAME( _mesa_3dnow_transform_points3_3d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3NRR_2 ) )
+
+ PREFETCH ( REGIND(EAX) )
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVD ( REGOFF(40, ECX), MM2 ) /* | m22 */
+ PUNPCKLDQ ( MM2, MM2 ) /* m22 | m22 */
+
+ MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ PUNPCKLDQ ( MM3, MM3 ) /* m32 | m32 */
+
+
+ALIGNTEXT16
+LLBL( G3TP3NRR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PREFETCHW ( REGIND(EAX) )
+
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+
+ PFADD ( MM1, MM4 ) /* x1*m11+m31 | x0*m00+m30 */
+ PFMUL ( MM2, MM5 ) /* | x2*m22 */
+
+ PFADD ( MM3, MM5 ) /* | x2*m22+m32 */
+ MOVQ ( MM4, REGIND(EDX) ) /* write r0, r1 */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ MOVD ( MM5, REGOFF(-8, EDX) ) /* write r2 */
+ JNZ ( LLBL( G3TP3NRR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3NRR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_2d )
+HIDDEN(_mesa_3dnow_transform_points3_2d)
+GLNAME( _mesa_3dnow_transform_points3_2d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2R_3) )
+
+ PREFETCH ( REGIND(EAX) )
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */
+
+ MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ALIGNTEXT16
+LLBL( G3TP2R_2 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+
+ MOVQ ( REGIND(EAX), MM3 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PREFETCH ( REGIND(EAX) )
+
+ MOVQ ( MM3, MM4 ) /* x1 | x0 */
+ PFMUL ( MM0, MM3 ) /* x1*m10 | x0*m00 */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ PFMUL ( MM1, MM4 ) /* x1*m11 | x0*m01 */
+
+ PFACC ( MM4, MM3 ) /* x0*m00+x1*m10 | x0*m01+x1*m11 */
+ MOVD ( MM5, REGOFF(-8, EDX) ) /* write r2 (=x2) */
+
+ PFADD ( MM2, MM3 ) /* x0*...*m10+m30 | x0*...*m11+m31 */
+ MOVQ ( MM3, REGOFF(-16, EDX) ) /* write r0, r1 */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP2R_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2R_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_2d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points3_2d_no_rot)
+GLNAME( _mesa_3dnow_transform_points3_2d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2NRR_2 ) )
+
+ PREFETCH ( REGIND(EAX) )
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */
+
+
+ALIGNTEXT16
+LLBL( G3TP2NRR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PREFETCH ( REGIND(EAX) )
+
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ PFADD ( MM1, MM4 ) /* x1*m11+m31 | x0*m00+m30 */
+
+ MOVQ ( MM4, REGOFF(-16, EDX) ) /* write r0, r1 */
+ MOVD ( MM5, REGOFF(-8, EDX) ) /* write r2 (=x2) */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP2NRR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2NRR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_identity )
+HIDDEN(_mesa_3dnow_transform_points3_identity)
+GLNAME( _mesa_3dnow_transform_points3_identity ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPIR_2 ) )
+
+ PREFETCHW ( REGIND(EDX) )
+
+ALIGNTEXT16
+LLBL( G3TPIR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) )
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ MOVQ ( MM0, REGOFF(-16, EDX) ) /* r1 | r0 */
+
+ MOVD ( MM1, REGOFF(-8, EDX) ) /* | r2 */
+ JNZ ( LLBL( G3TPIR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPIR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#endif
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/3dnow_xform4.S b/src/arch/x86/3dnow_xform4.S
new file mode 100644
index 0000000..b2b7c64
--- /dev/null
+++ b/src/arch/x86/3dnow_xform4.S
@@ -0,0 +1,570 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_3DNOW_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FRAME_OFFSET 4
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_general )
+HIDDEN(_mesa_3dnow_transform_points4_general)
+GLNAME( _mesa_3dnow_transform_points4_general ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPGR_2 ) )
+
+ PREFETCHW ( REGIND(EDX) )
+
+ALIGNTEXT16
+LLBL( G3TPGR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM4 ) /* x3 | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PREFETCH ( REGIND(EAX) )
+
+ MOVQ ( MM0, MM2 ) /* x1 | x0 */
+ MOVQ ( MM4, MM6 ) /* x3 | x2 */
+
+ PUNPCKLDQ ( MM0, MM0 ) /* x0 | x0 */
+ PUNPCKHDQ ( MM2, MM2 ) /* x1 | x1 */
+
+ MOVQ ( MM0, MM1 ) /* x0 | x0 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ PFMUL ( REGIND(ECX), MM0 ) /* x0*m1 | x0*m0 */
+ MOVQ ( MM2, MM3 ) /* x1 | x1 */
+
+ PFMUL ( REGOFF(8, ECX), MM1 ) /* x0*m3 | x0*m2 */
+ PUNPCKLDQ ( MM4, MM4 ) /* x2 | x2 */
+
+ PFMUL ( REGOFF(16, ECX), MM2 ) /* x1*m5 | x1*m4 */
+ MOVQ ( MM4, MM5 ) /* x2 | x2 */
+
+ PFMUL ( REGOFF(24, ECX), MM3 ) /* x1*m7 | x1*m6 */
+ PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */
+
+ PFMUL ( REGOFF(32, ECX), MM4 ) /* x2*m9 | x2*m8 */
+ MOVQ ( MM6, MM7 ) /* x3 | x3 */
+
+ PFMUL ( REGOFF(40, ECX), MM5 ) /* x2*m11 | x2*m10 */
+ PFADD ( MM0, MM2 )
+
+ PFMUL ( REGOFF(48, ECX), MM6 ) /* x3*m13 | x3*m12 */
+ PFADD ( MM1, MM3 )
+
+ PFMUL ( REGOFF(56, ECX), MM7 ) /* x3*m15 | x3*m14 */
+ PFADD ( MM4, MM6 )
+
+ PFADD ( MM5, MM7 )
+ PFADD ( MM2, MM6 )
+
+ PFADD ( MM3, MM7 )
+ MOVQ ( MM6, REGOFF(-16, EDX) )
+
+ MOVQ ( MM7, REGOFF(-8, EDX) )
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPGR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPGR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_perspective )
+HIDDEN(_mesa_3dnow_transform_points4_perspective)
+GLNAME( _mesa_3dnow_transform_points4_perspective ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPPR_2 ) )
+
+ PREFETCH ( REGIND(EAX) )
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVD ( REGOFF(40, ECX), MM1 ) /* | m22 */
+ PUNPCKLDQ ( REGOFF(56, ECX), MM1 ) /* m32 | m22 */
+
+ MOVQ ( REGOFF(32, ECX), MM2 ) /* m21 | m20 */
+ PXOR ( MM7, MM7 ) /* 0 | 0 */
+
+ALIGNTEXT16
+LLBL( G3TPPR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
+ MOVD ( REGOFF(8, EAX), MM3 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PREFETCH ( REGOFF(32, EAX) ) /* hopefully stride is zero */
+
+ MOVQ ( MM5, MM6 ) /* x3 | x2 */
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+
+ PUNPCKLDQ ( MM5, MM5 ) /* x2 | x2 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ PFMUL ( MM2, MM5 ) /* x2*m21 | x2*m20 */
+ PFSUBR ( MM7, MM3 ) /* | -x2 */
+
+ PFMUL ( MM1, MM6 ) /* x3*m32 | x2*m22 */
+ PFADD ( MM4, MM5 ) /* x1*m11+x2*m21 | x0*m00+x2*m20 */
+
+ PFACC ( MM3, MM6 ) /* -x2 | x2*m22+x3*m32 */
+ MOVQ ( MM5, REGOFF(-16, EDX) ) /* write r0, r1 */
+
+ MOVQ ( MM6, REGOFF(-8, EDX) ) /* write r2, r3 */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TPPR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPPR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_3d )
+HIDDEN(_mesa_3dnow_transform_points4_3d)
+GLNAME( _mesa_3dnow_transform_points4_3d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3R_2 ) )
+
+ MOVD ( REGOFF(8, ECX), MM6 ) /* | m2 */
+ PUNPCKLDQ ( REGOFF(24, ECX), MM6 ) /* m6 | m2 */
+
+ MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */
+ PUNPCKLDQ ( REGOFF(56, ECX), MM7 ) /* m14 | m10 */
+
+ALIGNTEXT16
+LLBL( G3TP3R_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGOFF(32, EAX) ) /* hopefully array is tightly packed */
+
+ MOVQ ( REGIND(EAX), MM2 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM3 ) /* x3 | x2 */
+
+ MOVQ ( MM2, MM0 ) /* x1 | x0 */
+ MOVQ ( MM3, MM4 ) /* x3 | x2 */
+
+ MOVQ ( MM0, MM1 ) /* x1 | x0 */
+ MOVQ ( MM4, MM5 ) /* x3 | x2 */
+
+ PUNPCKLDQ ( MM0, MM0 ) /* x0 | x0 */
+ PUNPCKHDQ ( MM1, MM1 ) /* x1 | x1 */
+
+ PFMUL ( REGIND(ECX), MM0 ) /* x0*m1 | x0*m0 */
+ PUNPCKLDQ ( MM3, MM3 ) /* x2 | x2 */
+
+ PFMUL ( REGOFF(16, ECX), MM1 ) /* x1*m5 | x1*m4 */
+ PUNPCKHDQ ( MM4, MM4 ) /* x3 | x3 */
+
+ PFMUL ( MM6, MM2 ) /* x1*m6 | x0*m2 */
+ PFADD ( MM0, MM1 ) /* x0*m1+x1*m5 | x0*m0+x1*m4 */
+
+ PFMUL ( REGOFF(32, ECX), MM3 ) /* x2*m9 | x2*m8 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ PFMUL ( REGOFF(48, ECX), MM4 ) /* x3*m13 | x3*m12 */
+ PFADD ( MM1, MM3 ) /* x0*m1+..+x2*m9 | x0*m0+...+x2*m8 */
+
+ PFMUL ( MM7, MM5 ) /* x3*m14 | x2*m10 */
+ PFADD ( MM3, MM4 ) /* r1 | r0 */
+
+ PFACC ( MM2, MM5 ) /* x0*m2+x1*m6 | x2*m10+x3*m14 */
+ MOVD ( REGOFF(12, EAX), MM0 ) /* | x3 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PFACC ( MM0, MM5 ) /* r3 | r2 */
+
+ MOVQ ( MM4, REGOFF(-16, EDX) ) /* write r0, r1 */
+ MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP3R_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3R_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_3d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points4_3d_no_rot)
+GLNAME( _mesa_3dnow_transform_points4_3d_no_rot ):
+
+ PUSH_L ( ESI )
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3NRR_2 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVD ( REGOFF(40, ECX), MM2 ) /* | m22 */
+ PUNPCKLDQ ( REGOFF(56, ECX), MM2 ) /* m32 | m22 */
+
+ MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */
+
+ALIGNTEXT16
+LLBL( G3TP3NRR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
+ MOVD ( REGOFF(12, EAX), MM7 ) /* | x3 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PREFETCH ( REGOFF(32, EAX) ) /* hopefully stride is zero */
+
+ MOVQ ( MM5, MM6 ) /* x3 | x2 */
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+
+ PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */
+ PFMUL ( MM2, MM5 ) /* x3*m32 | x2*m22 */
+
+ PFMUL ( MM1, MM6 ) /* x3*m31 | x3*m30 */
+ PFACC ( MM7, MM5 ) /* x3 | x2*m22+x3*m32 */
+
+ PFADD ( MM6, MM4 ) /* x1*m11+x3*m31 | x0*m00+x3*m30 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ MOVQ ( MM4, REGOFF(-16, EDX) ) /* write r0, r1 */
+ MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP3NRR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3NRR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_2d )
+HIDDEN(_mesa_3dnow_transform_points4_2d)
+GLNAME( _mesa_3dnow_transform_points4_2d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2R_2 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */
+
+ MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ALIGNTEXT16
+LLBL( G3TP2R_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+
+ MOVQ ( REGIND(EAX), MM3 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PREFETCH ( REGIND(EAX) )
+
+ MOVQ ( MM3, MM4 ) /* x1 | x0 */
+ MOVQ ( MM5, MM6 ) /* x3 | x2 */
+
+ PFMUL ( MM1, MM4 ) /* x1*m11 | x0*m01 */
+ PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */
+
+ PFMUL ( MM0, MM3 ) /* x1*m10 | x0*m00 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ PFACC ( MM4, MM3 ) /* x0*m01+x1*m11 | x0*m00+x1*m10 */
+ PFMUL ( MM2, MM6 ) /* x3*m31 | x3*m30 */
+
+ PFADD ( MM6, MM3 ) /* r1 | r0 */
+ MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
+
+ MOVQ ( MM3, REGOFF(-16, EDX) ) /* write r0, r1 */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP2R_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2R_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_2d_no_rot )
+HIDDEN(_mesa_3dnow_transform_points4_2d_no_rot)
+GLNAME( _mesa_3dnow_transform_points4_2d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2NRR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */
+
+ALIGNTEXT16
+LLBL( G3TP2NRR_2 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PREFETCH ( REGIND(EAX) )
+
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+ MOVQ ( MM5, MM6 ) /* x3 | x2 */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */
+
+ PFMUL ( MM1, MM6 ) /* x3*m31 | x3*m30 */
+ PFADD ( MM4, MM6 ) /* x1*m11+x3*m31 | x0*m00+x3*m30 */
+
+ MOVQ ( MM6, REGOFF(-16, EDX) ) /* write r0, r1 */
+ MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP2NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2NRR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_identity )
+HIDDEN(_mesa_3dnow_transform_points4_identity)
+GLNAME( _mesa_3dnow_transform_points4_identity ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPIR_2 ) )
+
+ALIGNTEXT16
+LLBL( G3TPIR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM1 ) /* x3 | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PREFETCH ( REGIND(EAX) )
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ MOVQ ( MM0, REGOFF(-16, EDX) ) /* r1 | r0 */
+
+ MOVQ ( MM1, REGOFF(-8, EDX) ) /* r3 | r2 */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPIR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPIR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#endif
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/Makefile.am b/src/arch/x86/Makefile.am
new file mode 100644
index 0000000..1343827
--- /dev/null
+++ b/src/arch/x86/Makefile.am
@@ -0,0 +1,40 @@
+# Copyright © 2012 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+if HAVE_X86_ASM
+
+AM_CPPFLAGS = \
+ -I$(top_srcdir)/include \
+ -I$(top_srcdir)/src/mesa \
+ -I$(top_srcdir)/src/GLdispatch/mapi \
+ $(API_DEFINES) \
+ $(DEFINES)
+
+noinst_PROGRAMS = gen_matypes
+
+gen_matypes_SOURCES = gen_matypes.c
+BUILT_SOURCES = matypes.h
+CLEANFILES = matypes.h
+
+matypes.h: gen_matypes
+ $(AM_V_GEN)./gen_matypes > $@
+
+endif
diff --git a/src/arch/x86/assyntax.h b/src/arch/x86/assyntax.h
new file mode 100644
index 0000000..4a41812
--- /dev/null
+++ b/src/arch/x86/assyntax.h
@@ -0,0 +1,1747 @@
+
+#ifndef __ASSYNTAX_H__
+#define __ASSYNTAX_H__
+
+/*
+ * Copyright 1992 Vrije Universiteit, The Netherlands
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation for any purpose and without fee is hereby granted, provided
+ * that the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the Vrije Universiteit not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. The Vrije Universiteit makes no
+ * representations about the suitability of this software for any purpose.
+ * It is provided "as is" without express or implied warranty.
+ *
+ * The Vrije Universiteit DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS,
+ * IN NO EVENT SHALL The Vrije Universiteit BE LIABLE FOR ANY SPECIAL,
+ * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+ * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * assyntax.h
+ *
+ * Select the syntax appropriate to the 386 assembler being used
+ * To add support for more assemblers add more columns to the CHOICE
+ * macro. Note that register names must also have uppercase names
+ * to avoid macro recursion. e.g., #define ah %ah recurses!
+ *
+ * NB 1. Some of the macros for certain assemblers imply that the code is to
+ * run in protected mode!! Caveat emptor.
+ *
+ * NB 2. 486 specific instructions are not included. This is to discourage
+ * their accidental use in code that is intended to run on 386 and 486
+ * systems.
+ *
+ * Supported assemblers:
+ *
+ * (a) AT&T SysVr4 as(1): define ATT_ASSEMBLER
+ * (b) GNU Assembler gas: define GNU_ASSEMBLER (default)
+ * (c) Amsterdam Compiler kit: define ACK_ASSEMBLER
+ * (d) The Netwide Assembler: define NASM_ASSEMBLER
+ * (e) Microsoft Assembler: define MASM_ASSEMBLER (UNTESTED!)
+ *
+ * The following naming conventions have been used to identify the various
+ * data types:
+ * _SR = segment register version
+ * Integer:
+ * _Q = quadword = 64 bits
+ * _L = long = 32 bits
+ * _W = short = 16 bits
+ * _B = byte = 8 bits
+ * Floating-point:
+ * _X = m80real = 80 bits
+ * _D = double = 64 bits
+ * _S = single = 32 bits
+ *
+ * Author: Gregory J. Sharp, Sept 1992
+ * Vrije Universiteit, Amsterdam, The Netherlands
+ *
+ * [support for Intel syntax added by Josh Vanderhoof, 1999]
+ */
+
+#if !(defined(NASM_ASSEMBLER) || defined(MASM_ASSEMBLER))
+
+/* Default to ATT_ASSEMBLER when SVR4 or SYSV are defined */
+#if (defined(SVR4) || defined(SYSV)) && !defined(GNU_ASSEMBLER)
+#define ATT_ASSEMBLER
+#endif
+
+#if !defined(ATT_ASSEMBLER) && !defined(GNU_ASSEMBLER) && !defined(ACK_ASSEMBLER)
+#define GNU_ASSEMBLER
+#endif
+
+#if (defined(__STDC__) && !defined(UNIXCPP)) || (defined (sun) && defined (i386) && defined (SVR4) && defined (__STDC__) && !defined (__GNUC__))
+#define CONCAT(x, y) x ## y
+#define CONCAT3(x, y, z) x ## y ## z
+#else
+#define CONCAT(x, y) x/**/y
+#define CONCAT3(x, y, z) x/**/y/**/z
+#endif
+
+#ifdef ACK_ASSEMBLER
+
+/* Assume we write code for 32-bit protected mode! */
+
+/* Redefine register names for GAS & AT&T assemblers */
+#define AL al
+#define AH ah
+#define AX ax
+#define EAX ax
+#define BL bl
+#define BH bh
+#define BX bx
+#define EBX bx
+#define CL cl
+#define CH ch
+#define CX cx
+#define ECX cx
+#define DL dl
+#define DH dh
+#define DX dx
+#define EDX dx
+#define BP bp
+#define EBP bp
+#define SI si
+#define ESI si
+#define DI di
+#define EDI di
+#define SP sp
+#define ESP sp
+#define CS cs
+#define SS ss
+#define DS ds
+#define ES es
+#define FS fs
+#define GS gs
+/* Control Registers */
+#define CR0 cr0
+#define CR1 cr1
+#define CR2 cr2
+#define CR3 cr3
+/* Debug Registers */
+#define DR0 dr0
+#define DR1 dr1
+#define DR2 dr2
+#define DR3 dr3
+#define DR4 dr4
+#define DR5 dr5
+#define DR6 dr6
+#define DR7 dr7
+/* Floating-point Stack */
+#define ST st
+
+#define AS_BEGIN .sect .text; .sect .rom; .sect .data; .sect .bss; .sect .text
+
+
+#define _WTOG o16 /* word toggle for _W instructions */
+#define _LTOG /* long toggle for _L instructions */
+#define ADDR_TOGGLE a16
+#define OPSZ_TOGGLE o16
+#define USE16 .use16
+#define USE32 .use32
+
+#define CHOICE(a,b,c) c
+
+#else /* AT&T or GAS */
+
+/* Redefine register names for GAS & AT&T assemblers */
+#define AL %al
+#define AH %ah
+#define AX %ax
+#define EAX %eax
+#define BL %bl
+#define BH %bh
+#define BX %bx
+#define EBX %ebx
+#define CL %cl
+#define CH %ch
+#define CX %cx
+#define ECX %ecx
+#define DL %dl
+#define DH %dh
+#define DX %dx
+#define EDX %edx
+#define BP %bp
+#define EBP %ebp
+#define SI %si
+#define ESI %esi
+#define DI %di
+#define EDI %edi
+#define SP %sp
+#define ESP %esp
+#define CS %cs
+#define SS %ss
+#define DS %ds
+#define ES %es
+#define FS %fs
+#define GS %gs
+/* Control Registers */
+#define CR0 %cr0
+#define CR1 %cr1
+#define CR2 %cr2
+#define CR3 %cr3
+/* Debug Registers */
+#define DR0 %db0
+#define DR1 %db1
+#define DR2 %db2
+#define DR3 %db3
+#define DR4 %db4
+#define DR5 %db5
+#define DR6 %db6
+#define DR7 %db7
+/* Floating-point Stack */
+#define _STX0 %st(0)
+#define _STX1 %st(1)
+#define _STX2 %st(2)
+#define _STX3 %st(3)
+#define _STX4 %st(4)
+#define _STX5 %st(5)
+#define _STX6 %st(6)
+#define _STX7 %st(7)
+#define ST(x) CONCAT(_STX,x)
+#ifdef GNU_ASSEMBLER
+#define ST0 %st(0)
+#else
+#define ST0 %st
+#endif
+/* MMX Registers */
+#define MM0 %mm0
+#define MM1 %mm1
+#define MM2 %mm2
+#define MM3 %mm3
+#define MM4 %mm4
+#define MM5 %mm5
+#define MM6 %mm6
+#define MM7 %mm7
+/* SSE Registers */
+#define XMM0 %xmm0
+#define XMM1 %xmm1
+#define XMM2 %xmm2
+#define XMM3 %xmm3
+#define XMM4 %xmm4
+#define XMM5 %xmm5
+#define XMM6 %xmm6
+#define XMM7 %xmm7
+
+#define AS_BEGIN
+#define USE16
+#define USE32
+
+#ifdef GNU_ASSEMBLER
+
+#define ADDR_TOGGLE aword
+#define OPSZ_TOGGLE word
+
+#define CHOICE(a,b,c) b
+
+#else
+/*
+ * AT&T ASSEMBLER SYNTAX
+ * *********************
+ */
+#define CHOICE(a,b,c) a
+
+#define ADDR_TOGGLE addr16
+#define OPSZ_TOGGLE data16
+
+#endif /* GNU_ASSEMBLER */
+#endif /* ACK_ASSEMBLER */
+
+
+#if defined(__QNX__) || defined(Lynx) || (defined(SYSV) || defined(SVR4)) && !defined(ACK_ASSEMBLER) || defined(__ELF__) || defined(__GNU__) || defined(__GNUC__) && !defined(__DJGPP__) && !defined(__MINGW32__)
+#define GLNAME(a) a
+#else
+#define GLNAME(a) CONCAT(_,a)
+#endif
+
+
+ /****************************************/
+ /* */
+ /* Select the various choices */
+ /* */
+ /****************************************/
+
+
+/* Redefine assembler directives */
+/*********************************/
+#define GLOBL CHOICE(.globl, .globl, .extern)
+#define GLOBAL GLOBL
+#define EXTERN GLOBL
+#ifndef __AOUT__
+#define ALIGNTEXT32 CHOICE(.align 32, .balign 32, .align 32)
+#define ALIGNTEXT16 CHOICE(.align 16, .balign 16, .align 16)
+#define ALIGNTEXT8 CHOICE(.align 8, .balign 8, .align 8)
+#define ALIGNTEXT4 CHOICE(.align 4, .balign 4, .align 4)
+#define ALIGNTEXT2 CHOICE(.align 2, .balign 2, .align 2)
+/* ALIGNTEXT4ifNOP is the same as ALIGNTEXT4, but only if the space is
+ * guaranteed to be filled with NOPs. Otherwise it does nothing.
+ */
+#define ALIGNTEXT32ifNOP CHOICE(.align 32, .balign ARG2(32,0x90), /*can't do it*/)
+#define ALIGNTEXT16ifNOP CHOICE(.align 16, .balign ARG2(16,0x90), /*can't do it*/)
+#define ALIGNTEXT8ifNOP CHOICE(.align 8, .balign ARG2(8,0x90), /*can't do it*/)
+#define ALIGNTEXT4ifNOP CHOICE(.align 4, .balign ARG2(4,0x90), /*can't do it*/)
+#define ALIGNDATA32 CHOICE(.align 32, .balign ARG2(32,0x0), .align 32)
+#define ALIGNDATA16 CHOICE(.align 16, .balign ARG2(16,0x0), .align 16)
+#define ALIGNDATA8 CHOICE(.align 8, .balign ARG2(8,0x0), .align 8)
+#define ALIGNDATA4 CHOICE(.align 4, .balign ARG2(4,0x0), .align 4)
+#define ALIGNDATA2 CHOICE(.align 2, .balign ARG2(2,0x0), .align 2)
+#else
+/* 'as -aout' on FreeBSD doesn't have .balign */
+#define ALIGNTEXT32 CHOICE(.align 32, .align ARG2(5,0x90), .align 32)
+#define ALIGNTEXT16 CHOICE(.align 16, .align ARG2(4,0x90), .align 16)
+#define ALIGNTEXT8 CHOICE(.align 8, .align ARG2(3,0x90), .align 8)
+#define ALIGNTEXT4 CHOICE(.align 4, .align ARG2(2,0x90), .align 4)
+#define ALIGNTEXT2 CHOICE(.align 2, .align ARG2(1,0x90), .align 2)
+/* ALIGNTEXT4ifNOP is the same as ALIGNTEXT4, but only if the space is
+ * guaranteed to be filled with NOPs. Otherwise it does nothing.
+ */
+#define ALIGNTEXT32ifNOP CHOICE(.align 32, .align ARG2(5,0x90), /*can't do it*/)
+#define ALIGNTEXT16ifNOP CHOICE(.align 16, .align ARG2(4,0x90), /*can't do it*/)
+#define ALIGNTEXT8ifNOP CHOICE(.align 8, .align ARG2(3,0x90), /*can't do it*/)
+#define ALIGNTEXT4ifNOP CHOICE(.align 4, .align ARG2(2,0x90), /*can't do it*/)
+#define ALIGNDATA32 CHOICE(.align 32, .align ARG2(5,0x0), .align 32)
+#define ALIGNDATA16 CHOICE(.align 16, .align ARG2(4,0x0), .align 16)
+#define ALIGNDATA8 CHOICE(.align 8, .align ARG2(3,0x0), .align 8)
+#define ALIGNDATA4 CHOICE(.align 4, .align ARG2(2,0x0), .align 4)
+#define ALIGNDATA2 CHOICE(.align 2, .align ARG2(1,0x0), .align 2)
+#endif /* __AOUT__ */
+#define FILE(s) CHOICE(.file s, .file s, .file s)
+#define STRING(s) CHOICE(.string s, .asciz s, .asciz s)
+#define D_LONG CHOICE(.long, .long, .data4)
+#define D_WORD CHOICE(.value, .short, .data2)
+#define D_BYTE CHOICE(.byte, .byte, .data1)
+#define SPACE CHOICE(.comm, .space, .space)
+#define COMM CHOICE(.comm, .comm, .comm)
+#define SEG_DATA CHOICE(.data, .data, .sect .data)
+#define SEG_TEXT CHOICE(.text, .text, .sect .text)
+#define SEG_BSS CHOICE(.bss, .bss, .sect .bss)
+
+#ifdef GNU_ASSEMBLER
+#define D_SPACE(n) . = . + n
+#else
+#define D_SPACE(n) .space n
+#endif
+
+/* Addressing Modes */
+/* Immediate Mode */
+#define ADDR(a) CHOICE(CONCAT($,a), $a, a)
+#define CONST(a) CHOICE(CONCAT($,a), $a, a)
+
+/* Indirect Mode */
+#define CONTENT(a) CHOICE(a, a, (a)) /* take contents of variable */
+#define REGIND(a) CHOICE((a), (a), (a)) /* Register a indirect */
+/* Register b indirect plus displacement a */
+#define REGOFF(a, b) CHOICE(a(b), a(b), a(b))
+/* Reg indirect Base + Index + Displacement - this is mainly for 16-bit mode
+ * which has no scaling
+ */
+#define REGBID(b,i,d) CHOICE(d(b,i), d(b,i), d(b)(i))
+/* Reg indirect Base + (Index * Scale) */
+#define REGBIS(b,i,s) CHOICE((b,i,s), (b,i,s), (b)(i*s))
+/* Reg indirect Base + (Index * Scale) + Displacement */
+#define REGBISD(b,i,s,d) CHOICE(d(b,i,s), d(b,i,s), d(b)(i*s))
+/* Displaced Scaled Index: */
+#define REGDIS(d,i,s) CHOICE(d(,i,s), d(,i,s), d(i * s))
+/* Indexed Base: */
+#define REGBI(b,i) CHOICE((b,i), (b,i), (b)(i))
+/* Displaced Base: */
+#define REGDB(d,b) CHOICE(d(b), d(b), d(b))
+/* Variable indirect: */
+#define VARINDIRECT(var) CHOICE(*var, *var, (var))
+/* Use register contents as jump/call target: */
+#define CODEPTR(reg) CHOICE(*reg, *reg, reg)
+
+/* For expressions requiring bracketing
+ * eg. (CRT0_PM | CRT_EM)
+ */
+
+#define EXPR(a) CHOICE([a], (a), [a])
+#define ENOT(a) CHOICE(0!a, ~a, ~a)
+#define EMUL(a,b) CHOICE(a\*b, a*b, a*b)
+#define EDIV(a,b) CHOICE(a\/b, a/b, a/b)
+
+/*
+ * We have to beat the problem of commas within arguments to choice.
+ * eg. choice (add a,b, add b,a) will get argument mismatch. Luckily ANSI
+ * and other known cpp definitions evaluate arguments before substitution
+ * so the following works.
+ */
+#define ARG2(a, b) a,b
+#define ARG3(a,b,c) a,b,c
+
+/* Redefine assembler commands */
+#define AAA CHOICE(aaa, aaa, aaa)
+#define AAD CHOICE(aad, aad, aad)
+#define AAM CHOICE(aam, aam, aam)
+#define AAS CHOICE(aas, aas, aas)
+#define ADC_L(a, b) CHOICE(adcl ARG2(a,b), adcl ARG2(a,b), _LTOG adc ARG2(b,a))
+#define ADC_W(a, b) CHOICE(adcw ARG2(a,b), adcw ARG2(a,b), _WTOG adc ARG2(b,a))
+#define ADC_B(a, b) CHOICE(adcb ARG2(a,b), adcb ARG2(a,b), adcb ARG2(b,a))
+#define ADD_L(a, b) CHOICE(addl ARG2(a,b), addl ARG2(a,b), _LTOG add ARG2(b,a))
+#define ADD_W(a, b) CHOICE(addw ARG2(a,b), addw ARG2(a,b), _WTOG add ARG2(b,a))
+#define ADD_B(a, b) CHOICE(addb ARG2(a,b), addb ARG2(a,b), addb ARG2(b,a))
+#define AND_L(a, b) CHOICE(andl ARG2(a,b), andl ARG2(a,b), _LTOG and ARG2(b,a))
+#define AND_W(a, b) CHOICE(andw ARG2(a,b), andw ARG2(a,b), _WTOG and ARG2(b,a))
+#define AND_B(a, b) CHOICE(andb ARG2(a,b), andb ARG2(a,b), andb ARG2(b,a))
+#define ARPL(a,b) CHOICE(arpl ARG2(a,b), arpl ARG2(a,b), arpl ARG2(b,a))
+#define BOUND_L(a, b) CHOICE(boundl ARG2(a,b), boundl ARG2(b,a), _LTOG bound ARG2(b,a))
+#define BOUND_W(a, b) CHOICE(boundw ARG2(a,b), boundw ARG2(b,a), _WTOG bound ARG2(b,a))
+#define BSF_L(a, b) CHOICE(bsfl ARG2(a,b), bsfl ARG2(a,b), _LTOG bsf ARG2(b,a))
+#define BSF_W(a, b) CHOICE(bsfw ARG2(a,b), bsfw ARG2(a,b), _WTOG bsf ARG2(b,a))
+#define BSR_L(a, b) CHOICE(bsrl ARG2(a,b), bsrl ARG2(a,b), _LTOG bsr ARG2(b,a))
+#define BSR_W(a, b) CHOICE(bsrw ARG2(a,b), bsrw ARG2(a,b), _WTOG bsr ARG2(b,a))
+#define BT_L(a, b) CHOICE(btl ARG2(a,b), btl ARG2(a,b), _LTOG bt ARG2(b,a))
+#define BT_W(a, b) CHOICE(btw ARG2(a,b), btw ARG2(a,b), _WTOG bt ARG2(b,a))
+#define BTC_L(a, b) CHOICE(btcl ARG2(a,b), btcl ARG2(a,b), _LTOG btc ARG2(b,a))
+#define BTC_W(a, b) CHOICE(btcw ARG2(a,b), btcw ARG2(a,b), _WTOG btc ARG2(b,a))
+#define BTR_L(a, b) CHOICE(btrl ARG2(a,b), btrl ARG2(a,b), _LTOG btr ARG2(b,a))
+#define BTR_W(a, b) CHOICE(btrw ARG2(a,b), btrw ARG2(a,b), _WTOG btr ARG2(b,a))
+#define BTS_L(a, b) CHOICE(btsl ARG2(a,b), btsl ARG2(a,b), _LTOG bts ARG2(b,a))
+#define BTS_W(a, b) CHOICE(btsw ARG2(a,b), btsw ARG2(a,b), _WTOG bts ARG2(b,a))
+#define CALL(a) CHOICE(call a, call a, call a)
+#define CALLF(s,a) CHOICE(lcall ARG2(s,a), lcall ARG2(s,a), callf s:a)
+#define CBW CHOICE(cbtw, cbw, cbw)
+#define CWDE CHOICE(cwtd, cwde, cwde)
+#define CLC CHOICE(clc, clc, clc)
+#define CLD CHOICE(cld, cld, cld)
+#define CLI CHOICE(cli, cli, cli)
+#define CLTS CHOICE(clts, clts, clts)
+#define CMC CHOICE(cmc, cmc, cmc)
+#define CMP_L(a, b) CHOICE(cmpl ARG2(a,b), cmpl ARG2(a,b), _LTOG cmp ARG2(b,a))
+#define CMP_W(a, b) CHOICE(cmpw ARG2(a,b), cmpw ARG2(a,b), _WTOG cmp ARG2(b,a))
+#define CMP_B(a, b) CHOICE(cmpb ARG2(a,b), cmpb ARG2(a,b), cmpb ARG2(b,a))
+#define CMPS_L CHOICE(cmpsl, cmpsl, _LTOG cmps)
+#define CMPS_W CHOICE(cmpsw, cmpsw, _WTOG cmps)
+#define CMPS_B CHOICE(cmpsb, cmpsb, cmpsb)
+#define CWD CHOICE(cwtl, cwd, cwd)
+#define CDQ CHOICE(cltd, cdq, cdq)
+#define DAA CHOICE(daa, daa, daa)
+#define DAS CHOICE(das, das, das)
+#define DEC_L(a) CHOICE(decl a, decl a, _LTOG dec a)
+#define DEC_W(a) CHOICE(decw a, decw a, _WTOG dec a)
+#define DEC_B(a) CHOICE(decb a, decb a, decb a)
+#define DIV_L(a) CHOICE(divl a, divl a, div a)
+#define DIV_W(a) CHOICE(divw a, divw a, div a)
+#define DIV_B(a) CHOICE(divb a, divb a, divb a)
+#define ENTER(a,b) CHOICE(enter ARG2(a,b), enter ARG2(a,b), enter ARG2(b,a))
+#define HLT CHOICE(hlt, hlt, hlt)
+#define IDIV_L(a) CHOICE(idivl a, idivl a, _LTOG idiv a)
+#define IDIV_W(a) CHOICE(idivw a, idivw a, _WTOG idiv a)
+#define IDIV_B(a) CHOICE(idivb a, idivb a, idivb a)
+/* More forms than this for imul!! */
+#define IMUL_L(a, b) CHOICE(imull ARG2(a,b), imull ARG2(a,b), _LTOG imul ARG2(b,a))
+#define IMUL_W(a, b) CHOICE(imulw ARG2(a,b), imulw ARG2(a,b), _WTOG imul ARG2(b,a))
+#define IMUL_B(a) CHOICE(imulb a, imulb a, imulb a)
+#define IN_L CHOICE(inl (DX), inl ARG2(DX,EAX), _LTOG in DX)
+#define IN_W CHOICE(inw (DX), inw ARG2(DX,AX), _WTOG in DX)
+#define IN_B CHOICE(inb (DX), inb ARG2(DX,AL), inb DX)
+/* Please AS code writer: use the following ONLY, if you refer to ports<256
+ * directly, but not in IN1_W(DX), for instance, even if IN1_ looks nicer
+ */
+#if defined (sun)
+#define IN1_L(a) CHOICE(inl (a), inl ARG2(a,EAX), _LTOG in a)
+#define IN1_W(a) CHOICE(inw (a), inw ARG2(a,AX), _WTOG in a)
+#define IN1_B(a) CHOICE(inb (a), inb ARG2(a,AL), inb a)
+#else
+#define IN1_L(a) CHOICE(inl a, inl ARG2(a,EAX), _LTOG in a)
+#define IN1_W(a) CHOICE(inw a, inw ARG2(a,AX), _WTOG in a)
+#define IN1_B(a) CHOICE(inb a, inb ARG2(a,AL), inb a)
+#endif
+#define INC_L(a) CHOICE(incl a, incl a, _LTOG inc a)
+#define INC_W(a) CHOICE(incw a, incw a, _WTOG inc a)
+#define INC_B(a) CHOICE(incb a, incb a, incb a)
+#define INS_L CHOICE(insl, insl, _LTOG ins)
+#define INS_W CHOICE(insw, insw, _WTOG ins)
+#define INS_B CHOICE(insb, insb, insb)
+#define INT(a) CHOICE(int a, int a, int a)
+#define INT3 CHOICE(int CONST(3), int3, int CONST(3))
+#define INTO CHOICE(into, into, into)
+#define IRET CHOICE(iret, iret, iret)
+#define IRETD CHOICE(iret, iret, iretd)
+#define JA(a) CHOICE(ja a, ja a, ja a)
+#define JAE(a) CHOICE(jae a, jae a, jae a)
+#define JB(a) CHOICE(jb a, jb a, jb a)
+#define JBE(a) CHOICE(jbe a, jbe a, jbe a)
+#define JC(a) CHOICE(jc a, jc a, jc a)
+#define JE(a) CHOICE(je a, je a, je a)
+#define JG(a) CHOICE(jg a, jg a, jg a)
+#define JGE(a) CHOICE(jge a, jge a, jge a)
+#define JL(a) CHOICE(jl a, jl a, jl a)
+#define JLE(a) CHOICE(jle a, jle a, jle a)
+#define JNA(a) CHOICE(jna a, jna a, jna a)
+#define JNAE(a) CHOICE(jnae a, jnae a, jnae a)
+#define JNB(a) CHOICE(jnb a, jnb a, jnb a)
+#define JNBE(a) CHOICE(jnbe a, jnbe a, jnbe a)
+#define JNC(a) CHOICE(jnc a, jnc a, jnc a)
+#define JNE(a) CHOICE(jne a, jne a, jne a)
+#define JNG(a) CHOICE(jng a, jng a, jng a)
+#define JNGE(a) CHOICE(jnge a, jnge a, jnge a)
+#define JNL(a) CHOICE(jnl a, jnl a, jnl a)
+#define JNLE(a) CHOICE(jnle a, jnle a, jnle a)
+#define JNO(a) CHOICE(jno a, jno a, jno a)
+#define JNP(a) CHOICE(jnp a, jnp a, jnp a)
+#define JNS(a) CHOICE(jns a, jns a, jns a)
+#define JNZ(a) CHOICE(jnz a, jnz a, jnz a)
+#define JO(a) CHOICE(jo a, jo a, jo a)
+#define JP(a) CHOICE(jp a, jp a, jp a)
+#define JPE(a) CHOICE(jpe a, jpe a, jpe a)
+#define JPO(a) CHOICE(jpo a, jpo a, jpo a)
+#define JS(a) CHOICE(js a, js a, js a)
+#define JZ(a) CHOICE(jz a, jz a, jz a)
+#define JMP(a) CHOICE(jmp a, jmp a, jmp a)
+#define JMPF(s,a) CHOICE(ljmp ARG2(s,a), ljmp ARG2(s,a), jmpf s:a)
+#define LAHF CHOICE(lahf, lahf, lahf)
+#if !defined(_REAL_MODE) && !defined(_V86_MODE)
+#define LAR(a, b) CHOICE(lar ARG2(a, b), lar ARG2(a, b), lar ARG2(b, a))
+#endif
+#define LEA_L(a, b) CHOICE(leal ARG2(a,b), leal ARG2(a,b), _LTOG lea ARG2(b,a))
+#define LEA_W(a, b) CHOICE(leaw ARG2(a,b), leaw ARG2(a,b), _WTOG lea ARG2(b,a))
+#define LEAVE CHOICE(leave, leave, leave)
+#define LGDT(a) CHOICE(lgdt a, lgdt a, lgdt a)
+#define LIDT(a) CHOICE(lidt a, lidt a, lidt a)
+#define LDS(a, b) CHOICE(ldsl ARG2(a,b), lds ARG2(a,b), lds ARG2(b,a))
+#define LES(a, b) CHOICE(lesl ARG2(a,b), les ARG2(a,b), les ARG2(b,a))
+#define LFS(a, b) CHOICE(lfsl ARG2(a,b), lfs ARG2(a,b), lfs ARG2(b,a))
+#define LGS(a, b) CHOICE(lgsl ARG2(a,b), lgs ARG2(a,b), lgs ARG2(b,a))
+#define LSS(a, b) CHOICE(lssl ARG2(a,b), lss ARG2(a,b), lss ARG2(b,a))
+#define LLDT(a) CHOICE(lldt a, lldt a, lldt a)
+#define LMSW(a) CHOICE(lmsw a, lmsw a, lmsw a)
+#define LOCK CHOICE(lock, lock, lock)
+#define LODS_L CHOICE(lodsl, lodsl, _LTOG lods)
+#define LODS_W CHOICE(lodsw, lodsw, _WTOG lods)
+#define LODS_B CHOICE(lodsb, lodsb, lodsb)
+#define LOOP(a) CHOICE(loop a, loop a, loop a)
+#define LOOPE(a) CHOICE(loope a, loope a, loope a)
+#define LOOPZ(a) CHOICE(loopz a, loopz a, loopz a)
+#define LOOPNE(a) CHOICE(loopne a, loopne a, loopne a)
+#define LOOPNZ(a) CHOICE(loopnz a, loopnz a, loopnz a)
+#if !defined(_REAL_MODE) && !defined(_V86_MODE)
+#define LSL(a, b) CHOICE(lsl ARG2(a,b), lsl ARG2(a,b), lsl ARG2(b,a))
+#endif
+#define LTR(a) CHOICE(ltr a, ltr a, ltr a)
+#define MOV_SR(a, b) CHOICE(movw ARG2(a,b), mov ARG2(a,b), mov ARG2(b,a))
+#define MOV_L(a, b) CHOICE(movl ARG2(a,b), movl ARG2(a,b), _LTOG mov ARG2(b,a))
+#define MOV_W(a, b) CHOICE(movw ARG2(a,b), movw ARG2(a,b), _WTOG mov ARG2(b,a))
+#define MOV_B(a, b) CHOICE(movb ARG2(a,b), movb ARG2(a,b), movb ARG2(b,a))
+#define MOVS_L CHOICE(movsl, movsl, _LTOG movs)
+#define MOVS_W CHOICE(movsw, movsw, _WTOG movs)
+#define MOVS_B CHOICE(movsb, movsb, movsb)
+#define MOVSX_BL(a, b) CHOICE(movsbl ARG2(a,b), movsbl ARG2(a,b), movsx ARG2(b,a))
+#define MOVSX_BW(a, b) CHOICE(movsbw ARG2(a,b), movsbw ARG2(a,b), movsx ARG2(b,a))
+#define MOVSX_WL(a, b) CHOICE(movswl ARG2(a,b), movswl ARG2(a,b), movsx ARG2(b,a))
+#define MOVZX_BL(a, b) CHOICE(movzbl ARG2(a,b), movzbl ARG2(a,b), movzx ARG2(b,a))
+#define MOVZX_BW(a, b) CHOICE(movzbw ARG2(a,b), movzbw ARG2(a,b), movzx ARG2(b,a))
+#define MOVZX_WL(a, b) CHOICE(movzwl ARG2(a,b), movzwl ARG2(a,b), movzx ARG2(b,a))
+#define MUL_L(a) CHOICE(mull a, mull a, _LTOG mul a)
+#define MUL_W(a) CHOICE(mulw a, mulw a, _WTOG mul a)
+#define MUL_B(a) CHOICE(mulb a, mulb a, mulb a)
+#define NEG_L(a) CHOICE(negl a, negl a, _LTOG neg a)
+#define NEG_W(a) CHOICE(negw a, negw a, _WTOG neg a)
+#define NEG_B(a) CHOICE(negb a, negb a, negb a)
+#define NOP CHOICE(nop, nop, nop)
+#define NOT_L(a) CHOICE(notl a, notl a, _LTOG not a)
+#define NOT_W(a) CHOICE(notw a, notw a, _WTOG not a)
+#define NOT_B(a) CHOICE(notb a, notb a, notb a)
+#define OR_L(a,b) CHOICE(orl ARG2(a,b), orl ARG2(a,b), _LTOG or ARG2(b,a))
+#define OR_W(a,b) CHOICE(orw ARG2(a,b), orw ARG2(a,b), _WTOG or ARG2(b,a))
+#define OR_B(a,b) CHOICE(orb ARG2(a,b), orb ARG2(a,b), orb ARG2(b,a))
+#define OUT_L CHOICE(outl (DX), outl ARG2(EAX,DX), _LTOG out DX)
+#define OUT_W CHOICE(outw (DX), outw ARG2(AX,DX), _WTOG out DX)
+#define OUT_B CHOICE(outb (DX), outb ARG2(AL,DX), outb DX)
+/* Please AS code writer: use the following ONLY, if you refer to ports<256
+ * directly, but not in OUT1_W(DX), for instance, even if OUT1_ looks nicer
+ */
+#define OUT1_L(a) CHOICE(outl (a), outl ARG2(EAX,a), _LTOG out a)
+#define OUT1_W(a) CHOICE(outw (a), outw ARG2(AX,a), _WTOG out a)
+#define OUT1_B(a) CHOICE(outb (a), outb ARG2(AL,a), outb a)
+#define OUTS_L CHOICE(outsl, outsl, _LTOG outs)
+#define OUTS_W CHOICE(outsw, outsw, _WTOG outs)
+#define OUTS_B CHOICE(outsb, outsb, outsb)
+#define POP_SR(a) CHOICE(pop a, pop a, pop a)
+#define POP_L(a) CHOICE(popl a, popl a, _LTOG pop a)
+#define POP_W(a) CHOICE(popw a, popw a, _WTOG pop a)
+#define POPA_L CHOICE(popal, popal, _LTOG popa)
+#define POPA_W CHOICE(popaw, popaw, _WTOG popa)
+#define POPF_L CHOICE(popfl, popfl, _LTOG popf)
+#define POPF_W CHOICE(popfw, popfw, _WTOG popf)
+#define PUSH_SR(a) CHOICE(push a, push a, push a)
+#define PUSH_L(a) CHOICE(pushl a, pushl a, _LTOG push a)
+#define PUSH_W(a) CHOICE(pushw a, pushw a, _WTOG push a)
+#define PUSH_B(a) CHOICE(push a, pushb a, push a)
+#define PUSHA_L CHOICE(pushal, pushal, _LTOG pusha)
+#define PUSHA_W CHOICE(pushaw, pushaw, _WTOG pusha)
+#define PUSHF_L CHOICE(pushfl, pushfl, _LTOG pushf)
+#define PUSHF_W CHOICE(pushfw, pushfw, _WTOG pushf)
+#define RCL_L(a, b) CHOICE(rcll ARG2(a,b), rcll ARG2(a,b), _LTOG rcl ARG2(b,a))
+#define RCL_W(a, b) CHOICE(rclw ARG2(a,b), rclw ARG2(a,b), _WTOG rcl ARG2(b,a))
+#define RCL_B(a, b) CHOICE(rclb ARG2(a,b), rclb ARG2(a,b), rclb ARG2(b,a))
+#define RCR_L(a, b) CHOICE(rcrl ARG2(a,b), rcrl ARG2(a,b), _LTOG rcr ARG2(b,a))
+#define RCR_W(a, b) CHOICE(rcrw ARG2(a,b), rcrw ARG2(a,b), _WTOG rcr ARG2(b,a))
+#define RCR_B(a, b) CHOICE(rcrb ARG2(a,b), rcrb ARG2(a,b), rcrb ARG2(b,a))
+#define ROL_L(a, b) CHOICE(roll ARG2(a,b), roll ARG2(a,b), _LTOG rol ARG2(b,a))
+#define ROL_W(a, b) CHOICE(rolw ARG2(a,b), rolw ARG2(a,b), _WTOG rol ARG2(b,a))
+#define ROL_B(a, b) CHOICE(rolb ARG2(a,b), rolb ARG2(a,b), rolb ARG2(b,a))
+#define ROR_L(a, b) CHOICE(rorl ARG2(a,b), rorl ARG2(a,b), _LTOG ror ARG2(b,a))
+#define ROR_W(a, b) CHOICE(rorw ARG2(a,b), rorw ARG2(a,b), _WTOG ror ARG2(b,a))
+#define ROR_B(a, b) CHOICE(rorb ARG2(a,b), rorb ARG2(a,b), rorb ARG2(b,a))
+#define REP CHOICE(rep ;, rep ;, repe)
+#define REPE CHOICE(repz ;, repe ;, repe)
+#define REPNE CHOICE(repnz ;, repne ;, repne)
+#define REPNZ REPNE
+#define REPZ REPE
+#define RET CHOICE(ret, ret, ret)
+#define SAHF CHOICE(sahf, sahf, sahf)
+#define SAL_L(a, b) CHOICE(sall ARG2(a,b), sall ARG2(a,b), _LTOG sal ARG2(b,a))
+#define SAL_W(a, b) CHOICE(salw ARG2(a,b), salw ARG2(a,b), _WTOG sal ARG2(b,a))
+#define SAL_B(a, b) CHOICE(salb ARG2(a,b), salb ARG2(a,b), salb ARG2(b,a))
+#define SAR_L(a, b) CHOICE(sarl ARG2(a,b), sarl ARG2(a,b), _LTOG sar ARG2(b,a))
+#define SAR_W(a, b) CHOICE(sarw ARG2(a,b), sarw ARG2(a,b), _WTOG sar ARG2(b,a))
+#define SAR_B(a, b) CHOICE(sarb ARG2(a,b), sarb ARG2(a,b), sarb ARG2(b,a))
+#define SBB_L(a, b) CHOICE(sbbl ARG2(a,b), sbbl ARG2(a,b), _LTOG sbb ARG2(b,a))
+#define SBB_W(a, b) CHOICE(sbbw ARG2(a,b), sbbw ARG2(a,b), _WTOG sbb ARG2(b,a))
+#define SBB_B(a, b) CHOICE(sbbb ARG2(a,b), sbbb ARG2(a,b), sbbb ARG2(b,a))
+#define SCAS_L CHOICE(scasl, scasl, _LTOG scas)
+#define SCAS_W CHOICE(scasw, scasw, _WTOG scas)
+#define SCAS_B CHOICE(scasb, scasb, scasb)
+#define SETA(a) CHOICE(seta a, seta a, seta a)
+#define SETAE(a) CHOICE(setae a, setae a, setae a)
+#define SETB(a) CHOICE(setb a, setb a, setb a)
+#define SETBE(a) CHOICE(setbe a, setbe a, setbe a)
+#define SETC(a) CHOICE(setc a, setb a, setb a)
+#define SETE(a) CHOICE(sete a, sete a, sete a)
+#define SETG(a) CHOICE(setg a, setg a, setg a)
+#define SETGE(a) CHOICE(setge a, setge a, setge a)
+#define SETL(a) CHOICE(setl a, setl a, setl a)
+#define SETLE(a) CHOICE(setle a, setle a, setle a)
+#define SETNA(a) CHOICE(setna a, setna a, setna a)
+#define SETNAE(a) CHOICE(setnae a, setnae a, setnae a)
+#define SETNB(a) CHOICE(setnb a, setnb a, setnb a)
+#define SETNBE(a) CHOICE(setnbe a, setnbe a, setnbe a)
+#define SETNC(a) CHOICE(setnc a, setnb a, setnb a)
+#define SETNE(a) CHOICE(setne a, setne a, setne a)
+#define SETNG(a) CHOICE(setng a, setng a, setng a)
+#define SETNGE(a) CHOICE(setnge a, setnge a, setnge a)
+#define SETNL(a) CHOICE(setnl a, setnl a, setnl a)
+#define SETNLE(a) CHOICE(setnle a, setnle a, setnle a)
+#define SETNO(a) CHOICE(setno a, setno a, setno a)
+#define SETNP(a) CHOICE(setnp a, setnp a, setnp a)
+#define SETNS(a) CHOICE(setns a, setns a, setna a)
+#define SETNZ(a) CHOICE(setnz a, setnz a, setnz a)
+#define SETO(a) CHOICE(seto a, seto a, seto a)
+#define SETP(a) CHOICE(setp a, setp a, setp a)
+#define SETPE(a) CHOICE(setpe a, setpe a, setpe a)
+#define SETPO(a) CHOICE(setpo a, setpo a, setpo a)
+#define SETS(a) CHOICE(sets a, sets a, seta a)
+#define SETZ(a) CHOICE(setz a, setz a, setz a)
+#define SGDT(a) CHOICE(sgdt a, sgdt a, sgdt a)
+#define SIDT(a) CHOICE(sidt a, sidt a, sidt a)
+#define SHL_L(a, b) CHOICE(shll ARG2(a,b), shll ARG2(a,b), _LTOG shl ARG2(b,a))
+#define SHL_W(a, b) CHOICE(shlw ARG2(a,b), shlw ARG2(a,b), _WTOG shl ARG2(b,a))
+#define SHL_B(a, b) CHOICE(shlb ARG2(a,b), shlb ARG2(a,b), shlb ARG2(b,a))
+#define SHLD_L(a,b,c) CHOICE(shldl ARG3(a,b,c), shldl ARG3(a,b,c), _LTOG shld ARG3(c,b,a))
+#define SHLD2_L(a,b) CHOICE(shldl ARG2(a,b), shldl ARG3(CL,a,b), _LTOG shld ARG3(b,a,CL))
+#define SHLD_W(a,b,c) CHOICE(shldw ARG3(a,b,c), shldw ARG3(a,b,c), _WTOG shld ARG3(c,b,a))
+#define SHLD2_W(a,b) CHOICE(shldw ARG2(a,b), shldw ARG3(CL,a,b), _WTOG shld ARG3(b,a,CL))
+#define SHR_L(a, b) CHOICE(shrl ARG2(a,b), shrl ARG2(a,b), _LTOG shr ARG2(b,a))
+#define SHR_W(a, b) CHOICE(shrw ARG2(a,b), shrw ARG2(a,b), _WTOG shr ARG2(b,a))
+#define SHR_B(a, b) CHOICE(shrb ARG2(a,b), shrb ARG2(a,b), shrb ARG2(b,a))
+#define SHRD_L(a,b,c) CHOICE(shrdl ARG3(a,b,c), shrdl ARG3(a,b,c), _LTOG shrd ARG3(c,b,a))
+#define SHRD2_L(a,b) CHOICE(shrdl ARG2(a,b), shrdl ARG3(CL,a,b), _LTOG shrd ARG3(b,a,CL))
+#define SHRD_W(a,b,c) CHOICE(shrdw ARG3(a,b,c), shrdw ARG3(a,b,c), _WTOG shrd ARG3(c,b,a))
+#define SHRD2_W(a,b) CHOICE(shrdw ARG2(a,b), shrdw ARG3(CL,a,b), _WTOG shrd ARG3(b,a,CL))
+#define SLDT(a) CHOICE(sldt a, sldt a, sldt a)
+#define SMSW(a) CHOICE(smsw a, smsw a, smsw a)
+#define STC CHOICE(stc, stc, stc)
+#define STD CHOICE(std, std, std)
+#define STI CHOICE(sti, sti, sti)
+#define STOS_L CHOICE(stosl, stosl, _LTOG stos)
+#define STOS_W CHOICE(stosw, stosw, _WTOG stos)
+#define STOS_B CHOICE(stosb, stosb, stosb)
+#define STR(a) CHOICE(str a, str a, str a)
+#define SUB_L(a, b) CHOICE(subl ARG2(a,b), subl ARG2(a,b), _LTOG sub ARG2(b,a))
+#define SUB_W(a, b) CHOICE(subw ARG2(a,b), subw ARG2(a,b), _WTOG sub ARG2(b,a))
+#define SUB_B(a, b) CHOICE(subb ARG2(a,b), subb ARG2(a,b), subb ARG2(b,a))
+#define TEST_L(a, b) CHOICE(testl ARG2(a,b), testl ARG2(a,b), _LTOG test ARG2(b,a))
+#define TEST_W(a, b) CHOICE(testw ARG2(a,b), testw ARG2(a,b), _WTOG test ARG2(b,a))
+#define TEST_B(a, b) CHOICE(testb ARG2(a,b), testb ARG2(a,b), testb ARG2(b,a))
+#define VERR(a) CHOICE(verr a, verr a, verr a)
+#define VERW(a) CHOICE(verw a, verw a, verw a)
+#define WAIT CHOICE(wait, wait, wait)
+#define XCHG_L(a, b) CHOICE(xchgl ARG2(a,b), xchgl ARG2(a,b), _LTOG xchg ARG2(b,a))
+#define XCHG_W(a, b) CHOICE(xchgw ARG2(a,b), xchgw ARG2(a,b), _WTOG xchg ARG2(b,a))
+#define XCHG_B(a, b) CHOICE(xchgb ARG2(a,b), xchgb ARG2(a,b), xchgb ARG2(b,a))
+#define XLAT CHOICE(xlat, xlat, xlat)
+#define XOR_L(a, b) CHOICE(xorl ARG2(a,b), xorl ARG2(a,b), _LTOG xor ARG2(b,a))
+#define XOR_W(a, b) CHOICE(xorw ARG2(a,b), xorw ARG2(a,b), _WTOG xor ARG2(b,a))
+#define XOR_B(a, b) CHOICE(xorb ARG2(a,b), xorb ARG2(a,b), xorb ARG2(b,a))
+
+
+/* Floating Point Instructions */
+#define F2XM1 CHOICE(f2xm1, f2xm1, f2xm1)
+#define FABS CHOICE(fabs, fabs, fabs)
+#define FADD_D(a) CHOICE(faddl a, faddl a, faddd a)
+#define FADD_S(a) CHOICE(fadds a, fadds a, fadds a)
+#define FADD2(a, b) CHOICE(fadd ARG2(a,b), fadd ARG2(a,b), fadd ARG2(b,a))
+#define FADDP(a, b) CHOICE(faddp ARG2(a,b), faddp ARG2(a,b), faddp ARG2(b,a))
+#define FIADD_L(a) CHOICE(fiaddl a, fiaddl a, fiaddl a)
+#define FIADD_W(a) CHOICE(fiadd a, fiadds a, fiadds a)
+#define FBLD(a) CHOICE(fbld a, fbld a, fbld a)
+#define FBSTP(a) CHOICE(fbstp a, fbstp a, fbstp a)
+#define FCHS CHOICE(fchs, fchs, fchs)
+#define FCLEX CHOICE(fclex, wait; fnclex, wait; fclex)
+#define FNCLEX CHOICE(fnclex, fnclex, fclex)
+#define FCOM(a) CHOICE(fcom a, fcom a, fcom a)
+#define FCOM_D(a) CHOICE(fcoml a, fcoml a, fcomd a)
+#define FCOM_S(a) CHOICE(fcoms a, fcoms a, fcoms a)
+#define FCOMP(a) CHOICE(fcomp a, fcomp a, fcomp a)
+#define FCOMP_D(a) CHOICE(fcompl a, fcompl a, fcompd a)
+#define FCOMP_S(a) CHOICE(fcomps a, fcomps a, fcomps a)
+#define FCOMPP CHOICE(fcompp, fcompp, fcompp)
+#define FCOS CHOICE(fcos, fcos, fcos)
+#define FDECSTP CHOICE(fdecstp, fdecstp, fdecstp)
+#define FDIV_D(a) CHOICE(fdivl a, fdivl a, fdivd a)
+#define FDIV_S(a) CHOICE(fdivs a, fdivs a, fdivs a)
+#define FDIV2(a, b) CHOICE(fdiv ARG2(a,b), fdiv ARG2(a,b), fdiv ARG2(b,a))
+#define FDIVP(a, b) CHOICE(fdivp ARG2(a,b), fdivp ARG2(a,b), fdivp ARG2(b,a))
+#define FIDIV_L(a) CHOICE(fidivl a, fidivl a, fidivl a)
+#define FIDIV_W(a) CHOICE(fidiv a, fidivs a, fidivs a)
+#define FDIVR_D(a) CHOICE(fdivrl a, fdivrl a, fdivrd a)
+#define FDIVR_S(a) CHOICE(fdivrs a, fdivrs a, fdivrs a)
+#define FDIVR2(a, b) CHOICE(fdivr ARG2(a,b), fdivr ARG2(a,b), fdivr ARG2(b,a))
+#define FDIVRP(a, b) CHOICE(fdivrp ARG2(a,b), fdivrp ARG2(a,b), fdivrp ARG2(b,a))
+#define FIDIVR_L(a) CHOICE(fidivrl a, fidivrl a, fidivrl a)
+#define FIDIVR_W(a) CHOICE(fidivr a, fidivrs a, fidivrs a)
+#define FFREE(a) CHOICE(ffree a, ffree a, ffree a)
+#define FICOM_L(a) CHOICE(ficoml a, ficoml a, ficoml a)
+#define FICOM_W(a) CHOICE(ficom a, ficoms a, ficoms a)
+#define FICOMP_L(a) CHOICE(ficompl a, ficompl a, ficompl a)
+#define FICOMP_W(a) CHOICE(ficomp a, ficomps a, ficomps a)
+#define FILD_Q(a) CHOICE(fildll a, fildq a, fildq a)
+#define FILD_L(a) CHOICE(fildl a, fildl a, fildl a)
+#define FILD_W(a) CHOICE(fild a, filds a, filds a)
+#define FINCSTP CHOICE(fincstp, fincstp, fincstp)
+#define FINIT CHOICE(finit, wait; fninit, wait; finit)
+#define FNINIT CHOICE(fninit, fninit, finit)
+#define FIST_L(a) CHOICE(fistl a, fistl a, fistl a)
+#define FIST_W(a) CHOICE(fist a, fists a, fists a)
+#define FISTP_Q(a) CHOICE(fistpll a, fistpq a, fistpq a)
+#define FISTP_L(a) CHOICE(fistpl a, fistpl a, fistpl a)
+#define FISTP_W(a) CHOICE(fistp a, fistps a, fistps a)
+#define FLD_X(a) CHOICE(fldt a, fldt a, fldx a) /* 80 bit data type! */
+#define FLD_D(a) CHOICE(fldl a, fldl a, fldd a)
+#define FLD_S(a) CHOICE(flds a, flds a, flds a)
+#define FLD1 CHOICE(fld1, fld1, fld1)
+#define FLDL2T CHOICE(fldl2t, fldl2t, fldl2t)
+#define FLDL2E CHOICE(fldl2e, fldl2e, fldl2e)
+#define FLDPI CHOICE(fldpi, fldpi, fldpi)
+#define FLDLG2 CHOICE(fldlg2, fldlg2, fldlg2)
+#define FLDLN2 CHOICE(fldln2, fldln2, fldln2)
+#define FLDZ CHOICE(fldz, fldz, fldz)
+#define FLDCW(a) CHOICE(fldcw a, fldcw a, fldcw a)
+#define FLDENV(a) CHOICE(fldenv a, fldenv a, fldenv a)
+#define FMUL_S(a) CHOICE(fmuls a, fmuls a, fmuls a)
+#define FMUL_D(a) CHOICE(fmull a, fmull a, fmuld a)
+#define FMUL2(a, b) CHOICE(fmul ARG2(a,b), fmul ARG2(a,b), fmul ARG2(b,a))
+#define FMULP(a, b) CHOICE(fmulp ARG2(a,b), fmulp ARG2(a,b), fmulp ARG2(b,a))
+#define FIMUL_L(a) CHOICE(fimull a, fimull a, fimull a)
+#define FIMUL_W(a) CHOICE(fimul a, fimuls a, fimuls a)
+#define FNOP CHOICE(fnop, fnop, fnop)
+#define FPATAN CHOICE(fpatan, fpatan, fpatan)
+#define FPREM CHOICE(fprem, fprem, fprem)
+#define FPREM1 CHOICE(fprem1, fprem1, fprem1)
+#define FPTAN CHOICE(fptan, fptan, fptan)
+#define FRNDINT CHOICE(frndint, frndint, frndint)
+#define FRSTOR(a) CHOICE(frstor a, frstor a, frstor a)
+#define FSAVE(a) CHOICE(fsave a, wait; fnsave a, wait; fsave a)
+#define FNSAVE(a) CHOICE(fnsave a, fnsave a, fsave a)
+#define FSCALE CHOICE(fscale, fscale, fscale)
+#define FSIN CHOICE(fsin, fsin, fsin)
+#define FSINCOS CHOICE(fsincos, fsincos, fsincos)
+#define FSQRT CHOICE(fsqrt, fsqrt, fsqrt)
+#define FST_D(a) CHOICE(fstl a, fstl a, fstd a)
+#define FST_S(a) CHOICE(fsts a, fsts a, fsts a)
+#define FSTP_X(a) CHOICE(fstpt a, fstpt a, fstpx a)
+#define FSTP_D(a) CHOICE(fstpl a, fstpl a, fstpd a)
+#define FSTP_S(a) CHOICE(fstps a, fstps a, fstps a)
+#define FSTP(a) CHOICE(fstp a, fstp a, fstp a)
+#define FSTCW(a) CHOICE(fstcw a, wait; fnstcw a, wait; fstcw a)
+#define FNSTCW(a) CHOICE(fnstcw a, fnstcw a, fstcw a)
+#define FSTENV(a) CHOICE(fstenv a, wait; fnstenv a, fstenv a)
+#define FNSTENV(a) CHOICE(fnstenv a, fnstenv a, fstenv a)
+#define FSTSW(a) CHOICE(fstsw a, wait; fnstsw a, wait; fstsw a)
+#define FNSTSW(a) CHOICE(fnstsw a, fnstsw a, fstsw a)
+#define FSUB_S(a) CHOICE(fsubs a, fsubs a, fsubs a)
+#define FSUB_D(a) CHOICE(fsubl a, fsubl a, fsubd a)
+#define FSUB2(a, b) CHOICE(fsub ARG2(a,b), fsub ARG2(a,b), fsub ARG2(b,a))
+#define FSUBP(a, b) CHOICE(fsubp ARG2(a,b), fsubp ARG2(a,b), fsubp ARG2(b,a))
+#define FISUB_L(a) CHOICE(fisubl a, fisubl a, fisubl a)
+#define FISUB_W(a) CHOICE(fisub a, fisubs a, fisubs a)
+#define FSUBR_S(a) CHOICE(fsubrs a, fsubrs a, fsubrs a)
+#define FSUBR_D(a) CHOICE(fsubrl a, fsubrl a, fsubrd a)
+#define FSUBR2(a, b) CHOICE(fsubr ARG2(a,b), fsubr ARG2(a,b), fsubr ARG2(b,a))
+#define FSUBRP(a, b) CHOICE(fsubrp ARG2(a,b), fsubrp ARG2(a,b), fsubrp ARG2(b,a))
+#define FISUBR_L(a) CHOICE(fisubrl a, fisubrl a, fisubrl a)
+#define FISUBR_W(a) CHOICE(fisubr a, fisubrs a, fisubrs a)
+#define FTST CHOICE(ftst, ftst, ftst)
+#define FUCOM(a) CHOICE(fucom a, fucom a, fucom a)
+#define FUCOMP(a) CHOICE(fucomp a, fucomp a, fucomp a)
+#define FUCOMPP CHOICE(fucompp, fucompp, fucompp)
+#define FWAIT CHOICE(wait, wait, wait)
+#define FXAM CHOICE(fxam, fxam, fxam)
+#define FXCH(a) CHOICE(fxch a, fxch a, fxch a)
+#define FXTRACT CHOICE(fxtract, fxtract, fxtract)
+#define FYL2X CHOICE(fyl2x, fyl2x, fyl2x)
+#define FYL2XP1 CHOICE(fyl2xp1, fyl2xp1, fyl2xp1)
+
+/* New instructions */
+#define CPUID CHOICE(D_BYTE ARG2(15, 162), cpuid, D_BYTE ARG2(15, 162))
+#define RDTSC CHOICE(D_BYTE ARG2(15, 49), rdtsc, D_BYTE ARG2(15, 49))
+
+#else /* NASM_ASSEMBLER || MASM_ASSEMBLER is defined */
+
+ /****************************************/
+ /* */
+ /* Intel style assemblers. */
+ /* (NASM and MASM) */
+ /* */
+ /****************************************/
+
+#define P_EAX EAX
+#define L_EAX EAX
+#define W_AX AX
+#define B_AH AH
+#define B_AL AL
+
+#define P_EBX EBX
+#define L_EBX EBX
+#define W_BX BX
+#define B_BH BH
+#define B_BL BL
+
+#define P_ECX ECX
+#define L_ECX ECX
+#define W_CX CX
+#define B_CH CH
+#define B_CL CL
+
+#define P_EDX EDX
+#define L_EDX EDX
+#define W_DX DX
+#define B_DH DH
+#define B_DL DL
+
+#define P_EBP EBP
+#define L_EBP EBP
+#define W_BP BP
+
+#define P_ESI ESI
+#define L_ESI ESI
+#define W_SI SI
+
+#define P_EDI EDI
+#define L_EDI EDI
+#define W_DI DI
+
+#define P_ESP ESP
+#define L_ESP ESP
+#define W_SP SP
+
+#define W_CS CS
+#define W_SS SS
+#define W_DS DS
+#define W_ES ES
+#define W_FS FS
+#define W_GS GS
+
+#define X_ST ST
+#define D_ST ST
+#define L_ST ST
+
+#define P_MM0 mm0
+#define P_MM1 mm1
+#define P_MM2 mm2
+#define P_MM3 mm3
+#define P_MM4 mm4
+#define P_MM5 mm5
+#define P_MM6 mm6
+#define P_MM7 mm7
+
+#define P_XMM0 xmm0
+#define P_XMM1 xmm1
+#define P_XMM2 xmm2
+#define P_XMM3 xmm3
+#define P_XMM4 xmm4
+#define P_XMM5 xmm5
+#define P_XMM6 xmm6
+#define P_XMM7 xmm7
+
+#define CONCAT(x, y) x ## y
+#define CONCAT3(x, y, z) x ## y ## z
+
+#if defined(NASM_ASSEMBLER)
+
+#define ST(n) st ## n
+#define ST0 st0
+
+#define TBYTE_PTR tword
+#define QWORD_PTR qword
+#define DWORD_PTR dword
+#define WORD_PTR word
+#define BYTE_PTR byte
+
+#define OFFSET
+
+#define GLOBL GLOBAL
+#define ALIGNTEXT32 ALIGN 32
+#define ALIGNTEXT16 ALIGN 16
+#define ALIGNTEXT8 ALIGN 8
+#define ALIGNTEXT4 ALIGN 4
+#define ALIGNTEXT2 ALIGN 2
+#define ALIGNTEXT32ifNOP ALIGN 32
+#define ALIGNTEXT16ifNOP ALIGN 16
+#define ALIGNTEXT8ifNOP ALIGN 8
+#define ALIGNTEXT4ifNOP ALIGN 4
+#define ALIGNDATA32 ALIGN 32
+#define ALIGNDATA16 ALIGN 16
+#define ALIGNDATA8 ALIGN 8
+#define ALIGNDATA4 ALIGN 4
+#define ALIGNDATA2 ALIGN 2
+#define FILE(s)
+#define STRING(s) db s
+#define D_LONG dd
+#define D_WORD dw
+#define D_BYTE db
+/* #define SPACE */
+/* #define COMM */
+#if defined(__WATCOMC__)
+SECTION _TEXT public align=16 class=CODE use32 flat
+SECTION _DATA public align=16 class=DATA use32 flat
+#define SEG_TEXT SECTION _TEXT
+#define SEG_DATA SECTION _DATA
+#define SEG_BSS SECTION .bss
+#else
+#define SEG_DATA SECTION .data
+#define SEG_TEXT SECTION .text
+#define SEG_BSS SECTION .bss
+#endif
+
+#define D_SPACE(n) db n REP 0
+
+#define AS_BEGIN
+
+/* Jcc's should be handled better than this... */
+#define NEAR near
+
+#else /* MASM */
+
+#define TBYTE_PTR tbyte ptr
+#define QWORD_PTR qword ptr
+#define DWORD_PTR dword ptr
+#define WORD_PTR word ptr
+#define BYTE_PTR byte ptr
+
+#define OFFSET offset
+
+#define GLOBL GLOBAL
+#define ALIGNTEXT32 ALIGN 32
+#define ALIGNTEXT16 ALIGN 16
+#define ALIGNTEXT8 ALIGN 8
+#define ALIGNTEXT4 ALIGN 4
+#define ALIGNTEXT2 ALIGN 2
+#define ALIGNTEXT32ifNOP ALIGN 32
+#define ALIGNTEXT16ifNOP ALIGN 16
+#define ALIGNTEXT8ifNOP ALIGN 8
+#define ALIGNTEXT4ifNOP ALIGN 4
+#define ALIGNDATA32 ALIGN 32
+#define ALIGNDATA16 ALIGN 16
+#define ALIGNDATA8 ALIGN 8
+#define ALIGNDATA4 ALIGN 4
+#define ALIGNDATA2 ALIGN 2
+#define FILE(s)
+#define STRING(s) db s
+#define D_LONG dd
+#define D_WORD dw
+#define D_BYTE db
+/* #define SPACE */
+/* #define COMM */
+#define SEG_DATA .DATA
+#define SEG_TEXT .CODE
+#define SEG_BSS .DATA
+
+#define D_SPACE(n) db n REP 0
+
+#define AS_BEGIN
+
+#define NEAR
+
+#endif
+
+#if defined(Lynx) || (defined(SYSV) || defined(SVR4)) \
+ || (defined(__linux__) || defined(__OS2ELF__)) && defined(__ELF__) \
+ || (defined(__FreeBSD__) && __FreeBSD__ >= 3) \
+ || (defined(__NetBSD__) && defined(__ELF__))
+#define GLNAME(a) a
+#else
+#define GLNAME(a) CONCAT(_, a)
+#endif
+
+/*
+ * Addressing Modes
+ */
+
+/* Immediate Mode */
+#define P_ADDR(a) OFFSET a
+#define X_ADDR(a) OFFSET a
+#define D_ADDR(a) OFFSET a
+#define L_ADDR(a) OFFSET a
+#define W_ADDR(a) OFFSET a
+#define B_ADDR(a) OFFSET a
+
+#define P_CONST(a) a
+#define X_CONST(a) a
+#define D_CONST(a) a
+#define L_CONST(a) a
+#define W_CONST(a) a
+#define B_CONST(a) a
+
+/* Indirect Mode */
+#ifdef NASM_ASSEMBLER
+#define P_CONTENT(a) [a]
+#define X_CONTENT(a) TBYTE_PTR [a]
+#define D_CONTENT(a) QWORD_PTR [a]
+#define L_CONTENT(a) DWORD_PTR [a]
+#define W_CONTENT(a) WORD_PTR [a]
+#define B_CONTENT(a) BYTE_PTR [a]
+#else
+#define P_CONTENT(a) a
+#define X_CONTENT(a) TBYTE_PTR a
+#define D_CONTENT(a) QWORD_PTR a
+#define L_CONTENT(a) DWORD_PTR a
+#define W_CONTENT(a) WORD_PTR a
+#define B_CONTENT(a) BYTE_PTR a
+#endif
+
+/* Register a indirect */
+#define P_REGIND(a) [a]
+#define X_REGIND(a) TBYTE_PTR [a]
+#define D_REGIND(a) QWORD_PTR [a]
+#define L_REGIND(a) DWORD_PTR [a]
+#define W_REGIND(a) WORD_PTR [a]
+#define B_REGIND(a) BYTE_PTR [a]
+
+/* Register b indirect plus displacement a */
+#define P_REGOFF(a, b) [b + a]
+#define X_REGOFF(a, b) TBYTE_PTR [b + a]
+#define D_REGOFF(a, b) QWORD_PTR [b + a]
+#define L_REGOFF(a, b) DWORD_PTR [b + a]
+#define W_REGOFF(a, b) WORD_PTR [b + a]
+#define B_REGOFF(a, b) BYTE_PTR [b + a]
+
+/* Reg indirect Base + Index + Displacement - this is mainly for 16-bit mode
+ * which has no scaling
+ */
+#define P_REGBID(b, i, d) [b + i + d]
+#define X_REGBID(b, i, d) TBYTE_PTR [b + i + d]
+#define D_REGBID(b, i, d) QWORD_PTR [b + i + d]
+#define L_REGBID(b, i, d) DWORD_PTR [b + i + d]
+#define W_REGBID(b, i, d) WORD_PTR [b + i + d]
+#define B_REGBID(b, i, d) BYTE_PTR [b + i + d]
+
+/* Reg indirect Base + (Index * Scale) */
+#define P_REGBIS(b, i, s) [b + i * s]
+#define X_REGBIS(b, i, s) TBYTE_PTR [b + i * s]
+#define D_REGBIS(b, i, s) QWORD_PTR [b + i * s]
+#define L_REGBIS(b, i, s) DWORD_PTR [b + i * s]
+#define W_REGBIS(b, i, s) WORD_PTR [b + i * s]
+#define B_REGBIS(b, i, s) BYTE_PTR [b + i * s]
+
+/* Reg indirect Base + (Index * Scale) + Displacement */
+#define P_REGBISD(b, i, s, d) [b + i * s + d]
+#define X_REGBISD(b, i, s, d) TBYTE_PTR [b + i * s + d]
+#define D_REGBISD(b, i, s, d) QWORD_PTR [b + i * s + d]
+#define L_REGBISD(b, i, s, d) DWORD_PTR [b + i * s + d]
+#define W_REGBISD(b, i, s, d) WORD_PTR [b + i * s + d]
+#define B_REGBISD(b, i, s, d) BYTE_PTR [b + i * s + d]
+
+/* Displaced Scaled Index: */
+#define P_REGDIS(d, i, s) [i * s + d]
+#define X_REGDIS(d, i, s) TBYTE_PTR [i * s + d]
+#define D_REGDIS(d, i, s) QWORD_PTR [i * s + d]
+#define L_REGDIS(d, i, s) DWORD_PTR [i * s + d]
+#define W_REGDIS(d, i, s) WORD_PTR [i * s + d]
+#define B_REGDIS(d, i, s) BYTE_PTR [i * s + d]
+
+/* Indexed Base: */
+#define P_REGBI(b, i) [b + i]
+#define X_REGBI(b, i) TBYTE_PTR [b + i]
+#define D_REGBI(b, i) QWORD_PTR [b + i]
+#define L_REGBI(b, i) DWORD_PTR [b + i]
+#define W_REGBI(b, i) WORD_PTR [b + i]
+#define B_REGBI(b, i) BYTE_PTR [b + i]
+
+/* Displaced Base: */
+#define P_REGDB(d, b) [b + d]
+#define X_REGDB(d, b) TBYTE_PTR [b + d]
+#define D_REGDB(d, b) QWORD_PTR [b + d]
+#define L_REGDB(d, b) DWORD_PTR [b + d]
+#define W_REGDB(d, b) WORD_PTR [b + d]
+#define B_REGDB(d, b) BYTE_PTR [b + d]
+
+/* Variable indirect: */
+#define VARINDIRECT(var) [var]
+
+/* Use register contents as jump/call target: */
+#define CODEPTR(reg) P_(reg)
+
+/*
+ * Redefine assembler commands
+ */
+
+#define P_(a) P_ ## a
+#define X_(a) X_ ## a
+#define D_(a) D_ ## a
+#define SR_(a) W_ ## a
+#define S_(a) L_ ## a
+#define L_(a) L_ ## a
+#define W_(a) W_ ## a
+#define B_(a) B_ ## a
+
+#define AAA aaa
+#define AAD aad
+#define AAM aam
+#define AAS aas
+#define ADC_L(a, b) adc L_(b), L_(a)
+#define ADC_W(a, b) adc W_(b), W_(a)
+#define ADC_B(a, b) adc B_(b), B_(a)
+#define ADD_L(a, b) add L_(b), L_(a)
+#define ADD_W(a, b) add W_(b), W_(a)
+#define ADD_B(a, b) add B_(b), B_(a)
+#define AND_L(a, b) and L_(b), L_(a)
+#define AND_W(a, b) and W_(b), W_(a)
+#define AND_B(a, b) and B_(b), B_(a)
+#define ARPL(a,b) arpl W_(b), a
+#define BOUND_L(a, b) bound L_(b), L_(a)
+#define BOUND_W(a, b) bound W_(b), W_(a)
+#define BSF_L(a, b) bsf L_(b), L_(a)
+#define BSF_W(a, b) bsf W_(b), W_(a)
+#define BSR_L(a, b) bsr L_(b), L_(a)
+#define BSR_W(a, b) bsr W_(b), W_(a)
+#define BT_L(a, b) bt L_(b), L_(a)
+#define BT_W(a, b) bt W_(b), W_(a)
+#define BTC_L(a, b) btc L_(b), L_(a)
+#define BTC_W(a, b) btc W_(b), W_(a)
+#define BTR_L(a, b) btr L_(b), L_(a)
+#define BTR_W(a, b) btr W_(b), W_(a)
+#define BTS_L(a, b) bts L_(b), L_(a)
+#define BTS_W(a, b) bts W_(b), W_(a)
+#define CALL(a) call a
+#define CALLF(s,a) call far s:a
+#define CBW cbw
+#define CWDE cwde
+#define CLC clc
+#define CLD cld
+#define CLI cli
+#define CLTS clts
+#define CMC cmc
+#define CMP_L(a, b) cmp L_(b), L_(a)
+#define CMP_W(a, b) cmp W_(b), W_(a)
+#define CMP_B(a, b) cmp B_(b), B_(a)
+#define CMPS_L cmpsd
+#define CMPS_W cmpsw
+#define CMPS_B cmpsb
+#define CPUID cpuid
+#define CWD cwd
+#define CDQ cdq
+#define DAA daa
+#define DAS das
+#define DEC_L(a) dec L_(a)
+#define DEC_W(a) dec W_(a)
+#define DEC_B(a) dec B_(a)
+#define DIV_L(a) div L_(a)
+#define DIV_W(a) div W_(a)
+#define DIV_B(a) div B_(a)
+#define ENTER(a,b) enter b, a
+#define HLT hlt
+#define IDIV_L(a) idiv L_(a)
+#define IDIV_W(a) idiv W_(a)
+#define IDIV_B(a) idiv B_(a)
+#define IMUL_L(a, b) imul L_(b), L_(a)
+#define IMUL_W(a, b) imul W_(b), W_(a)
+#define IMUL_B(a) imul B_(a)
+#define IN_L in EAX, DX
+#define IN_W in AX, DX
+#define IN_B in AL, DX
+#define IN1_L(a) in1 L_(a)
+#define IN1_W(a) in1 W_(a)
+#define IN1_B(a) in1 B_(a)
+#define INC_L(a) inc L_(a)
+#define INC_W(a) inc W_(a)
+#define INC_B(a) inc B_(a)
+#define INS_L ins
+#define INS_W ins
+#define INS_B ins
+#define INT(a) int B_(a)
+#define INT3 int3
+#define INTO into
+#define IRET iret
+#define IRETD iretd
+#define JA(a) ja NEAR a
+#define JAE(a) jae NEAR a
+#define JB(a) jb NEAR a
+#define JBE(a) jbe NEAR a
+#define JC(a) jc NEAR a
+#define JE(a) je NEAR a
+#define JG(a) jg NEAR a
+#define JGE(a) jge NEAR a
+#define JL(a) jl NEAR a
+#define JLE(a) jle NEAR a
+#define JNA(a) jna NEAR a
+#define JNAE(a) jnae NEAR a
+#define JNB(a) jnb NEAR a
+#define JNBE(a) jnbe NEAR a
+#define JNC(a) jnc NEAR a
+#define JNE(a) jne NEAR a
+#define JNG(a) jng NEAR a
+#define JNGE(a) jnge NEAR a
+#define JNL(a) jnl NEAR a
+#define JNLE(a) jnle NEAR a
+#define JNO(a) jno NEAR a
+#define JNP(a) jnp NEAR a
+#define JNS(a) jns NEAR a
+#define JNZ(a) jnz NEAR a
+#define JO(a) jo NEAR a
+#define JP(a) jp NEAR a
+#define JPE(a) jpe NEAR a
+#define JPO(a) jpo NEAR a
+#define JS(a) js NEAR a
+#define JZ(a) jz NEAR a
+#define JMP(a) jmp a
+#define JMPF(s,a) jmp far s:a
+#define LAHF lahf
+#define LAR(a, b) lar b, a
+#define LEA_L(a, b) lea P_(b), P_(a)
+#define LEA_W(a, b) lea P_(b), P_(a)
+#define LEAVE leave
+#define LGDT(a) lgdt a
+#define LIDT(a) lidt a
+#define LDS(a, b) lds b, P_(a)
+#define LES(a, b) les b, P_(a)
+#define LFS(a, b) lfs b, P_(a)
+#define LGS(a, b) lgs b, P_(a)
+#define LSS(a, b) lss b, P_(a)
+#define LLDT(a) lldt a
+#define LMSW(a) lmsw a
+#define LOCK lock
+#define LODS_L lodsd
+#define LODS_W lodsw
+#define LODS_B lodsb
+#define LOOP(a) loop a
+#define LOOPE(a) loope a
+#define LOOPZ(a) loopz a
+#define LOOPNE(a) loopne a
+#define LOOPNZ(a) loopnz a
+#define LSL(a, b) lsl b, a
+#define LTR(a) ltr a
+#define MOV_SR(a, b) mov SR_(b), SR_(a)
+#define MOV_L(a, b) mov L_(b), L_(a)
+#define MOV_W(a, b) mov W_(b), W_(a)
+#define MOV_B(a, b) mov B_(b), B_(a)
+#define MOVS_L movsd
+#define MOVS_W movsw
+#define MOVS_B movsb
+#define MOVSX_BL(a, b) movsx B_(b), B_(a)
+#define MOVSX_BW(a, b) movsx B_(b), B_(a)
+#define MOVSX_WL(a, b) movsx W_(b), W_(a)
+#define MOVZX_BL(a, b) movzx B_(b), B_(a)
+#define MOVZX_BW(a, b) movzx B_(b), B_(a)
+#define MOVZX_WL(a, b) movzx W_(b), W_(a)
+#define MUL_L(a) mul L_(a)
+#define MUL_W(a) mul W_(a)
+#define MUL_B(a) mul B_(a)
+#define NEG_L(a) neg L_(a)
+#define NEG_W(a) neg W_(a)
+#define NEG_B(a) neg B_(a)
+#define NOP nop
+#define NOT_L(a) not L_(a)
+#define NOT_W(a) not W_(a)
+#define NOT_B(a) not B_(a)
+#define OR_L(a,b) or L_(b), L_(a)
+#define OR_W(a,b) or W_(b), W_(a)
+#define OR_B(a,b) or B_(b), B_(a)
+#define OUT_L out DX, EAX
+#define OUT_W out DX, AX
+#define OUT_B out DX, AL
+#define OUT1_L(a) out1 L_(a)
+#define OUT1_W(a) out1 W_(a)
+#define OUT1_B(a) out1 B_(a)
+#define OUTS_L outsd
+#define OUTS_W outsw
+#define OUTS_B outsb
+#define POP_SR(a) pop SR_(a)
+#define POP_L(a) pop L_(a)
+#define POP_W(a) pop W_(a)
+#define POPA_L popad
+#define POPA_W popa
+#define POPF_L popfd
+#define POPF_W popf
+#define PUSH_SR(a) push SR_(a)
+#define PUSH_L(a) push L_(a)
+#define PUSH_W(a) push W_(a)
+#define PUSH_B(a) push B_(a)
+#define PUSHA_L pushad
+#define PUSHA_W pusha
+#define PUSHF_L pushfd
+#define PUSHF_W pushf
+#define RCL_L(a, b) rcl L_(b), L_(a)
+#define RCL_W(a, b) rcl W_(b), W_(a)
+#define RCL_B(a, b) rcl B_(b), B_(a)
+#define RCR_L(a, b) rcr L_(b), L_(a)
+#define RCR_W(a, b) rcr W_(b), W_(a)
+#define RCR_B(a, b) rcr B_(b), B_(a)
+#define RDTSC rdtsc
+#define ROL_L(a, b) rol L_(b), L_(a)
+#define ROL_W(a, b) rol W_(b), W_(a)
+#define ROL_B(a, b) rol B_(b), B_(a)
+#define ROR_L(a, b) ror L_(b), L_(a)
+#define ROR_W(a, b) ror W_(b), W_(a)
+#define ROR_B(a, b) ror B_(b), B_(a)
+#define REP rep
+#define REPE repe
+#define REPNE repne
+#define REPNZ REPNE
+#define REPZ REPE
+#define RET ret
+#define SAHF sahf
+#define SAL_L(a, b) sal L_(b), B_(a)
+#define SAL_W(a, b) sal W_(b), B_(a)
+#define SAL_B(a, b) sal B_(b), B_(a)
+#define SAR_L(a, b) sar L_(b), B_(a)
+#define SAR_W(a, b) sar W_(b), B_(a)
+#define SAR_B(a, b) sar B_(b), B_(a)
+#define SBB_L(a, b) sbb L_(b), L_(a)
+#define SBB_W(a, b) sbb W_(b), W_(a)
+#define SBB_B(a, b) sbb B_(b), B_(a)
+#define SCAS_L scas
+#define SCAS_W scas
+#define SCAS_B scas
+#define SETA(a) seta a
+#define SETAE(a) setae a
+#define SETB(a) setb a
+#define SETBE(a) setbe a
+#define SETC(a) setc a
+#define SETE(a) sete a
+#define SETG(a) setg a
+#define SETGE(a) setge a
+#define SETL(a) setl a
+#define SETLE(a) setle a
+#define SETNA(a) setna a
+#define SETNAE(a) setnae a
+#define SETNB(a) setnb a
+#define SETNBE(a) setnbe a
+#define SETNC(a) setnc a
+#define SETNE(a) setne a
+#define SETNG(a) setng a
+#define SETNGE(a) setnge a
+#define SETNL(a) setnl a
+#define SETNLE(a) setnle a
+#define SETNO(a) setno a
+#define SETNP(a) setnp a
+#define SETNS(a) setns a
+#define SETNZ(a) setnz a
+#define SETO(a) seto a
+#define SETP(a) setp a
+#define SETPE(a) setpe a
+#define SETPO(a) setpo a
+#define SETS(a) sets a
+#define SETZ(a) setz a
+#define SGDT(a) sgdt a
+#define SIDT(a) sidt a
+#define SHL_L(a, b) shl L_(b), B_(a)
+#define SHL_W(a, b) shl W_(b), B_(a)
+#define SHL_B(a, b) shl B_(b), B_(a)
+#define SHLD_L(a,b,c) shld
+#define SHLD2_L(a,b) shld L_(b), L_(a)
+#define SHLD_W(a,b,c) shld
+#define SHLD2_W(a,b) shld W_(b), W_(a)
+#define SHR_L(a, b) shr L_(b), B_(a)
+#define SHR_W(a, b) shr W_(b), B_(a)
+#define SHR_B(a, b) shr B_(b), B_(a)
+#define SHRD_L(a,b,c) shrd
+#define SHRD2_L(a,b) shrd L_(b), L_(a)
+#define SHRD_W(a,b,c) shrd
+#define SHRD2_W(a,b) shrd W_(b), W_(a)
+#define SLDT(a) sldt a
+#define SMSW(a) smsw a
+#define STC stc
+#define STD std
+#define STI sti
+#define STOS_L stosd
+#define STOS_W stosw
+#define STOS_B stosb
+#define STR(a) str a
+#define SUB_L(a, b) sub L_(b), L_(a)
+#define SUB_W(a, b) sub W_(b), W_(a)
+#define SUB_B(a, b) sub B_(b), B_(a)
+#define TEST_L(a, b) test L_(b), L_(a)
+#define TEST_W(a, b) test W_(b), W_(a)
+#define TEST_B(a, b) test B_(b), B_(a)
+#define VERR(a) verr a
+#define VERW(a) verw a
+#define WAIT wait
+#define XCHG_L(a, b) xchg L_(b), L_(a)
+#define XCHG_W(a, b) xchg W_(b), W_(a)
+#define XCHG_B(a, b) xchg B_(b), B_(a)
+#define XLAT xlat
+#define XOR_L(a, b) xor L_(b), L_(a)
+#define XOR_W(a, b) xor W_(b), W_(a)
+#define XOR_B(a, b) xor B_(b), B_(a)
+
+
+/* Floating Point Instructions */
+#define F2XM1 f2xm1
+#define FABS fabs
+#define FADD_D(a) fadd D_(a)
+#define FADD_S(a) fadd S_(a)
+#define FADD2(a, b) fadd b, a
+#define FADDP(a, b) faddp b, a
+#define FIADD_L(a) fiadd L_(a)
+#define FIADD_W(a) fiadd W_(a)
+#define FBLD(a) fbld a
+#define FBSTP(a) fbstp a
+#define FCHS fchs
+#define FCLEX fclex
+#define FNCLEX fnclex
+#define FCOM(a) fcom a
+#define FCOM_D(a) fcom D_(a)
+#define FCOM_S(a) fcom S_(a)
+#define FCOMP(a) fcomp a
+#define FCOMP_D(a) fcomp D_(a)
+#define FCOMP_S(a) fcomp S_(a)
+#define FCOMPP fcompp
+#define FCOS fcos
+#define FDECSTP fdecstp
+#define FDIV_D(a) fdiv D_(a)
+#define FDIV_S(a) fdiv S_(a)
+#define FDIV2(a, b) fdiv b, a
+#define FDIVP(a, b) fdivp b, a
+#define FIDIV_L(a) fidiv L_(a)
+#define FIDIV_W(a) fidiv W_(a)
+#define FDIVR_D(a) fdivr D_(a)
+#define FDIVR_S(a) fdivr S_(a)
+#define FDIVR2(a, b) fdivr b, a
+#define FDIVRP(a, b) fdivrp b, a
+#define FIDIVR_L(a) fidivr L_(a)
+#define FIDIVR_W(a) fidivr W_(a)
+#define FFREE(a) ffree a
+#define FICOM_L(a) ficom L_(a)
+#define FICOM_W(a) ficom W_(a)
+#define FICOMP_L(a) ficomp L_(a)
+#define FICOMP_W(a) ficomp W_(a)
+#define FILD_Q(a) fild D_(a)
+#define FILD_L(a) fild L_(a)
+#define FILD_W(a) fild W_(a)
+#define FINCSTP fincstp
+#define FINIT finit
+#define FNINIT fninit
+#define FIST_L(a) fist L_(a)
+#define FIST_W(a) fist W_(a)
+#define FISTP_Q(a) fistp D_(a)
+#define FISTP_L(a) fistp L_(a)
+#define FISTP_W(a) fistp W_(a)
+#define FLD_X(a) fld X_(a)
+#define FLD_D(a) fld D_(a)
+#define FLD_S(a) fld S_(a)
+#define FLD1 fld1
+#define FLDL2T fldl2t
+#define FLDL2E fldl2e
+#define FLDPI fldpi
+#define FLDLG2 fldlg2
+#define FLDLN2 fldln2
+#define FLDZ fldz
+#define FLDCW(a) fldcw a
+#define FLDENV(a) fldenv a
+#define FMUL_S(a) fmul S_(a)
+#define FMUL_D(a) fmul D_(a)
+#define FMUL2(a, b) fmul b, a
+#define FMULP(a, b) fmulp b, a
+#define FIMUL_L(a) fimul L_(a)
+#define FIMUL_W(a) fimul W_(a)
+#define FNOP fnop
+#define FPATAN fpatan
+#define FPREM fprem
+#define FPREM1 fprem1
+#define FPTAN fptan
+#define FRNDINT frndint
+#define FRSTOR(a) frstor a
+#define FSAVE(a) fsave a
+#define FNSAVE(a) fnsave a
+#define FSCALE fscale
+#define FSIN fsin
+#define FSINCOS fsincos
+#define FSQRT fsqrt
+#define FST_D(a) fst D_(a)
+#define FST_S(a) fst S_(a)
+#define FSTP_X(a) fstp X_(a)
+#define FSTP_D(a) fstp D_(a)
+#define FSTP_S(a) fstp S_(a)
+#define FSTP(a) fstp a
+#define FSTCW(a) fstcw a
+#define FNSTCW(a) fnstcw a
+#define FSTENV(a) fstenv a
+#define FNSTENV(a) fnstenv a
+#define FSTSW(a) fstsw a
+#define FNSTSW(a) fnstsw a
+#define FSUB_S(a) fsub S_(a)
+#define FSUB_D(a) fsub D_(a)
+#define FSUB2(a, b) fsub b, a
+#define FSUBP(a, b) fsubp b, a
+#define FISUB_L(a) fisub L_(a)
+#define FISUB_W(a) fisub W_(a)
+#define FSUBR_S(a) fsubr S_(a)
+#define FSUBR_D(a) fsubr D_(a)
+#define FSUBR2(a, b) fsubr b, a
+#define FSUBRP(a, b) fsubrp b, a
+#define FISUBR_L(a) fisubr L_(a)
+#define FISUBR_W(a) fisubr W_(a)
+#define FTST ftst
+#define FUCOM(a) fucom a
+#define FUCOMP(a) fucomp a
+#define FUCOMPP fucompp
+#define FWAIT fwait
+#define FXAM fxam
+#define FXCH(a) fxch a
+#define FXTRACT fxtract
+#define FYL2X fyl2x
+#define FYL2XP1 fyl2xp1
+
+#endif /* NASM_ASSEMBLER, MASM_ASSEMBLER */
+
+ /****************************************/
+ /* */
+ /* Extensions to x86 insn set - */
+ /* MMX, 3DNow! */
+ /* */
+ /****************************************/
+
+#if defined(NASM_ASSEMBLER) || defined(MASM_ASSEMBLER)
+#define P_ARG1(a) P_ ## a
+#define P_ARG2(a, b) P_ ## b, P_ ## a
+#define P_ARG3(a, b, c) P_ ## c, P_ ## b, P_ ## a
+#else
+#define P_ARG1(a) a
+#define P_ARG2(a, b) a, b
+#define P_ARG3(a, b, c) a, b, c
+#endif
+
+/* MMX */
+#define MOVD(a, b) movd P_ARG2(a, b)
+#define MOVQ(a, b) movq P_ARG2(a, b)
+
+#define PADDB(a, b) paddb P_ARG2(a, b)
+#define PADDW(a, b) paddw P_ARG2(a, b)
+#define PADDD(a, b) paddd P_ARG2(a, b)
+
+#define PADDSB(a, b) paddsb P_ARG2(a, b)
+#define PADDSW(a, b) paddsw P_ARG2(a, b)
+
+#define PADDUSB(a, b) paddusb P_ARG2(a, b)
+#define PADDUSW(a, b) paddusw P_ARG2(a, b)
+
+#define PSUBB(a, b) psubb P_ARG2(a, b)
+#define PSUBW(a, b) psubw P_ARG2(a, b)
+#define PSUBD(a, b) psubd P_ARG2(a, b)
+
+#define PSUBSB(a, b) psubsb P_ARG2(a, b)
+#define PSUBSW(a, b) psubsw P_ARG2(a, b)
+
+#define PSUBUSB(a, b) psubusb P_ARG2(a, b)
+#define PSUBUSW(a, b) psubusw P_ARG2(a, b)
+
+#define PCMPEQB(a, b) pcmpeqb P_ARG2(a, b)
+#define PCMPEQW(a, b) pcmpeqw P_ARG2(a, b)
+#define PCMPEQD(a, b) pcmpeqd P_ARG2(a, b)
+
+#define PCMPGTB(a, b) pcmpgtb P_ARG2(a, b)
+#define PCMPGTW(a, b) pcmpgtw P_ARG2(a, b)
+#define PCMPGTD(a, b) pcmpgtd P_ARG2(a, b)
+
+#define PMULHW(a, b) pmulhw P_ARG2(a, b)
+#define PMULLW(a, b) pmullw P_ARG2(a, b)
+
+#define PMADDWD(a, b) pmaddwd P_ARG2(a, b)
+
+#define PAND(a, b) pand P_ARG2(a, b)
+
+#define PANDN(a, b) pandn P_ARG2(a, b)
+
+#define POR(a, b) por P_ARG2(a, b)
+
+#define PXOR(a, b) pxor P_ARG2(a, b)
+
+#define PSRAW(a, b) psraw P_ARG2(a, b)
+#define PSRAD(a, b) psrad P_ARG2(a, b)
+
+#define PSRLW(a, b) psrlw P_ARG2(a, b)
+#define PSRLD(a, b) psrld P_ARG2(a, b)
+#define PSRLQ(a, b) psrlq P_ARG2(a, b)
+
+#define PSLLW(a, b) psllw P_ARG2(a, b)
+#define PSLLD(a, b) pslld P_ARG2(a, b)
+#define PSLLQ(a, b) psllq P_ARG2(a, b)
+
+#define PACKSSWB(a, b) packsswb P_ARG2(a, b)
+#define PACKSSDW(a, b) packssdw P_ARG2(a, b)
+#define PACKUSWB(a, b) packuswb P_ARG2(a, b)
+
+#define PUNPCKHBW(a, b) punpckhbw P_ARG2(a, b)
+#define PUNPCKHWD(a, b) punpckhwd P_ARG2(a, b)
+#define PUNPCKHDQ(a, b) punpckhdq P_ARG2(a, b)
+#define PUNPCKLBW(a, b) punpcklbw P_ARG2(a, b)
+#define PUNPCKLWD(a, b) punpcklwd P_ARG2(a, b)
+#define PUNPCKLDQ(a, b) punpckldq P_ARG2(a, b)
+
+#define EMMS emms
+
+/* AMD 3DNow! */
+#define PAVGUSB(a, b) pavgusb P_ARG2(a, b)
+#define PFADD(a, b) pfadd P_ARG2(a, b)
+#define PFSUB(a, b) pfsub P_ARG2(a, b)
+#define PFSUBR(a, b) pfsubr P_ARG2(a, b)
+#define PFACC(a, b) pfacc P_ARG2(a, b)
+#define PFCMPGE(a, b) pfcmpge P_ARG2(a, b)
+#define PFCMPGT(a, b) pfcmpgt P_ARG2(a, b)
+#define PFCMPEQ(a, b) pfcmpeq P_ARG2(a, b)
+#define PFMIN(a, b) pfmin P_ARG2(a, b)
+#define PFMAX(a, b) pfmax P_ARG2(a, b)
+#define PI2FD(a, b) pi2fd P_ARG2(a, b)
+#define PF2ID(a, b) pf2id P_ARG2(a, b)
+#define PFRCP(a, b) pfrcp P_ARG2(a, b)
+#define PFRSQRT(a, b) pfrsqrt P_ARG2(a, b)
+#define PFMUL(a, b) pfmul P_ARG2(a, b)
+#define PFRCPIT1(a, b) pfrcpit1 P_ARG2(a, b)
+#define PFRSQIT1(a, b) pfrsqit1 P_ARG2(a, b)
+#define PFRCPIT2(a, b) pfrcpit2 P_ARG2(a, b)
+#define PMULHRW(a, b) pmulhrw P_ARG2(a, b)
+
+#define FEMMS femms
+#define PREFETCH(a) prefetch P_ARG1(a)
+#define PREFETCHW(a) prefetchw P_ARG1(a)
+
+/* Intel SSE */
+#define ADDPS(a, b) addps P_ARG2(a, b)
+#define ADDSS(a, b) addss P_ARG2(a, b)
+#define ANDNPS(a, b) andnps P_ARG2(a, b)
+#define ANDPS(a, b) andps P_ARG2(a, b)
+/* NASM only knows the pseudo ops for these.
+#define CMPPS(a, b, c) cmpps P_ARG3(a, b, c)
+#define CMPSS(a, b, c) cmpss P_ARG3(a, b, c)
+*/
+#define CMPEQPS(a, b) cmpeqps P_ARG2(a, b)
+#define CMPLTPS(a, b) cmpltps P_ARG2(a, b)
+#define CMPLEPS(a, b) cmpleps P_ARG2(a, b)
+#define CMPUNORDPS(a, b) cmpunordps P_ARG2(a, b)
+#define CMPNEQPS(a, b) cmpneqps P_ARG2(a, b)
+#define CMPNLTPS(a, b) cmpnltps P_ARG2(a, b)
+#define CMPNLEPS(a, b) cmpnleps P_ARG2(a, b)
+#define CMPORDPS(a, b) cmpordps P_ARG2(a, b)
+#define CMPEQSS(a, b) cmpeqss P_ARG2(a, b)
+#define CMPLTSS(a, b) cmpltss P_ARG2(a, b)
+#define CMPLESS(a, b) cmpless P_ARG2(a, b)
+#define CMPUNORDSS(a, b) cmpunordss P_ARG2(a, b)
+#define CMPNEQSS(a, b) cmpneqss P_ARG2(a, b)
+#define CMPNLTSS(a, b) cmpnltss P_ARG2(a, b)
+#define CMPNLESS(a, b) cmpnless P_ARG2(a, b)
+#define CMPORDSS(a, b) cmpordss P_ARG2(a, b)
+#define COMISS(a, b) comiss P_ARG2(a, b)
+#define CVTPI2PS(a, b) cvtpi2ps P_ARG2(a, b)
+#define CVTPS2PI(a, b) cvtps2pi P_ARG2(a, b)
+#define CVTSI2SS(a, b) cvtsi2ss P_ARG2(a, b)
+#define CVTSS2SI(a, b) cvtss2si P_ARG2(a, b)
+#define CVTTPS2PI(a, b) cvttps2pi P_ARG2(a, b)
+#define CVTTSS2SI(a, b) cvttss2si P_ARG2(a, b)
+#define DIVPS(a, b) divps P_ARG2(a, b)
+#define DIVSS(a, b) divss P_ARG2(a, b)
+#define FXRSTOR(a) fxrstor P_ARG1(a)
+#define FXSAVE(a) fxsave P_ARG1(a)
+#define LDMXCSR(a) ldmxcsr P_ARG1(a)
+#define MAXPS(a, b) maxps P_ARG2(a, b)
+#define MAXSS(a, b) maxss P_ARG2(a, b)
+#define MINPS(a, b) minps P_ARG2(a, b)
+#define MINSS(a, b) minss P_ARG2(a, b)
+#define MOVAPS(a, b) movaps P_ARG2(a, b)
+#define MOVHLPS(a, b) movhlps P_ARG2(a, b)
+#define MOVHPS(a, b) movhps P_ARG2(a, b)
+#define MOVLHPS(a, b) movlhps P_ARG2(a, b)
+#define MOVLPS(a, b) movlps P_ARG2(a, b)
+#define MOVMSKPS(a, b) movmskps P_ARG2(a, b)
+#define MOVNTPS(a, b) movntps P_ARG2(a, b)
+#define MOVNTQ(a, b) movntq P_ARG2(a, b)
+#define MOVSS(a, b) movss P_ARG2(a, b)
+#define MOVUPS(a, b) movups P_ARG2(a, b)
+#define MULPS(a, b) mulps P_ARG2(a, b)
+#define MULSS(a, b) mulss P_ARG2(a, b)
+#define ORPS(a, b) orps P_ARG2(a, b)
+#define RCPPS(a, b) rcpps P_ARG2(a, b)
+#define RCPSS(a, b) rcpss P_ARG2(a, b)
+#define RSQRTPS(a, b) rsqrtps P_ARG2(a, b)
+#define RSQRTSS(a, b) rsqrtss P_ARG2(a, b)
+#define SHUFPS(a, b, c) shufps P_ARG3(a, b, c)
+#define SQRTPS(a, b) sqrtps P_ARG2(a, b)
+#define SQRTSS(a, b) sqrtss P_ARG2(a, b)
+#define STMXCSR(a) stmxcsr P_ARG1(a)
+#define SUBPS(a, b) subps P_ARG2(a, b)
+#define UCOMISS(a, b) ucomiss P_ARG2(a, b)
+#define UNPCKHPS(a, b) unpckhps P_ARG2(a, b)
+#define UNPCKLPS(a, b) unpcklps P_ARG2(a, b)
+#define XORPS(a, b) xorps P_ARG2(a, b)
+
+#define PREFETCHNTA(a) prefetchnta P_ARG1(a)
+#define PREFETCHT0(a) prefetcht0 P_ARG1(a)
+#define PREFETCHT1(a) prefetcht1 P_ARG1(a)
+#define PREFETCHT2(a) prefetcht2 P_ARG1(a)
+#define SFENCE sfence
+
+/* Added by BrianP for FreeBSD (per David Dawes) */
+#if !defined(NASM_ASSEMBLER) && !defined(MASM_ASSEMBLER) && !defined(__bsdi__)
+#define LLBL(a) CONCAT(.L,a)
+#define LLBL2(a,b) CONCAT3(.L,a,b)
+#else
+#define LLBL(a) a
+#define LLBL2(a,b) CONCAT(a,b)
+#endif
+
+/* Segment overrides */
+#define SEGCS D_BYTE 46
+#define SEGDS D_BYTE 62
+#define SEGES D_BYTE 38
+#define SEGFS D_BYTE 100
+#define SEGGS D_BYTE 101
+
+/* Temporary labels: valid until next non-local label */
+#ifdef NASM_ASSEMBLER
+#define TLBL(a) CONCAT(.,a)
+#else
+#define TLBL(a) CONCAT(a,$)
+#endif
+
+/* Hidden symbol visibility support.
+ * If we build with gcc's -fvisibility=hidden flag, we'll need to change
+ * the symbol visibility mode to 'default'.
+ */
+#if defined(GNU_ASSEMBLER) && !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__)
+# define HIDDEN(x) .hidden x
+#elif defined(__GNUC__) && !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__)
+# pragma GCC visibility push(default)
+# define HIDDEN(x) .hidden x
+#else
+# define HIDDEN(x)
+#endif
+
+#endif /* __ASSYNTAX_H__ */
diff --git a/src/arch/x86/clip_args.h b/src/arch/x86/clip_args.h
new file mode 100644
index 0000000..796611f
--- /dev/null
+++ b/src/arch/x86/clip_args.h
@@ -0,0 +1,59 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Clip test function interface for assembly code. Simply define
+ * FRAME_OFFSET to the number of bytes pushed onto the stack before
+ * using the ARG_* argument macros.
+ *
+ * Gareth Hughes
+ */
+
+#ifndef __CLIP_ARGS_H__
+#define __CLIP_ARGS_H__
+
+/*
+ * Offsets for clip_func arguments
+ *
+ * typedef GLvector4f *(*clip_func)( GLvector4f *clip_vec,
+ * GLvector4f *proj_vec,
+ * GLubyte clipMask[],
+ * GLubyte *orMask,
+ * GLubyte *andMask );
+ */
+
+#define OFFSET_SOURCE 4
+#define OFFSET_DEST 8
+#define OFFSET_CLIP 12
+#define OFFSET_OR 16
+#define OFFSET_AND 20
+
+#define ARG_SOURCE REGOFF(FRAME_OFFSET+OFFSET_SOURCE, ESP)
+#define ARG_DEST REGOFF(FRAME_OFFSET+OFFSET_DEST, ESP)
+#define ARG_CLIP REGOFF(FRAME_OFFSET+OFFSET_CLIP, ESP)
+#define ARG_OR REGOFF(FRAME_OFFSET+OFFSET_OR, ESP)
+#define ARG_AND REGOFF(FRAME_OFFSET+OFFSET_AND, ESP)
+
+#endif
diff --git a/src/arch/x86/common_x86.c b/src/arch/x86/common_x86.c
new file mode 100644
index 0000000..6299507
--- /dev/null
+++ b/src/arch/x86/common_x86.c
@@ -0,0 +1,336 @@
+/*
+ * Mesa 3-D graphics library
+ * Version: 6.5.1
+ *
+ * Copyright (C) 1999-2006 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file common_x86.c
+ *
+ * Check CPU capabilities & initialize optimized funtions for this particular
+ * processor.
+ *
+ * Changed by Andre Werthmann for using the new SSE functions.
+ *
+ * \author Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
+ * \author Andre Werthmann <wertmann@cs.uni-potsdam.de>
+ */
+
+/* XXX these includes should probably go into imports.h or glheader.h */
+#if defined(USE_SSE_ASM) && defined(__linux__)
+#include <linux/version.h>
+#endif
+#if defined(USE_SSE_ASM) && defined(__FreeBSD__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+#if defined(USE_SSE_ASM) && defined(__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#endif
+
+#include "main/imports.h"
+#include "common_x86_asm.h"
+
+
+/** Bitmask of X86_FEATURE_x bits */
+int _mesa_x86_cpu_features = 0x0;
+
+static int detection_debug = GL_FALSE;
+
+/* No reason for this to be public.
+ */
+extern GLuint _ASMAPI _mesa_x86_has_cpuid(void);
+extern void _ASMAPI _mesa_x86_cpuid(GLuint op, GLuint *reg_eax, GLuint *reg_ebx, GLuint *reg_ecx, GLuint *reg_edx);
+extern GLuint _ASMAPI _mesa_x86_cpuid_eax(GLuint op);
+extern GLuint _ASMAPI _mesa_x86_cpuid_ebx(GLuint op);
+extern GLuint _ASMAPI _mesa_x86_cpuid_ecx(GLuint op);
+extern GLuint _ASMAPI _mesa_x86_cpuid_edx(GLuint op);
+
+
+#if defined(USE_SSE_ASM)
+/*
+ * We must verify that the Streaming SIMD Extensions are truly supported
+ * on this processor before we go ahead and hook out the optimized code.
+ *
+ * However, I have been told by Alan Cox that all 2.4 (and later) Linux
+ * kernels provide full SSE support on all processors that expose SSE via
+ * the CPUID mechanism.
+ */
+
+/* These are assembly functions: */
+extern void _mesa_test_os_sse_support( void );
+extern void _mesa_test_os_sse_exception_support( void );
+
+
+#if defined(_WIN32)
+#ifndef STATUS_FLOAT_MULTIPLE_TRAPS
+# define STATUS_FLOAT_MULTIPLE_TRAPS (0xC00002B5L)
+#endif
+static LONG WINAPI ExceptionFilter(LPEXCEPTION_POINTERS exp)
+{
+ PEXCEPTION_RECORD rec = exp->ExceptionRecord;
+ PCONTEXT ctx = exp->ContextRecord;
+
+ if ( rec->ExceptionCode == EXCEPTION_ILLEGAL_INSTRUCTION ) {
+ _mesa_debug(NULL, "EXCEPTION_ILLEGAL_INSTRUCTION\n" );
+ _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
+ } else if ( rec->ExceptionCode == STATUS_FLOAT_MULTIPLE_TRAPS ) {
+ _mesa_debug(NULL, "STATUS_FLOAT_MULTIPLE_TRAPS\n");
+ /* Windows seems to clear the exception flag itself, we just have to increment Eip */
+ } else {
+ _mesa_debug(NULL, "UNEXPECTED EXCEPTION (0x%08x), terminating!\n" );
+ return EXCEPTION_EXECUTE_HANDLER;
+ }
+
+ if ( (ctx->ContextFlags & CONTEXT_CONTROL) != CONTEXT_CONTROL ) {
+ _mesa_debug(NULL, "Context does not contain control registers, terminating!\n");
+ return EXCEPTION_EXECUTE_HANDLER;
+ }
+ ctx->Eip += 3;
+
+ return EXCEPTION_CONTINUE_EXECUTION;
+}
+#endif /* _WIN32 */
+
+
+/**
+ * Check if SSE is supported.
+ * If not, turn off the X86_FEATURE_XMM flag in _mesa_x86_cpu_features.
+ */
+void _mesa_check_os_sse_support( void )
+{
+#if defined(__FreeBSD__)
+ {
+ int ret, enabled;
+ unsigned int len;
+ len = sizeof(enabled);
+ ret = sysctlbyname("hw.instruction_sse", &enabled, &len, NULL, 0);
+ if (ret || !enabled)
+ _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
+ }
+#elif defined (__NetBSD__)
+ {
+ int ret, enabled;
+ size_t len = sizeof(enabled);
+ ret = sysctlbyname("machdep.sse", &enabled, &len, (void *)NULL, 0);
+ if (ret || !enabled)
+ _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
+ }
+#elif defined(__OpenBSD__)
+ {
+ int mib[2];
+ int ret, enabled;
+ size_t len = sizeof(enabled);
+
+ mib[0] = CTL_MACHDEP;
+ mib[1] = CPU_SSE;
+
+ ret = sysctl(mib, 2, &enabled, &len, NULL, 0);
+ if (ret || !enabled)
+ _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
+ }
+#elif defined(_WIN32)
+ LPTOP_LEVEL_EXCEPTION_FILTER oldFilter;
+
+ /* Install our ExceptionFilter */
+ oldFilter = SetUnhandledExceptionFilter( ExceptionFilter );
+
+ if ( cpu_has_xmm ) {
+ _mesa_debug(NULL, "Testing OS support for SSE...\n");
+
+ _mesa_test_os_sse_support();
+
+ if ( cpu_has_xmm ) {
+ _mesa_debug(NULL, "Yes.\n");
+ } else {
+ _mesa_debug(NULL, "No!\n");
+ }
+ }
+
+ if ( cpu_has_xmm ) {
+ _mesa_debug(NULL, "Testing OS support for SSE unmasked exceptions...\n");
+
+ _mesa_test_os_sse_exception_support();
+
+ if ( cpu_has_xmm ) {
+ _mesa_debug(NULL, "Yes.\n");
+ } else {
+ _mesa_debug(NULL, "No!\n");
+ }
+ }
+
+ /* Restore previous exception filter */
+ SetUnhandledExceptionFilter( oldFilter );
+
+ if ( cpu_has_xmm ) {
+ _mesa_debug(NULL, "Tests of OS support for SSE passed.\n");
+ } else {
+ _mesa_debug(NULL, "Tests of OS support for SSE failed!\n");
+ }
+#else
+ /* Do nothing on other platforms for now.
+ */
+ if (detection_debug)
+ _mesa_debug(NULL, "Not testing OS support for SSE, leaving enabled.\n");
+#endif /* __FreeBSD__ */
+}
+
+#endif /* USE_SSE_ASM */
+
+
+/**
+ * Initialize the _mesa_x86_cpu_features bitfield.
+ * This is a no-op if called more than once.
+ */
+void
+_mesa_get_x86_features(void)
+{
+ static int called = 0;
+
+ if (called)
+ return;
+
+ called = 1;
+
+#ifdef USE_X86_ASM
+ _mesa_x86_cpu_features = 0x0;
+
+ if (_mesa_getenv( "MESA_NO_ASM")) {
+ return;
+ }
+
+ if (!_mesa_x86_has_cpuid()) {
+ _mesa_debug(NULL, "CPUID not detected\n");
+ }
+ else {
+ GLuint cpu_features;
+ GLuint cpu_ext_features;
+ GLuint cpu_ext_info;
+ char cpu_vendor[13];
+ GLuint result;
+
+ /* get vendor name */
+ _mesa_x86_cpuid(0, &result, (GLuint *)(cpu_vendor + 0), (GLuint *)(cpu_vendor + 8), (GLuint *)(cpu_vendor + 4));
+ cpu_vendor[12] = '\0';
+
+ if (detection_debug)
+ _mesa_debug(NULL, "CPU vendor: %s\n", cpu_vendor);
+
+ /* get cpu features */
+ cpu_features = _mesa_x86_cpuid_edx(1);
+
+ if (cpu_features & X86_CPU_FPU)
+ _mesa_x86_cpu_features |= X86_FEATURE_FPU;
+ if (cpu_features & X86_CPU_CMOV)
+ _mesa_x86_cpu_features |= X86_FEATURE_CMOV;
+
+#ifdef USE_MMX_ASM
+ if (cpu_features & X86_CPU_MMX)
+ _mesa_x86_cpu_features |= X86_FEATURE_MMX;
+#endif
+
+#ifdef USE_SSE_ASM
+ if (cpu_features & X86_CPU_XMM)
+ _mesa_x86_cpu_features |= X86_FEATURE_XMM;
+ if (cpu_features & X86_CPU_XMM2)
+ _mesa_x86_cpu_features |= X86_FEATURE_XMM2;
+#endif
+
+ /* query extended cpu features */
+ if ((cpu_ext_info = _mesa_x86_cpuid_eax(0x80000000)) > 0x80000000) {
+ if (cpu_ext_info >= 0x80000001) {
+
+ cpu_ext_features = _mesa_x86_cpuid_edx(0x80000001);
+
+ if (cpu_features & X86_CPU_MMX) {
+
+#ifdef USE_3DNOW_ASM
+ if (cpu_ext_features & X86_CPUEXT_3DNOW)
+ _mesa_x86_cpu_features |= X86_FEATURE_3DNOW;
+ if (cpu_ext_features & X86_CPUEXT_3DNOW_EXT)
+ _mesa_x86_cpu_features |= X86_FEATURE_3DNOWEXT;
+#endif
+
+#ifdef USE_MMX_ASM
+ if (cpu_ext_features & X86_CPUEXT_MMX_EXT)
+ _mesa_x86_cpu_features |= X86_FEATURE_MMXEXT;
+#endif
+ }
+ }
+
+ /* query cpu name */
+ if (cpu_ext_info >= 0x80000002) {
+ GLuint ofs;
+ char cpu_name[49];
+ for (ofs = 0; ofs < 3; ofs++)
+ _mesa_x86_cpuid(0x80000002+ofs, (GLuint *)(cpu_name + (16*ofs)+0), (GLuint *)(cpu_name + (16*ofs)+4), (GLuint *)(cpu_name + (16*ofs)+8), (GLuint *)(cpu_name + (16*ofs)+12));
+ cpu_name[48] = '\0'; /* the name should be NULL terminated, but just to be sure */
+
+ if (detection_debug)
+ _mesa_debug(NULL, "CPU name: %s\n", cpu_name);
+ }
+ }
+
+ }
+
+#ifdef USE_MMX_ASM
+ if ( cpu_has_mmx ) {
+ if ( _mesa_getenv( "MESA_NO_MMX" ) == 0 ) {
+ if (detection_debug)
+ _mesa_debug(NULL, "MMX cpu detected.\n");
+ } else {
+ _mesa_x86_cpu_features &= ~(X86_FEATURE_MMX);
+ }
+ }
+#endif
+
+#ifdef USE_3DNOW_ASM
+ if ( cpu_has_3dnow ) {
+ if ( _mesa_getenv( "MESA_NO_3DNOW" ) == 0 ) {
+ if (detection_debug)
+ _mesa_debug(NULL, "3DNow! cpu detected.\n");
+ } else {
+ _mesa_x86_cpu_features &= ~(X86_FEATURE_3DNOW);
+ }
+ }
+#endif
+
+#ifdef USE_SSE_ASM
+ if ( cpu_has_xmm ) {
+ if ( _mesa_getenv( "MESA_NO_SSE" ) == 0 ) {
+ if (detection_debug)
+ _mesa_debug(NULL, "SSE cpu detected.\n");
+ if ( _mesa_getenv( "MESA_FORCE_SSE" ) == 0 ) {
+ _mesa_check_os_sse_support();
+ }
+ } else {
+ _mesa_debug(NULL, "SSE cpu detected, but switched off by user.\n");
+ _mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
+ }
+ }
+#endif
+
+#endif /* USE_X86_ASM */
+
+ (void) detection_debug;
+}
diff --git a/src/arch/x86/common_x86_asm.S b/src/arch/x86/common_x86_asm.S
new file mode 100644
index 0000000..ea4047a
--- /dev/null
+++ b/src/arch/x86/common_x86_asm.S
@@ -0,0 +1,220 @@
+/*
+ * Mesa 3-D graphics library
+ * Version: 6.3
+ *
+ * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Check extended CPU capabilities. Now justs returns the raw CPUID
+ * feature information, allowing the higher level code to interpret the
+ * results.
+ *
+ * Written by Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
+ *
+ * Cleaned up and simplified by Gareth Hughes <gareth@valinux.com>
+ *
+ */
+
+/*
+ * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
+ * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces
+ * in there will break the build on some platforms.
+ */
+
+#include "matypes.h"
+#include "assyntax.h"
+#include "common_x86_features.h"
+
+ SEG_TEXT
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_x86_has_cpuid)
+HIDDEN(_mesa_x86_has_cpuid)
+GLNAME(_mesa_x86_has_cpuid):
+
+ /* Test for the CPUID command. If the ID Flag bit in EFLAGS
+ * (bit 21) is writable, the CPUID command is present */
+ PUSHF_L
+ POP_L (EAX)
+ MOV_L (EAX, ECX)
+ XOR_L (CONST(0x00200000), EAX)
+ PUSH_L (EAX)
+ POPF_L
+ PUSHF_L
+ POP_L (EAX)
+
+ /* Verify the ID Flag bit has been written. */
+ CMP_L (ECX, EAX)
+ SETNE (AL)
+ XOR_L (CONST(0xff), EAX)
+
+ RET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_x86_cpuid)
+HIDDEN(_mesa_x86_cpuid)
+GLNAME(_mesa_x86_cpuid):
+
+ MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */
+ PUSH_L (EDI)
+ PUSH_L (EBX)
+
+ CPUID
+
+ MOV_L (REGOFF(16, ESP), EDI) /* *eax */
+ MOV_L (EAX, REGIND(EDI))
+ MOV_L (REGOFF(20, ESP), EDI) /* *ebx */
+ MOV_L (EBX, REGIND(EDI))
+ MOV_L (REGOFF(24, ESP), EDI) /* *ecx */
+ MOV_L (ECX, REGIND(EDI))
+ MOV_L (REGOFF(28, ESP), EDI) /* *edx */
+ MOV_L (EDX, REGIND(EDI))
+
+ POP_L (EBX)
+ POP_L (EDI)
+ RET
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_x86_cpuid_eax)
+HIDDEN(_mesa_x86_cpuid_eax)
+GLNAME(_mesa_x86_cpuid_eax):
+
+ MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */
+ PUSH_L (EBX)
+
+ CPUID
+
+ POP_L (EBX)
+ RET
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_x86_cpuid_ebx)
+HIDDEN(_mesa_x86_cpuid_ebx)
+GLNAME(_mesa_x86_cpuid_ebx):
+
+ MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */
+ PUSH_L (EBX)
+
+ CPUID
+ MOV_L (EBX, EAX) /* return EBX */
+
+ POP_L (EBX)
+ RET
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_x86_cpuid_ecx)
+HIDDEN(_mesa_x86_cpuid_ecx)
+GLNAME(_mesa_x86_cpuid_ecx):
+
+ MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */
+ PUSH_L (EBX)
+
+ CPUID
+ MOV_L (ECX, EAX) /* return ECX */
+
+ POP_L (EBX)
+ RET
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_x86_cpuid_edx)
+HIDDEN(_mesa_x86_cpuid_edx)
+GLNAME(_mesa_x86_cpuid_edx):
+
+ MOV_L (REGOFF(4, ESP), EAX) /* cpuid op */
+ PUSH_L (EBX)
+
+ CPUID
+ MOV_L (EDX, EAX) /* return EDX */
+
+ POP_L (EBX)
+ RET
+
+#ifdef USE_SSE_ASM
+/* Execute an SSE instruction to see if the operating system correctly
+ * supports SSE. A signal handler for SIGILL should have been set
+ * before calling this function, otherwise this could kill the client
+ * application.
+ *
+ * -----> !!!! ATTENTION DEVELOPERS !!!! <-----
+ *
+ * If you're debugging with gdb and you get stopped in this function,
+ * just type 'continue'! Execution will proceed normally.
+ * See freedesktop.org bug #1709 for more info.
+ */
+ALIGNTEXT4
+GLOBL GLNAME( _mesa_test_os_sse_support )
+HIDDEN(_mesa_test_os_sse_support)
+GLNAME( _mesa_test_os_sse_support ):
+
+ XORPS ( XMM0, XMM0 )
+
+ RET
+
+
+/* Perform an SSE divide-by-zero to see if the operating system
+ * correctly supports unmasked SIMD FPU exceptions. Signal handlers for
+ * SIGILL and SIGFPE should have been set before calling this function,
+ * otherwise this could kill the client application.
+ */
+ALIGNTEXT4
+GLOBL GLNAME( _mesa_test_os_sse_exception_support )
+HIDDEN(_mesa_test_os_sse_exception_support)
+GLNAME( _mesa_test_os_sse_exception_support ):
+
+ PUSH_L ( EBP )
+ MOV_L ( ESP, EBP )
+ SUB_L ( CONST( 8 ), ESP )
+
+ /* Save the original MXCSR register value.
+ */
+ STMXCSR ( REGOFF( -4, EBP ) )
+
+ /* Unmask the divide-by-zero exception and perform one.
+ */
+ STMXCSR ( REGOFF( -8, EBP ) )
+ AND_L ( CONST( 0xfffffdff ), REGOFF( -8, EBP ) )
+ LDMXCSR ( REGOFF( -8, EBP ) )
+
+ XORPS ( XMM0, XMM0 )
+
+ PUSH_L ( CONST( 0x3f800000 ) )
+ PUSH_L ( CONST( 0x3f800000 ) )
+ PUSH_L ( CONST( 0x3f800000 ) )
+ PUSH_L ( CONST( 0x3f800000 ) )
+
+ MOVUPS ( REGIND( ESP ), XMM1 )
+
+ DIVPS ( XMM0, XMM1 )
+
+ /* Restore the original MXCSR register value.
+ */
+ LDMXCSR ( REGOFF( -4, EBP ) )
+
+ LEAVE
+ RET
+
+#endif
+
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/common_x86_asm.h b/src/arch/x86/common_x86_asm.h
new file mode 100644
index 0000000..0d39e3d
--- /dev/null
+++ b/src/arch/x86/common_x86_asm.h
@@ -0,0 +1,53 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Check CPU capabilities & initialize optimized funtions for this particular
+ * processor.
+ *
+ * Written by Holger Waechtler <holger@akaflieg.extern.tu-berlin.de>
+ * Changed by Andre Werthmann <wertmann@cs.uni-potsdam.de> for using the
+ * new SSE functions
+ *
+ * Reimplemented by Gareth Hughes in a more
+ * future-proof manner, based on code in the Linux kernel.
+ */
+
+#ifndef __COMMON_X86_ASM_H__
+#define __COMMON_X86_ASM_H__
+
+/* Do not reference mtypes.h from this file.
+ */
+#include "common_x86_features.h"
+
+extern int _mesa_x86_cpu_features;
+
+extern void _mesa_get_x86_features(void);
+
+extern void _mesa_check_os_sse_support(void);
+
+extern void _mesa_init_all_x86_transform_asm( void );
+
+#endif
diff --git a/src/arch/x86/common_x86_features.h b/src/arch/x86/common_x86_features.h
new file mode 100644
index 0000000..676af8c
--- /dev/null
+++ b/src/arch/x86/common_x86_features.h
@@ -0,0 +1,67 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 5.1
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * x86 CPUID feature information. The raw data is returned by
+ * _mesa_identify_x86_cpu_features() and interpreted with the cpu_has_*
+ * helper macros.
+ *
+ * Gareth Hughes
+ */
+
+#ifndef __COMMON_X86_FEATURES_H__
+#define __COMMON_X86_FEATURES_H__
+
+#define X86_FEATURE_FPU (1<<0)
+#define X86_FEATURE_CMOV (1<<1)
+#define X86_FEATURE_MMXEXT (1<<2)
+#define X86_FEATURE_MMX (1<<3)
+#define X86_FEATURE_FXSR (1<<4)
+#define X86_FEATURE_XMM (1<<5)
+#define X86_FEATURE_XMM2 (1<<6)
+#define X86_FEATURE_3DNOWEXT (1<<7)
+#define X86_FEATURE_3DNOW (1<<8)
+
+/* standard X86 CPU features */
+#define X86_CPU_FPU (1<<0)
+#define X86_CPU_CMOV (1<<15)
+#define X86_CPU_MMX (1<<23)
+#define X86_CPU_XMM (1<<25)
+#define X86_CPU_XMM2 (1<<26)
+
+/* extended X86 CPU features */
+#define X86_CPUEXT_MMX_EXT (1<<22)
+#define X86_CPUEXT_3DNOW_EXT (1<<30)
+#define X86_CPUEXT_3DNOW (1<<31)
+
+#define cpu_has_mmx (_mesa_x86_cpu_features & X86_FEATURE_MMX)
+#define cpu_has_mmxext (_mesa_x86_cpu_features & X86_FEATURE_MMXEXT)
+#define cpu_has_xmm (_mesa_x86_cpu_features & X86_FEATURE_XMM)
+#define cpu_has_xmm2 (_mesa_x86_cpu_features & X86_FEATURE_XMM2)
+#define cpu_has_3dnow (_mesa_x86_cpu_features & X86_FEATURE_3DNOW)
+#define cpu_has_3dnowext (_mesa_x86_cpu_features & X86_FEATURE_3DNOWEXT)
+
+#endif
+
diff --git a/src/arch/x86/gen_matypes.c b/src/arch/x86/gen_matypes.c
new file mode 100644
index 0000000..61f181c
--- /dev/null
+++ b/src/arch/x86/gen_matypes.c
@@ -0,0 +1,240 @@
+/*
+ * Mesa 3-D graphics library
+ * Version: 6.5.1
+ *
+ * Copyright (C) 1999-2006 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Gareth Hughes
+ */
+
+/*
+ * This generates an asm version of mtypes.h (called matypes.h), so that
+ * Mesa's x86 assembly code can access the internal structures easily.
+ * This will be particularly useful when developing new x86 asm code for
+ * Mesa, including lighting, clipping, texture image conversion etc.
+ */
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "tnl/t_context.h"
+
+
+#undef offsetof
+#define offsetof( type, member ) ((size_t) &((type *)0)->member)
+
+
+#define OFFSET_HEADER( x ) \
+do { \
+ printf( "\n" ); \
+ printf( "\n" ); \
+ printf( "/* =====================================================" \
+ "========\n" ); \
+ printf( " * Offsets for %s\n", x ); \
+ printf( " */\n" ); \
+ printf( "\n" ); \
+} while (0)
+
+#define DEFINE_HEADER( x ) \
+do { \
+ printf( "\n" ); \
+ printf( "/*\n" ); \
+ printf( " * Flags for %s\n", x ); \
+ printf( " */\n" ); \
+ printf( "\n" ); \
+} while (0)
+
+#define OFFSET( s, t, m ) \
+ printf( "#define %s\t%lu\n", s, (unsigned long) offsetof( t, m ) );
+
+#define SIZEOF( s, t ) \
+ printf( "#define %s\t%lu\n", s, (unsigned long) sizeof(t) );
+
+#define DEFINE( s, d ) \
+ printf( "#define %s\t0x%" PRIx64 "\n", s, (uint64_t) d );
+
+
+
+int main( int argc, char **argv )
+{
+ printf( "/*\n" );
+ printf( " * This file is automatically generated from the Mesa internal type\n" );
+ printf( " * definitions. Do not edit directly.\n" );
+ printf( " */\n" );
+ printf( "\n" );
+ printf( "#ifndef __ASM_TYPES_H__\n" );
+ printf( "#define __ASM_TYPES_H__\n" );
+ printf( "\n" );
+
+
+ /* struct gl_context offsets:
+ */
+ OFFSET_HEADER( "struct gl_context" );
+
+ printf( "\n" );
+ OFFSET( "CTX_LIGHT_ENABLED ", struct gl_context, Light.Enabled );
+ OFFSET( "CTX_LIGHT_SHADE_MODEL ", struct gl_context, Light.ShadeModel );
+ OFFSET( "CTX_LIGHT_COLOR_MAT_FACE ", struct gl_context, Light.ColorMaterialFace );
+ OFFSET( "CTX_LIGHT_COLOR_MAT_MODE ", struct gl_context, Light.ColorMaterialMode );
+ OFFSET( "CTX_LIGHT_COLOR_MAT_MASK ", struct gl_context, Light._ColorMaterialBitmask );
+ OFFSET( "CTX_LIGHT_COLOR_MAT_ENABLED ", struct gl_context, Light.ColorMaterialEnabled );
+ OFFSET( "CTX_LIGHT_ENABLED_LIST ", struct gl_context, Light.EnabledList );
+ OFFSET( "CTX_LIGHT_NEED_VERTS ", struct gl_context, Light._NeedVertices );
+ OFFSET( "CTX_LIGHT_BASE_COLOR ", struct gl_context, Light._BaseColor );
+
+
+ /* struct vertex_buffer offsets:
+ */
+ OFFSET_HEADER( "struct vertex_buffer" );
+
+ OFFSET( "VB_SIZE ", struct vertex_buffer, Size );
+ OFFSET( "VB_COUNT ", struct vertex_buffer, Count );
+ printf( "\n" );
+ OFFSET( "VB_ELTS ", struct vertex_buffer, Elts );
+ OFFSET( "VB_OBJ_PTR ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_POS] );
+ OFFSET( "VB_EYE_PTR ", struct vertex_buffer, EyePtr );
+ OFFSET( "VB_CLIP_PTR ", struct vertex_buffer, ClipPtr );
+ OFFSET( "VB_PROJ_CLIP_PTR ", struct vertex_buffer, NdcPtr );
+ OFFSET( "VB_CLIP_OR_MASK ", struct vertex_buffer, ClipOrMask );
+ OFFSET( "VB_CLIP_MASK ", struct vertex_buffer, ClipMask );
+ OFFSET( "VB_NORMAL_PTR ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_NORMAL] );
+ OFFSET( "VB_EDGE_FLAG ", struct vertex_buffer, EdgeFlag );
+ OFFSET( "VB_TEX0_COORD_PTR ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_TEX0] );
+ OFFSET( "VB_TEX1_COORD_PTR ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_TEX1] );
+ OFFSET( "VB_TEX2_COORD_PTR ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_TEX2] );
+ OFFSET( "VB_TEX3_COORD_PTR ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_TEX3] );
+ OFFSET( "VB_INDEX_PTR ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_COLOR_INDEX] );
+ OFFSET( "VB_COLOR_PTR ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_COLOR0] );
+ OFFSET( "VB_SECONDARY_COLOR_PTR ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_COLOR1] );
+ OFFSET( "VB_FOG_COORD_PTR ", struct vertex_buffer, AttribPtr[_TNL_ATTRIB_FOG] );
+ OFFSET( "VB_PRIMITIVE ", struct vertex_buffer, Primitive );
+ printf( "\n" );
+
+ DEFINE_HEADER( "struct vertex_buffer" );
+
+ /* XXX use new labels here someday after vertex proram is done */
+ DEFINE( "VERT_BIT_OBJ ", VERT_BIT_POS );
+ DEFINE( "VERT_BIT_NORM ", VERT_BIT_NORMAL );
+ DEFINE( "VERT_BIT_RGBA ", VERT_BIT_COLOR0 );
+ DEFINE( "VERT_BIT_SPEC_RGB ", VERT_BIT_COLOR1 );
+ DEFINE( "VERT_BIT_FOG_COORD ", VERT_BIT_FOG );
+ DEFINE( "VERT_BIT_TEX0 ", VERT_BIT_TEX0 );
+ DEFINE( "VERT_BIT_TEX1 ", VERT_BIT_TEX1 );
+ DEFINE( "VERT_BIT_TEX2 ", VERT_BIT_TEX2 );
+ DEFINE( "VERT_BIT_TEX3 ", VERT_BIT_TEX3 );
+
+
+ /* GLvector4f offsets:
+ */
+ OFFSET_HEADER( "GLvector4f" );
+
+ OFFSET( "V4F_DATA ", GLvector4f, data );
+ OFFSET( "V4F_START ", GLvector4f, start );
+ OFFSET( "V4F_COUNT ", GLvector4f, count );
+ OFFSET( "V4F_STRIDE ", GLvector4f, stride );
+ OFFSET( "V4F_SIZE ", GLvector4f, size );
+ OFFSET( "V4F_FLAGS ", GLvector4f, flags );
+
+ DEFINE_HEADER( "GLvector4f" );
+
+ DEFINE( "VEC_MALLOC ", VEC_MALLOC );
+ DEFINE( "VEC_NOT_WRITEABLE ", VEC_NOT_WRITEABLE );
+ DEFINE( "VEC_BAD_STRIDE ", VEC_BAD_STRIDE );
+ printf( "\n" );
+ DEFINE( "VEC_SIZE_1 ", VEC_SIZE_1 );
+ DEFINE( "VEC_SIZE_2 ", VEC_SIZE_2 );
+ DEFINE( "VEC_SIZE_3 ", VEC_SIZE_3 );
+ DEFINE( "VEC_SIZE_4 ", VEC_SIZE_4 );
+
+
+ /* GLmatrix offsets:
+ */
+ OFFSET_HEADER( "GLmatrix" );
+
+ OFFSET( "MATRIX_DATA ", GLmatrix, m );
+ OFFSET( "MATRIX_INV ", GLmatrix, inv );
+ OFFSET( "MATRIX_FLAGS ", GLmatrix, flags );
+ OFFSET( "MATRIX_TYPE ", GLmatrix, type );
+
+
+ /* struct gl_light offsets:
+ */
+ OFFSET_HEADER( "struct gl_light" );
+
+ OFFSET( "LIGHT_NEXT ", struct gl_light, next );
+ OFFSET( "LIGHT_PREV ", struct gl_light, prev );
+ printf( "\n" );
+ OFFSET( "LIGHT_AMBIENT ", struct gl_light, Ambient );
+ OFFSET( "LIGHT_DIFFUSE ", struct gl_light, Diffuse );
+ OFFSET( "LIGHT_SPECULAR ", struct gl_light, Specular );
+ OFFSET( "LIGHT_EYE_POSITION ", struct gl_light, EyePosition );
+ OFFSET( "LIGHT_SPOT_DIRECTION ", struct gl_light, SpotDirection );
+ OFFSET( "LIGHT_SPOT_EXPONENT ", struct gl_light, SpotExponent );
+ OFFSET( "LIGHT_SPOT_CUTOFF ", struct gl_light, SpotCutoff );
+ OFFSET( "LIGHT_COS_CUTOFF ", struct gl_light, _CosCutoff );
+ OFFSET( "LIGHT_CONST_ATTEN ", struct gl_light, ConstantAttenuation );
+ OFFSET( "LIGHT_LINEAR_ATTEN ", struct gl_light, LinearAttenuation );
+ OFFSET( "LIGHT_QUADRATIC_ATTEN ", struct gl_light, QuadraticAttenuation );
+ OFFSET( "LIGHT_ENABLED ", struct gl_light, Enabled );
+ printf( "\n" );
+ OFFSET( "LIGHT_FLAGS ", struct gl_light, _Flags );
+ printf( "\n" );
+ OFFSET( "LIGHT_POSITION ", struct gl_light, _Position );
+ OFFSET( "LIGHT_VP_INF_NORM ", struct gl_light, _VP_inf_norm );
+ OFFSET( "LIGHT_H_INF_NORM ", struct gl_light, _h_inf_norm );
+ OFFSET( "LIGHT_NORM_DIRECTION ", struct gl_light, _NormSpotDirection );
+ OFFSET( "LIGHT_VP_INF_SPOT_ATTEN ", struct gl_light, _VP_inf_spot_attenuation );
+ printf( "\n" );
+ OFFSET( "LIGHT_MAT_AMBIENT ", struct gl_light, _MatAmbient );
+ OFFSET( "LIGHT_MAT_DIFFUSE ", struct gl_light, _MatDiffuse );
+ OFFSET( "LIGHT_MAT_SPECULAR ", struct gl_light, _MatSpecular );
+ printf( "\n" );
+ SIZEOF( "SIZEOF_GL_LIGHT ", struct gl_light );
+
+ DEFINE_HEADER( "struct gl_light" );
+
+ DEFINE( "LIGHT_SPOT ", LIGHT_SPOT );
+ DEFINE( "LIGHT_LOCAL_VIEWER ", LIGHT_LOCAL_VIEWER );
+ DEFINE( "LIGHT_POSITIONAL ", LIGHT_POSITIONAL );
+ printf( "\n" );
+ DEFINE( "LIGHT_NEED_VERTICES ", LIGHT_NEED_VERTICES );
+
+
+ /* struct gl_lightmodel offsets:
+ */
+ OFFSET_HEADER( "struct gl_lightmodel" );
+
+ OFFSET( "LIGHT_MODEL_AMBIENT ", struct gl_lightmodel, Ambient );
+ OFFSET( "LIGHT_MODEL_LOCAL_VIEWER ", struct gl_lightmodel, LocalViewer );
+ OFFSET( "LIGHT_MODEL_TWO_SIDE ", struct gl_lightmodel, TwoSide );
+ OFFSET( "LIGHT_MODEL_COLOR_CONTROL ", struct gl_lightmodel, ColorControl );
+
+
+ printf( "\n" );
+ printf( "\n" );
+ printf( "#endif /* __ASM_TYPES_H__ */\n" );
+
+ return 0;
+}
diff --git a/src/arch/x86/mmx.h b/src/arch/x86/mmx.h
new file mode 100644
index 0000000..74e9979
--- /dev/null
+++ b/src/arch/x86/mmx.h
@@ -0,0 +1,59 @@
+/*
+ * Mesa 3-D graphics library
+ * Version: 6.5.2
+ *
+ * Copyright (C) 1999-2006 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef ASM_MMX_H
+#define ASM_MMX_H
+
+#include "main/compiler.h"
+#include "main/glheader.h"
+
+struct gl_context;
+
+extern void _ASMAPI
+_mesa_mmx_blend_transparency( struct gl_context *ctx, GLuint n, const GLubyte mask[],
+ GLvoid *rgba, const GLvoid *dest,
+ GLenum chanType );
+
+extern void _ASMAPI
+_mesa_mmx_blend_add( struct gl_context *ctx, GLuint n, const GLubyte mask[],
+ GLvoid *rgba, const GLvoid *dest,
+ GLenum chanType );
+
+extern void _ASMAPI
+_mesa_mmx_blend_min( struct gl_context *ctx, GLuint n, const GLubyte mask[],
+ GLvoid *rgba, const GLvoid *dest,
+ GLenum chanType );
+
+extern void _ASMAPI
+_mesa_mmx_blend_max( struct gl_context *ctx, GLuint n, const GLubyte mask[],
+ GLvoid *rgba, const GLvoid *dest,
+ GLenum chanType );
+
+extern void _ASMAPI
+_mesa_mmx_blend_modulate( struct gl_context *ctx, GLuint n, const GLubyte mask[],
+ GLvoid *rgba, const GLvoid *dest,
+ GLenum chanType );
+
+#endif
diff --git a/src/arch/x86/mmx_blend.S b/src/arch/x86/mmx_blend.S
new file mode 100644
index 0000000..eeaf43e
--- /dev/null
+++ b/src/arch/x86/mmx_blend.S
@@ -0,0 +1,402 @@
+ ;
+/*
+ * Written by Jos� Fonseca <j_r_fonseca@yahoo.co.uk>
+ */
+
+
+#ifdef USE_MMX_ASM
+#include "assyntax.h"
+#include "matypes.h"
+
+/* integer multiplication - alpha plus one
+ *
+ * makes the following approximation to the division (Sree)
+ *
+ * rgb*a/255 ~= (rgb*(a+1)) >> 256
+ *
+ * which is the fastest method that satisfies the following OpenGL criteria
+ *
+ * 0*0 = 0 and 255*255 = 255
+ *
+ * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making
+ *
+ * PCMPEQW ( MX1, MX1 )
+ */
+#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \
+ PSUBW ( MX1, MA1 ) /* a1 + 1 | a1 + 1 | a1 + 1 | a1 + 1 */ ;\
+ PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\
+ ;\
+TWO(PSUBW ( MX1, MA2 )) /* a2 + 1 | a2 + 1 | a2 + 1 | a2 + 1 */ ;\
+TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\
+ ;\
+ PSRLW ( CONST(8), MA1 ) /* t1 >> 8 ~= t1/255 */ ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 ~= t2/255 */
+
+
+/* integer multiplication - geometric series
+ *
+ * takes the geometric series approximation to the division
+ *
+ * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
+ *
+ * in this case just the first two terms to fit in 16bit arithmetic
+ *
+ * t/255 ~= (t + (t >> 8)) >> 8
+ *
+ * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254,
+ * so the special case a = 255 must be accounted or roundoff must be used
+ */
+#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \
+ PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\
+TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\
+ ;\
+ MOVQ ( MA1, MP1 ) ;\
+ PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\
+ ;\
+TWO(MOVQ ( MA2, MP2 )) ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\
+ ;\
+ PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
+ PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
+ ;\
+TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
+
+
+/* integer multiplication - geometric series plus rounding
+ *
+ * when using a geometric series division instead of truncating the result
+ * use roundoff in the approximation (Jim Blinn)
+ *
+ * t = rgb*a + 0x80
+ *
+ * achieving the exact results
+ *
+ * note that M80 is register with the 0x0080008000800080 constant
+ */
+#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \
+ PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\
+ PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\
+ ;\
+TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\
+TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\
+ ;\
+ MOVQ ( MA1, MP1 ) ;\
+ PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\
+ ;\
+TWO(MOVQ ( MA2, MP2 )) ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\
+ ;\
+ PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
+ PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
+ ;\
+TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
+
+
+/* linear interpolation - geometric series
+ */
+#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \
+ PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\
+ PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\
+ PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\
+ ;\
+TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\
+TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\
+TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\
+ ;\
+ MOVQ ( MA1, MP1 ) ;\
+ PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\
+ ;\
+TWO(MOVQ ( MA2, MP2 )) ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\
+ ;\
+ PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
+TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
+ ;\
+ PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\
+TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\
+ ;\
+ PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
+
+
+/* linear interpolation - geometric series with roundoff
+ *
+ * this is a generalization of Blinn's formula to signed arithmetic
+ *
+ * note that M80 is a register with the 0x0080008000800080 constant
+ */
+#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \
+ PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\
+ PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\
+ PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\
+ ;\
+TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\
+TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\
+TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\
+ ;\
+ PSRLW ( CONST(15), MP1 ) /* q1 > p1 ? 1 : 0 */ ;\
+TWO(PSRLW ( CONST(15), MP2 )) /* q2 > q2 ? 1 : 0 */ ;\
+ ;\
+ PSLLW ( CONST(8), MP1 ) /* q1 > p1 ? 0x100 : 0 */ ;\
+TWO(PSLLW ( CONST(8), MP2 )) /* q2 > q2 ? 0x100 : 0 */ ;\
+ ;\
+ PSUBW ( MP1, MA1 ) /* t1 -=? 0x100 */ ;\
+TWO(PSUBW ( MP2, MA2 )) /* t2 -=? 0x100 */ ;\
+ ;\
+ PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\
+TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\
+ ;\
+ MOVQ ( MA1, MP1 ) ;\
+ PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\
+ ;\
+TWO(MOVQ ( MA2, MP2 )) ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\
+ ;\
+ PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
+TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
+ ;\
+ PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\
+TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\
+ ;\
+ PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
+
+
+/* linear interpolation - geometric series with correction
+ *
+ * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria
+ *
+ * t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8
+ *
+ * note that although is faster than rounding off it doesn't give always the exact results
+ */
+#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \
+ PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\
+ PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\
+ PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\
+ ;\
+TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\
+TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\
+TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\
+ ;\
+ MOVQ ( MA1, MP1 ) ;\
+ PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\
+ ;\
+TWO(MOVQ ( MA2, MP2 )) ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\
+ ;\
+ PADDW ( MA1, MP1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
+ PSRLW ( CONST(7), MA1 ) /* t1 >> 15 */ ;\
+ ;\
+TWO(PADDW ( MA2, MP2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
+TWO(PSRLW ( CONST(7), MA2 )) /* t2 >> 15 */ ;\
+ ;\
+ PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ ;\
+TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */ ;\
+ ;\
+ PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\
+TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\
+ ;\
+ PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
+
+
+/* common blending setup code
+ *
+ * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making
+ *
+ * PXOR ( M00, M00 )
+ */
+#define GMB_LOAD(rgba, dest, MPP, MQQ) \
+ONE(MOVD ( REGIND(rgba), MPP )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\
+ONE(MOVD ( REGIND(dest), MQQ )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\
+ ;\
+TWO(MOVQ ( REGIND(rgba), MPP )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\
+TWO(MOVQ ( REGIND(dest), MQQ )) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */
+
+#define GMB_UNPACK(MP1, MQ1, MP2, MQ2, M00) \
+TWO(MOVQ ( MP1, MP2 )) ;\
+TWO(MOVQ ( MQ1, MQ2 )) ;\
+ ;\
+ PUNPCKLBW ( M00, MQ1 ) /* qa1 | qb1 | qg1 | qr1 */ ;\
+TWO(PUNPCKHBW ( M00, MQ2 )) /* qa2 | qb2 | qg2 | qr2 */ ;\
+ PUNPCKLBW ( M00, MP1 ) /* pa1 | pb1 | pg1 | pr1 */ ;\
+TWO(PUNPCKHBW ( M00, MP2 )) /* pa2 | pb2 | pg2 | pr2 */
+
+#define GMB_ALPHA(MP1, MA1, MP2, MA2) \
+ MOVQ ( MP1, MA1 ) ;\
+TWO(MOVQ ( MP2, MA2 )) ;\
+ ;\
+ PUNPCKHWD ( MA1, MA1 ) /* pa1 | pa1 | | */ ;\
+TWO(PUNPCKHWD ( MA2, MA2 )) /* pa2 | pa2 | | */ ;\
+ PUNPCKHDQ ( MA1, MA1 ) /* pa1 | pa1 | pa1 | pa1 */ ;\
+TWO(PUNPCKHDQ ( MA2, MA2 )) /* pa2 | pa2 | pa2 | pa2 */
+
+#define GMB_PACK( MS1, MS2 ) \
+ PACKUSWB ( MS2, MS1 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\
+
+#define GMB_STORE(rgba, MSS ) \
+ONE(MOVD ( MSS, REGIND(rgba) )) /* | | | | sa1 | sb1 | sg1 | sr1 */ ;\
+TWO(MOVQ ( MSS, REGIND(rgba) )) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */
+
+/* Kevin F. Quinn <kevquinn@gentoo.org> 2 July 2006
+ * Replace data segment constants with text-segment
+ * constants (via pushl/movq)
+ SEG_DATA
+
+ALIGNDATA8
+const_0080:
+ D_LONG 0x00800080, 0x00800080
+
+const_80:
+ D_LONG 0x80808080, 0x80808080
+*/
+#define const_0080_l 0x00800080
+#define const_0080_h 0x00800080
+#define const_80_l 0x80808080
+#define const_80_h 0x80808080
+
+ SEG_TEXT
+
+
+/* Blend transparency function
+ */
+
+#define TAG(x) CONCAT(x,_transparency)
+#define LLTAG(x) LLBL2(x,_transparency)
+
+#define INIT \
+ PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */
+
+#define MAIN( rgba, dest ) \
+ GMB_LOAD( rgba, dest, MM1, MM2 ) ;\
+ GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 ) ;\
+ GMB_ALPHA( MM1, MM3, MM4, MM6 ) ;\
+ GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 ) ;\
+ GMB_PACK( MM3, MM6 ) ;\
+ GMB_STORE( rgba, MM3 )
+
+#include "mmx_blendtmp.h"
+
+
+/* Blend add function
+ *
+ * FIXME: Add some loop unrolling here...
+ */
+
+#define TAG(x) CONCAT(x,_add)
+#define LLTAG(x) LLBL2(x,_add)
+
+#define INIT
+
+#define MAIN( rgba, dest ) \
+ONE(MOVD ( REGIND(rgba), MM1 )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\
+ONE(MOVD ( REGIND(dest), MM2 )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\
+ONE(PADDUSB ( MM2, MM1 )) ;\
+ONE(MOVD ( MM1, REGIND(rgba) )) /* | | | | sa1 | sb1 | sg1 | sr1 */ ;\
+ ;\
+TWO(MOVQ ( REGIND(rgba), MM1 )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\
+TWO(PADDUSB ( REGIND(dest), MM1 )) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\
+TWO(MOVQ ( MM1, REGIND(rgba) ))
+
+#include "mmx_blendtmp.h"
+
+
+/* Blend min function
+ */
+
+#define TAG(x) CONCAT(x,_min)
+#define LLTAG(x) LLBL2(x,_min)
+
+/* Kevin F. Quinn 2nd July 2006
+ * Replace data segment constants with text-segment instructions
+#define INIT \
+ MOVQ ( CONTENT(const_80), MM7 )
+ */
+#define INIT \
+ PUSH_L ( CONST(const_80_h) ) /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/ ;\
+ PUSH_L ( CONST(const_80_l) ) ;\
+ MOVQ ( REGIND(ESP), MM7 ) ;\
+ ADD_L ( CONST(8), ESP)
+
+#define MAIN( rgba, dest ) \
+ GMB_LOAD( rgba, dest, MM1, MM2 ) ;\
+ MOVQ ( MM1, MM3 ) ;\
+ MOVQ ( MM2, MM4 ) ;\
+ PXOR ( MM7, MM3 ) /* unsigned -> signed */ ;\
+ PXOR ( MM7, MM4 ) /* unsigned -> signed */ ;\
+ PCMPGTB ( MM3, MM4 ) /* q > p ? 0xff : 0x00 */ ;\
+ PAND ( MM4, MM1 ) /* q > p ? p : 0 */ ;\
+ PANDN ( MM2, MM4 ) /* q > p ? 0 : q */ ;\
+ POR ( MM1, MM4 ) /* q > p ? p : q */ ;\
+ GMB_STORE( rgba, MM4 )
+
+#include "mmx_blendtmp.h"
+
+
+/* Blend max function
+ */
+
+#define TAG(x) CONCAT(x,_max)
+#define LLTAG(x) LLBL2(x,_max)
+
+/* Kevin F. Quinn 2nd July 2006
+ * Replace data segment constants with text-segment instructions
+#define INIT \
+ MOVQ ( CONTENT(const_80), MM7 )
+ */
+#define INIT \
+ PUSH_L ( CONST(const_80_l) ) /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/ ;\
+ PUSH_L ( CONST(const_80_h) ) ;\
+ MOVQ ( REGIND(ESP), MM7 ) ;\
+ ADD_L ( CONST(8), ESP)
+
+#define MAIN( rgba, dest ) \
+ GMB_LOAD( rgba, dest, MM1, MM2 ) ;\
+ MOVQ ( MM1, MM3 ) ;\
+ MOVQ ( MM2, MM4 ) ;\
+ PXOR ( MM7, MM3 ) /* unsigned -> signed */ ;\
+ PXOR ( MM7, MM4 ) /* unsigned -> signed */ ;\
+ PCMPGTB ( MM3, MM4 ) /* q > p ? 0xff : 0x00 */ ;\
+ PAND ( MM4, MM2 ) /* q > p ? q : 0 */ ;\
+ PANDN ( MM1, MM4 ) /* q > p ? 0 : p */ ;\
+ POR ( MM2, MM4 ) /* q > p ? p : q */ ;\
+ GMB_STORE( rgba, MM4 )
+
+#include "mmx_blendtmp.h"
+
+
+/* Blend modulate function
+ */
+
+#define TAG(x) CONCAT(x,_modulate)
+#define LLTAG(x) LLBL2(x,_modulate)
+
+/* Kevin F. Quinn 2nd July 2006
+ * Replace data segment constants with text-segment instructions
+#define INIT \
+ MOVQ ( CONTENT(const_0080), MM7 )
+ */
+#define INIT \
+ PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ ;\
+ PUSH_L ( CONST(const_0080_l) ) /* 0x0080 | 0x0080 | 0x0080 | 0x0080 */ ;\
+ PUSH_L ( CONST(const_0080_h) ) ;\
+ MOVQ ( REGIND(ESP), MM7 ) ;\
+ ADD_L ( CONST(8), ESP)
+
+#define MAIN( rgba, dest ) \
+ GMB_LOAD( rgba, dest, MM1, MM2 ) ;\
+ GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 ) ;\
+ GMB_MULT_GSR( MM1, MM2, MM4, MM5, MM7 ) ;\
+ GMB_PACK( MM2, MM5 ) ;\
+ GMB_STORE( rgba, MM2 )
+
+#include "mmx_blendtmp.h"
+
+#endif
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/mmx_blendtmp.h b/src/arch/x86/mmx_blendtmp.h
new file mode 100644
index 0000000..8534792
--- /dev/null
+++ b/src/arch/x86/mmx_blendtmp.h
@@ -0,0 +1,114 @@
+/*
+ * Written by José Fonseca <j_r_fonseca@yahoo.co.uk>
+ */
+
+
+/*
+ * void _mesa_mmx_blend( struct gl_context *ctx,
+ * GLuint n,
+ * const GLubyte mask[],
+ * GLchan rgba[][4],
+ * CONST GLchan dest[][4] )
+ *
+ */
+ALIGNTEXT16
+GLOBL GLNAME( TAG(_mesa_mmx_blend) )
+HIDDEN( TAG(_mesa_mmx_blend) )
+GLNAME( TAG(_mesa_mmx_blend) ):
+
+ PUSH_L ( EBP )
+ MOV_L ( ESP, EBP )
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+ PUSH_L ( EBX )
+
+ MOV_L ( REGOFF(12, EBP), ECX ) /* n */
+ CMP_L ( CONST(0), ECX)
+ JE ( LLTAG(GMB_return) )
+
+ MOV_L ( REGOFF(16, EBP), EBX ) /* mask */
+ MOV_L ( REGOFF(20, EBP), EDI ) /* rgba */
+ MOV_L ( REGOFF(24, EBP), ESI ) /* dest */
+
+ INIT
+
+ TEST_L ( CONST(4), EDI ) /* align rgba on an 8-byte boundary */
+ JZ ( LLTAG(GMB_align_end) )
+
+ CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */
+ JE ( LLTAG(GMB_align_continue) )
+
+ /* runin */
+#define ONE(x) x
+#define TWO(x)
+ MAIN ( EDI, ESI )
+#undef ONE
+#undef TWO
+
+LLTAG(GMB_align_continue):
+
+ DEC_L ( ECX ) /* n -= 1 */
+ INC_L ( EBX ) /* mask += 1 */
+ ADD_L ( CONST(4), EDI ) /* rgba += 1 */
+ ADD_L ( CONST(4), ESI ) /* dest += 1 */
+
+LLTAG(GMB_align_end):
+
+ CMP_L ( CONST(2), ECX)
+ JB ( LLTAG(GMB_loop_end) )
+
+ALIGNTEXT16
+LLTAG(GMB_loop_begin):
+
+ CMP_W ( CONST(0), REGIND(EBX) ) /* *mask == 0 && *(mask + 1) == 0 */
+ JE ( LLTAG(GMB_loop_continue) )
+
+ /* main loop */
+#define ONE(x)
+#define TWO(x) x
+ MAIN ( EDI, ESI )
+#undef ONE
+#undef TWO
+
+LLTAG(GMB_loop_continue):
+
+ DEC_L ( ECX )
+ DEC_L ( ECX ) /* n -= 2 */
+ ADD_L ( CONST(2), EBX ) /* mask += 2 */
+ ADD_L ( CONST(8), EDI ) /* rgba += 2 */
+ ADD_L ( CONST(8), ESI ) /* dest += 2 */
+ CMP_L ( CONST(2), ECX )
+ JAE ( LLTAG(GMB_loop_begin) )
+
+LLTAG(GMB_loop_end):
+
+ CMP_L ( CONST(1), ECX )
+ JB ( LLTAG(GMB_done) )
+
+ CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */
+ JE ( LLTAG(GMB_done) )
+
+ /* runout */
+#define ONE(x) x
+#define TWO(x)
+ MAIN ( EDI, ESI )
+#undef ONE
+#undef TWO
+
+LLTAG(GMB_done):
+
+ EMMS
+
+LLTAG(GMB_return):
+
+ POP_L ( EBX )
+ POP_L ( EDI )
+ POP_L ( ESI )
+ MOV_L ( EBP, ESP )
+ POP_L ( EBP )
+ RET
+
+#undef TAG
+#undef LLTAG
+#undef INIT
+#undef MAIN
diff --git a/src/arch/x86/norm_args.h b/src/arch/x86/norm_args.h
new file mode 100644
index 0000000..e22f8bb
--- /dev/null
+++ b/src/arch/x86/norm_args.h
@@ -0,0 +1,57 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Normal transform function interface for assembly code. Simply define
+ * FRAME_OFFSET to the number of bytes pushed onto the stack before
+ * using the ARG_* argument macros.
+ *
+ * Gareth Hughes
+ */
+
+#ifndef __NORM_ARGS_H__
+#define __NORM_ARGS_H__
+
+/* Offsets for normal_func arguments
+ *
+ * typedef void (*normal_func)( const GLmatrix *mat,
+ * GLfloat scale,
+ * const GLvector4f *in,
+ * const GLfloat lengths[],
+ * GLvector4f *dest );
+ */
+#define OFFSET_MAT 4
+#define OFFSET_SCALE 8
+#define OFFSET_IN 12
+#define OFFSET_LENGTHS 16
+#define OFFSET_DEST 20
+
+#define ARG_MAT REGOFF(FRAME_OFFSET+OFFSET_MAT, ESP)
+#define ARG_SCALE REGOFF(FRAME_OFFSET+OFFSET_SCALE, ESP)
+#define ARG_IN REGOFF(FRAME_OFFSET+OFFSET_IN, ESP)
+#define ARG_LENGTHS REGOFF(FRAME_OFFSET+OFFSET_LENGTHS, ESP)
+#define ARG_DEST REGOFF(FRAME_OFFSET+OFFSET_DEST, ESP)
+
+#endif
diff --git a/src/arch/x86/read_rgba_span_x86.S b/src/arch/x86/read_rgba_span_x86.S
new file mode 100644
index 0000000..3be4515
--- /dev/null
+++ b/src/arch/x86/read_rgba_span_x86.S
@@ -0,0 +1,686 @@
+/*
+ * (C) Copyright IBM Corporation 2004
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file read_rgba_span_x86.S
+ * Optimized routines to transfer pixel data from the framebuffer to a
+ * buffer in main memory.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+ .file "read_rgba_span_x86.S"
+#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
+/* Kevin F. Quinn 2nd July 2006
+ * Replaced data segment constants with text-segment instructions.
+ */
+#define LOAD_MASK(mvins,m1,m2) \
+ pushl $0xff00ff00 ;\
+ pushl $0xff00ff00 ;\
+ pushl $0xff00ff00 ;\
+ pushl $0xff00ff00 ;\
+ mvins (%esp), m1 ;\
+ pushl $0x00ff0000 ;\
+ pushl $0x00ff0000 ;\
+ pushl $0x00ff0000 ;\
+ pushl $0x00ff0000 ;\
+ mvins (%esp), m2 ;\
+ addl $32, %esp
+
+/* I implemented these as macros because they appear in several places,
+ * and I've tweaked them a number of times. I got tired of changing every
+ * place they appear. :)
+ */
+
+#define DO_ONE_PIXEL() \
+ movl (%ebx), %eax ; \
+ addl $4, %ebx ; \
+ bswap %eax /* ARGB -> BGRA */ ; \
+ rorl $8, %eax /* BGRA -> ABGR */ ; \
+ movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
+ addl $4, %ecx
+
+#define DO_ONE_LAST_PIXEL() \
+ movl (%ebx), %eax ; \
+ bswap %eax /* ARGB -> BGRA */ ; \
+ rorl $8, %eax /* BGRA -> ABGR */ ; \
+ movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
+
+
+/**
+ * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
+ *
+ * \warning
+ * This function assumes that the caller will issue the EMMS instruction
+ * at the correct places.
+ */
+
+.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
+#ifndef USE_DRICORE
+.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
+#endif
+ .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
+_generic_read_RGBA_span_BGRA8888_REV_MMX:
+ pushl %ebx
+
+#ifdef USE_INNER_EMMS
+ emms
+#endif
+ LOAD_MASK(movq,%mm1,%mm2)
+
+ movl 8(%esp), %ebx /* source pointer */
+ movl 16(%esp), %edx /* number of pixels to copy */
+ movl 12(%esp), %ecx /* destination pointer */
+
+ testl %edx, %edx
+ jle .L20 /* Bail if there's nothing to do. */
+
+ movl %ebx, %eax
+
+ negl %eax
+ sarl $2, %eax
+ andl $1, %eax
+ je .L17
+
+ subl %eax, %edx
+ DO_ONE_PIXEL()
+.L17:
+
+ /* Would it be faster to unroll this loop once and process 4 pixels
+ * per pass, instead of just two?
+ */
+
+ movl %edx, %eax
+ shrl %eax
+ jmp .L18
+.L19:
+ movq (%ebx), %mm0
+ addl $8, %ebx
+
+ /* These 9 instructions do what PSHUFB (if there were such an
+ * instruction) could do in 1. :(
+ */
+
+ movq %mm0, %mm3
+ movq %mm0, %mm4
+
+ pand %mm2, %mm3
+ psllq $16, %mm4
+ psrlq $16, %mm3
+ pand %mm2, %mm4
+
+ pand %mm1, %mm0
+ por %mm4, %mm3
+ por %mm3, %mm0
+
+ movq %mm0, (%ecx)
+ addl $8, %ecx
+ subl $1, %eax
+.L18:
+ jne .L19
+
+#ifdef USE_INNER_EMMS
+ emms
+#endif
+
+ /* At this point there are either 1 or 0 pixels remaining to be
+ * converted. Convert the last pixel, if needed.
+ */
+
+ testl $1, %edx
+ je .L20
+
+ DO_ONE_LAST_PIXEL()
+
+.L20:
+ popl %ebx
+ ret
+ .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
+
+
+/**
+ * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
+ * instructions are only actually used to read data from the framebuffer.
+ * In practice, the speed-up is pretty small.
+ *
+ * \todo
+ * Do some more testing and determine if there's any reason to have this
+ * function in addition to the MMX version.
+ *
+ * \warning
+ * This function assumes that the caller will issue the EMMS instruction
+ * at the correct places.
+ */
+
+.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
+#ifndef USE_DRICORE
+.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
+#endif
+ .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
+_generic_read_RGBA_span_BGRA8888_REV_SSE:
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+
+#ifdef USE_INNER_EMMS
+ emms
+#endif
+
+ LOAD_MASK(movq,%mm1,%mm2)
+
+ movl 16(%esp), %ebx /* source pointer */
+ movl 24(%esp), %edx /* number of pixels to copy */
+ movl 20(%esp), %ecx /* destination pointer */
+
+ testl %edx, %edx
+ jle .L35 /* Bail if there's nothing to do. */
+
+ movl %esp, %ebp
+ subl $16, %esp
+ andl $0xfffffff0, %esp
+
+ movl %ebx, %eax
+ movl %edx, %esi
+
+ negl %eax
+ andl $15, %eax
+ sarl $2, %eax
+ cmpl %edx, %eax
+ cmovle %eax, %esi
+
+ subl %esi, %edx
+
+ testl $1, %esi
+ je .L32
+
+ DO_ONE_PIXEL()
+.L32:
+
+ testl $2, %esi
+ je .L31
+
+ movq (%ebx), %mm0
+ addl $8, %ebx
+
+ movq %mm0, %mm3
+ movq %mm0, %mm4
+
+ pand %mm2, %mm3
+ psllq $16, %mm4
+ psrlq $16, %mm3
+ pand %mm2, %mm4
+
+ pand %mm1, %mm0
+ por %mm4, %mm3
+ por %mm3, %mm0
+
+ movq %mm0, (%ecx)
+ addl $8, %ecx
+.L31:
+
+ movl %edx, %eax
+ shrl $2, %eax
+ jmp .L33
+.L34:
+ movaps (%ebx), %xmm0
+ addl $16, %ebx
+
+ /* This would be so much better if we could just move directly from
+ * an SSE register to an MMX register. Unfortunately, that
+ * functionality wasn't introduced until SSE2 with the MOVDQ2Q
+ * instruction.
+ */
+
+ movaps %xmm0, (%esp)
+ movq (%esp), %mm0
+ movq 8(%esp), %mm5
+
+ movq %mm0, %mm3
+ movq %mm0, %mm4
+ movq %mm5, %mm6
+ movq %mm5, %mm7
+
+ pand %mm2, %mm3
+ pand %mm2, %mm6
+
+ psllq $16, %mm4
+ psllq $16, %mm7
+
+ psrlq $16, %mm3
+ psrlq $16, %mm6
+
+ pand %mm2, %mm4
+ pand %mm2, %mm7
+
+ pand %mm1, %mm0
+ pand %mm1, %mm5
+
+ por %mm4, %mm3
+ por %mm7, %mm6
+
+ por %mm3, %mm0
+ por %mm6, %mm5
+
+ movq %mm0, (%ecx)
+ movq %mm5, 8(%ecx)
+ addl $16, %ecx
+
+ subl $1, %eax
+.L33:
+ jne .L34
+
+#ifdef USE_INNER_EMMS
+ emms
+#endif
+ movl %ebp, %esp
+
+ /* At this point there are either [0, 3] pixels remaining to be
+ * converted.
+ */
+
+ testl $2, %edx
+ je .L36
+
+ movq (%ebx), %mm0
+ addl $8, %ebx
+
+ movq %mm0, %mm3
+ movq %mm0, %mm4
+
+ pand %mm2, %mm3
+ psllq $16, %mm4
+ psrlq $16, %mm3
+ pand %mm2, %mm4
+
+ pand %mm1, %mm0
+ por %mm4, %mm3
+ por %mm3, %mm0
+
+ movq %mm0, (%ecx)
+ addl $8, %ecx
+.L36:
+
+ testl $1, %edx
+ je .L35
+
+ DO_ONE_LAST_PIXEL()
+.L35:
+ popl %ebp
+ popl %ebx
+ popl %esi
+ ret
+ .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
+
+
+/**
+ * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
+ */
+
+ .text
+.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
+#ifndef USE_DRICORE
+.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
+#endif
+ .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
+_generic_read_RGBA_span_BGRA8888_REV_SSE2:
+ pushl %esi
+ pushl %ebx
+
+ LOAD_MASK(movdqu,%xmm1,%xmm2)
+
+ movl 12(%esp), %ebx /* source pointer */
+ movl 20(%esp), %edx /* number of pixels to copy */
+ movl 16(%esp), %ecx /* destination pointer */
+
+ movl %ebx, %eax
+ movl %edx, %esi
+
+ testl %edx, %edx
+ jle .L46 /* Bail if there's nothing to do. */
+
+ /* If the source pointer isn't a multiple of 16 we have to process
+ * a few pixels the "slow" way to get the address aligned for
+ * the SSE fetch intsructions.
+ */
+
+ negl %eax
+ andl $15, %eax
+ sarl $2, %eax
+
+ cmpl %edx, %eax
+ cmovbe %eax, %esi
+ subl %esi, %edx
+
+ testl $1, %esi
+ je .L41
+
+ DO_ONE_PIXEL()
+.L41:
+ testl $2, %esi
+ je .L40
+
+ movq (%ebx), %xmm0
+ addl $8, %ebx
+
+ movdqa %xmm0, %xmm3
+ movdqa %xmm0, %xmm4
+ andps %xmm1, %xmm0
+
+ andps %xmm2, %xmm3
+ pslldq $2, %xmm4
+ psrldq $2, %xmm3
+ andps %xmm2, %xmm4
+
+ orps %xmm4, %xmm3
+ orps %xmm3, %xmm0
+
+ movq %xmm0, (%ecx)
+ addl $8, %ecx
+.L40:
+
+ /* Would it be worth having a specialized version of this loop for
+ * the case where the destination is 16-byte aligned? That version
+ * would be identical except that it could use movedqa instead of
+ * movdqu.
+ */
+
+ movl %edx, %eax
+ shrl $2, %eax
+ jmp .L42
+.L43:
+ movdqa (%ebx), %xmm0
+ addl $16, %ebx
+
+ movdqa %xmm0, %xmm3
+ movdqa %xmm0, %xmm4
+ andps %xmm1, %xmm0
+
+ andps %xmm2, %xmm3
+ pslldq $2, %xmm4
+ psrldq $2, %xmm3
+ andps %xmm2, %xmm4
+
+ orps %xmm4, %xmm3
+ orps %xmm3, %xmm0
+
+ movdqu %xmm0, (%ecx)
+ addl $16, %ecx
+ subl $1, %eax
+.L42:
+ jne .L43
+
+
+ /* There may be upto 3 pixels remaining to be copied. Take care
+ * of them now. We do the 2 pixel case first because the data
+ * will be aligned.
+ */
+
+ testl $2, %edx
+ je .L47
+
+ movq (%ebx), %xmm0
+ addl $8, %ebx
+
+ movdqa %xmm0, %xmm3
+ movdqa %xmm0, %xmm4
+ andps %xmm1, %xmm0
+
+ andps %xmm2, %xmm3
+ pslldq $2, %xmm4
+ psrldq $2, %xmm3
+ andps %xmm2, %xmm4
+
+ orps %xmm4, %xmm3
+ orps %xmm3, %xmm0
+
+ movq %xmm0, (%ecx)
+ addl $8, %ecx
+.L47:
+
+ testl $1, %edx
+ je .L46
+
+ DO_ONE_LAST_PIXEL()
+.L46:
+
+ popl %ebx
+ popl %esi
+ ret
+ .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
+
+
+
+#define MASK_565_L 0x07e0f800
+#define MASK_565_H 0x0000001f
+/* Setting SCALE_ADJUST to 5 gives a perfect match with the
+ * classic C implementation in Mesa. Setting SCALE_ADJUST
+ * to 0 is slightly faster but at a small cost to accuracy.
+ */
+#define SCALE_ADJUST 5
+#if SCALE_ADJUST == 5
+#define PRESCALE_L 0x00100001
+#define PRESCALE_H 0x00000200
+#define SCALE_L 0x40C620E8
+#define SCALE_H 0x0000839d
+#elif SCALE_ADJUST == 0
+#define PRESCALE_L 0x00200001
+#define PRESCALE_H 0x00000800
+#define SCALE_L 0x01040108
+#define SCALE_H 0x00000108
+#else
+#error SCALE_ADJUST must either be 5 or 0.
+#endif
+#define ALPHA_L 0x00000000
+#define ALPHA_H 0x00ff0000
+
+/**
+ * MMX optimized version of the RGB565 to RGBA copy routine.
+ */
+
+ .text
+ .globl _generic_read_RGBA_span_RGB565_MMX
+#ifndef USE_DRICORE
+ .hidden _generic_read_RGBA_span_RGB565_MMX
+#endif
+ .type _generic_read_RGBA_span_RGB565_MMX, @function
+
+_generic_read_RGBA_span_RGB565_MMX:
+
+#ifdef USE_INNER_EMMS
+ emms
+#endif
+
+ movl 4(%esp), %eax /* source pointer */
+ movl 8(%esp), %edx /* destination pointer */
+ movl 12(%esp), %ecx /* number of pixels to copy */
+
+ pushl $MASK_565_H
+ pushl $MASK_565_L
+ movq (%esp), %mm5
+ pushl $PRESCALE_H
+ pushl $PRESCALE_L
+ movq (%esp), %mm6
+ pushl $SCALE_H
+ pushl $SCALE_L
+ movq (%esp), %mm7
+ pushl $ALPHA_H
+ pushl $ALPHA_L
+ movq (%esp), %mm3
+ addl $32,%esp
+
+ sarl $2, %ecx
+ jl .L01 /* Bail early if the count is negative. */
+ jmp .L02
+
+.L03:
+ /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
+ * second pixels into the four words of %mm0 and %mm2.
+ */
+
+ movq (%eax), %mm4
+ addl $8, %eax
+
+ pshufw $0x00, %mm4, %mm0
+ pshufw $0x55, %mm4, %mm2
+
+
+ /* Mask the pixels so that each word of each register contains only
+ * one color component.
+ */
+
+ pand %mm5, %mm0
+ pand %mm5, %mm2
+
+
+ /* Adjust the component values so that they are as small as possible,
+ * but large enough so that we can multiply them by an unsigned 16-bit
+ * number and get a value as large as 0x00ff0000.
+ */
+
+ pmullw %mm6, %mm0
+ pmullw %mm6, %mm2
+#if SCALE_ADJUST > 0
+ psrlw $SCALE_ADJUST, %mm0
+ psrlw $SCALE_ADJUST, %mm2
+#endif
+
+ /* Scale the input component values to be on the range
+ * [0, 0x00ff0000]. This it the real magic of the whole routine.
+ */
+
+ pmulhuw %mm7, %mm0
+ pmulhuw %mm7, %mm2
+
+
+ /* Always set the alpha value to 0xff.
+ */
+
+ por %mm3, %mm0
+ por %mm3, %mm2
+
+
+ /* Pack the 16-bit values to 8-bit values and store the converted
+ * pixel data.
+ */
+
+ packuswb %mm2, %mm0
+ movq %mm0, (%edx)
+ addl $8, %edx
+
+ pshufw $0xaa, %mm4, %mm0
+ pshufw $0xff, %mm4, %mm2
+
+ pand %mm5, %mm0
+ pand %mm5, %mm2
+ pmullw %mm6, %mm0
+ pmullw %mm6, %mm2
+#if SCALE_ADJUST > 0
+ psrlw $SCALE_ADJUST, %mm0
+ psrlw $SCALE_ADJUST, %mm2
+#endif
+ pmulhuw %mm7, %mm0
+ pmulhuw %mm7, %mm2
+
+ por %mm3, %mm0
+ por %mm3, %mm2
+
+ packuswb %mm2, %mm0
+
+ movq %mm0, (%edx)
+ addl $8, %edx
+
+ subl $1, %ecx
+.L02:
+ jne .L03
+
+
+ /* At this point there can be at most 3 pixels left to process. If
+ * there is either 2 or 3 left, process 2.
+ */
+
+ movl 12(%esp), %ecx
+ testl $0x02, %ecx
+ je .L04
+
+ movd (%eax), %mm4
+ addl $4, %eax
+
+ pshufw $0x00, %mm4, %mm0
+ pshufw $0x55, %mm4, %mm2
+
+ pand %mm5, %mm0
+ pand %mm5, %mm2
+ pmullw %mm6, %mm0
+ pmullw %mm6, %mm2
+#if SCALE_ADJUST > 0
+ psrlw $SCALE_ADJUST, %mm0
+ psrlw $SCALE_ADJUST, %mm2
+#endif
+ pmulhuw %mm7, %mm0
+ pmulhuw %mm7, %mm2
+
+ por %mm3, %mm0
+ por %mm3, %mm2
+
+ packuswb %mm2, %mm0
+
+ movq %mm0, (%edx)
+ addl $8, %edx
+
+.L04:
+ /* At this point there can be at most 1 pixel left to process.
+ * Process it if needed.
+ */
+
+ testl $0x01, %ecx
+ je .L01
+
+ movzwl (%eax), %ecx
+ movd %ecx, %mm4
+
+ pshufw $0x00, %mm4, %mm0
+
+ pand %mm5, %mm0
+ pmullw %mm6, %mm0
+#if SCALE_ADJUST > 0
+ psrlw $SCALE_ADJUST, %mm0
+#endif
+ pmulhuw %mm7, %mm0
+
+ por %mm3, %mm0
+
+ packuswb %mm0, %mm0
+
+ movd %mm0, (%edx)
+
+.L01:
+#ifdef USE_INNER_EMMS
+ emms
+#endif
+ ret
+#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/read_rgba_span_x86.h b/src/arch/x86/read_rgba_span_x86.h
new file mode 100644
index 0000000..564b1bb
--- /dev/null
+++ b/src/arch/x86/read_rgba_span_x86.h
@@ -0,0 +1,56 @@
+/*
+ * (C) Copyright IBM Corporation 2004
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file read_rgba_span_x86.h
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#ifndef READ_RGBA_SPAN_X86_H
+#define READ_RGBA_SPAN_X86_H
+
+#if defined(USE_SSE_ASM) || defined(USE_MMX_ASM)
+#include "x86/common_x86_asm.h"
+#endif
+
+#if defined(USE_SSE_ASM)
+extern void _generic_read_RGBA_span_BGRA8888_REV_SSE2( const unsigned char *,
+ unsigned char *, unsigned );
+#endif
+
+#if defined(USE_SSE_ASM)
+extern void _generic_read_RGBA_span_BGRA8888_REV_SSE( const unsigned char *,
+ unsigned char *, unsigned );
+#endif
+
+#if defined(USE_MMX_ASM)
+extern void _generic_read_RGBA_span_BGRA8888_REV_MMX( const unsigned char *,
+ unsigned char *, unsigned );
+
+extern void _generic_read_RGBA_span_RGB565_MMX( const unsigned char *,
+ unsigned char *, unsigned );
+#endif
+
+#endif /* READ_RGBA_SPAN_X86_H */
diff --git a/src/arch/x86/rtasm/x86sse.c b/src/arch/x86/rtasm/x86sse.c
new file mode 100644
index 0000000..c93faba
--- /dev/null
+++ b/src/arch/x86/rtasm/x86sse.c
@@ -0,0 +1,1203 @@
+#ifdef USE_X86_ASM
+#if defined(__i386__) || defined(__386__)
+
+#include "main/imports.h"
+#include "x86sse.h"
+
+#define DISASSEM 0
+#define X86_TWOB 0x0f
+
+#if 0
+static unsigned char *cptr( void (*label)() )
+{
+ return (unsigned char *)(unsigned long)label;
+}
+#endif
+
+
+static void do_realloc( struct x86_function *p )
+{
+ if (p->size == 0) {
+ p->size = 1024;
+ p->store = _mesa_exec_malloc(p->size);
+ p->csr = p->store;
+ }
+ else {
+ unsigned used = p->csr - p->store;
+ unsigned char *tmp = p->store;
+ p->size *= 2;
+ p->store = _mesa_exec_malloc(p->size);
+ memcpy(p->store, tmp, used);
+ p->csr = p->store + used;
+ _mesa_exec_free(tmp);
+ }
+}
+
+/* Emit bytes to the instruction stream:
+ */
+static unsigned char *reserve( struct x86_function *p, int bytes )
+{
+ if (p->csr + bytes - p->store > p->size)
+ do_realloc(p);
+
+ {
+ unsigned char *csr = p->csr;
+ p->csr += bytes;
+ return csr;
+ }
+}
+
+
+
+static void emit_1b( struct x86_function *p, char b0 )
+{
+ char *csr = (char *)reserve(p, 1);
+ *csr = b0;
+}
+
+static void emit_1i( struct x86_function *p, int i0 )
+{
+ int *icsr = (int *)reserve(p, sizeof(i0));
+ *icsr = i0;
+}
+
+static void emit_1ub( struct x86_function *p, unsigned char b0 )
+{
+ unsigned char *csr = reserve(p, 1);
+ *csr++ = b0;
+}
+
+static void emit_2ub( struct x86_function *p, unsigned char b0, unsigned char b1 )
+{
+ unsigned char *csr = reserve(p, 2);
+ *csr++ = b0;
+ *csr++ = b1;
+}
+
+static void emit_3ub( struct x86_function *p, unsigned char b0, unsigned char b1, unsigned char b2 )
+{
+ unsigned char *csr = reserve(p, 3);
+ *csr++ = b0;
+ *csr++ = b1;
+ *csr++ = b2;
+}
+
+
+/* Build a modRM byte + possible displacement. No treatment of SIB
+ * indexing. BZZT - no way to encode an absolute address.
+ */
+static void emit_modrm( struct x86_function *p,
+ struct x86_reg reg,
+ struct x86_reg regmem )
+{
+ unsigned char val = 0;
+
+ assert(reg.mod == mod_REG);
+
+ val |= regmem.mod << 6; /* mod field */
+ val |= reg.idx << 3; /* reg field */
+ val |= regmem.idx; /* r/m field */
+
+ emit_1ub(p, val);
+
+ /* Oh-oh we've stumbled into the SIB thing.
+ */
+ if (regmem.file == file_REG32 &&
+ regmem.idx == reg_SP) {
+ emit_1ub(p, 0x24); /* simplistic! */
+ }
+
+ switch (regmem.mod) {
+ case mod_REG:
+ case mod_INDIRECT:
+ break;
+ case mod_DISP8:
+ emit_1b(p, regmem.disp);
+ break;
+ case mod_DISP32:
+ emit_1i(p, regmem.disp);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+}
+
+
+static void emit_modrm_noreg( struct x86_function *p,
+ unsigned op,
+ struct x86_reg regmem )
+{
+ struct x86_reg dummy = x86_make_reg(file_REG32, op);
+ emit_modrm(p, dummy, regmem);
+}
+
+/* Many x86 instructions have two opcodes to cope with the situations
+ * where the destination is a register or memory reference
+ * respectively. This function selects the correct opcode based on
+ * the arguments presented.
+ */
+static void emit_op_modrm( struct x86_function *p,
+ unsigned char op_dst_is_reg,
+ unsigned char op_dst_is_mem,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ switch (dst.mod) {
+ case mod_REG:
+ emit_1ub(p, op_dst_is_reg);
+ emit_modrm(p, dst, src);
+ break;
+ case mod_INDIRECT:
+ case mod_DISP32:
+ case mod_DISP8:
+ assert(src.mod == mod_REG);
+ emit_1ub(p, op_dst_is_mem);
+ emit_modrm(p, src, dst);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+}
+
+
+
+
+
+
+
+/* Create and manipulate registers and regmem values:
+ */
+struct x86_reg x86_make_reg( enum x86_reg_file file,
+ enum x86_reg_name idx )
+{
+ struct x86_reg reg;
+
+ reg.file = file;
+ reg.idx = idx;
+ reg.mod = mod_REG;
+ reg.disp = 0;
+
+ return reg;
+}
+
+struct x86_reg x86_make_disp( struct x86_reg reg,
+ int disp )
+{
+ assert(reg.file == file_REG32);
+
+ if (reg.mod == mod_REG)
+ reg.disp = disp;
+ else
+ reg.disp += disp;
+
+ if (reg.disp == 0)
+ reg.mod = mod_INDIRECT;
+ else if (reg.disp <= 127 && reg.disp >= -128)
+ reg.mod = mod_DISP8;
+ else
+ reg.mod = mod_DISP32;
+
+ return reg;
+}
+
+struct x86_reg x86_deref( struct x86_reg reg )
+{
+ return x86_make_disp(reg, 0);
+}
+
+struct x86_reg x86_get_base_reg( struct x86_reg reg )
+{
+ return x86_make_reg( reg.file, reg.idx );
+}
+
+unsigned char *x86_get_label( struct x86_function *p )
+{
+ return p->csr;
+}
+
+
+
+/***********************************************************************
+ * x86 instructions
+ */
+
+
+void x86_jcc( struct x86_function *p,
+ enum x86_cc cc,
+ unsigned char *label )
+{
+ int offset = label - (x86_get_label(p) + 2);
+
+ if (offset <= 127 && offset >= -128) {
+ emit_1ub(p, 0x70 + cc);
+ emit_1b(p, (char) offset);
+ }
+ else {
+ offset = label - (x86_get_label(p) + 6);
+ emit_2ub(p, 0x0f, 0x80 + cc);
+ emit_1i(p, offset);
+ }
+}
+
+/* Always use a 32bit offset for forward jumps:
+ */
+unsigned char *x86_jcc_forward( struct x86_function *p,
+ enum x86_cc cc )
+{
+ emit_2ub(p, 0x0f, 0x80 + cc);
+ emit_1i(p, 0);
+ return x86_get_label(p);
+}
+
+unsigned char *x86_jmp_forward( struct x86_function *p)
+{
+ emit_1ub(p, 0xe9);
+ emit_1i(p, 0);
+ return x86_get_label(p);
+}
+
+unsigned char *x86_call_forward( struct x86_function *p)
+{
+ emit_1ub(p, 0xe8);
+ emit_1i(p, 0);
+ return x86_get_label(p);
+}
+
+/* Fixup offset from forward jump:
+ */
+void x86_fixup_fwd_jump( struct x86_function *p,
+ unsigned char *fixup )
+{
+ *(int *)(fixup - 4) = x86_get_label(p) - fixup;
+}
+
+void x86_jmp( struct x86_function *p, unsigned char *label)
+{
+ emit_1ub(p, 0xe9);
+ emit_1i(p, label - x86_get_label(p) - 4);
+}
+
+#if 0
+/* This doesn't work once we start reallocating & copying the
+ * generated code on buffer fills, because the call is relative to the
+ * current pc.
+ */
+void x86_call( struct x86_function *p, void (*label)())
+{
+ emit_1ub(p, 0xe8);
+ emit_1i(p, cptr(label) - x86_get_label(p) - 4);
+}
+#else
+void x86_call( struct x86_function *p, struct x86_reg reg)
+{
+ emit_1ub(p, 0xff);
+ emit_modrm_noreg(p, 2, reg);
+}
+#endif
+
+
+/* michal:
+ * Temporary. As I need immediate operands, and dont want to mess with the codegen,
+ * I load the immediate into general purpose register and use it.
+ */
+void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+ assert(dst.mod == mod_REG);
+ emit_1ub(p, 0xb8 + dst.idx);
+ emit_1i(p, imm);
+}
+
+void x86_push( struct x86_function *p,
+ struct x86_reg reg )
+{
+ assert(reg.mod == mod_REG);
+ emit_1ub(p, 0x50 + reg.idx);
+ p->stack_offset += 4;
+}
+
+void x86_pop( struct x86_function *p,
+ struct x86_reg reg )
+{
+ assert(reg.mod == mod_REG);
+ emit_1ub(p, 0x58 + reg.idx);
+ p->stack_offset -= 4;
+}
+
+void x86_inc( struct x86_function *p,
+ struct x86_reg reg )
+{
+ assert(reg.mod == mod_REG);
+ emit_1ub(p, 0x40 + reg.idx);
+}
+
+void x86_dec( struct x86_function *p,
+ struct x86_reg reg )
+{
+ assert(reg.mod == mod_REG);
+ emit_1ub(p, 0x48 + reg.idx);
+}
+
+void x86_ret( struct x86_function *p )
+{
+ emit_1ub(p, 0xc3);
+}
+
+void x86_sahf( struct x86_function *p )
+{
+ emit_1ub(p, 0x9e);
+}
+
+void x86_mov( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_xor( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_op_modrm( p, 0x33, 0x31, dst, src );
+}
+
+void x86_cmp( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_op_modrm( p, 0x3b, 0x39, dst, src );
+}
+
+void x86_lea( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_1ub(p, 0x8d);
+ emit_modrm( p, dst, src );
+}
+
+void x86_test( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_1ub(p, 0x85);
+ emit_modrm( p, dst, src );
+}
+
+void x86_add( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_op_modrm(p, 0x03, 0x01, dst, src );
+}
+
+void x86_mul( struct x86_function *p,
+ struct x86_reg src )
+{
+ assert (src.file == file_REG32 && src.mod == mod_REG);
+ emit_op_modrm(p, 0xf7, 0, x86_make_reg (file_REG32, reg_SP), src );
+}
+
+void x86_sub( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_op_modrm(p, 0x2b, 0x29, dst, src );
+}
+
+void x86_or( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_op_modrm( p, 0x0b, 0x09, dst, src );
+}
+
+void x86_and( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_op_modrm( p, 0x23, 0x21, dst, src );
+}
+
+
+
+/***********************************************************************
+ * SSE instructions
+ */
+
+
+void sse_movss( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, 0xF3, X86_TWOB);
+ emit_op_modrm( p, 0x10, 0x11, dst, src );
+}
+
+void sse_movaps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_1ub(p, X86_TWOB);
+ emit_op_modrm( p, 0x28, 0x29, dst, src );
+}
+
+void sse_movups( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_1ub(p, X86_TWOB);
+ emit_op_modrm( p, 0x10, 0x11, dst, src );
+}
+
+void sse_movhps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ assert(dst.mod != mod_REG || src.mod != mod_REG);
+ emit_1ub(p, X86_TWOB);
+ emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */
+}
+
+void sse_movlps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ assert(dst.mod != mod_REG || src.mod != mod_REG);
+ emit_1ub(p, X86_TWOB);
+ emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */
+}
+
+void sse_maxps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, X86_TWOB, 0x5F);
+ emit_modrm( p, dst, src );
+}
+
+void sse_maxss( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
+ emit_modrm( p, dst, src );
+}
+
+void sse_divss( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0xF3, X86_TWOB, 0x5E);
+ emit_modrm( p, dst, src );
+}
+
+void sse_minps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, X86_TWOB, 0x5D);
+ emit_modrm( p, dst, src );
+}
+
+void sse_subps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, X86_TWOB, 0x5C);
+ emit_modrm( p, dst, src );
+}
+
+void sse_mulps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, X86_TWOB, 0x59);
+ emit_modrm( p, dst, src );
+}
+
+void sse_mulss( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0xF3, X86_TWOB, 0x59);
+ emit_modrm( p, dst, src );
+}
+
+void sse_addps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, X86_TWOB, 0x58);
+ emit_modrm( p, dst, src );
+}
+
+void sse_addss( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0xF3, X86_TWOB, 0x58);
+ emit_modrm( p, dst, src );
+}
+
+void sse_andnps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, X86_TWOB, 0x55);
+ emit_modrm( p, dst, src );
+}
+
+void sse_andps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, X86_TWOB, 0x54);
+ emit_modrm( p, dst, src );
+}
+
+void sse_rsqrtps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, X86_TWOB, 0x52);
+ emit_modrm( p, dst, src );
+}
+
+void sse_rsqrtss( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0xF3, X86_TWOB, 0x52);
+ emit_modrm( p, dst, src );
+
+}
+
+void sse_movhlps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ assert(dst.mod == mod_REG && src.mod == mod_REG);
+ emit_2ub(p, X86_TWOB, 0x12);
+ emit_modrm( p, dst, src );
+}
+
+void sse_movlhps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ assert(dst.mod == mod_REG && src.mod == mod_REG);
+ emit_2ub(p, X86_TWOB, 0x16);
+ emit_modrm( p, dst, src );
+}
+
+void sse_orps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, X86_TWOB, 0x56);
+ emit_modrm( p, dst, src );
+}
+
+void sse_xorps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, X86_TWOB, 0x57);
+ emit_modrm( p, dst, src );
+}
+
+void sse_cvtps2pi( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ assert(dst.file == file_MMX &&
+ (src.file == file_XMM || src.mod != mod_REG));
+
+ p->need_emms = 1;
+
+ emit_2ub(p, X86_TWOB, 0x2d);
+ emit_modrm( p, dst, src );
+}
+
+
+/* Shufps can also be used to implement a reduced swizzle when dest ==
+ * arg0.
+ */
+void sse_shufps( struct x86_function *p,
+ struct x86_reg dest,
+ struct x86_reg arg0,
+ unsigned char shuf)
+{
+ emit_2ub(p, X86_TWOB, 0xC6);
+ emit_modrm(p, dest, arg0);
+ emit_1ub(p, shuf);
+}
+
+void sse_cmpps( struct x86_function *p,
+ struct x86_reg dest,
+ struct x86_reg arg0,
+ unsigned char cc)
+{
+ emit_2ub(p, X86_TWOB, 0xC2);
+ emit_modrm(p, dest, arg0);
+ emit_1ub(p, cc);
+}
+
+void sse_pmovmskb( struct x86_function *p,
+ struct x86_reg dest,
+ struct x86_reg src)
+{
+ emit_3ub(p, 0x66, X86_TWOB, 0xD7);
+ emit_modrm(p, dest, src);
+}
+
+/***********************************************************************
+ * SSE2 instructions
+ */
+
+/**
+ * Perform a reduced swizzle:
+ */
+void sse2_pshufd( struct x86_function *p,
+ struct x86_reg dest,
+ struct x86_reg arg0,
+ unsigned char shuf)
+{
+ emit_3ub(p, 0x66, X86_TWOB, 0x70);
+ emit_modrm(p, dest, arg0);
+ emit_1ub(p, shuf);
+}
+
+void sse2_cvttps2dq( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub( p, 0xF3, X86_TWOB, 0x5B );
+ emit_modrm( p, dst, src );
+}
+
+void sse2_cvtps2dq( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0x66, X86_TWOB, 0x5B);
+ emit_modrm( p, dst, src );
+}
+
+void sse2_packssdw( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0x66, X86_TWOB, 0x6B);
+ emit_modrm( p, dst, src );
+}
+
+void sse2_packsswb( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0x66, X86_TWOB, 0x63);
+ emit_modrm( p, dst, src );
+}
+
+void sse2_packuswb( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0x66, X86_TWOB, 0x67);
+ emit_modrm( p, dst, src );
+}
+
+void sse2_rcpps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, X86_TWOB, 0x53);
+ emit_modrm( p, dst, src );
+}
+
+void sse2_rcpss( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_3ub(p, 0xF3, X86_TWOB, 0x53);
+ emit_modrm( p, dst, src );
+}
+
+void sse2_movd( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ emit_2ub(p, 0x66, X86_TWOB);
+ emit_op_modrm( p, 0x6e, 0x7e, dst, src );
+}
+
+
+
+
+/***********************************************************************
+ * x87 instructions
+ */
+void x87_fist( struct x86_function *p, struct x86_reg dst )
+{
+ emit_1ub(p, 0xdb);
+ emit_modrm_noreg(p, 2, dst);
+}
+
+void x87_fistp( struct x86_function *p, struct x86_reg dst )
+{
+ emit_1ub(p, 0xdb);
+ emit_modrm_noreg(p, 3, dst);
+}
+
+void x87_fild( struct x86_function *p, struct x86_reg arg )
+{
+ emit_1ub(p, 0xdf);
+ emit_modrm_noreg(p, 0, arg);
+}
+
+void x87_fldz( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xee);
+}
+
+
+void x87_fldcw( struct x86_function *p, struct x86_reg arg )
+{
+ assert(arg.file == file_REG32);
+ assert(arg.mod != mod_REG);
+ emit_1ub(p, 0xd9);
+ emit_modrm_noreg(p, 5, arg);
+}
+
+void x87_fld1( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xe8);
+}
+
+void x87_fldl2e( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xea);
+}
+
+void x87_fldln2( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xed);
+}
+
+void x87_fwait( struct x86_function *p )
+{
+ emit_1ub(p, 0x9b);
+}
+
+void x87_fnclex( struct x86_function *p )
+{
+ emit_2ub(p, 0xdb, 0xe2);
+}
+
+void x87_fclex( struct x86_function *p )
+{
+ x87_fwait(p);
+ x87_fnclex(p);
+}
+
+
+static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86_reg arg,
+ unsigned char dst0ub0,
+ unsigned char dst0ub1,
+ unsigned char arg0ub0,
+ unsigned char arg0ub1,
+ unsigned char argmem_noreg)
+{
+ assert(dst.file == file_x87);
+
+ if (arg.file == file_x87) {
+ if (dst.idx == 0)
+ emit_2ub(p, dst0ub0, dst0ub1+arg.idx);
+ else if (arg.idx == 0)
+ emit_2ub(p, arg0ub0, arg0ub1+arg.idx);
+ else
+ assert(0);
+ }
+ else if (dst.idx == 0) {
+ assert(arg.file == file_REG32);
+ emit_1ub(p, 0xd8);
+ emit_modrm_noreg(p, argmem_noreg, arg);
+ }
+ else
+ assert(0);
+}
+
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+ x87_arith_op(p, dst, arg,
+ 0xd8, 0xc8,
+ 0xdc, 0xc8,
+ 4);
+}
+
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+ x87_arith_op(p, dst, arg,
+ 0xd8, 0xe0,
+ 0xdc, 0xe8,
+ 4);
+}
+
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+ x87_arith_op(p, dst, arg,
+ 0xd8, 0xe8,
+ 0xdc, 0xe0,
+ 5);
+}
+
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+ x87_arith_op(p, dst, arg,
+ 0xd8, 0xc0,
+ 0xdc, 0xc0,
+ 0);
+}
+
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+ x87_arith_op(p, dst, arg,
+ 0xd8, 0xf0,
+ 0xdc, 0xf8,
+ 6);
+}
+
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+ x87_arith_op(p, dst, arg,
+ 0xd8, 0xf8,
+ 0xdc, 0xf0,
+ 7);
+}
+
+void x87_fmulp( struct x86_function *p, struct x86_reg dst )
+{
+ assert(dst.file == file_x87);
+ assert(dst.idx >= 1);
+ emit_2ub(p, 0xde, 0xc8+dst.idx);
+}
+
+void x87_fsubp( struct x86_function *p, struct x86_reg dst )
+{
+ assert(dst.file == file_x87);
+ assert(dst.idx >= 1);
+ emit_2ub(p, 0xde, 0xe8+dst.idx);
+}
+
+void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
+{
+ assert(dst.file == file_x87);
+ assert(dst.idx >= 1);
+ emit_2ub(p, 0xde, 0xe0+dst.idx);
+}
+
+void x87_faddp( struct x86_function *p, struct x86_reg dst )
+{
+ assert(dst.file == file_x87);
+ assert(dst.idx >= 1);
+ emit_2ub(p, 0xde, 0xc0+dst.idx);
+}
+
+void x87_fdivp( struct x86_function *p, struct x86_reg dst )
+{
+ assert(dst.file == file_x87);
+ assert(dst.idx >= 1);
+ emit_2ub(p, 0xde, 0xf8+dst.idx);
+}
+
+void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
+{
+ assert(dst.file == file_x87);
+ assert(dst.idx >= 1);
+ emit_2ub(p, 0xde, 0xf0+dst.idx);
+}
+
+void x87_fucom( struct x86_function *p, struct x86_reg arg )
+{
+ assert(arg.file == file_x87);
+ emit_2ub(p, 0xdd, 0xe0+arg.idx);
+}
+
+void x87_fucomp( struct x86_function *p, struct x86_reg arg )
+{
+ assert(arg.file == file_x87);
+ emit_2ub(p, 0xdd, 0xe8+arg.idx);
+}
+
+void x87_fucompp( struct x86_function *p )
+{
+ emit_2ub(p, 0xda, 0xe9);
+}
+
+void x87_fxch( struct x86_function *p, struct x86_reg arg )
+{
+ assert(arg.file == file_x87);
+ emit_2ub(p, 0xd9, 0xc8+arg.idx);
+}
+
+void x87_fabs( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xe1);
+}
+
+void x87_fchs( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xe0);
+}
+
+void x87_fcos( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xff);
+}
+
+
+void x87_fprndint( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xfc);
+}
+
+void x87_fscale( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xfd);
+}
+
+void x87_fsin( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xfe);
+}
+
+void x87_fsincos( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xfb);
+}
+
+void x87_fsqrt( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xfa);
+}
+
+void x87_fxtract( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xf4);
+}
+
+/* st0 = (2^st0)-1
+ *
+ * Restrictions: -1.0 <= st0 <= 1.0
+ */
+void x87_f2xm1( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xf0);
+}
+
+/* st1 = st1 * log2(st0);
+ * pop_stack;
+ */
+void x87_fyl2x( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xf1);
+}
+
+/* st1 = st1 * log2(st0 + 1.0);
+ * pop_stack;
+ *
+ * A fast operation, with restrictions: -.29 < st0 < .29
+ */
+void x87_fyl2xp1( struct x86_function *p )
+{
+ emit_2ub(p, 0xd9, 0xf9);
+}
+
+
+void x87_fld( struct x86_function *p, struct x86_reg arg )
+{
+ if (arg.file == file_x87)
+ emit_2ub(p, 0xd9, 0xc0 + arg.idx);
+ else {
+ emit_1ub(p, 0xd9);
+ emit_modrm_noreg(p, 0, arg);
+ }
+}
+
+void x87_fst( struct x86_function *p, struct x86_reg dst )
+{
+ if (dst.file == file_x87)
+ emit_2ub(p, 0xdd, 0xd0 + dst.idx);
+ else {
+ emit_1ub(p, 0xd9);
+ emit_modrm_noreg(p, 2, dst);
+ }
+}
+
+void x87_fstp( struct x86_function *p, struct x86_reg dst )
+{
+ if (dst.file == file_x87)
+ emit_2ub(p, 0xdd, 0xd8 + dst.idx);
+ else {
+ emit_1ub(p, 0xd9);
+ emit_modrm_noreg(p, 3, dst);
+ }
+}
+
+void x87_fcom( struct x86_function *p, struct x86_reg dst )
+{
+ if (dst.file == file_x87)
+ emit_2ub(p, 0xd8, 0xd0 + dst.idx);
+ else {
+ emit_1ub(p, 0xd8);
+ emit_modrm_noreg(p, 2, dst);
+ }
+}
+
+void x87_fcomp( struct x86_function *p, struct x86_reg dst )
+{
+ if (dst.file == file_x87)
+ emit_2ub(p, 0xd8, 0xd8 + dst.idx);
+ else {
+ emit_1ub(p, 0xd8);
+ emit_modrm_noreg(p, 3, dst);
+ }
+}
+
+
+void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
+{
+ assert(dst.file == file_REG32);
+
+ if (dst.idx == reg_AX &&
+ dst.mod == mod_REG)
+ emit_2ub(p, 0xdf, 0xe0);
+ else {
+ emit_1ub(p, 0xdd);
+ emit_modrm_noreg(p, 7, dst);
+ }
+}
+
+
+
+
+/***********************************************************************
+ * MMX instructions
+ */
+
+void mmx_emms( struct x86_function *p )
+{
+ assert(p->need_emms);
+ emit_2ub(p, 0x0f, 0x77);
+ p->need_emms = 0;
+}
+
+void mmx_packssdw( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ assert(dst.file == file_MMX &&
+ (src.file == file_MMX || src.mod != mod_REG));
+
+ p->need_emms = 1;
+
+ emit_2ub(p, X86_TWOB, 0x6b);
+ emit_modrm( p, dst, src );
+}
+
+void mmx_packuswb( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ assert(dst.file == file_MMX &&
+ (src.file == file_MMX || src.mod != mod_REG));
+
+ p->need_emms = 1;
+
+ emit_2ub(p, X86_TWOB, 0x67);
+ emit_modrm( p, dst, src );
+}
+
+void mmx_movd( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ p->need_emms = 1;
+ emit_1ub(p, X86_TWOB);
+ emit_op_modrm( p, 0x6e, 0x7e, dst, src );
+}
+
+void mmx_movq( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src )
+{
+ p->need_emms = 1;
+ emit_1ub(p, X86_TWOB);
+ emit_op_modrm( p, 0x6f, 0x7f, dst, src );
+}
+
+
+/***********************************************************************
+ * Helper functions
+ */
+
+
+/* Retreive a reference to one of the function arguments, taking into
+ * account any push/pop activity:
+ */
+struct x86_reg x86_fn_arg( struct x86_function *p,
+ unsigned arg )
+{
+ return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
+ p->stack_offset + arg * 4); /* ??? */
+}
+
+
+void x86_init_func( struct x86_function *p )
+{
+ p->size = 0;
+ p->store = NULL;
+ p->csr = p->store;
+}
+
+int x86_init_func_size( struct x86_function *p, unsigned code_size )
+{
+ p->size = code_size;
+ p->store = _mesa_exec_malloc(code_size);
+ p->csr = p->store;
+ return p->store != NULL;
+}
+
+void x86_release_func( struct x86_function *p )
+{
+ _mesa_exec_free(p->store);
+ p->store = NULL;
+ p->csr = NULL;
+ p->size = 0;
+}
+
+
+void (*x86_get_func( struct x86_function *p ))(void)
+{
+ if (DISASSEM && p->store)
+ printf("disassemble %p %p\n", p->store, p->csr);
+ return (void (*)(void)) (unsigned long) p->store;
+}
+
+#else
+
+void x86sse_dummy( void )
+{
+}
+
+#endif
+
+#else /* USE_X86_ASM */
+
+int x86sse_c_dummy_var; /* silence warning */
+
+#endif /* USE_X86_ASM */
diff --git a/src/arch/x86/rtasm/x86sse.h b/src/arch/x86/rtasm/x86sse.h
new file mode 100644
index 0000000..f6282f5
--- /dev/null
+++ b/src/arch/x86/rtasm/x86sse.h
@@ -0,0 +1,256 @@
+
+#ifndef _X86SSE_H_
+#define _X86SSE_H_
+
+#if defined(__i386__) || defined(__386__)
+
+/* It is up to the caller to ensure that instructions issued are
+ * suitable for the host cpu. There are no checks made in this module
+ * for mmx/sse/sse2 support on the cpu.
+ */
+struct x86_reg {
+ unsigned file:3;
+ unsigned idx:3;
+ unsigned mod:2; /* mod_REG if this is just a register */
+ int disp:24; /* only +/- 23bits of offset - should be enough... */
+};
+
+struct x86_function {
+ unsigned size;
+ unsigned char *store;
+ unsigned char *csr;
+ unsigned stack_offset;
+ int need_emms;
+ const char *fn;
+};
+
+enum x86_reg_file {
+ file_REG32,
+ file_MMX,
+ file_XMM,
+ file_x87
+};
+
+/* Values for mod field of modr/m byte
+ */
+enum x86_reg_mod {
+ mod_INDIRECT,
+ mod_DISP8,
+ mod_DISP32,
+ mod_REG
+};
+
+enum x86_reg_name {
+ reg_AX,
+ reg_CX,
+ reg_DX,
+ reg_BX,
+ reg_SP,
+ reg_BP,
+ reg_SI,
+ reg_DI
+};
+
+
+enum x86_cc {
+ cc_O, /* overflow */
+ cc_NO, /* not overflow */
+ cc_NAE, /* not above or equal / carry */
+ cc_AE, /* above or equal / not carry */
+ cc_E, /* equal / zero */
+ cc_NE /* not equal / not zero */
+};
+
+enum sse_cc {
+ cc_Equal,
+ cc_LessThan,
+ cc_LessThanEqual,
+ cc_Unordered,
+ cc_NotEqual,
+ cc_NotLessThan,
+ cc_NotLessThanEqual,
+ cc_Ordered
+};
+
+#define cc_Z cc_E
+#define cc_NZ cc_NE
+
+/* Begin/end/retreive function creation:
+ */
+
+
+void x86_init_func( struct x86_function *p );
+int x86_init_func_size( struct x86_function *p, unsigned code_size );
+void x86_release_func( struct x86_function *p );
+void (*x86_get_func( struct x86_function *p ))( void );
+
+
+
+/* Create and manipulate registers and regmem values:
+ */
+struct x86_reg x86_make_reg( enum x86_reg_file file,
+ enum x86_reg_name idx );
+
+struct x86_reg x86_make_disp( struct x86_reg reg,
+ int disp );
+
+struct x86_reg x86_deref( struct x86_reg reg );
+
+struct x86_reg x86_get_base_reg( struct x86_reg reg );
+
+
+/* Labels, jumps and fixup:
+ */
+unsigned char *x86_get_label( struct x86_function *p );
+
+void x86_jcc( struct x86_function *p,
+ enum x86_cc cc,
+ unsigned char *label );
+
+unsigned char *x86_jcc_forward( struct x86_function *p,
+ enum x86_cc cc );
+
+unsigned char *x86_jmp_forward( struct x86_function *p);
+
+unsigned char *x86_call_forward( struct x86_function *p);
+
+void x86_fixup_fwd_jump( struct x86_function *p,
+ unsigned char *fixup );
+
+void x86_jmp( struct x86_function *p, unsigned char *label );
+
+/* void x86_call( struct x86_function *p, void (*label)() ); */
+void x86_call( struct x86_function *p, struct x86_reg reg);
+
+/* michal:
+ * Temporary. As I need immediate operands, and dont want to mess with the codegen,
+ * I load the immediate into general purpose register and use it.
+ */
+void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
+
+
+/* Macro for sse_shufps() and sse2_pshufd():
+ */
+#define SHUF(_x,_y,_z,_w) (((_x)<<0) | ((_y)<<2) | ((_z)<<4) | ((_w)<<6))
+#define SHUF_NOOP RSW(0,1,2,3)
+#define GET_SHUF(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
+
+void mmx_emms( struct x86_function *p );
+void mmx_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+ unsigned char shuf );
+void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_andnps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src,
+ unsigned char cc );
+void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movlhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_orps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_xorps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_rsqrtps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+ unsigned char shuf );
+void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
+
+void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_and( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_cmp( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_dec( struct x86_function *p, struct x86_reg reg );
+void x86_inc( struct x86_function *p, struct x86_reg reg );
+void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mul( struct x86_function *p, struct x86_reg src );
+void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_pop( struct x86_function *p, struct x86_reg reg );
+void x86_push( struct x86_function *p, struct x86_reg reg );
+void x86_ret( struct x86_function *p );
+void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_sahf( struct x86_function *p );
+
+void x87_f2xm1( struct x86_function *p );
+void x87_fabs( struct x86_function *p );
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_faddp( struct x86_function *p, struct x86_reg dst );
+void x87_fchs( struct x86_function *p );
+void x87_fclex( struct x86_function *p );
+void x87_fcom( struct x86_function *p, struct x86_reg dst );
+void x87_fcomp( struct x86_function *p, struct x86_reg dst );
+void x87_fcos( struct x86_function *p );
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fdivp( struct x86_function *p, struct x86_reg dst );
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fdivrp( struct x86_function *p, struct x86_reg dst );
+void x87_fild( struct x86_function *p, struct x86_reg arg );
+void x87_fist( struct x86_function *p, struct x86_reg dst );
+void x87_fistp( struct x86_function *p, struct x86_reg dst );
+void x87_fld( struct x86_function *p, struct x86_reg arg );
+void x87_fld1( struct x86_function *p );
+void x87_fldcw( struct x86_function *p, struct x86_reg arg );
+void x87_fldl2e( struct x86_function *p );
+void x87_fldln2( struct x86_function *p );
+void x87_fldz( struct x86_function *p );
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fmulp( struct x86_function *p, struct x86_reg dst );
+void x87_fnclex( struct x86_function *p );
+void x87_fprndint( struct x86_function *p );
+void x87_fscale( struct x86_function *p );
+void x87_fsin( struct x86_function *p );
+void x87_fsincos( struct x86_function *p );
+void x87_fsqrt( struct x86_function *p );
+void x87_fst( struct x86_function *p, struct x86_reg dst );
+void x87_fstp( struct x86_function *p, struct x86_reg dst );
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fsubp( struct x86_function *p, struct x86_reg dst );
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fsubrp( struct x86_function *p, struct x86_reg dst );
+void x87_fxch( struct x86_function *p, struct x86_reg dst );
+void x87_fxtract( struct x86_function *p );
+void x87_fyl2x( struct x86_function *p );
+void x87_fyl2xp1( struct x86_function *p );
+void x87_fwait( struct x86_function *p );
+void x87_fnstsw( struct x86_function *p, struct x86_reg dst );
+void x87_fucompp( struct x86_function *p );
+void x87_fucomp( struct x86_function *p, struct x86_reg arg );
+void x87_fucom( struct x86_function *p, struct x86_reg arg );
+
+
+
+/* Retreive a reference to one of the function arguments, taking into
+ * account any push/pop activity. Note - doesn't track explict
+ * manipulation of ESP by other instructions.
+ */
+struct x86_reg x86_fn_arg( struct x86_function *p, unsigned arg );
+
+#endif
+#endif
diff --git a/src/arch/x86/sse.c b/src/arch/x86/sse.c
new file mode 100644
index 0000000..aef15b5
--- /dev/null
+++ b/src/arch/x86/sse.c
@@ -0,0 +1,123 @@
+/*
+ * Mesa 3-D graphics library
+ * Version: 6.0
+ *
+ * Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * PentiumIII-SIMD (SSE) optimizations contributed by
+ * Andre Werthmann <wertmann@cs.uni-potsdam.de>
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "math/m_xform.h"
+#include "tnl/t_context.h"
+
+#include "sse.h"
+#include "x86_xform.h"
+
+#ifdef DEBUG_MATH
+#include "math/m_debug.h"
+#endif
+
+
+#ifdef USE_SSE_ASM
+DECLARE_XFORM_GROUP( sse, 2 )
+DECLARE_XFORM_GROUP( sse, 3 )
+
+#if 1
+/* Some functions are not written in SSE-assembly, because the fpu ones are faster */
+extern void _ASMAPI _mesa_sse_transform_normals_no_rot( NORM_ARGS );
+extern void _ASMAPI _mesa_sse_transform_rescale_normals( NORM_ARGS );
+extern void _ASMAPI _mesa_sse_transform_rescale_normals_no_rot( NORM_ARGS );
+
+extern void _ASMAPI _mesa_sse_transform_points4_general( XFORM_ARGS );
+extern void _ASMAPI _mesa_sse_transform_points4_3d( XFORM_ARGS );
+/* XXX this function segfaults, see below */
+extern void _ASMAPI _mesa_sse_transform_points4_identity( XFORM_ARGS );
+/* XXX this one works, see below */
+extern void _ASMAPI _mesa_x86_transform_points4_identity( XFORM_ARGS );
+#else
+DECLARE_NORM_GROUP( sse )
+#endif
+
+
+extern void _ASMAPI
+_mesa_v16_sse_general_xform( GLfloat *first_vert,
+ const GLfloat *m,
+ const GLfloat *src,
+ GLuint src_stride,
+ GLuint count );
+
+extern void _ASMAPI
+_mesa_sse_project_vertices( GLfloat *first,
+ GLfloat *last,
+ const GLfloat *m,
+ GLuint stride );
+
+extern void _ASMAPI
+_mesa_sse_project_clipped_vertices( GLfloat *first,
+ GLfloat *last,
+ const GLfloat *m,
+ GLuint stride,
+ const GLubyte *clipmask );
+#endif
+
+
+void _mesa_init_sse_transform_asm( void )
+{
+#ifdef USE_SSE_ASM
+ ASSIGN_XFORM_GROUP( sse, 2 );
+ ASSIGN_XFORM_GROUP( sse, 3 );
+
+#if 1
+ /* TODO: Finish these off.
+ */
+ _mesa_transform_tab[4][MATRIX_GENERAL] =
+ _mesa_sse_transform_points4_general;
+ _mesa_transform_tab[4][MATRIX_3D] =
+ _mesa_sse_transform_points4_3d;
+ /* XXX NOTE: _mesa_sse_transform_points4_identity segfaults with the
+ conformance tests, so use the x86 version.
+ */
+ _mesa_transform_tab[4][MATRIX_IDENTITY] =
+ _mesa_x86_transform_points4_identity;/*_mesa_sse_transform_points4_identity;*/
+
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT] =
+ _mesa_sse_transform_normals_no_rot;
+ _mesa_normal_tab[NORM_TRANSFORM | NORM_RESCALE] =
+ _mesa_sse_transform_rescale_normals;
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_RESCALE] =
+ _mesa_sse_transform_rescale_normals_no_rot;
+#else
+ ASSIGN_XFORM_GROUP( sse, 4 );
+
+ ASSIGN_NORM_GROUP( sse );
+#endif
+
+#ifdef DEBUG_MATH
+ _math_test_all_transform_functions( "SSE" );
+ _math_test_all_normal_transform_functions( "SSE" );
+#endif
+#endif
+}
+
diff --git a/src/arch/x86/sse.h b/src/arch/x86/sse.h
new file mode 100644
index 0000000..e92ddc1
--- /dev/null
+++ b/src/arch/x86/sse.h
@@ -0,0 +1,36 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * PentiumIII-SIMD (SSE) optimizations contributed by
+ * Andre Werthmann <wertmann@cs.uni-potsdam.de>
+ */
+
+#ifndef __SSE_H__
+#define __SSE_H__
+
+void _mesa_init_sse_transform_asm( void );
+
+#endif
diff --git a/src/arch/x86/sse_normal.S b/src/arch/x86/sse_normal.S
new file mode 100644
index 0000000..a8c0d38
--- /dev/null
+++ b/src/arch/x86/sse_normal.S
@@ -0,0 +1,261 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** TODO:
+ * - insert PREFETCH instructions to avoid cache-misses !
+ * - some more optimizations are possible...
+ * - for 40-50% more performance in the SSE-functions, the
+ * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
+ */
+
+#ifdef USE_SSE_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "norm_args.h"
+
+ SEG_TEXT
+
+#define M(i) REGOFF(i * 4, EDX)
+#define S(i) REGOFF(i * 4, ESI)
+#define D(i) REGOFF(i * 4, EDI)
+#define STRIDE REGOFF(12, ESI)
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_sse_transform_rescale_normals_no_rot)
+HIDDEN(_mesa_sse_transform_rescale_normals_no_rot)
+GLNAME(_mesa_sse_transform_rescale_normals_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */
+ MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */
+
+ MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */
+ MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */
+
+ MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L ( ECX, ECX )
+ JZ( LLBL(K_G3TRNNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L ( STRIDE, EAX ) /* stride */
+ MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */
+
+ IMUL_L( CONST(16), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* m0 */
+ MOVSS ( M(5), XMM2 ) /* m5 */
+ UNPCKLPS( XMM2, XMM1 ) /* m5 | m0 */
+ MOVSS ( ARG_SCALE, XMM0 ) /* scale */
+ SHUFPS ( CONST(0x0), XMM0, XMM0 ) /* scale | scale */
+ MULPS ( XMM0, XMM1 ) /* m5*scale | m0*scale */
+ MULSS ( M(10), XMM0 ) /* m10*scale */
+
+ALIGNTEXT32
+LLBL(K_G3TRNNRR_top):
+ MOVLPS ( S(0), XMM2 ) /* uy | ux */
+ MULPS ( XMM1, XMM2 ) /* uy*m5*scale | ux*m0*scale */
+ MOVLPS ( XMM2, D(0) ) /* ->D(1) | D(0) */
+
+ MOVSS ( S(2), XMM2 ) /* uz */
+ MULSS ( XMM0, XMM2 ) /* uz*m10*scale */
+ MOVSS ( XMM2, D(2) ) /* ->D(2) */
+
+LLBL(K_G3TRNNRR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_G3TRNNRR_top) )
+
+LLBL(K_G3TRNNRR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_sse_transform_rescale_normals)
+HIDDEN(_mesa_sse_transform_rescale_normals)
+GLNAME(_mesa_sse_transform_rescale_normals):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */
+ MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */
+
+ MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */
+ MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */
+
+ MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L ( ECX, ECX )
+ JZ( LLBL(K_G3TRNR_finish) ) /* count was zero; go to finish */
+
+ MOV_L ( STRIDE, EAX ) /* stride */
+ MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */
+
+ IMUL_L( CONST(16), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM0 ) /* m0 */
+ MOVSS ( M(4), XMM1 ) /* m4 */
+ UNPCKLPS( XMM1, XMM0 ) /* m4 | m0 */
+
+ MOVSS ( ARG_SCALE, XMM4 ) /* scale */
+ SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* scale | scale */
+
+ MULPS ( XMM4, XMM0 ) /* m4*scale | m0*scale */
+ MOVSS ( M(1), XMM1 ) /* m1 */
+ MOVSS ( M(5), XMM2 ) /* m5 */
+ UNPCKLPS( XMM2, XMM1 ) /* m5 | m1 */
+ MULPS ( XMM4, XMM1 ) /* m5*scale | m1*scale */
+ MOVSS ( M(2), XMM2 ) /* m2 */
+ MOVSS ( M(6), XMM3 ) /* m6 */
+ UNPCKLPS( XMM3, XMM2 ) /* m6 | m2 */
+ MULPS ( XMM4, XMM2 ) /* m6*scale | m2*scale */
+
+ MOVSS ( M(8), XMM6 ) /* m8 */
+ MULSS ( ARG_SCALE, XMM6 ) /* m8*scale */
+ MOVSS ( M(9), XMM7 ) /* m9 */
+ MULSS ( ARG_SCALE, XMM7 ) /* m9*scale */
+
+ALIGNTEXT32
+LLBL(K_G3TRNR_top):
+ MOVSS ( S(0), XMM3 ) /* ux */
+ SHUFPS ( CONST(0x0), XMM3, XMM3 ) /* ux | ux */
+ MULPS ( XMM0, XMM3 ) /* ux*m4 | ux*m0 */
+ MOVSS ( S(1), XMM4 ) /* uy */
+ SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* uy | uy */
+ MULPS ( XMM1, XMM4 ) /* uy*m5 | uy*m1 */
+ MOVSS ( S(2), XMM5 ) /* uz */
+ SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* uz | uz */
+ MULPS ( XMM2, XMM5 ) /* uz*m6 | uz*m2 */
+
+ ADDPS ( XMM4, XMM3 )
+ ADDPS ( XMM5, XMM3 )
+ MOVLPS ( XMM3, D(0) )
+
+ MOVSS ( M(10), XMM3 ) /* m10 */
+ MULSS ( ARG_SCALE, XMM3 ) /* m10*scale */
+ MULSS ( S(2), XMM3 ) /* m10*scale*uz */
+ MOVSS ( S(1), XMM4 ) /* uy */
+ MULSS ( XMM7, XMM4 ) /* uy*m9*scale */
+ MOVSS ( S(0), XMM5 ) /* ux */
+ MULSS ( XMM6, XMM5 ) /* ux*m8*scale */
+
+ ADDSS ( XMM4, XMM3 )
+ ADDSS ( XMM5, XMM3 )
+ MOVSS ( XMM3, D(2) )
+
+LLBL(K_G3TRNR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_G3TRNR_top) )
+
+LLBL(K_G3TRNR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_sse_transform_normals_no_rot)
+HIDDEN(_mesa_sse_transform_normals_no_rot)
+GLNAME(_mesa_sse_transform_normals_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */
+ MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */
+
+ MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */
+ MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */
+
+ MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L ( ECX, ECX )
+ JZ( LLBL(K_G3TNNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L ( STRIDE, EAX ) /* stride */
+ MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */
+
+ IMUL_L( CONST(16), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS( M(0), XMM0 ) /* m0 */
+ MOVSS( M(5), XMM1 ) /* m5 */
+ UNPCKLPS( XMM1, XMM0 ) /* m5 | m0 */
+ MOVSS( M(10), XMM1 ) /* m10 */
+
+ALIGNTEXT32
+LLBL(K_G3TNNRR_top):
+ MOVLPS( S(0), XMM2 ) /* uy | ux */
+ MULPS( XMM0, XMM2 ) /* uy*m5 | ux*m0 */
+ MOVLPS( XMM2, D(0) )
+
+ MOVSS( S(2), XMM2 ) /* uz */
+ MULSS( XMM1, XMM2 ) /* uz*m10 */
+ MOVSS( XMM2, D(2) )
+
+LLBL(K_G3TNNRR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_G3TNNRR_top) )
+
+LLBL(K_G3TNNRR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+#endif
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/sse_xform1.S b/src/arch/x86/sse_xform1.S
new file mode 100644
index 0000000..4aa9de6
--- /dev/null
+++ b/src/arch/x86/sse_xform1.S
@@ -0,0 +1,446 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** TODO:
+ * - insert PREFETCH instructions to avoid cache-misses !
+ * - some more optimizations are possible...
+ * - for 40-50% more performance in the SSE-functions, the
+ * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
+ */
+
+#ifdef USE_SSE_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define S(i) REGOFF(i * 4, ESI)
+#define D(i) REGOFF(i * 4, EDI)
+#define M(i) REGOFF(i * 4, EDX)
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_general)
+HIDDEN( _mesa_sse_transform_points1_general )
+GLNAME( _mesa_sse_transform_points1_general ):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ CMP_L( CONST(0), ECX ) /* count == 0 ? */
+ JE( LLBL(K_GTP1GR_finish) ) /* yes -> nothing to do. */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ MOVAPS( M(0), XMM0 ) /* m3 | m2 | m1 | m0 */
+ MOVAPS( M(12), XMM1 ) /* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP1GR_top):
+ MOVSS( S(0), XMM2 ) /* ox */
+ SHUFPS( CONST(0x0), XMM2, XMM2 ) /* ox | ox | ox | ox */
+ MULPS( XMM0, XMM2 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+ ADDPS( XMM1, XMM2 ) /* + | + | + | + */
+ MOVUPS( XMM2, D(0) )
+
+LLBL(K_GTP1GR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP1GR_top) )
+
+LLBL(K_GTP1GR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_identity)
+HIDDEN(_mesa_sse_transform_points1_identity)
+GLNAME( _mesa_sse_transform_points1_identity ):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP1IR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_1), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(1), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ CMP_L( ESI, EDI )
+ JE( LLBL(K_GTP1IR_finish) )
+
+
+ALIGNTEXT32
+LLBL(K_GTP1IR_top):
+ MOV_L( S(0), EDX )
+ MOV_L( EDX, D(0) )
+
+LLBL(K_GTP1IR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP1IR_top) )
+
+LLBL(K_GTP1IR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_3d_no_rot)
+HIDDEN(_mesa_sse_transform_points1_3d_no_rot)
+GLNAME(_mesa_sse_transform_points1_3d_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP13DNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ MOVSS( M(0), XMM0 ) /* m0 */
+ MOVSS( M(12), XMM1 ) /* m12 */
+ MOVSS( M(13), XMM2 ) /* m13 */
+ MOVSS( M(14), XMM3 ) /* m14 */
+
+ALIGNTEXT32
+LLBL(K_GTP13DNRR_top):
+ MOVSS( S(0), XMM4 ) /* ox */
+ MULSS( XMM0, XMM4 ) /* ox*m0 */
+ ADDSS( XMM1, XMM4 ) /* ox*m0+m12 */
+ MOVSS( XMM4, D(0) )
+
+ MOVSS( XMM2, D(1) )
+ MOVSS( XMM3, D(2) )
+
+LLBL(K_GTP13DNRR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP13DNRR_top) )
+
+LLBL(K_GTP13DNRR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_perspective)
+HIDDEN(_mesa_sse_transform_points1_perspective)
+GLNAME(_mesa_sse_transform_points1_perspective):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP13PR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ XORPS( XMM0, XMM0 ) /* 0 | 0 | 0 | 0 */
+ MOVSS( M(0), XMM1 ) /* m0 */
+ MOVSS( M(14), XMM2 ) /* m14 */
+
+ALIGNTEXT32
+LLBL(K_GTP13PR_top):
+ MOVSS( S(0), XMM3 ) /* ox */
+ MULSS( XMM1, XMM3 ) /* ox*m0 */
+ MOVSS( XMM3, D(0) ) /* ox*m0->D(0) */
+ MOVSS( XMM2, D(2) ) /* m14->D(2) */
+
+ MOVSS( XMM0, D(1) )
+ MOVSS( XMM0, D(3) )
+
+LLBL(K_GTP13PR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP13PR_top) )
+
+LLBL(K_GTP13PR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_2d)
+HIDDEN(_mesa_sse_transform_points1_2d)
+GLNAME(_mesa_sse_transform_points1_2d):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP13P2DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVLPS( M(0), XMM0 ) /* m1 | m0 */
+ MOVLPS( M(12), XMM1 ) /* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP13P2DR_top):
+ MOVSS( S(0), XMM2 ) /* ox */
+ SHUFPS( CONST(0x0), XMM2, XMM2 ) /* ox | ox | ox | ox */
+ MULPS( XMM0, XMM2 ) /* - | - | ox*m1 | ox*m0 */
+ ADDPS( XMM1, XMM2 ) /* - | - | ox*m1+m13 | ox*m0+m12 */
+ MOVLPS( XMM2, D(0) )
+
+LLBL(K_GTP13P2DR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP13P2DR_top) )
+
+LLBL(K_GTP13P2DR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_2d_no_rot)
+HIDDEN(_mesa_sse_transform_points1_2d_no_rot)
+GLNAME(_mesa_sse_transform_points1_2d_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP13P2DNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS( M(0), XMM0 ) /* m0 */
+ MOVSS( M(12), XMM1 ) /* m12 */
+ MOVSS( M(13), XMM2 ) /* m13 */
+
+ALIGNTEXT32
+LLBL(K_GTP13P2DNRR_top):
+ MOVSS( S(0), XMM3 ) /* ox */
+ MULSS( XMM0, XMM3 ) /* ox*m0 */
+ ADDSS( XMM1, XMM3 ) /* ox*m0+m12 */
+ MOVSS( XMM3, D(0) )
+ MOVSS( XMM2, D(1) )
+
+LLBL(K_GTP13P2DNRR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP13P2DNRR_top) )
+
+LLBL(K_GTP13P2DNRR_finish):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_3d)
+HIDDEN(_mesa_sse_transform_points1_3d)
+GLNAME(_mesa_sse_transform_points1_3d):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP13P3DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ MOVAPS( M(0), XMM0 ) /* m3 | m2 | m1 | m0 */
+ MOVAPS( M(12), XMM1 ) /* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP13P3DR_top):
+ MOVSS( S(0), XMM2 ) /* ox */
+ SHUFPS( CONST(0x0), XMM2, XMM2 ) /* ox | ox | ox | ox */
+ MULPS( XMM0, XMM2 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+ ADDPS( XMM1, XMM2 ) /* +m15 | +m14 | +m13 | +m12 */
+ MOVLPS( XMM2, D(0) ) /* - | - | ->D(1)| ->D(0)*/
+ UNPCKHPS( XMM2, XMM2 ) /* ox*m3+m15 | ox*m3+m15 | ox*m2+m14 | ox*m2+m14 */
+ MOVSS( XMM2, D(2) )
+
+LLBL(K_GTP13P3DR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP13P3DR_top) )
+
+LLBL(K_GTP13P3DR_finish):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+#endif
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/sse_xform2.S b/src/arch/x86/sse_xform2.S
new file mode 100644
index 0000000..a443dad
--- /dev/null
+++ b/src/arch/x86/sse_xform2.S
@@ -0,0 +1,466 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** TODO:
+ * - insert PREFETCH instructions to avoid cache-misses !
+ * - some more optimizations are possible...
+ * - for 40-50% more performance in the SSE-functions, the
+ * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
+ */
+
+#ifdef USE_SSE_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define S(i) REGOFF(i * 4, ESI)
+#define D(i) REGOFF(i * 4, EDI)
+#define M(i) REGOFF(i * 4, EDX)
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_general)
+HIDDEN (_mesa_sse_transform_points2_general)
+GLNAME( _mesa_sse_transform_points2_general ):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(K_GTP2GR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVAPS( M(0), XMM0 ) /* m3 | m2 | m1 | m0 */
+ MOVAPS( M(4), XMM1 ) /* m7 | m6 | m5 | m4 */
+ MOVAPS( M(12), XMM2 ) /* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP2GR_top):
+ MOVSS( S(0), XMM3 ) /* ox */
+ SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ox | ox | ox | ox */
+ MULPS( XMM0, XMM3 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+ MOVSS( S(1), XMM4 ) /* oy */
+ SHUFPS( CONST(0x0), XMM4, XMM4 ) /* oy | oy | oy | oy */
+ MULPS( XMM1, XMM4 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+
+ ADDPS( XMM4, XMM3 )
+ ADDPS( XMM2, XMM3 )
+ MOVAPS( XMM3, D(0) )
+
+LLBL(K_GTP2GR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP2GR_top) )
+
+LLBL(K_GTP2GR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_identity)
+HIDDEN(_mesa_sse_transform_points2_identity)
+GLNAME( _mesa_sse_transform_points2_identity ):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP2IR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ CMP_L( ESI, EDI )
+ JE( LLBL(K_GTP2IR_finish) )
+
+
+ALIGNTEXT32
+LLBL(K_GTP2IR_top):
+ MOV_L ( S(0), EDX )
+ MOV_L ( EDX, D(0) )
+ MOV_L ( S(1), EDX )
+ MOV_L ( EDX, D(1) )
+
+LLBL(K_GTP2IR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP2IR_top) )
+
+LLBL(K_GTP2IR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_3d_no_rot)
+HIDDEN(_mesa_sse_transform_points2_3d_no_rot)
+GLNAME(_mesa_sse_transform_points2_3d_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP23DNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ XORPS( XMM0, XMM0 ) /* clean the working register */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */
+ MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */
+ UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */
+ MOVLPS ( M(12), XMM2 ) /* - | - | m13 | m12 */
+ MOVSS ( M(14), XMM3 ) /* - | - | - | m14 */
+
+ALIGNTEXT32
+LLBL(K_GTP23DNRR_top):
+ MOVLPS ( S(0), XMM0 ) /* - | - | oy | ox */
+ MULPS ( XMM1, XMM0 ) /* - | - | oy*m5 | ox*m0 */
+ ADDPS ( XMM2, XMM0 ) /* - | - | +m13 | +m12 */
+ MOVLPS ( XMM0, D(0) ) /* -> D(1) | -> D(0) */
+
+ MOVSS ( XMM3, D(2) ) /* -> D(2) */
+
+LLBL(K_GTP23DNRR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP23DNRR_top) )
+
+LLBL(K_GTP23DNRR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_perspective)
+HIDDEN(_mesa_sse_transform_points2_perspective)
+GLNAME(_mesa_sse_transform_points2_perspective):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP23PR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */
+ MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */
+ UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */
+ MOVSS ( M(14), XMM3 ) /* m14 */
+ XORPS ( XMM0, XMM0 ) /* 0 | 0 | 0 | 0 */
+
+ALIGNTEXT32
+LLBL(K_GTP23PR_top):
+ MOVLPS( S(0), XMM4 ) /* oy | ox */
+ MULPS( XMM1, XMM4 ) /* oy*m5 | ox*m0 */
+ MOVLPS( XMM4, D(0) ) /* ->D(1) | ->D(0) */
+ MOVSS( XMM3, D(2) ) /* ->D(2) */
+ MOVSS( XMM0, D(3) ) /* ->D(3) */
+
+LLBL(K_GTP23PR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP23PR_top) )
+
+LLBL(K_GTP23PR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_2d)
+HIDDEN(_mesa_sse_transform_points2_2d)
+GLNAME(_mesa_sse_transform_points2_2d):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP23P2DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVLPS( M(0), XMM0 ) /* m1 | m0 */
+ MOVLPS( M(4), XMM1 ) /* m5 | m4 */
+ MOVLPS( M(12), XMM2 ) /* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP23P2DR_top):
+ MOVSS( S(0), XMM3 ) /* ox */
+ SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ox | ox */
+ MULPS( XMM0, XMM3 ) /* ox*m1 | ox*m0 */
+
+ MOVSS( S(1), XMM4 ) /* oy */
+ SHUFPS( CONST(0x0), XMM4, XMM4 ) /* oy | oy */
+ MULPS( XMM1, XMM4 ) /* oy*m5 | oy*m4 */
+
+ ADDPS( XMM4, XMM3 )
+ ADDPS( XMM2, XMM3 )
+ MOVLPS( XMM3, D(0) ) /* ->D(1) | ->D(0) */
+
+LLBL(K_GTP23P2DR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP23P2DR_top) )
+
+LLBL(K_GTP23P2DR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_2d_no_rot)
+HIDDEN(_mesa_sse_transform_points2_2d_no_rot)
+GLNAME(_mesa_sse_transform_points2_2d_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP23P2DNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* m0 */
+ MOVSS ( M(5), XMM2 ) /* m5 */
+ UNPCKLPS ( XMM2, XMM1 ) /* m5 | m0 */
+ MOVLPS ( M(12), XMM2 ) /* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP23P2DNRR_top):
+ MOVLPS( S(0), XMM0 ) /* oy | ox */
+ MULPS( XMM1, XMM0 ) /* oy*m5 | ox*m0 */
+ ADDPS( XMM2, XMM0 ) /* +m13 | +m12 */
+ MOVLPS( XMM0, D(0) ) /* ->D(1) | ->D(0) */
+
+LLBL(K_GTP23P2DNRR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP23P2DNRR_top) )
+
+LLBL(K_GTP23P2DNRR_finish):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_3d)
+HIDDEN(_mesa_sse_transform_points2_3d)
+GLNAME(_mesa_sse_transform_points2_3d):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP23P3DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVAPS( M(0), XMM0 ) /* m2 | m1 | m0 */
+ MOVAPS( M(4), XMM1 ) /* m6 | m5 | m4 */
+ MOVAPS( M(12), XMM2 ) /* m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP23P3DR_top):
+ MOVSS( S(0), XMM3 ) /* ox */
+ SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ox | ox | ox */
+ MULPS( XMM0, XMM3 ) /* ox*m2 | ox*m1 | ox*m0 */
+
+ MOVSS( S(1), XMM4 ) /* oy */
+ SHUFPS( CONST(0x0), XMM4, XMM4 ) /* oy | oy | oy */
+ MULPS( XMM1, XMM4 ) /* oy*m6 | oy*m5 | oy*m4 */
+
+ ADDPS( XMM4, XMM3 )
+ ADDPS( XMM2, XMM3 )
+
+ MOVLPS( XMM3, D(0) ) /* ->D(1) | ->D(0) */
+ UNPCKHPS( XMM3, XMM3 )
+ MOVSS( XMM3, D(2) ) /* ->D(2) */
+
+LLBL(K_GTP23P3DR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP23P3DR_top) )
+
+LLBL(K_GTP23P3DR_finish):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+#endif
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/sse_xform3.S b/src/arch/x86/sse_xform3.S
new file mode 100644
index 0000000..4bc22d8
--- /dev/null
+++ b/src/arch/x86/sse_xform3.S
@@ -0,0 +1,512 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** TODO:
+ * - insert PREFETCH instructions to avoid cache-misses !
+ * - some more optimizations are possible...
+ * - for 40-50% more performance in the SSE-functions, the
+ * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
+ */
+
+#ifdef USE_SSE_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define S(i) REGOFF(i * 4, ESI)
+#define D(i) REGOFF(i * 4, EDI)
+#define M(i) REGOFF(i * 4, EDX)
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_general)
+HIDDEN(_mesa_sse_transform_points3_general)
+GLNAME( _mesa_sse_transform_points3_general ):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ CMP_L ( CONST(0), ECX ) /* count == 0 ? */
+ JE ( LLBL(K_GTPGR_finish) ) /* yes -> nothing to do. */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ MOVAPS ( REGOFF(0, EDX), XMM0 ) /* m0 | m1 | m2 | m3 */
+ MOVAPS ( REGOFF(16, EDX), XMM1 ) /* m4 | m5 | m6 | m7 */
+ MOVAPS ( REGOFF(32, EDX), XMM2 ) /* m8 | m9 | m10 | m11 */
+ MOVAPS ( REGOFF(48, EDX), XMM3 ) /* m12 | m13 | m14 | m15 */
+
+
+ALIGNTEXT32
+LLBL(K_GTPGR_top):
+ MOVSS ( REGOFF(0, ESI), XMM4 ) /* | | | ox */
+ SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox | ox */
+ MOVSS ( REGOFF(4, ESI), XMM5 ) /* | | | oy */
+ SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy | oy */
+ MOVSS ( REGOFF(8, ESI), XMM6 ) /* | | | oz */
+ SHUFPS ( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz | oz */
+
+ MULPS ( XMM0, XMM4 ) /* m3*ox | m2*ox | m1*ox | m0*ox */
+ MULPS ( XMM1, XMM5 ) /* m7*oy | m6*oy | m5*oy | m4*oy */
+ MULPS ( XMM2, XMM6 ) /* m11*oz | m10*oz | m9*oz | m8*oz */
+
+ ADDPS ( XMM5, XMM4 )
+ ADDPS ( XMM6, XMM4 )
+ ADDPS ( XMM3, XMM4 )
+
+ MOVAPS ( XMM4, REGOFF(0, EDI) )
+
+LLBL(K_GTPGR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTPGR_top) )
+
+LLBL(K_GTPGR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_identity)
+HIDDEN(_mesa_sse_transform_points3_identity)
+GLNAME( _mesa_sse_transform_points3_identity ):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTPIR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ CMP_L( ESI, EDI )
+ JE( LLBL(K_GTPIR_finish) )
+
+
+ALIGNTEXT32
+LLBL(K_GTPIR_top):
+ MOVLPS ( S(0), XMM0 )
+ MOVLPS ( XMM0, D(0) )
+ MOVSS ( S(2), XMM0 )
+ MOVSS ( XMM0, D(2) )
+
+LLBL(K_GTPIR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTPIR_top) )
+
+LLBL(K_GTPIR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_3d_no_rot)
+HIDDEN(_mesa_sse_transform_points3_3d_no_rot)
+GLNAME(_mesa_sse_transform_points3_3d_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP3DNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ XORPS( XMM0, XMM0 ) /* clean the working register */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */
+ MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */
+ UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */
+ MOVLPS ( M(12), XMM2 ) /* - | - | m13 | m12 */
+ MOVSS ( M(10), XMM3 ) /* - | - | - | m10 */
+ MOVSS ( M(14), XMM4 ) /* - | - | - | m14 */
+
+ALIGNTEXT32
+LLBL(K_GTP3DNRR_top):
+
+ MOVLPS ( S(0), XMM0 ) /* - | - | s1 | s0 */
+ MULPS ( XMM1, XMM0 ) /* - | - | s1*m5 | s0*m0 */
+ ADDPS ( XMM2, XMM0 ) /* - | - | +m13 | +m12 */
+ MOVLPS ( XMM0, D(0) ) /* -> D(1) | -> D(0) */
+
+ MOVSS ( S(2), XMM0 ) /* sz */
+ MULSS ( XMM3, XMM0 ) /* sz*m10 */
+ ADDSS ( XMM4, XMM0 ) /* +m14 */
+ MOVSS ( XMM0, D(2) ) /* -> D(2) */
+
+LLBL(K_GTP3DNRR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP3DNRR_top) )
+
+LLBL(K_GTP3DNRR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_perspective)
+HIDDEN(_mesa_sse_transform_points3_perspective)
+GLNAME(_mesa_sse_transform_points3_perspective):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP3PR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */
+ MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */
+ UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */
+ MOVLPS ( M(8), XMM2 ) /* - | - | m9 | m8 */
+ MOVSS ( M(10), XMM3 ) /* m10 */
+ MOVSS ( M(14), XMM4 ) /* m14 */
+ XORPS ( XMM6, XMM6 ) /* 0 */
+
+ALIGNTEXT32
+LLBL(K_GTP3PR_top):
+ MOVLPS ( S(0), XMM0 ) /* oy | ox */
+ MULPS ( XMM1, XMM0 ) /* oy*m5 | ox*m0 */
+ MOVSS ( S(2), XMM5 ) /* oz */
+ SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* oz | oz */
+ MULPS ( XMM2, XMM5 ) /* oz*m9 | oz*m8 */
+ ADDPS ( XMM5, XMM0 ) /* +oy*m5 | +ox*m0 */
+ MOVLPS ( XMM0, D(0) ) /* ->D(1) | ->D(0) */
+
+ MOVSS ( S(2), XMM0 ) /* oz */
+ MULSS ( XMM3, XMM0 ) /* oz*m10 */
+ ADDSS ( XMM4, XMM0 ) /* +m14 */
+ MOVSS ( XMM0, D(2) ) /* ->D(2) */
+
+ MOVSS ( S(2), XMM0 ) /* oz */
+ MOVSS ( XMM6, XMM5 ) /* 0 */
+ SUBPS ( XMM0, XMM5 ) /* -oz */
+ MOVSS ( XMM5, D(3) ) /* ->D(3) */
+
+LLBL(K_GTP3PR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP3PR_top) )
+
+LLBL(K_GTP3PR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_2d)
+HIDDEN(_mesa_sse_transform_points3_2d)
+GLNAME(_mesa_sse_transform_points3_2d):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP3P2DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVLPS( M(0), XMM0 ) /* m1 | m0 */
+ MOVLPS( M(4), XMM1 ) /* m5 | m4 */
+ MOVLPS( M(12), XMM2 ) /* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP3P2DR_top):
+ MOVSS ( S(0), XMM3 ) /* ox */
+ SHUFPS ( CONST(0x0), XMM3, XMM3 ) /* ox | ox */
+ MULPS ( XMM0, XMM3 ) /* ox*m1 | ox*m0 */
+ MOVSS ( S(1), XMM4 ) /* oy */
+ SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* oy | oy */
+ MULPS ( XMM1, XMM4 ) /* oy*m5 | oy*m4 */
+
+ ADDPS ( XMM4, XMM3 )
+ ADDPS ( XMM2, XMM3 )
+ MOVLPS ( XMM3, D(0) )
+
+ MOVSS ( S(2), XMM3 )
+ MOVSS ( XMM3, D(2) )
+
+LLBL(K_GTP3P2DR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP3P2DR_top) )
+
+LLBL(K_GTP3P2DR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_2d_no_rot)
+HIDDEN(_mesa_sse_transform_points3_2d_no_rot)
+GLNAME(_mesa_sse_transform_points3_2d_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP3P2DNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* m0 */
+ MOVSS ( M(5), XMM2 ) /* m5 */
+ UNPCKLPS ( XMM2, XMM1 ) /* m5 | m0 */
+ MOVLPS ( M(12), XMM2 ) /* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP3P2DNRR_top):
+ MOVLPS( S(0), XMM0 ) /* oy | ox */
+ MULPS( XMM1, XMM0 ) /* oy*m5 | ox*m0 */
+ ADDPS( XMM2, XMM0 ) /* +m13 | +m12 */
+ MOVLPS( XMM0, D(0) ) /* ->D(1) | ->D(0) */
+
+ MOVSS( S(2), XMM0 )
+ MOVSS( XMM0, D(2) )
+
+LLBL(K_GTP3P2DNRR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP3P2DNRR_top) )
+
+LLBL(K_GTP3P2DNRR_finish):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_3d)
+HIDDEN(_mesa_sse_transform_points3_3d)
+GLNAME(_mesa_sse_transform_points3_3d):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP3P3DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ MOVAPS( M(0), XMM0 ) /* m2 | m1 | m0 */
+ MOVAPS( M(4), XMM1 ) /* m6 | m5 | m4 */
+ MOVAPS( M(8), XMM2 ) /* m10 | m9 | m8 */
+ MOVAPS( M(12), XMM3 ) /* m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP3P3DR_top):
+ MOVSS( S(0), XMM4 )
+ SHUFPS( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox */
+ MULPS( XMM0, XMM4 ) /* ox*m2 | ox*m1 | ox*m0 */
+
+ MOVSS( S(1), XMM5 )
+ SHUFPS( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy */
+ MULPS( XMM1, XMM5 ) /* oy*m6 | oy*m5 | oy*m4 */
+
+ MOVSS( S(2), XMM6 )
+ SHUFPS( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz */
+ MULPS( XMM2, XMM6 ) /* oz*m10 | oz*m9 | oz*m8 */
+
+ ADDPS( XMM5, XMM4 ) /* + | + | + */
+ ADDPS( XMM6, XMM4 ) /* + | + | + */
+ ADDPS( XMM3, XMM4 ) /* + | + | + */
+
+ MOVLPS( XMM4, D(0) ) /* => D(1) | => D(0) */
+ UNPCKHPS( XMM4, XMM4 )
+ MOVSS( XMM4, D(2) )
+
+LLBL(K_GTP3P3DR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP3P3DR_top) )
+
+LLBL(K_GTP3P3DR_finish):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+#endif
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/sse_xform4.S b/src/arch/x86/sse_xform4.S
new file mode 100644
index 0000000..fb1fa74
--- /dev/null
+++ b/src/arch/x86/sse_xform4.S
@@ -0,0 +1,235 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_SSE_ASM
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FRAME_OFFSET 8
+
+#define SRC(i) REGOFF(i * 4, ESI)
+#define DST(i) REGOFF(i * 4, EDI)
+#define MAT(i) REGOFF(i * 4, EDX)
+
+#define SELECT(r0, r1, r2, r3) CONST( r0 * 64 + r1 * 16 + r2 * 4 + r3 )
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_sse_transform_points4_general )
+HIDDEN(_mesa_sse_transform_points4_general)
+GLNAME( _mesa_sse_transform_points4_general ):
+
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX ) /* verify non-zero count */
+ JE( LLBL( sse_general_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */
+
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+
+ PREFETCHT0( REGIND(ESI) )
+
+ MOVAPS( MAT(0), XMM4 ) /* m3 | m2 | m1 | m0 */
+ MOVAPS( MAT(4), XMM5 ) /* m7 | m6 | m5 | m4 */
+ MOVAPS( MAT(8), XMM6 ) /* m11 | m10 | m9 | m8 */
+ MOVAPS( MAT(12), XMM7 ) /* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT16
+LLBL( sse_general_loop ):
+
+ MOVSS( SRC(0), XMM0 ) /* ox */
+ SHUFPS( CONST(0x0), XMM0, XMM0 ) /* ox | ox | ox | ox */
+ MULPS( XMM4, XMM0 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+
+ MOVSS( SRC(1), XMM1 ) /* oy */
+ SHUFPS( CONST(0x0), XMM1, XMM1 ) /* oy | oy | oy | oy */
+ MULPS( XMM5, XMM1 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+
+ MOVSS( SRC(2), XMM2 ) /* oz */
+ SHUFPS( CONST(0x0), XMM2, XMM2 ) /* oz | oz | oz | oz */
+ MULPS( XMM6, XMM2 ) /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+
+ MOVSS( SRC(3), XMM3 ) /* ow */
+ SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ow | ow | ow | ow */
+ MULPS( XMM7, XMM3 ) /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+
+ ADDPS( XMM1, XMM0 ) /* ox*m3+oy*m7 | ... */
+ ADDPS( XMM2, XMM0 ) /* ox*m3+oy*m7+oz*m11 | ... */
+ ADDPS( XMM3, XMM0 ) /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+ MOVAPS( XMM0, DST(0) ) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+
+ DEC_L( ECX )
+ JNZ( LLBL( sse_general_loop ) )
+
+LLBL( sse_general_done ):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME( _mesa_sse_transform_points4_3d )
+HIDDEN(_mesa_sse_transform_points4_3d)
+GLNAME( _mesa_sse_transform_points4_3d ):
+
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI ) /* ptr to source GLvector4f */
+ MOV_L( ARG_DEST, EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP43P3DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )/* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ MOVAPS( MAT(0), XMM0 ) /* m3 | m2 | m1 | m0 */
+ MOVAPS( MAT(4), XMM1 ) /* m7 | m6 | m5 | m4 */
+ MOVAPS( MAT(8), XMM2 ) /* m11 | m10 | m9 | m8 */
+ MOVAPS( MAT(12), XMM3 ) /* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL( K_GTP43P3DR_top ):
+ MOVSS( SRC(0), XMM4 ) /* ox */
+ SHUFPS( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox | ox */
+ MULPS( XMM0, XMM4 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+
+ MOVSS( SRC(1), XMM5 ) /* oy */
+ SHUFPS( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy | oy */
+ MULPS( XMM1, XMM5 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+
+ MOVSS( SRC(2), XMM6 ) /* oz */
+ SHUFPS( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz | oz */
+ MULPS( XMM2, XMM6 ) /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+
+ MOVSS( SRC(3), XMM7 ) /* ow */
+ SHUFPS( CONST(0x0), XMM7, XMM7 ) /* ow | ow | ow | ow */
+ MULPS( XMM3, XMM7 ) /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+
+ ADDPS( XMM5, XMM4 ) /* ox*m3+oy*m7 | ... */
+ ADDPS( XMM6, XMM4 ) /* ox*m3+oy*m7+oz*m11 | ... */
+ ADDPS( XMM7, XMM4 ) /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+ MOVAPS( XMM4, DST(0) ) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+
+ MOVSS( SRC(3), XMM4 ) /* ow */
+ MOVSS( XMM4, DST(3) ) /* ->D(3) */
+
+LLBL( K_GTP43P3DR_skip ):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP43P3DR_top) )
+
+LLBL( K_GTP43P3DR_finish ):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_sse_transform_points4_identity )
+HIDDEN(_mesa_sse_transform_points4_identity)
+GLNAME( _mesa_sse_transform_points4_identity ):
+
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX ) /* verify non-zero count */
+ JE( LLBL( sse_identity_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */
+
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+
+ALIGNTEXT16
+LLBL( sse_identity_loop ):
+
+ PREFETCHNTA( REGOFF(32, ESI) )
+
+ MOVAPS( REGIND(ESI), XMM0 )
+ ADD_L( EAX, ESI )
+
+ MOVAPS( XMM0, REGIND(EDI) )
+ ADD_L( CONST(16), EDI )
+
+ DEC_L( ECX )
+ JNZ( LLBL( sse_identity_loop ) )
+
+LLBL( sse_identity_done ):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#endif
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/x86_cliptest.S b/src/arch/x86/x86_cliptest.S
new file mode 100644
index 0000000..e413aee
--- /dev/null
+++ b/src/arch/x86/x86_cliptest.S
@@ -0,0 +1,407 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
+ * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces
+ * in there will break the build on some platforms.
+ */
+
+#include "assyntax.h"
+#include "matypes.h"
+#include "clip_args.h"
+
+#define SRC0 REGOFF(0, ESI)
+#define SRC1 REGOFF(4, ESI)
+#define SRC2 REGOFF(8, ESI)
+#define SRC3 REGOFF(12, ESI)
+#define DST0 REGOFF(0, EDI)
+#define DST1 REGOFF(4, EDI)
+#define DST2 REGOFF(8, EDI)
+#define DST3 REGOFF(12, EDI)
+#define MAT0 REGOFF(0, EDX)
+#define MAT1 REGOFF(4, EDX)
+#define MAT2 REGOFF(8, EDX)
+#define MAT3 REGOFF(12, EDX)
+
+
+/*
+ * Table for clip test.
+ *
+ * bit6 = SRC3 < 0
+ * bit5 = SRC2 < 0
+ * bit4 = abs(S(2)) > abs(S(3))
+ * bit3 = SRC1 < 0
+ * bit2 = abs(S(1)) > abs(S(3))
+ * bit1 = SRC0 < 0
+ * bit0 = abs(S(0)) > abs(S(3))
+ */
+
+ SEG_DATA
+
+clip_table:
+ D_BYTE 0x00, 0x01, 0x00, 0x02, 0x04, 0x05, 0x04, 0x06
+ D_BYTE 0x00, 0x01, 0x00, 0x02, 0x08, 0x09, 0x08, 0x0a
+ D_BYTE 0x20, 0x21, 0x20, 0x22, 0x24, 0x25, 0x24, 0x26
+ D_BYTE 0x20, 0x21, 0x20, 0x22, 0x28, 0x29, 0x28, 0x2a
+ D_BYTE 0x00, 0x01, 0x00, 0x02, 0x04, 0x05, 0x04, 0x06
+ D_BYTE 0x00, 0x01, 0x00, 0x02, 0x08, 0x09, 0x08, 0x0a
+ D_BYTE 0x10, 0x11, 0x10, 0x12, 0x14, 0x15, 0x14, 0x16
+ D_BYTE 0x10, 0x11, 0x10, 0x12, 0x18, 0x19, 0x18, 0x1a
+ D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x37, 0x35, 0x37, 0x36
+ D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x3b, 0x39, 0x3b, 0x3a
+ D_BYTE 0x2f, 0x2d, 0x2f, 0x2e, 0x27, 0x25, 0x27, 0x26
+ D_BYTE 0x2f, 0x2d, 0x2f, 0x2e, 0x2b, 0x29, 0x2b, 0x2a
+ D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x37, 0x35, 0x37, 0x36
+ D_BYTE 0x3f, 0x3d, 0x3f, 0x3e, 0x3b, 0x39, 0x3b, 0x3a
+ D_BYTE 0x1f, 0x1d, 0x1f, 0x1e, 0x17, 0x15, 0x17, 0x16
+ D_BYTE 0x1f, 0x1d, 0x1f, 0x1e, 0x1b, 0x19, 0x1b, 0x1a
+
+
+ SEG_TEXT
+
+/*
+ * _mesa_x86_cliptest_points4
+ *
+ * AL: ormask
+ * AH: andmask
+ * EBX: temp0
+ * ECX: temp1
+ * EDX: clipmask[]
+ * ESI: clip[]
+ * EDI: proj[]
+ * EBP: temp2
+ */
+
+#if defined(__ELF__) && defined(__PIC__) && defined(GNU_ASSEMBLER) && !defined(ELFPIC)
+#define ELFPIC
+#endif
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_cliptest_points4 )
+HIDDEN(_mesa_x86_cliptest_points4)
+GLNAME( _mesa_x86_cliptest_points4 ):
+
+#ifdef ELFPIC
+#define FRAME_OFFSET 20
+#else
+#define FRAME_OFFSET 16
+#endif
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBP )
+ PUSH_L( EBX )
+
+#ifdef ELFPIC
+ /* store pointer to clip_table on stack */
+ CALL( LLBL(ctp4_get_eip) )
+ ADD_L( CONST(_GLOBAL_OFFSET_TABLE_), EBX )
+ MOV_L( REGOFF(clip_table@GOT, EBX), EBX )
+ PUSH_L( EBX )
+ JMP( LLBL(ctp4_clip_table_ready) )
+
+LLBL(ctp4_get_eip):
+ /* store eip in ebx */
+ MOV_L( REGIND(ESP), EBX )
+ RET
+
+LLBL(ctp4_clip_table_ready):
+#endif
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_CLIP, EDX )
+ MOV_L( ARG_OR, EBX )
+
+ MOV_L( ARG_AND, EBP )
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+ MOV_L( EAX, ARG_SOURCE ) /* put stride in ARG_SOURCE */
+
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDX, ECX )
+
+ MOV_L( ECX, ARG_CLIP ) /* put clipmask + count in ARG_CLIP */
+ CMP_L( ECX, EDX )
+
+ MOV_B( REGIND(EBX), AL )
+ MOV_B( REGIND(EBP), AH )
+
+ JZ( LLBL(ctp4_finish) )
+
+ALIGNTEXT16
+LLBL(ctp4_top):
+
+ FLD1 /* F3 */
+ FDIV_S( SRC3 ) /* GH: don't care about div-by-zero */
+
+ MOV_L( SRC3, EBP )
+ MOV_L( SRC2, EBX )
+
+ XOR_L( ECX, ECX )
+ ADD_L( EBP, EBP ) /* ebp = abs(S(3))*2 ; carry = sign of S(3) */
+
+ ADC_L( ECX, ECX )
+ ADD_L( EBX, EBX ) /* ebx = abs(S(2))*2 ; carry = sign of S(2) */
+
+ ADC_L( ECX, ECX )
+ CMP_L( EBX, EBP ) /* carry = abs(S(2))*2 > abs(S(3))*2 */
+
+ ADC_L( ECX, ECX )
+ MOV_L( SRC1, EBX )
+
+ ADD_L( EBX, EBX ) /* ebx = abs(S(1))*2 ; carry = sign of S(1) */
+
+ ADC_L( ECX, ECX )
+ CMP_L( EBX, EBP ) /* carry = abs(S(1))*2 > abs(S(3))*2 */
+
+ ADC_L( ECX, ECX )
+ MOV_L( SRC0, EBX )
+
+ ADD_L( EBX, EBX ) /* ebx = abs(S(0))*2 ; carry = sign of S(0) */
+
+ ADC_L( ECX, ECX )
+ CMP_L( EBX, EBP ) /* carry = abs(S(0))*2 > abs(S(3))*2 */
+
+ ADC_L( ECX, ECX )
+
+#ifdef ELFPIC
+ MOV_L( REGIND(ESP), EBP ) /* clip_table */
+
+ MOV_B( REGBI(EBP, ECX), CL )
+#else
+ MOV_B( REGOFF(clip_table,ECX), CL )
+#endif
+
+ OR_B( CL, AL )
+ AND_B( CL, AH )
+
+ TEST_B( CL, CL )
+ MOV_B( CL, REGIND(EDX) )
+
+ JZ( LLBL(ctp4_proj) )
+
+LLBL(ctp4_noproj):
+
+ FSTP( ST(0) ) /* */
+
+ MOV_L( CONST(0), DST0 )
+ MOV_L( CONST(0), DST1 )
+ MOV_L( CONST(0), DST2 )
+ MOV_L( CONST(0x3f800000), DST3 )
+
+ JMP( LLBL(ctp4_next) )
+
+LLBL(ctp4_proj):
+
+ FLD_S( SRC0 ) /* F0 F3 */
+ FMUL2( ST(1), ST0 )
+
+ FLD_S( SRC1 ) /* F1 F0 F3 */
+ FMUL2( ST(2), ST0 )
+
+ FLD_S( SRC2 ) /* F2 F1 F0 F3 */
+ FMUL2( ST(3), ST0 )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F3 */
+ FSTP_S( DST0 ) /* F1 F2 F3 */
+ FSTP_S( DST1 ) /* F2 F3 */
+ FSTP_S( DST2 ) /* F3 */
+ FSTP_S( DST3 ) /* */
+
+LLBL(ctp4_next):
+
+ INC_L( EDX )
+ ADD_L( CONST(16), EDI )
+
+ ADD_L( ARG_SOURCE, ESI )
+ CMP_L( EDX, ARG_CLIP )
+
+ JNZ( LLBL(ctp4_top) )
+
+ MOV_L( ARG_OR, ECX )
+ MOV_L( ARG_AND, EDX )
+
+ MOV_B( AL, REGIND(ECX) )
+ MOV_B( AH, REGIND(EDX) )
+
+LLBL(ctp4_finish):
+
+ MOV_L( ARG_DEST, EAX )
+#ifdef ELFPIC
+ POP_L( ESI ) /* discard ptr to clip_table */
+#endif
+ POP_L( EBX )
+ POP_L( EBP )
+ POP_L( EDI )
+ POP_L( ESI )
+
+ RET
+
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_cliptest_points4_np )
+HIDDEN(_mesa_x86_cliptest_points4_np)
+GLNAME( _mesa_x86_cliptest_points4_np ):
+
+#ifdef ELFPIC
+#define FRAME_OFFSET 20
+#else
+#define FRAME_OFFSET 16
+#endif
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBP )
+ PUSH_L( EBX )
+
+#ifdef ELFPIC
+ /* store pointer to clip_table on stack */
+ CALL( LLBL(ctp4_np_get_eip) )
+ ADD_L( CONST(_GLOBAL_OFFSET_TABLE_), EBX )
+ MOV_L( REGOFF(clip_table@GOT, EBX), EBX )
+ PUSH_L( EBX )
+ JMP( LLBL(ctp4_np_clip_table_ready) )
+
+LLBL(ctp4_np_get_eip):
+ /* store eip in ebx */
+ MOV_L( REGIND(ESP), EBX )
+ RET
+
+LLBL(ctp4_np_clip_table_ready):
+#endif
+
+ MOV_L( ARG_SOURCE, ESI )
+ /* slot */
+
+ MOV_L( ARG_CLIP, EDX )
+ MOV_L( ARG_OR, EBX )
+
+ MOV_L( ARG_AND, EBP )
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( EAX, ARG_DEST ) /* put stride in ARG_DEST */
+ ADD_L( EDX, ECX )
+
+ MOV_L( ECX, EDI ) /* put clipmask + count in EDI */
+ CMP_L( ECX, EDX )
+
+ MOV_B( REGIND(EBX), AL )
+ MOV_B( REGIND(EBP), AH )
+
+ JZ( LLBL(ctp4_np_finish) )
+
+ALIGNTEXT16
+LLBL(ctp4_np_top):
+
+ MOV_L( SRC3, EBP )
+ MOV_L( SRC2, EBX )
+
+ XOR_L( ECX, ECX )
+ ADD_L( EBP, EBP ) /* ebp = abs(S(3))*2 ; carry = sign of S(3) */
+
+ ADC_L( ECX, ECX )
+ ADD_L( EBX, EBX ) /* ebx = abs(S(2))*2 ; carry = sign of S(2) */
+
+ ADC_L( ECX, ECX )
+ CMP_L( EBX, EBP ) /* carry = abs(S(2))*2 > abs(S(3))*2 */
+
+ ADC_L( ECX, ECX )
+ MOV_L( SRC1, EBX )
+
+ ADD_L( EBX, EBX ) /* ebx = abs(S(1))*2 ; carry = sign of S(1) */
+
+ ADC_L( ECX, ECX )
+ CMP_L( EBX, EBP ) /* carry = abs(S(1))*2 > abs(S(3))*2 */
+
+ ADC_L( ECX, ECX )
+ MOV_L( SRC0, EBX )
+
+ ADD_L( EBX, EBX ) /* ebx = abs(S(0))*2 ; carry = sign of S(0) */
+
+ ADC_L( ECX, ECX )
+ CMP_L( EBX, EBP ) /* carry = abs(S(0))*2 > abs(S(3))*2 */
+
+ ADC_L( ECX, ECX )
+
+#ifdef ELFPIC
+ MOV_L( REGIND(ESP), EBP ) /* clip_table */
+
+ MOV_B( REGBI(EBP, ECX), CL )
+#else
+ MOV_B( REGOFF(clip_table,ECX), CL )
+#endif
+
+ OR_B( CL, AL )
+ AND_B( CL, AH )
+
+ TEST_B( CL, CL )
+ MOV_B( CL, REGIND(EDX) )
+
+ INC_L( EDX )
+ /* slot */
+
+ ADD_L( ARG_DEST, ESI )
+ CMP_L( EDX, EDI )
+
+ JNZ( LLBL(ctp4_np_top) )
+
+ MOV_L( ARG_OR, ECX )
+ MOV_L( ARG_AND, EDX )
+
+ MOV_B( AL, REGIND(ECX) )
+ MOV_B( AH, REGIND(EDX) )
+
+LLBL(ctp4_np_finish):
+
+ MOV_L( ARG_SOURCE, EAX )
+#ifdef ELFPIC
+ POP_L( ESI ) /* discard ptr to clip_table */
+#endif
+ POP_L( EBX )
+ POP_L( EBP )
+ POP_L( EDI )
+ POP_L( ESI )
+
+ RET
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/x86_xform.c b/src/arch/x86/x86_xform.c
new file mode 100644
index 0000000..3dcc55e
--- /dev/null
+++ b/src/arch/x86/x86_xform.c
@@ -0,0 +1,126 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Intel x86 assembly code by Josh Vanderhoof
+ */
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "math/m_xform.h"
+
+#include "x86_xform.h"
+#include "common_x86_asm.h"
+
+#ifdef USE_X86_ASM
+#ifdef USE_3DNOW_ASM
+#include "3dnow.h"
+#endif
+#ifdef USE_SSE_ASM
+#include "sse.h"
+#endif
+#endif
+
+#ifdef DEBUG_MATH
+#include "math/m_debug.h"
+#endif
+
+
+#ifdef USE_X86_ASM
+DECLARE_XFORM_GROUP( x86, 2 )
+DECLARE_XFORM_GROUP( x86, 3 )
+DECLARE_XFORM_GROUP( x86, 4 )
+
+
+extern GLvector4f * _ASMAPI
+_mesa_x86_cliptest_points4( GLvector4f *clip_vec,
+ GLvector4f *proj_vec,
+ GLubyte clipMask[],
+ GLubyte *orMask,
+ GLubyte *andMask,
+ GLboolean viewport_z_clip );
+
+extern GLvector4f * _ASMAPI
+_mesa_x86_cliptest_points4_np( GLvector4f *clip_vec,
+ GLvector4f *proj_vec,
+ GLubyte clipMask[],
+ GLubyte *orMask,
+ GLubyte *andMask,
+ GLboolean viewport_z_clip );
+
+extern void _ASMAPI
+_mesa_v16_x86_cliptest_points4( GLfloat *first_vert,
+ GLfloat *last_vert,
+ GLubyte *or_mask,
+ GLubyte *and_mask,
+ GLubyte *clip_mask,
+ GLboolean viewport_z_clip );
+
+extern void _ASMAPI
+_mesa_v16_x86_general_xform( GLfloat *dest,
+ const GLfloat *m,
+ const GLfloat *src,
+ GLuint src_stride,
+ GLuint count );
+#endif
+
+
+#ifdef USE_X86_ASM
+static void _mesa_init_x86_transform_asm( void )
+{
+ ASSIGN_XFORM_GROUP( x86, 2 );
+ ASSIGN_XFORM_GROUP( x86, 3 );
+ ASSIGN_XFORM_GROUP( x86, 4 );
+
+ _mesa_clip_tab[4] = _mesa_x86_cliptest_points4;
+ _mesa_clip_np_tab[4] = _mesa_x86_cliptest_points4_np;
+
+#ifdef DEBUG_MATH
+ _math_test_all_transform_functions( "x86" );
+ _math_test_all_cliptest_functions( "x86" );
+#endif
+}
+#endif
+
+
+void _mesa_init_all_x86_transform_asm( void )
+{
+ _mesa_get_x86_features();
+
+#ifdef USE_X86_ASM
+ if ( _mesa_x86_cpu_features ) {
+ _mesa_init_x86_transform_asm();
+ }
+
+ if (cpu_has_3dnow) {
+ _mesa_init_3dnow_transform_asm();
+ }
+
+ if ( cpu_has_xmm ) {
+ _mesa_init_sse_transform_asm();
+ }
+
+#endif
+}
diff --git a/src/arch/x86/x86_xform.h b/src/arch/x86/x86_xform.h
new file mode 100644
index 0000000..e886d9a
--- /dev/null
+++ b/src/arch/x86/x86_xform.h
@@ -0,0 +1,106 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Gareth Hughes
+ */
+
+#ifndef X86_XFORM_H
+#define X86_XFORM_H
+
+
+/* =============================================================
+ * Transformation function declarations:
+ */
+
+#define XFORM_ARGS GLvector4f *to_vec, \
+ const GLfloat m[16], \
+ const GLvector4f *from_vec
+
+#define DECLARE_XFORM_GROUP( pfx, sz ) \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_general( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_identity( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d_no_rot( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_perspective( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d_no_rot( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d( XFORM_ARGS );
+
+#define ASSIGN_XFORM_GROUP( pfx, sz ) \
+ _mesa_transform_tab[sz][MATRIX_GENERAL] = \
+ _mesa_##pfx##_transform_points##sz##_general; \
+ _mesa_transform_tab[sz][MATRIX_IDENTITY] = \
+ _mesa_##pfx##_transform_points##sz##_identity; \
+ _mesa_transform_tab[sz][MATRIX_3D_NO_ROT] = \
+ _mesa_##pfx##_transform_points##sz##_3d_no_rot; \
+ _mesa_transform_tab[sz][MATRIX_PERSPECTIVE] = \
+ _mesa_##pfx##_transform_points##sz##_perspective; \
+ _mesa_transform_tab[sz][MATRIX_2D] = \
+ _mesa_##pfx##_transform_points##sz##_2d; \
+ _mesa_transform_tab[sz][MATRIX_2D_NO_ROT] = \
+ _mesa_##pfx##_transform_points##sz##_2d_no_rot; \
+ _mesa_transform_tab[sz][MATRIX_3D] = \
+ _mesa_##pfx##_transform_points##sz##_3d;
+
+
+/* =============================================================
+ * Normal transformation function declarations:
+ */
+
+#define NORM_ARGS const GLmatrix *mat, \
+ GLfloat scale, \
+ const GLvector4f *in, \
+ const GLfloat *lengths, \
+ GLvector4f *dest
+
+#define DECLARE_NORM_GROUP( pfx ) \
+extern void _ASMAPI _mesa_##pfx##_rescale_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_normalize_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_normals_no_rot( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals_no_rot( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals_no_rot( NORM_ARGS );
+
+#define ASSIGN_NORM_GROUP( pfx ) \
+ _mesa_normal_tab[NORM_RESCALE] = \
+ _mesa_##pfx##_rescale_normals; \
+ _mesa_normal_tab[NORM_NORMALIZE] = \
+ _mesa_##pfx##_normalize_normals; \
+ _mesa_normal_tab[NORM_TRANSFORM] = \
+ _mesa_##pfx##_transform_normals; \
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT] = \
+ _mesa_##pfx##_transform_normals_no_rot; \
+ _mesa_normal_tab[NORM_TRANSFORM | NORM_RESCALE] = \
+ _mesa_##pfx##_transform_rescale_normals; \
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_RESCALE] = \
+ _mesa_##pfx##_transform_rescale_normals_no_rot; \
+ _mesa_normal_tab[NORM_TRANSFORM | NORM_NORMALIZE] = \
+ _mesa_##pfx##_transform_normalize_normals; \
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_NORMALIZE] = \
+ _mesa_##pfx##_transform_normalize_normals_no_rot;
+
+
+#endif
diff --git a/src/arch/x86/x86_xform2.S b/src/arch/x86/x86_xform2.S
new file mode 100644
index 0000000..980725e
--- /dev/null
+++ b/src/arch/x86/x86_xform2.S
@@ -0,0 +1,574 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
+ * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces
+ * in there will break the build on some platforms.
+ */
+
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FP_ONE 1065353216
+#define FP_ZERO 0
+
+#define SRC0 REGOFF(0, ESI)
+#define SRC1 REGOFF(4, ESI)
+#define SRC2 REGOFF(8, ESI)
+#define SRC3 REGOFF(12, ESI)
+#define DST0 REGOFF(0, EDI)
+#define DST1 REGOFF(4, EDI)
+#define DST2 REGOFF(8, EDI)
+#define DST3 REGOFF(12, EDI)
+#define MAT0 REGOFF(0, EDX)
+#define MAT1 REGOFF(4, EDX)
+#define MAT2 REGOFF(8, EDX)
+#define MAT3 REGOFF(12, EDX)
+#define MAT4 REGOFF(16, EDX)
+#define MAT5 REGOFF(20, EDX)
+#define MAT6 REGOFF(24, EDX)
+#define MAT7 REGOFF(28, EDX)
+#define MAT8 REGOFF(32, EDX)
+#define MAT9 REGOFF(36, EDX)
+#define MAT10 REGOFF(40, EDX)
+#define MAT11 REGOFF(44, EDX)
+#define MAT12 REGOFF(48, EDX)
+#define MAT13 REGOFF(52, EDX)
+#define MAT14 REGOFF(56, EDX)
+#define MAT15 REGOFF(60, EDX)
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_general )
+HIDDEN(_mesa_x86_transform_points2_general)
+GLNAME( _mesa_x86_transform_points2_general ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p2_gr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p2_gr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+ FLD_S( SRC0 ) /* F5 F4 */
+ FMUL_S( MAT1 )
+ FLD_S( SRC0 ) /* F6 F5 F4 */
+ FMUL_S( MAT2 )
+ FLD_S( SRC0 ) /* F7 F6 F5 F4 */
+ FMUL_S( MAT3 )
+
+ FLD_S( SRC1 ) /* F0 F7 F6 F5 F4 */
+ FMUL_S( MAT4 )
+ FLD_S( SRC1 ) /* F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT5 )
+ FLD_S( SRC1 ) /* F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT6 )
+ FLD_S( SRC1 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT7 )
+
+ FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
+ FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */
+
+ FXCH( ST(3) ) /* F4 F6 F5 F7 */
+ FADD_S( MAT12 )
+ FXCH( ST(2) ) /* F5 F6 F4 F7 */
+ FADD_S( MAT13 )
+ FXCH( ST(1) ) /* F6 F5 F4 F7 */
+ FADD_S( MAT14 )
+ FXCH( ST(3) ) /* F7 F5 F4 F6 */
+ FADD_S( MAT15 )
+
+ FXCH( ST(2) ) /* F4 F5 F7 F6 */
+ FSTP_S( DST0 ) /* F5 F7 F6 */
+ FSTP_S( DST1 ) /* F7 F6 */
+ FXCH( ST(1) ) /* F6 F7 */
+ FSTP_S( DST2 ) /* F7 */
+ FSTP_S( DST3 ) /* */
+
+LLBL(x86_p2_gr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p2_gr_loop) )
+
+LLBL(x86_p2_gr_done):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_perspective )
+HIDDEN(_mesa_x86_transform_points2_perspective)
+GLNAME( _mesa_x86_transform_points2_perspective ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p2_pr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ MOV_L( MAT14, EBX )
+
+ALIGNTEXT16
+LLBL(x86_p2_pr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+
+ FLD_S( SRC1 ) /* F1 F4 */
+ FMUL_S( MAT5 )
+
+ FXCH( ST(1) ) /* F4 F1 */
+ FSTP_S( DST0 ) /* F1 */
+ FSTP_S( DST1 ) /* */
+ MOV_L( EBX, DST2 )
+ MOV_L( CONST(FP_ZERO), DST3 )
+
+LLBL(x86_p2_pr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p2_pr_loop) )
+
+LLBL(x86_p2_pr_done):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_3d )
+HIDDEN(_mesa_x86_transform_points2_3d)
+GLNAME( _mesa_x86_transform_points2_3d ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p2_3dr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p2_3dr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+ FLD_S( SRC0 ) /* F5 F4 */
+ FMUL_S( MAT1 )
+ FLD_S( SRC0 ) /* F6 F5 F4 */
+ FMUL_S( MAT2 )
+
+ FLD_S( SRC1 ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT4 )
+ FLD_S( SRC1 ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT5 )
+ FLD_S( SRC1 ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT6 )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F6 F5 F4 */
+
+ FXCH( ST(2) ) /* F4 F5 F6 */
+ FADD_S( MAT12 )
+ FXCH( ST(1) ) /* F5 F4 F6 */
+ FADD_S( MAT13 )
+ FXCH( ST(2) ) /* F6 F4 F5 */
+ FADD_S( MAT14 )
+
+ FXCH( ST(1) ) /* F4 F6 F5 */
+ FSTP_S( DST0 ) /* F6 F5 */
+ FXCH( ST(1) ) /* F5 F6 */
+ FSTP_S( DST1 ) /* F6 */
+ FSTP_S( DST2 ) /* */
+
+LLBL(x86_p2_3dr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p2_3dr_loop) )
+
+LLBL(x86_p2_3dr_done):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_3d_no_rot )
+HIDDEN(_mesa_x86_transform_points2_3d_no_rot)
+GLNAME( _mesa_x86_transform_points2_3d_no_rot ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p2_3dnrr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ MOV_L( MAT14, EBX )
+
+ALIGNTEXT16
+LLBL(x86_p2_3dnrr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+
+ FLD_S( SRC1 ) /* F1 F4 */
+ FMUL_S( MAT5 )
+
+ FXCH( ST(1) ) /* F4 F1 */
+ FADD_S( MAT12 )
+ FLD_S( MAT13 ) /* F5 F4 F1 */
+ FXCH( ST(2) ) /* F1 F4 F5 */
+ FADDP( ST0, ST(2) ) /* F4 F5 */
+
+ FSTP_S( DST0 ) /* F5 */
+ FSTP_S( DST1 ) /* */
+ MOV_L( EBX, DST2 )
+
+LLBL(x86_p2_3dnrr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p2_3dnrr_loop) )
+
+LLBL(x86_p2_3dnrr_done):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_2d )
+HIDDEN(_mesa_x86_transform_points2_2d)
+GLNAME( _mesa_x86_transform_points2_2d ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p2_2dr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p2_2dr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+ FLD_S( SRC0 ) /* F5 F4 */
+ FMUL_S( MAT1 )
+
+ FLD_S( SRC1 ) /* F0 F5 F4 */
+ FMUL_S( MAT4 )
+ FLD_S( SRC1 ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT5 )
+
+ FXCH( ST(1) ) /* F0 F1 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F1 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F5 F4 */
+
+ FXCH( ST(1) ) /* F4 F5 */
+ FADD_S( MAT12 )
+ FXCH( ST(1) ) /* F5 F4 */
+ FADD_S( MAT13 )
+
+ FXCH( ST(1) ) /* F4 F5 */
+ FSTP_S( DST0 ) /* F5 */
+ FSTP_S( DST1 ) /* */
+
+LLBL(x86_p2_2dr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p2_2dr_loop) )
+
+LLBL(x86_p2_2dr_done):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME( _mesa_x86_transform_points2_2d_no_rot )
+HIDDEN(_mesa_x86_transform_points2_2d_no_rot)
+GLNAME( _mesa_x86_transform_points2_2d_no_rot ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p2_2dnrr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p2_2dnrr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+
+ FLD_S( SRC1 ) /* F1 F4 */
+ FMUL_S( MAT5 )
+
+ FXCH( ST(1) ) /* F4 F1 */
+ FADD_S( MAT12 )
+ FLD_S( MAT13 ) /* F5 F4 F1 */
+ FXCH( ST(2) ) /* F1 F4 F5 */
+ FADDP( ST0, ST(2) ) /* F4 F5 */
+
+ FSTP_S( DST0 ) /* F5 */
+ FSTP_S( DST1 ) /* */
+
+LLBL(x86_p2_2dnrr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p2_2dnrr_loop) )
+
+LLBL(x86_p2_2dnrr_done):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_identity )
+HIDDEN(_mesa_x86_transform_points2_identity)
+GLNAME( _mesa_x86_transform_points2_identity ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p2_ir_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ CMP_L( ESI, EDI )
+ JE( LLBL(x86_p2_ir_done) )
+
+ALIGNTEXT16
+LLBL(x86_p2_ir_loop):
+
+ MOV_L( SRC0, EBX )
+ MOV_L( SRC1, EDX )
+
+ MOV_L( EBX, DST0 )
+ MOV_L( EDX, DST1 )
+
+LLBL(x86_p2_ir_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p2_ir_loop) )
+
+LLBL(x86_p2_ir_done):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/x86_xform3.S b/src/arch/x86/x86_xform3.S
new file mode 100644
index 0000000..1c782f1
--- /dev/null
+++ b/src/arch/x86/x86_xform3.S
@@ -0,0 +1,644 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
+ * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces
+ * in there will break the build on some platforms.
+ */
+
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FP_ONE 1065353216
+#define FP_ZERO 0
+
+#define SRC0 REGOFF(0, ESI)
+#define SRC1 REGOFF(4, ESI)
+#define SRC2 REGOFF(8, ESI)
+#define SRC3 REGOFF(12, ESI)
+#define DST0 REGOFF(0, EDI)
+#define DST1 REGOFF(4, EDI)
+#define DST2 REGOFF(8, EDI)
+#define DST3 REGOFF(12, EDI)
+#define MAT0 REGOFF(0, EDX)
+#define MAT1 REGOFF(4, EDX)
+#define MAT2 REGOFF(8, EDX)
+#define MAT3 REGOFF(12, EDX)
+#define MAT4 REGOFF(16, EDX)
+#define MAT5 REGOFF(20, EDX)
+#define MAT6 REGOFF(24, EDX)
+#define MAT7 REGOFF(28, EDX)
+#define MAT8 REGOFF(32, EDX)
+#define MAT9 REGOFF(36, EDX)
+#define MAT10 REGOFF(40, EDX)
+#define MAT11 REGOFF(44, EDX)
+#define MAT12 REGOFF(48, EDX)
+#define MAT13 REGOFF(52, EDX)
+#define MAT14 REGOFF(56, EDX)
+#define MAT15 REGOFF(60, EDX)
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_general )
+HIDDEN(_mesa_x86_transform_points3_general)
+GLNAME( _mesa_x86_transform_points3_general ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p3_gr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p3_gr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+ FLD_S( SRC0 ) /* F5 F4 */
+ FMUL_S( MAT1 )
+ FLD_S( SRC0 ) /* F6 F5 F4 */
+ FMUL_S( MAT2 )
+ FLD_S( SRC0 ) /* F7 F6 F5 F4 */
+ FMUL_S( MAT3 )
+
+ FLD_S( SRC1 ) /* F0 F7 F6 F5 F4 */
+ FMUL_S( MAT4 )
+ FLD_S( SRC1 ) /* F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT5 )
+ FLD_S( SRC1 ) /* F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT6 )
+ FLD_S( SRC1 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT7 )
+
+ FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
+ FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */
+
+ FLD_S( SRC2 ) /* F0 F7 F6 F5 F4 */
+ FMUL_S( MAT8 )
+ FLD_S( SRC2 ) /* F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT9 )
+ FLD_S( SRC2 ) /* F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT10 )
+ FLD_S( SRC2 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT11 )
+
+ FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
+ FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */
+
+ FXCH( ST(3) ) /* F4 F6 F5 F7 */
+ FADD_S( MAT12 )
+ FXCH( ST(2) ) /* F5 F6 F4 F7 */
+ FADD_S( MAT13 )
+ FXCH( ST(1) ) /* F6 F5 F4 F7 */
+ FADD_S( MAT14 )
+ FXCH( ST(3) ) /* F7 F5 F4 F6 */
+ FADD_S( MAT15 )
+
+ FXCH( ST(2) ) /* F4 F5 F7 F6 */
+ FSTP_S( DST0 ) /* F5 F7 F6 */
+ FSTP_S( DST1 ) /* F7 F6 */
+ FXCH( ST(1) ) /* F6 F7 */
+ FSTP_S( DST2 ) /* F7 */
+ FSTP_S( DST3 ) /* */
+
+LLBL(x86_p3_gr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p3_gr_loop) )
+
+LLBL(x86_p3_gr_done):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_perspective )
+HIDDEN(_mesa_x86_transform_points3_perspective)
+GLNAME( _mesa_x86_transform_points3_perspective ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p3_pr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p3_pr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+
+ FLD_S( SRC1 ) /* F5 F4 */
+ FMUL_S( MAT5 )
+
+ FLD_S( SRC2 ) /* F0 F5 F4 */
+ FMUL_S( MAT8 )
+ FLD_S( SRC2 ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT9 )
+ FLD_S( SRC2 ) /* F2 F1 F0 F5 F4 */
+ FMUL_S( MAT10 )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F5 F4 */
+ FADDP( ST0, ST(4) ) /* F1 F2 F5 F4 */
+ FADDP( ST0, ST(2) ) /* F2 F5 F4 */
+ FLD_S( MAT14 ) /* F6 F2 F5 F4 */
+ FXCH( ST(1) ) /* F2 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F6 F5 F4 */
+
+ MOV_L( SRC2, EBX )
+ XOR_L( CONST(-2147483648), EBX )/* change sign */
+
+ FXCH( ST(2) ) /* F4 F5 F6 */
+ FSTP_S( DST0 ) /* F5 F6 */
+ FSTP_S( DST1 ) /* F6 */
+ FSTP_S( DST2 ) /* */
+ MOV_L( EBX, DST3 )
+
+LLBL(x86_p3_pr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p3_pr_loop) )
+
+LLBL(x86_p3_pr_done):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_3d )
+HIDDEN(_mesa_x86_transform_points3_3d)
+GLNAME( _mesa_x86_transform_points3_3d ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p3_3dr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p3_3dr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+ FLD_S( SRC0 ) /* F5 F4 */
+ FMUL_S( MAT1 )
+ FLD_S( SRC0 ) /* F6 F5 F4 */
+ FMUL_S( MAT2 )
+
+ FLD_S( SRC1 ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT4 )
+ FLD_S( SRC1 ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT5 )
+ FLD_S( SRC1 ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT6 )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F6 F5 F4 */
+
+ FLD_S( SRC2 ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT8 )
+ FLD_S( SRC2 ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT9 )
+ FLD_S( SRC2 ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT10 )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F6 F5 F4 */
+
+ FXCH( ST(2) ) /* F4 F5 F6 */
+ FADD_S( MAT12 )
+ FXCH( ST(1) ) /* F5 F4 F6 */
+ FADD_S( MAT13 )
+ FXCH( ST(2) ) /* F6 F4 F5 */
+ FADD_S( MAT14 )
+
+ FXCH( ST(1) ) /* F4 F6 F5 */
+ FSTP_S( DST0 ) /* F6 F5 */
+ FXCH( ST(1) ) /* F5 F6 */
+ FSTP_S( DST1 ) /* F6 */
+ FSTP_S( DST2 ) /* */
+
+LLBL(x86_p3_3dr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p3_3dr_loop) )
+
+LLBL(x86_p3_3dr_done):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_3d_no_rot )
+HIDDEN(_mesa_x86_transform_points3_3d_no_rot)
+GLNAME( _mesa_x86_transform_points3_3d_no_rot ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p3_3dnrr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p3_3dnrr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+
+ FLD_S( SRC1 ) /* F1 F4 */
+ FMUL_S( MAT5 )
+
+ FLD_S( SRC2 ) /* F2 F1 F4 */
+ FMUL_S( MAT10 )
+
+ FXCH( ST(2) ) /* F4 F1 F2 */
+ FADD_S( MAT12 )
+ FLD_S( MAT13 ) /* F5 F4 F1 F2 */
+ FXCH( ST(2) ) /* F1 F4 F5 F2 */
+ FADDP( ST0, ST(2) ) /* F4 F5 F2 */
+ FLD_S( MAT14 ) /* F6 F4 F5 F2 */
+ FXCH( ST(3) ) /* F2 F4 F5 F6 */
+ FADDP( ST0, ST(3) ) /* F4 F5 F6 */
+
+ FSTP_S( DST0 ) /* F5 F6 */
+ FSTP_S( DST1 ) /* F6 */
+ FSTP_S( DST2 ) /* */
+
+LLBL(x86_p3_3dnrr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p3_3dnrr_loop) )
+
+LLBL(x86_p3_3dnrr_done):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_2d )
+HIDDEN(_mesa_x86_transform_points3_2d)
+GLNAME( _mesa_x86_transform_points3_2d ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p3_2dr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p3_2dr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+ FLD_S( SRC0 ) /* F5 F4 */
+ FMUL_S( MAT1 )
+
+ FLD_S( SRC1 ) /* F0 F5 F4 */
+ FMUL_S( MAT4 )
+ FLD_S( SRC1 ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT5 )
+
+ FXCH( ST(1) ) /* F0 F1 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F1 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F5 F4 */
+
+ FXCH( ST(1) ) /* F4 F5 */
+ FADD_S( MAT12 )
+ FXCH( ST(1) ) /* F5 F4 */
+ FADD_S( MAT13 )
+
+ MOV_L( SRC2, EBX )
+
+ FXCH( ST(1) ) /* F4 F5 */
+ FSTP_S( DST0 ) /* F5 */
+ FSTP_S( DST1 ) /* */
+ MOV_L( EBX, DST2 )
+
+LLBL(x86_p3_2dr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p3_2dr_loop) )
+
+LLBL(x86_p3_2dr_done):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_2d_no_rot )
+HIDDEN(_mesa_x86_transform_points3_2d_no_rot)
+GLNAME( _mesa_x86_transform_points3_2d_no_rot ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p3_2dnrr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p3_2dnrr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+
+ FLD_S( SRC1 ) /* F1 F4 */
+ FMUL_S( MAT5 )
+
+ FXCH( ST(1) ) /* F4 F1 */
+ FADD_S( MAT12 )
+ FLD_S( MAT13 ) /* F5 F4 F1 */
+
+ FXCH( ST(2) ) /* F1 F4 F5 */
+ FADDP( ST0, ST(2) ) /* F4 F5 */
+
+ MOV_L( SRC2, EBX )
+
+ FSTP_S( DST0 ) /* F5 */
+ FSTP_S( DST1 ) /* */
+ MOV_L( EBX, DST2 )
+
+LLBL(x86_p3_2dnrr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p3_2dnrr_loop) )
+
+LLBL(x86_p3_2dnrr_done):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_identity )
+HIDDEN(_mesa_x86_transform_points3_identity)
+GLNAME(_mesa_x86_transform_points3_identity ):
+
+#define FRAME_OFFSET 16
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+ PUSH_L( EBP )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p3_ir_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ CMP_L( ESI, EDI )
+ JE( LLBL(x86_p3_ir_done) )
+
+ALIGNTEXT16
+LLBL(x86_p3_ir_loop):
+
+#if 1
+ MOV_L( SRC0, EBX )
+ MOV_L( SRC1, EBP )
+ MOV_L( SRC2, EDX )
+
+ MOV_L( EBX, DST0 )
+ MOV_L( EBP, DST1 )
+ MOV_L( EDX, DST2 )
+#else
+ FLD_S( SRC0 )
+ FLD_S( SRC1 )
+ FLD_S( SRC2 )
+
+ FSTP_S( DST2 )
+ FSTP_S( DST1 )
+ FSTP_S( DST0 )
+#endif
+
+LLBL(x86_p3_ir_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p3_ir_loop) )
+
+LLBL(x86_p3_ir_done):
+
+ POP_L( EBP )
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/x86_xform4.S b/src/arch/x86/x86_xform4.S
new file mode 100644
index 0000000..97a8411
--- /dev/null
+++ b/src/arch/x86/x86_xform4.S
@@ -0,0 +1,677 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * NOTE: Avoid using spaces in between '(' ')' and arguments, especially
+ * with macros like CONST, LLBL that expand to CONCAT(...). Putting spaces
+ * in there will break the build on some platforms.
+ */
+
+#include "assyntax.h"
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FP_ONE 1065353216
+#define FP_ZERO 0
+
+#define SRC0 REGOFF(0, ESI)
+#define SRC1 REGOFF(4, ESI)
+#define SRC2 REGOFF(8, ESI)
+#define SRC3 REGOFF(12, ESI)
+#define DST0 REGOFF(0, EDI)
+#define DST1 REGOFF(4, EDI)
+#define DST2 REGOFF(8, EDI)
+#define DST3 REGOFF(12, EDI)
+#define MAT0 REGOFF(0, EDX)
+#define MAT1 REGOFF(4, EDX)
+#define MAT2 REGOFF(8, EDX)
+#define MAT3 REGOFF(12, EDX)
+#define MAT4 REGOFF(16, EDX)
+#define MAT5 REGOFF(20, EDX)
+#define MAT6 REGOFF(24, EDX)
+#define MAT7 REGOFF(28, EDX)
+#define MAT8 REGOFF(32, EDX)
+#define MAT9 REGOFF(36, EDX)
+#define MAT10 REGOFF(40, EDX)
+#define MAT11 REGOFF(44, EDX)
+#define MAT12 REGOFF(48, EDX)
+#define MAT13 REGOFF(52, EDX)
+#define MAT14 REGOFF(56, EDX)
+#define MAT15 REGOFF(60, EDX)
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_general )
+HIDDEN(_mesa_x86_transform_points4_general)
+GLNAME( _mesa_x86_transform_points4_general ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p4_gr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p4_gr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+ FLD_S( SRC0 ) /* F5 F4 */
+ FMUL_S( MAT1 )
+ FLD_S( SRC0 ) /* F6 F5 F4 */
+ FMUL_S( MAT2 )
+ FLD_S( SRC0 ) /* F7 F6 F5 F4 */
+ FMUL_S( MAT3 )
+
+ FLD_S( SRC1 ) /* F0 F7 F6 F5 F4 */
+ FMUL_S( MAT4 )
+ FLD_S( SRC1 ) /* F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT5 )
+ FLD_S( SRC1 ) /* F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT6 )
+ FLD_S( SRC1 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT7 )
+
+ FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
+ FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */
+
+ FLD_S( SRC2 ) /* F0 F7 F6 F5 F4 */
+ FMUL_S( MAT8 )
+ FLD_S( SRC2 ) /* F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT9 )
+ FLD_S( SRC2 ) /* F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT10 )
+ FLD_S( SRC2 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT11 )
+
+ FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
+ FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */
+
+ FLD_S( SRC3 ) /* F0 F7 F6 F5 F4 */
+ FMUL_S( MAT12 )
+ FLD_S( SRC3 ) /* F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT13 )
+ FLD_S( SRC3 ) /* F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT14 )
+ FLD_S( SRC3 ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT15 )
+
+ FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
+ FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F2 F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F3 F7 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F7 F6 F5 F4 */
+
+ FXCH( ST(3) ) /* F4 F6 F5 F7 */
+ FSTP_S( DST0 ) /* F6 F5 F7 */
+ FXCH( ST(1) ) /* F5 F6 F7 */
+ FSTP_S( DST1 ) /* F6 F7 */
+ FSTP_S( DST2 ) /* F7 */
+ FSTP_S( DST3 ) /* */
+
+LLBL(x86_p4_gr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p4_gr_loop) )
+
+LLBL(x86_p4_gr_done):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_perspective )
+HIDDEN(_mesa_x86_transform_points4_perspective)
+GLNAME( _mesa_x86_transform_points4_perspective ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p4_pr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p4_pr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+
+ FLD_S( SRC1 ) /* F5 F4 */
+ FMUL_S( MAT5 )
+
+ FLD_S( SRC2 ) /* F0 F5 F4 */
+ FMUL_S( MAT8 )
+ FLD_S( SRC2 ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT9 )
+ FLD_S( SRC2 ) /* F6 F1 F0 F5 F4 */
+ FMUL_S( MAT10 )
+
+ FXCH( ST(2) ) /* F0 F1 F6 F5 F4 */
+ FADDP( ST0, ST(4) ) /* F1 F6 F5 F4 */
+ FADDP( ST0, ST(2) ) /* F6 F5 F4 */
+
+ FLD_S( SRC3 ) /* F2 F6 F5 F4 */
+ FMUL_S( MAT14 )
+
+ FADDP( ST0, ST(1) ) /* F6 F5 F4 */
+
+ MOV_L( SRC2, EBX )
+ XOR_L( CONST(-2147483648), EBX )/* change sign */
+
+ FXCH( ST(2) ) /* F4 F5 F6 */
+ FSTP_S( DST0 ) /* F5 F6 */
+ FSTP_S( DST1 ) /* F6 */
+ FSTP_S( DST2 ) /* */
+ MOV_L( EBX, DST3 )
+
+LLBL(x86_p4_pr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p4_pr_loop) )
+
+LLBL(x86_p4_pr_done):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_3d )
+HIDDEN(_mesa_x86_transform_points4_3d)
+GLNAME( _mesa_x86_transform_points4_3d ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p4_3dr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p4_3dr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+ FLD_S( SRC0 ) /* F5 F4 */
+ FMUL_S( MAT1 )
+ FLD_S( SRC0 ) /* F6 F5 F4 */
+ FMUL_S( MAT2 )
+
+ FLD_S( SRC1 ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT4 )
+ FLD_S( SRC1 ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT5 )
+ FLD_S( SRC1 ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT6 )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F6 F5 F4 */
+
+ FLD_S( SRC2 ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT8 )
+ FLD_S( SRC2 ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT9 )
+ FLD_S( SRC2 ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT10 )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F6 F5 F4 */
+
+ FLD_S( SRC3 ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT12 )
+ FLD_S( SRC3 ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT13 )
+ FLD_S( SRC3 ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT14 )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F6 F5 F4 */
+
+ MOV_L( SRC3, EBX )
+
+ FXCH( ST(2) ) /* F4 F5 F6 */
+ FSTP_S( DST0 ) /* F5 F6 */
+ FSTP_S( DST1 ) /* F6 */
+ FSTP_S( DST2 ) /* */
+ MOV_L( EBX, DST3 )
+
+LLBL(x86_p4_3dr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p4_3dr_loop) )
+
+LLBL(x86_p4_3dr_done):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_x86_transform_points4_3d_no_rot)
+HIDDEN(_mesa_x86_transform_points4_3d_no_rot)
+GLNAME(_mesa_x86_transform_points4_3d_no_rot):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p4_3dnrr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p4_3dnrr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+
+ FLD_S( SRC1 ) /* F5 F4 */
+ FMUL_S( MAT5 )
+
+ FLD_S( SRC2 ) /* F6 F5 F4 */
+ FMUL_S( MAT10 )
+
+ FLD_S( SRC3 ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT12 )
+ FLD_S( SRC3 ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT13 )
+ FLD_S( SRC3 ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT14 )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F6 F5 F4 */
+
+ MOV_L( SRC3, EBX )
+
+ FXCH( ST(2) ) /* F4 F5 F6 */
+ FSTP_S( DST0 ) /* F5 F6 */
+ FSTP_S( DST1 ) /* F6 */
+ FSTP_S( DST2 ) /* */
+ MOV_L( EBX, DST3 )
+
+LLBL(x86_p4_3dnrr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p4_3dnrr_loop) )
+
+LLBL(x86_p4_3dnrr_done):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_2d )
+HIDDEN(_mesa_x86_transform_points4_2d)
+GLNAME( _mesa_x86_transform_points4_2d ):
+
+#define FRAME_OFFSET 16
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+ PUSH_L( EBP )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p4_2dr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p4_2dr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+ FLD_S( SRC0 ) /* F5 F4 */
+ FMUL_S( MAT1 )
+
+ FLD_S( SRC1 ) /* F0 F5 F4 */
+ FMUL_S( MAT4 )
+ FLD_S( SRC1 ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT5 )
+
+ FXCH( ST(1) ) /* F0 F1 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F1 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F5 F4 */
+
+ FLD_S( SRC3 ) /* F0 F5 F4 */
+ FMUL_S( MAT12 )
+ FLD_S( SRC3 ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT13 )
+
+ FXCH( ST(1) ) /* F0 F1 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F1 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F5 F4 */
+
+ MOV_L( SRC2, EBX )
+ MOV_L( SRC3, EBP )
+
+ FXCH( ST(1) ) /* F4 F5 */
+ FSTP_S( DST0 ) /* F5 */
+ FSTP_S( DST1 ) /* */
+ MOV_L( EBX, DST2 )
+ MOV_L( EBP, DST3 )
+
+LLBL(x86_p4_2dr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p4_2dr_loop) )
+
+LLBL(x86_p4_2dr_done):
+
+ POP_L( EBP )
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_2d_no_rot )
+HIDDEN(_mesa_x86_transform_points4_2d_no_rot)
+GLNAME( _mesa_x86_transform_points4_2d_no_rot ):
+
+#define FRAME_OFFSET 16
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+ PUSH_L( EBP )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p4_2dnrr_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL(x86_p4_2dnrr_loop):
+
+ FLD_S( SRC0 ) /* F4 */
+ FMUL_S( MAT0 )
+
+ FLD_S( SRC1 ) /* F5 F4 */
+ FMUL_S( MAT5 )
+
+ FLD_S( SRC3 ) /* F0 F5 F4 */
+ FMUL_S( MAT12 )
+ FLD_S( SRC3 ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT13 )
+
+ FXCH( ST(1) ) /* F0 F1 F5 F4 */
+ FADDP( ST0, ST(3) ) /* F1 F5 F4 */
+ FADDP( ST0, ST(1) ) /* F5 F4 */
+
+ MOV_L( SRC2, EBX )
+ MOV_L( SRC3, EBP )
+
+ FXCH( ST(1) ) /* F4 F5 */
+ FSTP_S( DST0 ) /* F5 */
+ FSTP_S( DST1 ) /* */
+ MOV_L( EBX, DST2 )
+ MOV_L( EBP, DST3 )
+
+LLBL(x86_p4_2dnrr_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p4_2dnrr_loop) )
+
+LLBL(x86_p4_2dnrr_done):
+
+ POP_L( EBP )
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_identity )
+HIDDEN(_mesa_x86_transform_points4_identity)
+GLNAME( _mesa_x86_transform_points4_identity ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(x86_p4_ir_done) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ CMP_L( ESI, EDI )
+ JE( LLBL(x86_p4_ir_done) )
+
+ALIGNTEXT16
+LLBL(x86_p4_ir_loop):
+
+ MOV_L( SRC0, EBX )
+ MOV_L( SRC1, EDX )
+
+ MOV_L( EBX, DST0 )
+ MOV_L( EDX, DST1 )
+
+ MOV_L( SRC2, EBX )
+ MOV_L( SRC3, EDX )
+
+ MOV_L( EBX, DST2 )
+ MOV_L( EDX, DST3 )
+
+LLBL(x86_p4_ir_skip):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(x86_p4_ir_loop) )
+
+LLBL(x86_p4_ir_done):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+
+#if defined (__ELF__) && defined (__linux__)
+ .section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/arch/x86/xform_args.h b/src/arch/x86/xform_args.h
new file mode 100644
index 0000000..b773f51
--- /dev/null
+++ b/src/arch/x86/xform_args.h
@@ -0,0 +1,51 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Transform function interface for assembly code. Simply define
+ * FRAME_OFFSET to the number of bytes pushed onto the stack before
+ * using the ARG_* argument macros.
+ *
+ * Gareth Hughes
+ */
+
+#ifndef __XFORM_ARGS_H__
+#define __XFORM_ARGS_H__
+
+/* Offsets for transform_func arguments
+ *
+ * typedef void (*transform_func)( GLvector4f *to_vec,
+ * const GLfloat m[16],
+ * const GLvector4f *from_vec );
+ */
+#define OFFSET_DEST 4
+#define OFFSET_MATRIX 8
+#define OFFSET_SOURCE 12
+
+#define ARG_DEST REGOFF(FRAME_OFFSET+OFFSET_DEST, ESP)
+#define ARG_MATRIX REGOFF(FRAME_OFFSET+OFFSET_MATRIX, ESP)
+#define ARG_SOURCE REGOFF(FRAME_OFFSET+OFFSET_SOURCE, ESP)
+
+#endif