summaryrefslogtreecommitdiff
path: root/reference/dd.asm
diff options
context:
space:
mode:
Diffstat (limited to 'reference/dd.asm')
-rw-r--r--reference/dd.asm409
1 files changed, 171 insertions, 238 deletions
diff --git a/reference/dd.asm b/reference/dd.asm
index 3008314..332831f 100644
--- a/reference/dd.asm
+++ b/reference/dd.asm
@@ -6,263 +6,196 @@
@out(r1.y) out1
@out(r1.z) out2
@out(r1.w) out3
+@const(c4.x) 0x3f800000, 0x00000000, 0x3e4ccccd, 0x3f1013a9
+@const(c5.x) 0x3f400d1b, 0xbf800000, 0x3ccccccd, 0x3d4ccccd
+@const(c6.x) 0x3fb8aa65, 0x40000000, 0x3f800000, 0xc39044fe
+@const(c7.x) 0xbe2ab368, 0x41200000, 0x00000000, 0x00000000
+@const(c8.x) 0x00000000, 0x3f800000, 0x43000000, 0x00000000
(sy)(ss)absneg.f r0.z, (neg)c0.y
-bary.f r0.w, 3, r0.x
-bary.f r1.x, 0, r0.x
+bary.f r0.w, 5, r0.x
bary.f r1.y, 4, r0.x
+bary.f r1.x, 3, r0.x
mul.f r1.z, r0.z, r0.z
-mov.f32f32 r0.w, r0.w
-mov.f32f32 r1.w, r1.x
-bary.f r2.y, 1, r0.x
+mov.f32f32 r1.w, r0.w
+mov.f32f32 r2.x, r1.y
+mov.f32f32 r2.y, r1.x
add.f r1.z, c4.x, (neg)r1.z
-mov.f32f32 r2.z, r0.w
-mov.f32f32 r3.x, r1.y
-mov.f32f32 r2.x, r2.y
-mov.f32f32 r1.z, r1.z
-mul.f r3.y, c3.x, r0.w
-mov.f32f32 r2.w, r3.x
-mul.f r3.w, c3.x, r1.x
-mul.f r1.x, r1.z, c4.w
-mul.f r3.z, c3.x, r3.x
-dsx (f32)(xy)r4.y, r1.w
-(sy)mul.f r1.z, r4.y, r4.y
-mul.f r4.x, c3.x, r2.y
-mov.f32f32 r1.x, r1.x
-(ss)nop
-dsx (f32)(xy)r1.w, r2.z
-(sy)mul.f r1.w, r1.w, r1.w
-mad.f32 r1.z, r4.z, r4.z, r1.z
-mad.f32 r1.w, r2.x, r2.x, r1.w
-add.f r1.x, c4.x, (neg)r1.x
-dsy (f32)(xy)r2.x, r3.y
-(sy)mul.f r2.x, r2.x, r2.x
-(ss)nop
-dsy (f32)(xy)r3.y, r3.w
-mov.f32f32 r1.z, r1.z
-mov.f32f32 r1.w, r1.w
-mov.f32f32 r1.x, r1.x
-bary.f r2.z, 5, r0.x
-bary.f (ei)r0.x, 2, r0.x
-mad.f32 r0.y, r2.y, r2.y, r2.x
-mov.f32f32 r2.x, r1.x
-mov.f32f32 r2.y, r2.z
-cmps.f.lt r1.x, r1.x, c4.y
-mov.f32f32 r2.z, r0.x
-mov.f32f32 r0.y, r0.y
-mov.f32f32 r2.w, r2.y
-cov.u32f32 r1.x, r1.x
-sqrt r2.x, r2.x
-(ss)mov.f32f32 r2.x, r2.x
-mov.f32f32 r3.w, r0.z
-mul.f r4.x, c3.x, r2.y
-(sy)mul.f r3.y, r3.y, r3.y
-dsx (f32)(x)r4.y, r2.z
-cmps.f.ne r1.x, r1.x, c4.y
-mad.f32 r2.x, c5.x, r3.w, r2.x
-dsx (f32)(x)r4.z, r2.w
-(sy)mad.f32 r1.w, r4.z, r4.z, r1.w
-mad.f32 r1.z, r4.y, r4.y, r1.z
-(ss)mad.f32 r2.z, r3.z, r3.z, r3.y
-mov.f32f32 r2.x, r2.x
-dsy (f32)(x)r3.y, r4.x
-(sy)mad.f32 r0.y, r3.y, r3.y, r0.y
+mul.f r2.z, c3.x, r1.w
+absneg.f r1.w, (neg)r1.w
+absneg.f r2.w, (neg)r2.x
+mul.f r1.z, r1.z, c4.w
+absneg.f r3.x, (neg)r2.y
+bary.f r3.y, 2, r0.x
+mov.f32f32 r3.z, r2.w
+add.f r1.z, c4.x, (neg)r1.z
+dsy (f32)(x)r3.w, r2.z
+mul.f r2.y, c3.x, r2.y
+(ss)mul.f r2.z, c3.x, r2.x
+mov.f32f32 r2.x, r1.w
+mov.f32f32 r4.x, r1.z
+cmps.f.lt r1.z, r1.z, c4.y
+add.f r4.y, c1.y, r3.z
+mov.f32f32 r4.z, r3.x
+mul.f r4.w, c3.x, r3.y
+mov.f32f32 r5.x, c2.x
+dsy (f32)(xy)r5.y, r2.y
+(sy)(ss)mul.f r2.y, r5.y, r5.y
+sqrt r2.z, r4.x
+(ss)mad.f32 r2.z, c5.x, r0.z, r2.z
+mad.f32 r2.y, r5.z, r5.z, r2.y
+cov.u32f32 r1.z, r1.z
+mad.f32 r2.y, r3.w, r3.w, r2.y
+mov.f32f32 r3.w, r2.z
+mul.f r2.z, r2.z, c4.x
+cmps.f.ne r1.z, r1.z, c4.y
+(ss)add.f r4.x, c1.z, r2.x
+mul.f r5.y, r3.w, c4.y
+absneg.f r5.z, (neg)c0.x
+mul.f r3.w, r3.w, c4.y
+absneg.f r5.w, (neg)c0.z
+mad.f32 r0.z, c5.x, r0.z, (neg)r2.z
+mad.f32 r2.z, c5.x, r5.z, (neg)r5.y
+mov.f32f32 r5.y, c4.y
+mad.f32 r3.w, c5.x, r5.w, (neg)r3.w
+mov.f32f32 r5.z, c4.y
+mov.f32f32 r5.w, c4.y
+sel.b32 r2.z, r5.y, r1.z, r2.z
+add.f r5.y, c1.x, r4.z
+rcp r5.x, r5.x
+sel.b32 r3.w, r5.z, r1.z, r3.w
+sel.b32 r0.z, r5.w, r1.z, r0.z
+absneg.f r1.z, (neg)r2.z
+(ss)mul.f r5.y, r5.y, r5.x
+mul.f r4.x, r4.x, r5.x
+absneg.f r5.z, (neg)r3.w
+absneg.f r5.w, (neg)r0.z
+mul.f r6.x, r5.y, (neg)r2.z
+mov.f32f32 r6.y, r4.x
+rcp r6.z, r1.z
+add.f r3.x, c5.y, r3.x
+mul.f r4.y, r4.y, r5.x
+(ss)rcp r1.z, r1.z
+add.f r4.z, c6.z, r4.z
+mul.f r5.x, r6.y, r0.z
+(ss)mul.f r3.x, r3.x, r6.z
+mad.f32 r6.x, r4.y, (neg)r0.z, r6.x
+(ss)mul.f r1.z, r4.z, r1.z
+mad.f32 r4.x, r4.x, (neg)r3.w, r6.x
+mov.f32f32 r4.z, r3.x
+mov.f32f32 r4.y, r4.y
+max.f r3.x, r3.x, r1.z
+rcp r6.x, r5.w
+add.f r2.w, c5.y, r2.w
+mov.f32f32 r6.z, r4.x
+mul.f r4.x, r4.x, c6.y
mov.f32f32 r1.z, r1.z
-mov.f32f32 r2.z, r2.z
-mul.f r2.w, r2.x, c4.x
-mul.f r3.y, r2.x, c4.y
-mul.f r2.x, r2.x, c4.y
-sqrt r1.w, r1.w
-(ss)mov.f32f32 r1.w, r1.w
-mov.f32f32 r2.w, r2.w
-mov.f32f32 r0.z, r0.z
-mov.f32f32 r3.y, r3.y
-absneg.f r3.z, (neg)c0.x
-mov.f32f32 r2.x, r2.x
-mad.f32 r0.z, c5.x, r0.z, (neg)r2.w
-mov.f32f32 r2.w, c4.y
+(ss)mul.f r2.w, r2.w, r6.x
+(ss)rcp r5.w, r5.w
+add.f r3.z, c6.y, r3.z
+mad.f32 r6.x, c5.z, r6.z, c5.w
+max.f r4.x, r4.x, c4.y
+min.f r1.z, r4.z, r1.z
+(ss)mul.f r3.z, r3.z, r5.w
+mov.f32f32 r4.z, r2.w
+min.f r4.x, r4.x, c4.x
+mad.f32 r5.x, r4.y, r3.w, (neg)r5.x
+max.f r2.w, r2.w, r3.z
+(ss)rcp r5.w, r6.x
mov.f32f32 r3.z, r3.z
-absneg.f r3.w, (neg)c0.z
-sqrt r0.y, r0.y
-(ss)mov.f32f32 r0.y, r0.y
-sel.b32 r0.z, r2.w, r1.x, r0.z
-absneg.f r2.y, (neg)r2.y
-mad.f32 r2.w, c5.x, r3.z, (neg)r3.y
-mov.f32f32 r3.y, c4.y
-mov.f32f32 r3.z, r3.w
-mov.f32f32 r2.y, r2.y
-absneg.f r3.w, (neg)r0.z
-sel.b32 r2.w, r3.y, r1.x, r2.w
-absneg.f r0.w, (neg)r0.w
-add.f r3.y, c1.z, r2.y
-mov.f32f32 r4.x, c2.x
-absneg.f r4.y, (neg)r2.w
-mov.f32f32 r0.w, r0.w
-mad.f32 r2.x, c5.x, r3.z, (neg)r2.x
-rcp r3.z, r3.w
-mov.f32f32 r4.z, c4.y
-absneg.f r3.x, (neg)r3.x
-add.f r4.w, c1.x, r0.w
-rcp r4.x, r4.x
-(ss)mul.f r3.y, r3.y, r4.x
-rcp r5.x, r4.y
-add.f r5.y, c5.y, r0.w
-sel.b32 r1.x, r4.z, r1.x, r2.x
-mul.f r2.x, r4.w, r4.x
-mov.f32f32 r3.y, r3.y
-(ss)mul.f r4.z, r5.y, r5.x
-absneg.f r4.w, (neg)r1.x
-mov.f32f32 r3.x, r3.x
-mul.f r5.x, r3.y, r0.z
-mov.f32f32 r4.z, r4.z
-(ss)rcp r4.y, r4.y
-add.f r0.w, c6.z, r0.w
-add.f r5.y, c1.y, r3.x
+(ss)add.f r6.x, c8.y, (neg)r4.x
+mov.f32f32 r6.z, r5.x
+min.f r2.w, r3.x, r2.w
+rcp r3.x, r5.z
+add.f r1.w, c5.y, r1.w
+mul.f r6.x, r6.x, c4.x
+mul.f r5.x, r5.x, r6.z
+min.f r3.z, r4.z, r3.z
+(ss)mul.f r1.w, r1.w, r3.x
+rcp r3.x, r5.z
+add.f r2.x, c6.z, r2.x
+mov.f32f32 r4.z, r5.y
+max.f r1.z, r1.z, r3.z
+mov.f32f32 r3.z, r1.w
+(ss)mul.f r2.x, r2.x, r3.x
+mul.f r3.x, r4.z, r3.w
+mul.f r3.w, r4.y, r2.z
+mad.f32 r2.z, r6.y, r2.z, (neg)r3.x
+max.f r1.w, r1.w, r2.x
mov.f32f32 r2.x, r2.x
-rcp r5.z, r4.w
-add.f r5.w, c5.y, r3.x
-mov.f32f32 r0.w, r0.w
-mul.f r4.x, r5.y, r4.x
-mul.f r5.y, r2.x, (neg)r2.w
-add.f r6.x, c5.y, r2.y
-mul.f r3.z, r5.w, r3.z
-mov.f32f32 r4.x, r4.x
-(ss)mul.f r0.w, r0.w, r4.y
-(ss)mul.f r4.y, r6.x, r5.z
-mov.f32f32 r3.z, r3.z
-mad.f32 r5.x, r4.x, r1.x, (neg)r5.x
-mov.f32f32 r0.w, r0.w
-mad.f32 r5.y, r4.x, (neg)r0.z, r5.y
-mov.f32f32 r4.y, r4.y
-mov.f32f32 r5.x, r5.x
-max.f r5.z, r4.z, r0.w
-mov.f32f32 r5.y, r5.y
-min.f r0.w, r4.z, r0.w
-mul.f r4.z, r5.x, r5.x
-mul.f r5.x, r2.x, r1.x
-mov.f32f32 r5.z, r5.z
-mad.f32 r5.x, r3.y, r2.w, (neg)r5.x
-rcp r3.w, r3.w
-add.f r3.x, c6.y, r3.x
-mad.f32 r1.x, r3.y, (neg)r1.x, r5.y
-rcp r3.y, r4.w
-add.f r2.y, c6.z, r2.y
-(ss)mov.f32f32 r4.w, r5.x
-mov.f32f32 r3.x, r3.x
-mov.f32f32 r1.x, r1.x
-mov.f32f32 r2.y, r2.y
-mad.f32 r4.z, r4.w, r4.w, r4.z
-(ss)mul.f r3.x, r3.x, r3.w
-mul.f r3.w, r1.x, c6.y
-mad.f32 r1.x, c5.z, r1.x, c5.w
-mov.f32f32 r4.z, r4.z
-mul.f r2.w, r4.x, r2.w
-mov.f32f32 r3.x, r3.x
-mad.f32 r2.x, r2.x, r0.z, (neg)r2.w
-mov.f32f32 r2.w, r3.w
-mul.f r2.y, r2.y, r3.y
-max.f r3.y, r3.z, r3.x
-mov.f32f32 r2.x, r2.x
-max.f r2.w, r2.w, c4.y
-min.f r3.x, r3.z, r3.x
-mov.f32f32 r3.y, r3.y
-mad.f32 r2.x, r2.x, r2.x, r4.z
-min.f r2.w, r2.w, c4.x
-max.f r0.w, r0.w, r3.x
-min.f r3.x, r5.z, r3.y
-mov.f32f32 r2.x, r2.x
-mov.f32f32 r2.y, r2.y
-add.f r3.y, c8.y, (neg)r2.w
-rcp r1.x, r1.x
-(ss)mov.f32f32 r1.x, r1.x
-add.f r2.x, r2.x, c5.y
-max.f r3.z, r4.y, r2.y
-mul.f r3.y, r3.y, c4.x
-min.f r2.y, r4.y, r2.y
-mov.f32f32 r2.x, r2.x
-mov.f32f32 r3.z, r3.z
-mul.f r0.y, r1.w, r0.y
-sqrt r1.z, r1.z
-max.f r0.w, r0.w, r2.y
-mad.f32 r1.x, r2.x, r1.x, c4.x
-min.f r1.w, r3.x, r3.z
-mov.f32f32 r0.y, r0.y
-(ss)mov.f32f32 r1.z, r1.z
-mov.f32f32 r1.x, r1.x
-mov.f32f32 r1.w, r1.w
-mov.f32f32 r0.w, r0.w
-mul.f r0.x, c3.x, r0.x
-mul.f r1.x, (neg)r1.x, c6.x
+mad.f32 r3.x, r4.z, r0.z, (neg)r3.w
+nop
+min.f r1.w, r2.w, r1.w
+mov.f32f32 r2.w, r2.z
+min.f r2.x, r3.z, r2.x
+mov.f32f32 r3.z, r3.x
+mov.f32f32 r3.w, r1.w
+mad.f32 r2.z, r2.z, r2.w, r5.x
+max.f r1.z, r1.z, r2.x
+mad.f32 r2.x, r3.x, r3.z, r2.z
mul.f r0.z, r0.z, r1.w
-add.f r0.w, r1.w, (neg)r0.w
-rcp r0.y, r0.y
-(ss)mov.f32f32 r0.y, r0.y
-mov.f32f32 r1.x, r1.x
+sqrt r2.y, r2.y
+dsx (f32)(x)r6.y, r0.w
+dsx (f32)(xy)r6.z, r1.x
+(sy)(ss)mul.f r0.w, r6.z, r6.z
+add.f r1.x, r3.w, (neg)r1.z
+add.f r1.z, r2.x, c5.y
add.f r0.z, r1.y, (neg)r0.z
-mov.f32f32 r0.w, r0.w
-dsy (f32)(x)r3.z, r0.x
-(sy)(ss)mad.f32 r0.x, r3.z, r3.z, r2.z
-mov.f32f32 r1.y, c4.y
-mov.f32f32 r2.x, c4.y
-mov.f32f32 r0.z, r0.z
-exp2 r1.x, r1.x
-(ss)mov.f32f32 r1.x, r1.x
-mad.f32 r0.w, c7.y, r0.w, c4.x
-mov.f32f32 r0.x, r0.x
-mov.f32f32 r1.y, r1.y
-add.f r1.x, c4.x, r1.x
+mad.f32 r0.w, r6.w, r6.w, r0.w
+mad.f32 r1.x, c7.y, r1.x, c4.x
+mad.f32 r1.y, r1.z, r5.w, c4.x
add.f r0.z, r0.z, c7.x
-mov.f32f32 r0.w, r0.w
-mov.f32f32 r1.w, r1.y
-mov.f32f32 r1.x, r1.x
-mov.f32f32 r0.z, r0.z
-sqrt r0.x, r0.x
-(ss)mov.f32f32 r0.x, r0.x
-mov.f32f32 r1.y, r2.x
-(rpt1)nop
+mad.f32 r0.w, r6.y, r6.y, r0.w
+dsy (f32)(x)r4.y, r4.w
+bary.f r2.z, 0, r0.x
+dsx (f32)(x)r3.x, r3.y
+mul.f r1.y, (neg)r1.y, c6.x
mul.f r0.z, c6.w, r0.z
rcp r1.x, r1.x
-(ss)mov.f32f32 r1.x, r1.x
-rcp r0.w, r0.w
-(rpt1)nop
-mov.f32f32 r0.z, r0.z
-mov.f32f32 r1.x, r1.x
-(ss)mov.f32f32 r0.w, r0.w
-mul.f r0.x, r1.z, r0.x
-mov.f32f32 r1.z, r1.y
-mov.f32f32 r1.x, r1.x
-mul.f r0.z, r0.z, r0.w
-mov.f32f32 r0.x, r0.x
+bary.f (ei)r2.w, 1, r0.x
+mul.f r0.x, c3.x, r2.z
+mov.f32f32 r1.w, c4.y
+(ss)mul.f r0.z, r0.z, r1.x
+sqrt r0.y, r0.w
+(ss)mul.f r0.w, r0.y, r2.y
+mul.f r0.y, c3.x, r2.w
+mov.f32f32 r1.z, c4.y
nop
-max.f r0.w, r1.x, c4.y
-mov.f32f32 r0.z, r0.z
-mul.f r0.x, r0.x, r0.y
-nop
-min.f r0.y, r0.w, c4.x
-(rpt2)nop
-mul.f r0.y, r2.w, r0.y
+exp2 r1.x, r1.y
+(ss)add.f r1.x, c4.x, r1.x
+dsx (f32)(xy)r2.x, r2.z
+(sy)(ss)mul.f r1.y, r2.x, r2.x
exp2 r0.z, r0.z
-(ss)mov.f32f32 r0.z, r0.z
-mov.f32f32 r0.x, r0.x
+(ss)add.f r0.z, c4.x, r0.z
+rcp r0.w, r0.w
+mad.f32 r1.y, r2.y, r2.y, r1.y
+dsy (f32)(xy)r2.x, r0.x
nop
-add.f r0.y, r0.y, r3.y
-add.f r0.z, c4.x, r0.z
-mul.f r0.x, r0.x, c4.z
+(sy)(ss)mul.f r0.x, r2.x, r2.x
+mad.f32 r0.y, r3.x, r3.x, r1.y
+mad.f32 r0.x, r2.y, r2.y, r0.x
+rcp r1.x, r1.x
+(ss)max.f r1.x, r1.x, c4.y
+rcp r0.z, r0.z
+mad.f32 r0.x, r4.y, r4.y, r0.x
+(rpt5)nop
+sqrt r0.x, r0.x
nop
-mov.f32f32 r0.y, r0.y
-mov.f32f32 r0.z, r0.z
+sqrt r0.y, r0.y
+(ss)mul.f r0.x, r0.y, r0.x
+min.f r0.y, r1.x, c4.x
+(rpt1)nop
+mul.f r0.x, r0.x, r0.w
+mul.f r0.y, r4.x, r0.y
(rpt1)nop
-mov.f32f32 r1.y, r0.y
-(rpt2)nop
-rcp r0.y, r0.z
-(ss)mov.f32f32 r0.y, r0.y
-(rpt2)nop
-mul.f r0.x, r0.x, r0.y
-(rpt2)nop
-mov.f32f32 r1.x, r0.x
+mul.f r0.x, r0.x, c4.z
+add.f r1.y, r0.y, r6.x
+(rpt1)nop
+mul.f r1.x, r0.x, r0.z
end
nop
nop
+nop
; FRAG: outputs: r1.x (1:0)
-; FRAG: inputs: r0.x (5:20,cm=f,il=8,b=1) r1.x (5:21,cm=f,il=12,b=1)
-; FRAG: 265 instructions, 0 half, 7 full
+; FRAG: inputs: r0.y (5:20,cm=f,il=8,b=1) r1.x (5:21,cm=f,il=12,b=1)
+; FRAG: 189 instructions, 0 half, 7 full