diff options
author | Matt Turner <mattst88@gmail.com> | 2018-03-30 18:16:20 -0700 |
---|---|---|
committer | Matt Turner <mattst88@gmail.com> | 2018-03-30 18:16:20 -0700 |
commit | 8480fed68dcacdaedbecbcde0022e985e698758f (patch) | |
tree | 7157eb863106873af7cb37947b782efe19ba26c2 | |
parent | 7738826ca690794663e493d80bcc74221f03cc7e (diff) |
mmxloongson3
-rw-r--r-- | pixman/loongson-mmintrin.h | 11 | ||||
-rw-r--r-- | pixman/pixman-mmx.c | 889 |
2 files changed, 899 insertions, 1 deletions
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h index 086c6e0..1471c11 100644 --- a/pixman/loongson-mmintrin.h +++ b/pixman/loongson-mmintrin.h @@ -110,6 +110,17 @@ _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) return ret; } +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 ret; + asm("pcmpgth %0, %1, %2\n\t" + : "=f" (ret) + : "f" (__m1), "f" (__m2) + ); + return ret; +} + extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_empty (void) { diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index dec3974..74e97c2 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -59,6 +59,74 @@ _mm_empty (void) } #endif +#ifdef USE_LOONGSON_MMI +#define COMBINE_A_OUT 1 +#define COMBINE_A_IN 2 +#define COMBINE_B_OUT 4 +#define COMBINE_B_IN 8 + +#define COMBINE_CLEAR 0 +#define COMBINE_A (COMBINE_A_OUT | COMBINE_A_IN) +#define COMBINE_B (COMBINE_B_OUT | COMBINE_B_IN) +#define COMBINE_A_OVER (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN) +#define COMBINE_B_OVER (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN) +#define COMBINE_A_ATOP (COMBINE_B_OUT | COMBINE_A_IN) +#define COMBINE_B_ATOP (COMBINE_A_OUT | COMBINE_B_IN) +#define COMBINE_XOR (COMBINE_A_OUT | COMBINE_B_OUT) + +/* no SIMD instructions for div, so leave it alone */ +/* portion covered by a but not b */ +static uint8_t +combine_disjoint_out_part (uint8_t a, uint8_t b) +{ + /* min (1, (1-b) / a) */ + + b = ~b; /* 1 - b */ + if (b >= a) /* 1 - b >= a -> (1-b)/a >= 1 */ + return MASK; /* 1 */ + return DIV_UN8 (b, a); /* (1-b) / a */ +} + +/* portion covered by both a and b */ +static uint8_t +combine_disjoint_in_part (uint8_t a, uint8_t b) +{ + /* max (1-(1-b)/a,0) */ + /* = - min ((1-b)/a - 1, 0) */ + /* = 1 - min (1, (1-b)/a) */ + + b = ~b; /* 1 - b */ + if (b >= a) /* 1 - b >= a -> (1-b)/a >= 1 */ + return 0; /* 1 - 1 */ + return ~DIV_UN8(b, a); /* 1 - (1-b) / a */ +} + +/* portion covered by a but not b */ +static uint8_t +combine_conjoint_out_part (uint8_t a, uint8_t b) +{ + /* max (1-b/a,0) */ + /* = 1-min(b/a,1) */ + + /* min (1, (1-b) / a) */ + + if (b >= a) /* b >= a -> b/a >= 1 */ + return 0x00; /* 0 */ + return ~DIV_UN8(b, a); /* 1 - b/a */ +} + +/* portion covered by both a and b */ +static uint8_t +combine_conjoint_in_part (uint8_t a, uint8_t b) +{ + /* min (1,b/a) */ + + if (b >= a) /* b >= a -> b/a >= 1 */ + return MASK; /* 1 */ + return DIV_UN8 (b, a); /* b/a */ +} +#endif + #ifdef USE_X86_MMX # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64)) # include <xmmintrin.h> @@ -710,6 +778,36 @@ combine (const uint32_t *src, const uint32_t *mask) return vsrc; } +#ifdef USE_LOONGSON_MMI +static force_inline void +mmx_combine_mask_ca(const uint32_t *src, const uint32_t *mask, __m64 *s64, __m64 *m64) +{ + __m64 res, tmp; + + if(!(*mask)) + { + *s64 = 0; + *m64 = 0; + return; + } + + *s64 = load8888(src); + + if (*mask == ~0) + { + *m64 = expand_alpha(*s64); + return; + } + + *m64 = load8888(mask); + + res = pix_multiply(*s64, *m64); + tmp = expand_alpha(*s64); + *s64 = res; + *m64 = pix_multiply(*m64, tmp); +} +#endif + static force_inline __m64 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) { @@ -728,6 +826,41 @@ core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()); } +#ifdef USE_LOONGSON_MMI +static void +mmx_combine_disjoint_over_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + uint32_t *end = dest + width; + uint32_t s32; + uint64_t sa64; + __m64 s64, d64; + + while (dest < end) + { + s64 = combine (src, mask); + + if (s64) + { + store8888(&s32, s64); + sa64 = combine_disjoint_out_part (*dest >> A_SHIFT, s32 >> A_SHIFT); + d64 = pix_add (pix_multiply (load8888 (dest),expand_alpha_rev ((*(__m64*)&sa64))), s64); + store8888 (dest, d64); + } + + ++dest; + ++src; + if (mask) + ++mask; + + } +} +#endif + static void mmx_combine_over_u (pixman_implementation_t *imp, pixman_op_t op, @@ -1063,6 +1196,298 @@ mmx_combine_saturate_u (pixman_implementation_t *imp, _mm_empty (); } +#ifdef USE_LOONGSON_MMI +/* 在combine_conjoint_general_u等函数中,有多个分支,由参数combine决定,*/ +/*此值在函数运行期间并不会改变,所以原始代码中每个值都去判断是没有必要的.*/ +/*可以在函数入口处判断,并设置好相应的函数指针,以后直接调用即可.*/ +/*有两个分支是直接返回常量,为了做到统一,自定义两个内联函数,其作用为直接返回相应的常量.*/ +/*_u和_ca后綴的两套代均码可用*/ +#define DEF_FUNC_ZERO_MASK(type, zm, suffix, res) \ + static type inline combine_joint_ ##zm## _ ##suffix( type sa, type da, type io_flag) \ + { \ + return res; \ + } + +/* 四个分支中的另外两个分支函数,conjoint与disjoint的代码结构一样,函数名不一样,设置此宏来生成相应的函数*/ +/* 参数顺序不一样,由io_flag标记决定,0表示in_part,1表示out_part*/ +/*由于是底层函数,并不向外提供此接口,因此这里没有判断其值是否合法. */ +#define DEF_FUNC_COMBINE_JOINT_U(cd, io) \ + static uint8_t inline combine_ ##cd## joint_ ##io## _part_u(uint8_t sa, uint8_t da, uint8_t io_flag) \ + { \ + uint8_t parm[2]; \ + parm[0] = sa * (io_flag ^ 0x1) + da * (io_flag ^ 0x0); \ + parm[1] = sa * (io_flag ^ 0x0) + da * (io_flag ^ 0x1); \ + return combine_ ##cd## joint_ ##io## _part (parm[0], parm[1]); \ + } + +/* 设置函数指针数组的宏,在函数入口处存放正确的处理函数. */ +/*_u和_ca后綴的两套代均码可用*/ +#define DEF_COMB_FUNC_ARR(cd,SUFFIX,suffix) \ + COMBINE_JOINT_FUNC_##SUFFIX combine_ ##cd## joint_ ##suffix[4] ={ \ + combine_joint_zero_ ##suffix, \ + combine_ ##cd## joint_out_part_ ##suffix, \ + combine_ ##cd## joint_in_part_ ##suffix, \ + combine_joint_mask_ ##suffix \ + }; + +typedef uint8_t (*COMBINE_JOINT_FUNC_U)(uint8_t a, uint8_t b, uint8_t io_flag); + +DEF_FUNC_ZERO_MASK(uint8_t,zero,u, 0x0) +DEF_FUNC_ZERO_MASK(uint8_t,mask,u, ~0x0) + +DEF_FUNC_COMBINE_JOINT_U(dis, in); +DEF_FUNC_COMBINE_JOINT_U(dis, out); +DEF_COMB_FUNC_ARR(dis,U,u) + +DEF_FUNC_COMBINE_JOINT_U(con, in); +DEF_FUNC_COMBINE_JOINT_U(con, out); +DEF_COMB_FUNC_ARR(con, U, u) + +/* conjoint与disjoint相关函数主体结构一样的,只是四个分支中调用的函数不同,因此设置一个底层函数*/ +static void +mmx_combine_joint_general_u (uint32_t * dest, + const uint32_t *src, + const uint32_t *mask, + int width, + uint8_t comb, + COMBINE_JOINT_FUNC_U *cjf) +{ + COMBINE_JOINT_FUNC_U combine_joint_u[2]; + combine_joint_u[0] = cjf[comb & COMBINE_A]; /* in_part */ + combine_joint_u[1] = cjf[(comb & COMBINE_B)>>2]; /* out_par */ + + uint32_t *end = dest + width; + while (dest < end) + { + __m64 s64 = combine (src, mask); + __m64 d64,sa64,da64; + uint8_t sa, da; + uint32_t tmp; + uint64_t Fa, Fb; + + /* 由于这些函数中有除法指令,因此并未用多媒体指令来优化, */ + /* 从而需要再转存到uint32_t类型变量中,再取出运算.*/ + store8888(&tmp, s64); + sa = tmp >> A_SHIFT; + da = *dest >> A_SHIFT; + + Fa = combine_joint_u[0](sa, da, 0); + Fb = combine_joint_u[1](sa, da, 1); + + d64 = load8888(dest); + sa64 = expand_alpha_rev (*(__m64*)&Fa); + da64 = expand_alpha_rev (*(__m64*)&Fb); + + d64 = pix_add_mul (s64, sa64, d64, da64); + + store8888 (dest, d64); + + ++dest; + ++src; + if (mask) + ++mask; + } +} + + +static void +mmx_combine_disjoint_general_u (uint32_t * dest, + const uint32_t *src, + const uint32_t *mask, + int width, + uint8_t comb) +{ + mmx_combine_joint_general_u (dest, src, mask, width, comb, combine_disjoint_u); +} + +static void +mmx_combine_disjoint_in_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN); +} + +static void +mmx_combine_disjoint_in_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN); +} + +static void +mmx_combine_disjoint_out_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT); +} + +static void +mmx_combine_disjoint_out_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT); +} + +static void +mmx_combine_disjoint_atop_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP); +} + +static void +mmx_combine_disjoint_atop_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP); +} + +static void +mmx_combine_disjoint_xor_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR); +} + +/* Conjoint */ +static void +mmx_combine_conjoint_general_u(uint32_t * dest, + const uint32_t *src, + const uint32_t *mask, + int width, + uint8_t comb) +{ + mmx_combine_joint_general_u (dest, src, mask, width, comb, combine_conjoint_u); +} + +static void +mmx_combine_conjoint_over_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER); +} + +static void +mmx_combine_conjoint_over_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER); +} + +static void +mmx_combine_conjoint_in_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN); +} + +static void +mmx_combine_conjoint_in_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN); +} + +static void +mmx_combine_conjoint_out_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT); +} + +static void +mmx_combine_conjoint_out_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT); +} + +static void +mmx_combine_conjoint_atop_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP); +} + +static void +mmx_combine_conjoint_atop_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP); +} + +static void +mmx_combine_conjoint_xor_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR); +} + +/* Component alpha combiners */ static void mmx_combine_src_ca (pixman_implementation_t *imp, pixman_op_t op, @@ -1089,6 +1514,415 @@ mmx_combine_src_ca (pixman_implementation_t *imp, } static void +mmx_combine_saturate_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + uint32_t *end = dest + width; + while (dest < end) + { + uint16_t sa, sr, sg, sb; + uint32_t sa32, m32; + __m64 m64, s64, d64, sa64, da64, cmpf, res; + + mmx_combine_mask_ca (src, mask, &s64, &m64); + + d64 = load8888 (dest); + da64 = expand_alpha (negate(d64)); + /*这里涉及到分支,分支中的运算GENERIC实际上是先做乘法,再做ADD,*/ + /*而另一分支是ADD,因此可以无条件地先判断,记下标记位,做乘法,*/ + /*再将不需要做乘法的相应关字恢复,最后做加法. */ + cmpf = _mm_cmpgt_pi16 (m64, da64); + if (cmpf) + { + store8888 (&m32, m64); + sa = (m32 >> (A_SHIFT)); + sr = (m32 >> (R_SHIFT)) & MASK; + sg = (m32 >> (G_SHIFT)) & MASK; + sb = m32 & MASK; + sa32 = (~(*dest) >> A_SHIFT) & MASK; + + /* 可能为0,当为0时,其结果肯定将被恢复,因此这里为其随便赋一值 */ + sa = (sa) ? sa : 0x1; + sr = (sr) ? sr : 0x1; + sg = (sg) ? sg : 0x1; + sb = (sb) ? sb : 0x1; + + sa32 = ((sa32 << G_SHIFT) / sb & MASK) | + ((((sa32 << G_SHIFT) / sg) & MASK) << G_SHIFT) | + ((((sa32 << G_SHIFT) / sr) & MASK) << R_SHIFT) | + ((((sa32 << G_SHIFT) / sa) & MASK) << A_SHIFT); + sa64 = load8888 (&sa32); + da64 = MC (4x00ff); + res = pix_multiply (s64, sa64); + s64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 (s64, negate (cmpf))); + res = pix_multiply (d64, da64); + d64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 (d64, negate (cmpf))); + } + res = _mm_adds_pu8 (s64, d64); + store8888 (dest, res); + + ++dest; + ++src; + if (mask) + ++mask; + } +} + +#define DEF_FUNC_COMBINE_JOINT_CA(cd, io) \ + static uint32_t inline combine_ ##cd## joint_ ##io## _part_ca(uint32_t sa, uint32_t da, uint32_t io_flag) \ + { \ + uint8_t da8 = da >> A_SHIFT; \ + uint32_t m, n, o, p, res; \ + uint8_t i, parm[2][4], shift=0; \ + for (i=0; i<4; i++) \ + { \ + parm[0][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x1) + da8 * (io_flag ^ 0x0); \ + parm[1][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x0) + da8 * (io_flag ^ 0x1); \ + shift += G_SHIFT; \ + } \ + m = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][0], parm[1][0]); \ + n = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][1], parm[1][1]) << G_SHIFT; \ + o = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][2], parm[1][2]) << R_SHIFT; \ + p = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][3], parm[1][3]) << A_SHIFT; \ + res = m | n | o | p; \ + return res; \ + } + +typedef uint32_t (*COMBINE_JOINT_FUNC_CA)(uint32_t sa, uint32_t da, uint32_t io_flag); + +DEF_FUNC_ZERO_MASK(uint32_t, zero, ca, 0x0) +DEF_FUNC_ZERO_MASK(uint32_t, mask, ca, ~0x0) + +DEF_FUNC_COMBINE_JOINT_CA(dis, in); +DEF_FUNC_COMBINE_JOINT_CA(dis, out); +DEF_COMB_FUNC_ARR(dis, CA, ca) + +DEF_FUNC_COMBINE_JOINT_CA(con, in); +DEF_FUNC_COMBINE_JOINT_CA(con, out); +DEF_COMB_FUNC_ARR(con, CA, ca) + +static void +mmx_combine_joint_general_ca (uint32_t * dest, + const uint32_t *src, + const uint32_t *mask, + int width, + uint8_t comb, + COMBINE_JOINT_FUNC_CA *cjf) +{ + COMBINE_JOINT_FUNC_CA combine_joint_ca[2]; + combine_joint_ca[0] = cjf[comb & COMBINE_A]; + combine_joint_ca[1] = cjf[(comb & COMBINE_B)>>2]; + + uint32_t *end = dest + width; + while (dest < end) + { + __m64 m64, s64, sa64, da64, d64; + uint32_t m32, Fa, Fb; + + mmx_combine_mask_ca (src, mask, &s64, &m64); + store8888(&m32, m64); + + Fa = combine_joint_ca[0](m32, *dest, 0); + Fb = combine_joint_ca[1](m32, *dest, 1); + + sa64 = load8888 (&Fa); + da64 = load8888 (&Fb); + + d64 = load8888 (dest); + d64 = pix_add_mul(s64, sa64, d64, da64); + + store8888 (dest, d64); + + ++dest; + ++src; + if (mask) + ++mask; + } + +} + +static void +mmx_combine_disjoint_general_ca (uint32_t * dest, + const uint32_t *src, + const uint32_t *mask, + int width, + uint8_t comb) +{ + mmx_combine_joint_general_ca (dest, src, mask, width, comb, combine_disjoint_ca); +} + +static void +mmx_combine_disjoint_over_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER); +} + +static void +mmx_combine_disjoint_in_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN); +} + +static void +mmx_combine_disjoint_in_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN); +} + +static void +mmx_combine_disjoint_out_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT); +} + +static void +mmx_combine_disjoint_out_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT); +} + +static void +mmx_combine_disjoint_atop_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP); +} + +static void +mmx_combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP); +} + +static void +mmx_combine_disjoint_xor_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR); +} + +static void +mmx_combine_conjoint_general_ca(uint32_t * dest, + const uint32_t *src, + const uint32_t *mask, + int width, + uint8_t comb) +{ + mmx_combine_joint_general_ca(dest,src,mask,width,comb,combine_conjoint_ca); +} + +/* + * Multiply + * B(Dca, ad, Sca, as) = Dca.Sca + */ + +static void +mmx_combine_multiply_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + __m64 dia, d, sia; + __m64 s = combine (src, mask); + __m64 ss = s; + d = load8888 (dest); + sia = negate (expand_alpha (s)); + dia = negate (expand_alpha (d)); + ss = pix_add_mul (ss, dia, d, sia); + d = pix_multiply (d, s); + d = pix_add (d, ss); + store8888 (dest, d); + + ++dest; + ++src; + if (mask) + mask++; + } + _mm_empty (); +} + +static void +mmx_combine_multiply_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); + __m64 r = d; + __m64 da = negate (expand_alpha (d)); + __m64 sa = expand_alpha (s); + s = pix_multiply (s, a); + a = pix_multiply (a, sa); + a = negate (a); + r = pix_add_mul (r, a, s, da); + d = pix_multiply (d, s); + r = pix_add (r, d); + store8888 (dest, r); + + ++src; + ++dest; + ++mask; + } + _mm_empty (); +} + +static void +mmx_combine_conjoint_over_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER); +} + +static void +mmx_combine_conjoint_over_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER); +} + +static void +mmx_combine_conjoint_in_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN); +} + +static void +mmx_combine_conjoint_in_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN); +} + +static void +mmx_combine_conjoint_out_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT); +} + +static void +mmx_combine_conjoint_out_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT); +} + +static void +mmx_combine_conjoint_atop_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP); +} + +static void +mmx_combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP); +} + +static void +mmx_combine_conjoint_xor_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR); +} +#endif + +static void mmx_combine_over_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dest, @@ -3989,7 +4823,7 @@ mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) -static const pixman_iter_info_t mmx_iters[] = +static const pixman_iter_info_t mmx_iters[] = { { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW, _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL @@ -4116,6 +4950,31 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback) imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u; imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u; +#ifdef USE_LOONGSON_MMI + /* Disjoint, unified */ + imp->combine_32[PIXMAN_OP_DISJOINT_OVER] = mmx_combine_disjoint_over_u; + imp->combine_32[PIXMAN_OP_DISJOINT_OVER_REVERSE] = mmx_combine_saturate_u; + imp->combine_32[PIXMAN_OP_DISJOINT_IN] = mmx_combine_disjoint_in_u; + imp->combine_32[PIXMAN_OP_DISJOINT_IN_REVERSE] = mmx_combine_disjoint_in_reverse_u; + imp->combine_32[PIXMAN_OP_DISJOINT_OUT] = mmx_combine_disjoint_out_u; + imp->combine_32[PIXMAN_OP_DISJOINT_OUT_REVERSE] = mmx_combine_disjoint_out_reverse_u; + imp->combine_32[PIXMAN_OP_DISJOINT_ATOP] = mmx_combine_disjoint_atop_u; + imp->combine_32[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = mmx_combine_disjoint_atop_reverse_u; + imp->combine_32[PIXMAN_OP_DISJOINT_XOR] = mmx_combine_disjoint_xor_u; + + /* Conjoint, unified */ + imp->combine_32[PIXMAN_OP_CONJOINT_OVER] = mmx_combine_conjoint_over_u; + imp->combine_32[PIXMAN_OP_CONJOINT_OVER_REVERSE] = mmx_combine_conjoint_over_reverse_u; + imp->combine_32[PIXMAN_OP_CONJOINT_IN] = mmx_combine_conjoint_in_u; + imp->combine_32[PIXMAN_OP_CONJOINT_IN_REVERSE] = mmx_combine_conjoint_in_reverse_u; + imp->combine_32[PIXMAN_OP_CONJOINT_OUT] = mmx_combine_conjoint_out_u; + imp->combine_32[PIXMAN_OP_CONJOINT_OUT_REVERSE] = mmx_combine_conjoint_out_reverse_u; + imp->combine_32[PIXMAN_OP_CONJOINT_ATOP] = mmx_combine_conjoint_atop_u; + imp->combine_32[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = mmx_combine_conjoint_atop_reverse_u; + imp->combine_32[PIXMAN_OP_CONJOINT_XOR] = mmx_combine_conjoint_xor_u; + + /* Component alpha combiners */ +#endif imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u; imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u; imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u; @@ -4137,6 +4996,34 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback) imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca; imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca; imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca; +#ifdef USE_LOONGSON_MMI + imp->combine_32_ca[PIXMAN_OP_SATURATE] = mmx_combine_saturate_ca; + + /* Disjoint CA */ + imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER] = mmx_combine_disjoint_over_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = mmx_combine_saturate_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN] = mmx_combine_disjoint_in_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = mmx_combine_disjoint_in_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_OUT] = mmx_combine_disjoint_out_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = mmx_combine_disjoint_out_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_ATOP] = mmx_combine_disjoint_atop_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = mmx_combine_disjoint_atop_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_DISJOINT_XOR] = mmx_combine_disjoint_xor_ca; + + /* Conjoint CA */ + imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER] = mmx_combine_conjoint_over_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = mmx_combine_conjoint_over_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN] = mmx_combine_conjoint_in_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = mmx_combine_conjoint_in_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_OUT] = mmx_combine_conjoint_out_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = mmx_combine_conjoint_out_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_ATOP] = mmx_combine_conjoint_atop_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = mmx_combine_conjoint_atop_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_CONJOINT_XOR] = mmx_combine_conjoint_xor_ca; + + /* Multiply CA */ + imp->combine_32_ca[PIXMAN_OP_MULTIPLY] = mmx_combine_multiply_ca; +#endif imp->blt = mmx_blt; imp->fill = mmx_fill; |