summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJosé Fonseca <jfonseca@vmware.com>2013-11-20 08:32:52 +0000
committerJose Fonseca <jfonseca@vmware.com>2016-10-04 23:36:20 +0100
commite088390c7ddce4e64559a5dad6235ffc430ac736 (patch)
tree7edc7823c67dfb5e8db59ea31174b036a1df90ec
parentadd01add1bb91af64ad5cfa671c99bee72016773 (diff)
gallivm: Basic AVX2 support.
v2: pblendb -> pblendvb Reviewed-by: Roland Scheidegger <sroland@vmware.com>
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_arit.c104
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_init.c5
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_logic.c10
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c7
4 files changed, 98 insertions, 28 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index c4e35a21d2..f5cacc460f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -143,6 +143,20 @@ lp_build_min_simple(struct lp_build_context *bld,
intr_size = 128;
}
} else if (HAVE_LLVM < 0x0309 &&
+ util_cpu_caps.has_avx2 && type.length > 4) {
+ intr_size = 256;
+ switch (type.width) {
+ case 8:
+ intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
+ break;
+ case 16:
+ intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
+ break;
+ case 32:
+ intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
+ break;
+ }
+ } else if (HAVE_LLVM < 0x0309 &&
util_cpu_caps.has_sse2 && type.length >= 2) {
intr_size = 128;
if ((type.width == 8 || type.width == 16) &&
@@ -347,6 +361,20 @@ lp_build_max_simple(struct lp_build_context *bld,
intr_size = 128;
}
} else if (HAVE_LLVM < 0x0309 &&
+ util_cpu_caps.has_avx2 && type.length > 4) {
+ intr_size = 256;
+ switch (type.width) {
+ case 8:
+ intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
+ break;
+ case 16:
+ intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
+ break;
+ case 32:
+ intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
+ break;
+ }
+ } else if (HAVE_LLVM < 0x0309 &&
util_cpu_caps.has_sse2 && type.length >= 2) {
intr_size = 128;
if ((type.width == 8 || type.width == 16) &&
@@ -526,18 +554,27 @@ lp_build_add(struct lp_build_context *bld,
if(a == bld->one || b == bld->one)
return bld->one;
- if (type.width * type.length == 128 &&
- !type.floating && !type.fixed) {
- if(util_cpu_caps.has_sse2) {
- if(type.width == 8)
- intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
- if(type.width == 16)
- intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
- } else if (util_cpu_caps.has_altivec) {
- if(type.width == 8)
- intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
- if(type.width == 16)
- intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
+ if (!type.floating && !type.fixed) {
+ if (type.width * type.length == 128) {
+ if(util_cpu_caps.has_sse2) {
+ if(type.width == 8)
+ intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
+ if(type.width == 16)
+ intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
+ } else if (util_cpu_caps.has_altivec) {
+ if(type.width == 8)
+ intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
+ if(type.width == 16)
+ intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
+ }
+ }
+ if (type.width * type.length == 256) {
+ if(util_cpu_caps.has_avx2) {
+ if(type.width == 8)
+ intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
+ if(type.width == 16)
+ intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
+ }
}
}
@@ -818,18 +855,27 @@ lp_build_sub(struct lp_build_context *bld,
if(b == bld->one)
return bld->zero;
- if (type.width * type.length == 128 &&
- !type.floating && !type.fixed) {
- if (util_cpu_caps.has_sse2) {
- if(type.width == 8)
- intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
- if(type.width == 16)
- intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
- } else if (util_cpu_caps.has_altivec) {
- if(type.width == 8)
- intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
- if(type.width == 16)
- intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
+ if (!type.floating && !type.fixed) {
+ if (type.width * type.length == 128) {
+ if (util_cpu_caps.has_sse2) {
+ if(type.width == 8)
+ intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
+ if(type.width == 16)
+ intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
+ } else if (util_cpu_caps.has_altivec) {
+ if(type.width == 8)
+ intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
+ if(type.width == 16)
+ intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
+ }
+ }
+ if (type.width * type.length == 256) {
+ if (util_cpu_caps.has_avx2) {
+ if(type.width == 8)
+ intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
+ if(type.width == 16)
+ intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
+ }
}
}
@@ -1587,6 +1633,16 @@ lp_build_abs(struct lp_build_context *bld,
return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
}
}
+ else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) {
+ switch(type.width) {
+ case 8:
+ return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
+ case 16:
+ return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
+ case 32:
+ return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
+ }
+ }
else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
(gallivm_debug & GALLIVM_DEBUG_PERF) &&
(type.width == 8 || type.width == 16 || type.width == 32)) {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 22340c081f..7114cde438 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -457,6 +457,11 @@ lp_build_init(void)
util_cpu_caps.has_f16c = 0;
util_cpu_caps.has_fma = 0;
}
+ if (HAVE_LLVM < 0x0304 || !USE_MCJIT) {
+ /* AVX2 support has only been tested with LLVM 3.4, and it requires
+ * MCJIT. */
+ util_cpu_caps.has_avx2 = 0;
+ }
#ifdef PIPE_ARCH_PPC_64
/* Set the NJ bit in VSCR to 0 so denormalized values are handled as
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 14bf236948..1a50e82c24 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -348,7 +348,9 @@ lp_build_select(struct lp_build_context *bld,
else if (((util_cpu_caps.has_sse4_1 &&
type.width * type.length == 128) ||
(util_cpu_caps.has_avx &&
- type.width * type.length == 256 && type.width >= 32)) &&
+ type.width * type.length == 256 && type.width >= 32) ||
+ (util_cpu_caps.has_avx2 &&
+ type.width * type.length == 256)) &&
!LLVMIsConstant(a) &&
!LLVMIsConstant(b) &&
!LLVMIsConstant(mask)) {
@@ -365,9 +367,13 @@ lp_build_select(struct lp_build_context *bld,
intrinsic = "llvm.x86.avx.blendv.pd.256";
arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4);
}
- else {
+ else if (type.width == 32) {
intrinsic = "llvm.x86.avx.blendv.ps.256";
arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
+ } else {
+ assert(util_cpu_caps.has_avx2);
+ intrinsic = "llvm.x86.avx2.pblendvb";
+ arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32);
}
}
else if (type.floating &&
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index 6bf92c87c4..f91b761dc1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -1409,6 +1409,9 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
LLVMValueRef mipoff1 = NULL;
LLVMValueRef colors0;
LLVMValueRef colors1;
+ boolean use_floats = util_cpu_caps.has_avx &&
+ !util_cpu_caps.has_avx2 &&
+ bld->coord_type.length > 4;
/* sample the first mipmap level */
lp_build_mipmap_level_sizes(bld, ilevel0,
@@ -1423,7 +1426,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
}
- if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
+ if (use_floats) {
if (img_filter == PIPE_TEX_FILTER_NEAREST) {
lp_build_sample_image_nearest_afloat(bld,
size0,
@@ -1514,7 +1517,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
}
- if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
+ if (use_floats) {
if (img_filter == PIPE_TEX_FILTER_NEAREST) {
lp_build_sample_image_nearest_afloat(bld,
size1,