summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorL. E. Segovia <amy@centricular.com>2024-02-11 14:19:26 -0300
committerGStreamer Marge Bot <gitlab-merge-bot@gstreamer-foundation.org>2024-02-21 13:36:50 +0000
commit5967409ba5916199ba846e2e4eb59ef1cad6af27 (patch)
tree4ee6144d0de64c1f08057cd737c6a8d0bd6af73f
parent486b5454763e730868d39ebbdf89ad261d212bf8 (diff)
neon: Use the real intrinsics for divf and sqrtf
The existing implementation used the reciprocal for the calculations, without windowing out denormals. Fixes #62 Part-of: <https://gitlab.freedesktop.org/gstreamer/orc/-/merge_requests/169>
-rw-r--r--orc-test/orctest.c56
-rw-r--r--orc/orcrules-neon.c205
2 files changed, 3 insertions, 258 deletions
diff --git a/orc-test/orctest.c b/orc-test/orctest.c
index 84905f0..c34bd7a 100644
--- a/orc-test/orctest.c
+++ b/orc-test/orctest.c
@@ -619,15 +619,6 @@ print_param_val_float (const int *const var, const int *const var2,
}
}
-static float
-get_array_val_float (OrcArray *array, int i, int j)
-{
- void *ptr = ORC_PTR_OFFSET (array->data,
- i*array->element_size + j*array->stride);
-
- return *(float *)ptr;
-}
-
int
float_compare (OrcArray *array1, OrcArray *array2, int i, int j)
{
@@ -655,49 +646,6 @@ float_compare (OrcArray *array1, OrcArray *array2, int i, int j)
return FALSE;
}
-int
-check_expected_failure (int flags, OrcProgram *p, OrcArray** src, OrcArray** dest_exec, OrcArray** dest_emul, int i, int j) {
- OrcTarget *target;
- unsigned int target_flags;
-
- target = orc_target_get_default ();
- target_flags = orc_target_get_default_flags (target);
-
- if ((flags & ORC_TEST_FLAGS_BACKUP) == 0 && strcmp (orc_target_get_name (target), "neon") == 0 && (target_flags & ORC_TARGET_NEON_NEON)) {
- if (strstr(p->name, "divf")) {
-
- float src_val = get_array_val_float (src[1], i, j);
- float dest_exec_val = get_array_val_float (dest_exec[0], i, j);
-
- // Dividing by a large number in NEON will result in 0
- if (fabs(src_val) > 7e37 && fabs(dest_exec_val) == 0.0f) {
- printf(" NEON divf mismatch expected");
- return TRUE;
- }
-
- } else if (strstr(p->name, "sqrtf")) {
-
- float src_val = get_array_val_float (src[0], i, j);
- float dest_exec_val = get_array_val_float (dest_exec[0], i, j);
- float dest_emul_val = get_array_val_float (dest_emul[0], i, j);
-
- // sqrt of 0 or small numbers in NEON returns NaN because it uses reciprocal estimate
- if (fabs(src_val) < 2e-38) {
- printf(" NEON sqrtf mismatch expected");
- return TRUE;
- }
-
- // sqrt in NEON will sometimes be imprecise because frecps returns 1 on small numbers
- if (fabs(dest_exec_val - dest_emul_val)/dest_emul_val < 2e-7) {
- printf(" NEON sqrtf mismatch expected");
- return TRUE;
- }
- }
- }
-
- return FALSE;
-}
-
OrcTestResult
orc_test_compare_output (OrcProgram *program)
{
@@ -938,9 +886,7 @@ orc_test_compare_output_full_for_target (OrcProgram *program, int flags, const c
print_array_val_float (dest_exec[l-ORC_VAR_D1], i, j);
if (!float_compare (dest_emul[l-ORC_VAR_D1], dest_exec[l-ORC_VAR_D1], i, j)) {
line_bad = TRUE;
- if(!check_expected_failure(flags, program, src, dest_exec, dest_emul, i,j)) {
- n_lines_bad++;
- }
+ n_lines_bad++;
}
} else {
a = print_array_val_hex (dest_emul[l-ORC_VAR_D1], i, j);
diff --git a/orc/orcrules-neon.c b/orc/orcrules-neon.c
index a84457a..d565203 100644
--- a/orc/orcrules-neon.c
+++ b/orc/orcrules-neon.c
@@ -2966,214 +2966,13 @@ BINARY_VFP(addd,"vadd.f64",0xee300b00, "fadd", 0x4e60d400, 0)
BINARY_VFP(subd,"vsub.f64",0xee300b40, "fsub", 0x4ee0d400, 0)
BINARY_VFP(muld,"vmul.f64",0xee200b00, "fmul", 0x6e60dc00, 0)
BINARY_VFP(divd,"vdiv.f64",0xee800b00, "fdiv", 0x6e60fc00, 0)
+BINARY_VFP(divf,"vdiv.f32",0xee800a00, "fdiv", 0x6e20fc00, 0)
UNARY_VFP(sqrtd,"vsqrt.f64",0xeeb10b00, "fsqrt", 0x6ee1f800, 0)
+UNARY_VFP(sqrtf,"vsqrt.f32",0xeeb10ac0, "fsqrt", 0x6ea1f800, 0)
/* BINARY_VFP(cmpeqd,"vcmpe.f64",0xee000000, NULL, 0, 0) */
UNARY_VFP(convdf,"vcvt.f64.f32",0xee200b00, "fcvtzs", 0x4ee1b800, 0)
UNARY_VFP(convfd,"vcvt.f32.f64",0xee200b00, "scvtf", 0x4e61d800, 0)
-#if 1
-#define NUM_ITERS_DIVF 2
-static void
-orc_neon_rule_divf (OrcCompiler *p, void *user, OrcInstruction *insn)
-{
- int vec_shift = 1;
-
- if (p->is_64bit) {
- OrcVariable tmpreg = { .alloc = p->tmpreg, .size = p->vars[insn->src_args[1]].size };
- OrcVariable tmpreg2 = { .alloc = p->tmpreg2, .size = p->vars[insn->src_args[1]].size };
- int i;
-
- orc_neon64_emit_unary (p, "frecpe", 0x0ea1d800,
- tmpreg, p->vars[insn->src_args[1]],
- vec_shift);
- for(i = 0; i < NUM_ITERS_DIVF; i++) {
- orc_neon64_emit_binary (p, "frecps", 0x0e20fc00,
- tmpreg2, /* correction factor */
- tmpreg, /* the last estimate */
- p->vars[insn->src_args[1]], /* the original number */
- vec_shift);
- orc_neon64_emit_binary (p, "fmul", 0x2e20dc00,
- tmpreg, /* revised estimate */
- tmpreg, /* last estimate */
- tmpreg2, /* correction factor */
- vec_shift);
- }
-
- orc_neon64_emit_binary (p, "fmul", 0x2e20dc00,
- p->vars[insn->dest_args[0]],
- p->vars[insn->src_args[0]],
- tmpreg, vec_shift);
- } else {
- if (p->insn_shift <= vec_shift) {
- int i;
- orc_neon_emit_unary (p, "vrecpe.f32", 0xf3bb0500,
- p->tmpreg,
- p->vars[insn->src_args[1]].alloc);
- for(i = 0; i < NUM_ITERS_DIVF; i++) {
- orc_neon_emit_binary (p, "vrecps.f32", 0xf2000f10,
- p->tmpreg2, /* correction factor */
- p->tmpreg, /* the last estimate */
- p->vars[insn->src_args[1]].alloc); /* the original number */
- orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10,
- p->tmpreg, /* revised estimate */
- p->tmpreg, /* last estimate */
- p->tmpreg2); /* correction factor */
- }
-
- orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10,
- p->vars[insn->dest_args[0]].alloc,
- p->vars[insn->src_args[0]].alloc,
- p->tmpreg);
-
- } else if (p->insn_shift == vec_shift + 1) {
- int i;
- orc_neon_emit_unary_quad (p, "vrecpe.f32", 0xf3bb0500,
- p->tmpreg,
- p->vars[insn->src_args[1]].alloc);
- for(i = 0; i < NUM_ITERS_DIVF; i++) {
- orc_neon_emit_binary_quad (p, "vrecps.f32", 0xf2000f10,
- p->tmpreg2, /* correction factor */
- p->tmpreg, /* the last estimate */
- p->vars[insn->src_args[1]].alloc); /* the original number */
- orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10,
- p->tmpreg, /* revised estimate */
- p->tmpreg, /* last estimate */
- p->tmpreg2); /* correction factor */
- }
-
- orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10,
- p->vars[insn->dest_args[0]].alloc,
- p->vars[insn->src_args[0]].alloc,
- p->tmpreg);
-
- } else {
- ORC_COMPILER_ERROR(p, "shift too large");
- }
- }
-}
-#endif
-
-#if 1
-#define NUM_ITERS_SQRTF 2
-static void
-orc_neon_rule_sqrtf (OrcCompiler *p, void *user, OrcInstruction *insn)
-{
- int vec_shift = 1;
-
- if (p->is_64bit) {
- OrcVariable tmpreg = { .alloc = p->tmpreg, .size = p->vars[insn->src_args[0]].size };
- OrcVariable tmpreg2 = { .alloc = p->tmpreg2, .size = p->vars[insn->src_args[0]].size };
- int i;
-
- orc_neon64_emit_unary (p, "frsqrte", 0x2ea1d800,
- tmpreg, p->vars[insn->src_args[0]],
- vec_shift);
-
- for(i = 0; i < NUM_ITERS_SQRTF; i++) {
- orc_neon64_emit_binary (p, "fmul", 0x2e20dc00,
- tmpreg2, tmpreg, p->vars[insn->src_args[0]],
- vec_shift);
- orc_neon64_emit_binary (p, "frsqrts", 0x0ea0fc00,
- tmpreg2, tmpreg, tmpreg2, vec_shift);
- orc_neon64_emit_binary (p, "fmul", 0x2e20dc00,
- tmpreg, tmpreg, tmpreg2,
- vec_shift);
- }
-
- orc_neon64_emit_unary (p, "frecpe", 0x0ea1d800,
- p->vars[insn->dest_args[0]], tmpreg,
- vec_shift);
-
- for(i = 0; i < NUM_ITERS_DIVF; i++) {
- orc_neon64_emit_binary (p, "frecps", 0x0e20fc00,
- tmpreg2, /* correction factor */
- p->vars[insn->dest_args[0]], /* the last estimate */
- tmpreg, /* the original number */
- vec_shift);
- orc_neon64_emit_binary (p, "fmul", 0x2e20dc00,
- p->vars[insn->dest_args[0]], /* revised estimate */
- p->vars[insn->dest_args[0]], /* last estimate */
- tmpreg2, /* correction factor */
- vec_shift);
- }
- } else {
- if (p->insn_shift <= vec_shift) {
- int i;
- orc_neon_emit_unary (p, "vrsqrte.f32", 0xf3bb0580,
- p->tmpreg,
- p->vars[insn->src_args[0]].alloc);
- for(i = 0; i < NUM_ITERS_SQRTF; i++) {
- orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10,
- p->tmpreg2,
- p->tmpreg,
- p->vars[insn->src_args[0]].alloc);
- orc_neon_emit_binary (p, "vrsqrts.f32", 0xf2200f10,
- p->tmpreg2,
- p->tmpreg,
- p->tmpreg2);
- orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10,
- p->tmpreg,
- p->tmpreg,
- p->tmpreg2);
- }
-
- orc_neon_emit_unary(p, "vrecpe.f32", 0xf3bb0500,
- p->vars[insn->dest_args[0]].alloc,
- p->tmpreg);
-
- for(i=0; i < NUM_ITERS_DIVF; i++) {
- orc_neon_emit_binary (p, "vrecps.f32", 0xf2000f10,
- p->tmpreg2, /* correction factor */
- p->vars[insn->dest_args[0]].alloc, /* the last estimate */
- p->tmpreg); /* the original number */
- orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10,
- p->vars[insn->dest_args[0]].alloc, /* revised estimate */
- p->vars[insn->dest_args[0]].alloc, /* last estimate */
- p->tmpreg2); /* correction factor */
- }
-
- } else if (p->insn_shift == vec_shift + 1) {
- int i;
- orc_neon_emit_unary_quad (p, "vrsqrte.f32", 0xf3bb0580,
- p->tmpreg,
- p->vars[insn->src_args[0]].alloc);
- for(i = 0; i < NUM_ITERS_SQRTF; i++) {
- orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10,
- p->tmpreg2,
- p->tmpreg,
- p->vars[insn->src_args[0]].alloc);
- orc_neon_emit_binary_quad (p, "vrsqrts.f32", 0xf2200f10,
- p->tmpreg2,
- p->tmpreg,
- p->tmpreg2);
- orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10,
- p->tmpreg,
- p->tmpreg,
- p->tmpreg2);
- }
-
- orc_neon_emit_unary_quad(p, "vrecpe.f32", 0xf3bb0500,
- p->vars[insn->dest_args[0]].alloc,
- p->tmpreg);
-
- for(i=0; i < NUM_ITERS_DIVF; i++) {
- orc_neon_emit_binary_quad (p, "vrecps.f32", 0xf2000f10,
- p->tmpreg2, /* correction factor */
- p->vars[insn->dest_args[0]].alloc, /* the last estimate */
- p->tmpreg); /* the original number */
- orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10,
- p->vars[insn->dest_args[0]].alloc, /* revised estimate */
- p->vars[insn->dest_args[0]].alloc, /* last estimate */
- p->tmpreg2); /* correction factor */
- }
-
- } else {
- ORC_COMPILER_ERROR(p, "shift too large");
- }
- }
-}
-#endif
-
static void
orc_neon_rule_accw (OrcCompiler *p, void *user, OrcInstruction *insn)
{