diff options
author | L. E. Segovia <amy@centricular.com> | 2024-02-11 14:19:26 -0300 |
---|---|---|
committer | GStreamer Marge Bot <gitlab-merge-bot@gstreamer-foundation.org> | 2024-02-21 13:36:50 +0000 |
commit | 5967409ba5916199ba846e2e4eb59ef1cad6af27 (patch) | |
tree | 4ee6144d0de64c1f08057cd737c6a8d0bd6af73f | |
parent | 486b5454763e730868d39ebbdf89ad261d212bf8 (diff) |
neon: Use the real intrinsics for divf and sqrtf
The existing implementation used the reciprocal for the calculations,
without windowing out denormals.
Fixes #62
Part-of: <https://gitlab.freedesktop.org/gstreamer/orc/-/merge_requests/169>
-rw-r--r-- | orc-test/orctest.c | 56 | ||||
-rw-r--r-- | orc/orcrules-neon.c | 205 |
2 files changed, 3 insertions, 258 deletions
diff --git a/orc-test/orctest.c b/orc-test/orctest.c index 84905f0..c34bd7a 100644 --- a/orc-test/orctest.c +++ b/orc-test/orctest.c @@ -619,15 +619,6 @@ print_param_val_float (const int *const var, const int *const var2, } } -static float -get_array_val_float (OrcArray *array, int i, int j) -{ - void *ptr = ORC_PTR_OFFSET (array->data, - i*array->element_size + j*array->stride); - - return *(float *)ptr; -} - int float_compare (OrcArray *array1, OrcArray *array2, int i, int j) { @@ -655,49 +646,6 @@ float_compare (OrcArray *array1, OrcArray *array2, int i, int j) return FALSE; } -int -check_expected_failure (int flags, OrcProgram *p, OrcArray** src, OrcArray** dest_exec, OrcArray** dest_emul, int i, int j) { - OrcTarget *target; - unsigned int target_flags; - - target = orc_target_get_default (); - target_flags = orc_target_get_default_flags (target); - - if ((flags & ORC_TEST_FLAGS_BACKUP) == 0 && strcmp (orc_target_get_name (target), "neon") == 0 && (target_flags & ORC_TARGET_NEON_NEON)) { - if (strstr(p->name, "divf")) { - - float src_val = get_array_val_float (src[1], i, j); - float dest_exec_val = get_array_val_float (dest_exec[0], i, j); - - // Dividing by a large number in NEON will result in 0 - if (fabs(src_val) > 7e37 && fabs(dest_exec_val) == 0.0f) { - printf(" NEON divf mismatch expected"); - return TRUE; - } - - } else if (strstr(p->name, "sqrtf")) { - - float src_val = get_array_val_float (src[0], i, j); - float dest_exec_val = get_array_val_float (dest_exec[0], i, j); - float dest_emul_val = get_array_val_float (dest_emul[0], i, j); - - // sqrt of 0 or small numbers in NEON returns NaN because it uses reciprocal estimate - if (fabs(src_val) < 2e-38) { - printf(" NEON sqrtf mismatch expected"); - return TRUE; - } - - // sqrt in NEON will sometimes be imprecise because frecps returns 1 on small numbers - if (fabs(dest_exec_val - dest_emul_val)/dest_emul_val < 2e-7) { - printf(" NEON sqrtf mismatch expected"); - return TRUE; - } - } - } - - return FALSE; -} - OrcTestResult orc_test_compare_output (OrcProgram *program) { @@ -938,9 +886,7 @@ orc_test_compare_output_full_for_target (OrcProgram *program, int flags, const c print_array_val_float (dest_exec[l-ORC_VAR_D1], i, j); if (!float_compare (dest_emul[l-ORC_VAR_D1], dest_exec[l-ORC_VAR_D1], i, j)) { line_bad = TRUE; - if(!check_expected_failure(flags, program, src, dest_exec, dest_emul, i,j)) { - n_lines_bad++; - } + n_lines_bad++; } } else { a = print_array_val_hex (dest_emul[l-ORC_VAR_D1], i, j); diff --git a/orc/orcrules-neon.c b/orc/orcrules-neon.c index a84457a..d565203 100644 --- a/orc/orcrules-neon.c +++ b/orc/orcrules-neon.c @@ -2966,214 +2966,13 @@ BINARY_VFP(addd,"vadd.f64",0xee300b00, "fadd", 0x4e60d400, 0) BINARY_VFP(subd,"vsub.f64",0xee300b40, "fsub", 0x4ee0d400, 0) BINARY_VFP(muld,"vmul.f64",0xee200b00, "fmul", 0x6e60dc00, 0) BINARY_VFP(divd,"vdiv.f64",0xee800b00, "fdiv", 0x6e60fc00, 0) +BINARY_VFP(divf,"vdiv.f32",0xee800a00, "fdiv", 0x6e20fc00, 0) UNARY_VFP(sqrtd,"vsqrt.f64",0xeeb10b00, "fsqrt", 0x6ee1f800, 0) +UNARY_VFP(sqrtf,"vsqrt.f32",0xeeb10ac0, "fsqrt", 0x6ea1f800, 0) /* BINARY_VFP(cmpeqd,"vcmpe.f64",0xee000000, NULL, 0, 0) */ UNARY_VFP(convdf,"vcvt.f64.f32",0xee200b00, "fcvtzs", 0x4ee1b800, 0) UNARY_VFP(convfd,"vcvt.f32.f64",0xee200b00, "scvtf", 0x4e61d800, 0) -#if 1 -#define NUM_ITERS_DIVF 2 -static void -orc_neon_rule_divf (OrcCompiler *p, void *user, OrcInstruction *insn) -{ - int vec_shift = 1; - - if (p->is_64bit) { - OrcVariable tmpreg = { .alloc = p->tmpreg, .size = p->vars[insn->src_args[1]].size }; - OrcVariable tmpreg2 = { .alloc = p->tmpreg2, .size = p->vars[insn->src_args[1]].size }; - int i; - - orc_neon64_emit_unary (p, "frecpe", 0x0ea1d800, - tmpreg, p->vars[insn->src_args[1]], - vec_shift); - for(i = 0; i < NUM_ITERS_DIVF; i++) { - orc_neon64_emit_binary (p, "frecps", 0x0e20fc00, - tmpreg2, /* correction factor */ - tmpreg, /* the last estimate */ - p->vars[insn->src_args[1]], /* the original number */ - vec_shift); - orc_neon64_emit_binary (p, "fmul", 0x2e20dc00, - tmpreg, /* revised estimate */ - tmpreg, /* last estimate */ - tmpreg2, /* correction factor */ - vec_shift); - } - - orc_neon64_emit_binary (p, "fmul", 0x2e20dc00, - p->vars[insn->dest_args[0]], - p->vars[insn->src_args[0]], - tmpreg, vec_shift); - } else { - if (p->insn_shift <= vec_shift) { - int i; - orc_neon_emit_unary (p, "vrecpe.f32", 0xf3bb0500, - p->tmpreg, - p->vars[insn->src_args[1]].alloc); - for(i = 0; i < NUM_ITERS_DIVF; i++) { - orc_neon_emit_binary (p, "vrecps.f32", 0xf2000f10, - p->tmpreg2, /* correction factor */ - p->tmpreg, /* the last estimate */ - p->vars[insn->src_args[1]].alloc); /* the original number */ - orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10, - p->tmpreg, /* revised estimate */ - p->tmpreg, /* last estimate */ - p->tmpreg2); /* correction factor */ - } - - orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10, - p->vars[insn->dest_args[0]].alloc, - p->vars[insn->src_args[0]].alloc, - p->tmpreg); - - } else if (p->insn_shift == vec_shift + 1) { - int i; - orc_neon_emit_unary_quad (p, "vrecpe.f32", 0xf3bb0500, - p->tmpreg, - p->vars[insn->src_args[1]].alloc); - for(i = 0; i < NUM_ITERS_DIVF; i++) { - orc_neon_emit_binary_quad (p, "vrecps.f32", 0xf2000f10, - p->tmpreg2, /* correction factor */ - p->tmpreg, /* the last estimate */ - p->vars[insn->src_args[1]].alloc); /* the original number */ - orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10, - p->tmpreg, /* revised estimate */ - p->tmpreg, /* last estimate */ - p->tmpreg2); /* correction factor */ - } - - orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10, - p->vars[insn->dest_args[0]].alloc, - p->vars[insn->src_args[0]].alloc, - p->tmpreg); - - } else { - ORC_COMPILER_ERROR(p, "shift too large"); - } - } -} -#endif - -#if 1 -#define NUM_ITERS_SQRTF 2 -static void -orc_neon_rule_sqrtf (OrcCompiler *p, void *user, OrcInstruction *insn) -{ - int vec_shift = 1; - - if (p->is_64bit) { - OrcVariable tmpreg = { .alloc = p->tmpreg, .size = p->vars[insn->src_args[0]].size }; - OrcVariable tmpreg2 = { .alloc = p->tmpreg2, .size = p->vars[insn->src_args[0]].size }; - int i; - - orc_neon64_emit_unary (p, "frsqrte", 0x2ea1d800, - tmpreg, p->vars[insn->src_args[0]], - vec_shift); - - for(i = 0; i < NUM_ITERS_SQRTF; i++) { - orc_neon64_emit_binary (p, "fmul", 0x2e20dc00, - tmpreg2, tmpreg, p->vars[insn->src_args[0]], - vec_shift); - orc_neon64_emit_binary (p, "frsqrts", 0x0ea0fc00, - tmpreg2, tmpreg, tmpreg2, vec_shift); - orc_neon64_emit_binary (p, "fmul", 0x2e20dc00, - tmpreg, tmpreg, tmpreg2, - vec_shift); - } - - orc_neon64_emit_unary (p, "frecpe", 0x0ea1d800, - p->vars[insn->dest_args[0]], tmpreg, - vec_shift); - - for(i = 0; i < NUM_ITERS_DIVF; i++) { - orc_neon64_emit_binary (p, "frecps", 0x0e20fc00, - tmpreg2, /* correction factor */ - p->vars[insn->dest_args[0]], /* the last estimate */ - tmpreg, /* the original number */ - vec_shift); - orc_neon64_emit_binary (p, "fmul", 0x2e20dc00, - p->vars[insn->dest_args[0]], /* revised estimate */ - p->vars[insn->dest_args[0]], /* last estimate */ - tmpreg2, /* correction factor */ - vec_shift); - } - } else { - if (p->insn_shift <= vec_shift) { - int i; - orc_neon_emit_unary (p, "vrsqrte.f32", 0xf3bb0580, - p->tmpreg, - p->vars[insn->src_args[0]].alloc); - for(i = 0; i < NUM_ITERS_SQRTF; i++) { - orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10, - p->tmpreg2, - p->tmpreg, - p->vars[insn->src_args[0]].alloc); - orc_neon_emit_binary (p, "vrsqrts.f32", 0xf2200f10, - p->tmpreg2, - p->tmpreg, - p->tmpreg2); - orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10, - p->tmpreg, - p->tmpreg, - p->tmpreg2); - } - - orc_neon_emit_unary(p, "vrecpe.f32", 0xf3bb0500, - p->vars[insn->dest_args[0]].alloc, - p->tmpreg); - - for(i=0; i < NUM_ITERS_DIVF; i++) { - orc_neon_emit_binary (p, "vrecps.f32", 0xf2000f10, - p->tmpreg2, /* correction factor */ - p->vars[insn->dest_args[0]].alloc, /* the last estimate */ - p->tmpreg); /* the original number */ - orc_neon_emit_binary (p, "vmul.f32", 0xf3000d10, - p->vars[insn->dest_args[0]].alloc, /* revised estimate */ - p->vars[insn->dest_args[0]].alloc, /* last estimate */ - p->tmpreg2); /* correction factor */ - } - - } else if (p->insn_shift == vec_shift + 1) { - int i; - orc_neon_emit_unary_quad (p, "vrsqrte.f32", 0xf3bb0580, - p->tmpreg, - p->vars[insn->src_args[0]].alloc); - for(i = 0; i < NUM_ITERS_SQRTF; i++) { - orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10, - p->tmpreg2, - p->tmpreg, - p->vars[insn->src_args[0]].alloc); - orc_neon_emit_binary_quad (p, "vrsqrts.f32", 0xf2200f10, - p->tmpreg2, - p->tmpreg, - p->tmpreg2); - orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10, - p->tmpreg, - p->tmpreg, - p->tmpreg2); - } - - orc_neon_emit_unary_quad(p, "vrecpe.f32", 0xf3bb0500, - p->vars[insn->dest_args[0]].alloc, - p->tmpreg); - - for(i=0; i < NUM_ITERS_DIVF; i++) { - orc_neon_emit_binary_quad (p, "vrecps.f32", 0xf2000f10, - p->tmpreg2, /* correction factor */ - p->vars[insn->dest_args[0]].alloc, /* the last estimate */ - p->tmpreg); /* the original number */ - orc_neon_emit_binary_quad (p, "vmul.f32", 0xf3000d10, - p->vars[insn->dest_args[0]].alloc, /* revised estimate */ - p->vars[insn->dest_args[0]].alloc, /* last estimate */ - p->tmpreg2); /* correction factor */ - } - - } else { - ORC_COMPILER_ERROR(p, "shift too large"); - } - } -} -#endif - static void orc_neon_rule_accw (OrcCompiler *p, void *user, OrcInstruction *insn) { |