glsl, spirv: Improve accuracy of asin() and acos()
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

The polynomial used for asin_expr() was suboptimal (and its source was
not documented).

A better approximation is found in the _Handbook_of_Mathematical_Functions_
by Abramowitz and Stegun, which is used in Nvidia's Cg toolkit. However,
while this approximation gives a good absolute error bound, its relative
error exceeds the 4096 ulp allowed by the Vulkan spec. Taking a page
from the spirv implementation of asin(), we implement a piecewise
approximation where a Taylor series is used for small values of |x|.
This patch also harmonizes the GLSL and Vulkan implementations by moving
the implementation to common code (nir_builder).

Running tests on asin() with a grid of 64000 samples between 0.0 and +1.0,
the original asin() at 32 bits has:
```
                       glsl                       spirv
  RMSE:            1.756451e-04                 1.609091e-04
  worst abs error: 3.904104e-04 at 0.937001     3.904104e-04 at 0.937001
  worst ulp error: 11800 at 6.2499e-05          3826 at 0.841331
```
whereas the new implementation has for both:
```
  RMSE:            2.528056e-05
  worst abs error: 4.962087e-05 at 0.451149
  worst ulp error: 2379 at 0.215106
```

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Acked-by: Mel Henning <mhenning@darkrefraction.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40862>
This commit is contained in:
Eric R. Smith 2026-04-03 14:38:44 -03:00 committed by Marge Bot
parent fa784fffd0
commit 4ae192a3d9
13 changed files with 121 additions and 131 deletions

View file

@ -13,7 +13,7 @@ traces:
checksum: c2264076c2f0ca6b48c3bab152a65a36
angle/libangle_restricted_traces_candy_crush_soda_saga.so:
vk-radv-raven:
checksum: 3370be7bbc95e6d2ec5b1b3ed36c56a0
checksum: a3e8f7c78583fb0e6c5d13f576dad5d2
angle/libangle_restricted_traces_free_fire.so:
vk-radv-raven:
checksum: 62307dd7a10d656733bca7e0f065ac99
@ -28,7 +28,7 @@ traces:
checksum: 47c004fed88bed8d3d387295399f0810
angle/libangle_restricted_traces_pubg_mobile_battle_royale.so:
vk-radv-raven:
checksum: 95092db8df12af91de8de5463e4e4bb6
checksum: f028e6aeb48690e5a36744be55a56773
angle/libangle_restricted_traces_temple_run_300.so:
vk-radv-raven:
checksum: 4fa6a73dad7d9dd747ba7cbf82aae42e

View file

@ -1209,8 +1209,6 @@ private:
ir_swizzle *matrix_elt(ir_variable *var, int col, int row);
ir_dereference_record *record_ref(ir_variable *var, const char *field);
ir_expression *asin_expr(ir_variable *x, float p0, float p1);
/**
* Call function \param f with parameters specified as the linked
* list \param params of \c ir_variable objects. \param ret should
@ -6507,6 +6505,8 @@ builtin_builder::_degrees(builtin_available_predicate avail,
UNOPA(sin, ir_unop_sin)
UNOPA(cos, ir_unop_cos)
UNOPA(asin, ir_unop_asin)
UNOPA(acos, ir_unop_acos)
ir_function_signature *
builtin_builder::_tan(builtin_available_predicate avail,
@ -6518,20 +6518,6 @@ builtin_builder::_tan(builtin_available_predicate avail,
return sig;
}
ir_expression *
builtin_builder::asin_expr(ir_variable *x, float p0, float p1)
{
return mul(sign(x),
sub(IMM_FP(x->type, M_PI_2f),
mul(sqrt(sub(IMM_FP(x->type, 1.0f), abs(x))),
add(IMM_FP(x->type, M_PI_2f),
mul(abs(x),
add(IMM_FP(x->type, (M_PI_4f - 1.0f)),
mul(abs(x),
add(IMM_FP(x->type, p0),
mul(abs(x), IMM_FP(x->type, p1))))))))));
}
/**
* Generate a ir_call to a function with a set of parameters
*
@ -6568,30 +6554,6 @@ builtin_builder::call(ir_function *f, ir_variable *ret, ir_exec_list params)
return new(linalloc) ir_call(sig, deref, &actual_params);
}
ir_function_signature *
builtin_builder::_asin(builtin_available_predicate avail,
const glsl_type *type)
{
ir_variable *x = in_var(type, "x");
MAKE_SIG(type, avail, 1, x);
body.emit(ret(asin_expr(x, 0.086566724f, -0.03102955f)));
return sig;
}
ir_function_signature *
builtin_builder::_acos(builtin_available_predicate avail,
const glsl_type *type)
{
ir_variable *x = in_var(type, "x");
MAKE_SIG(type, avail, 1, x);
body.emit(ret(sub(IMM_FP(type, M_PI_2f), asin_expr(x, 0.08132463f, -0.02363318f))));
return sig;
}
ir_function_signature *
builtin_builder::_sinh(builtin_available_predicate avail,
const glsl_type *type)

View file

@ -2358,6 +2358,12 @@ nir_visitor::visit(ir_expression *ir)
return;
}
case ir_unop_asin:
result = nir_asin(&b, srcs[0]);
break;
case ir_unop_acos:
result = nir_acos(&b, srcs[0]);
break;
case ir_unop_atan:
result = nir_atan(&b, srcs[0]);
break;

View file

@ -236,6 +236,8 @@ ir_expression::ir_expression(int op, ir_rvalue *op0)
case ir_unop_round_even:
case ir_unop_sin:
case ir_unop_cos:
case ir_unop_asin:
case ir_unop_acos:
case ir_unop_dFdx:
case ir_unop_dFdx_coarse:
case ir_unop_dFdx_fine:

View file

@ -568,6 +568,8 @@ ir_expression_operation = [
operation("sin", 1, source_types=(float_type,), c_expression="sinf({src0})"),
operation("cos", 1, source_types=(float_type,), c_expression="cosf({src0})"),
operation("atan", 1, source_types=(float_type,), c_expression="atan({src0})"),
operation("acos", 1, source_types=(float_type,), c_expression="acosf({src0})"),
operation("asin", 1, source_types=(float_type,), c_expression="asinf({src0})"),
# Partial derivatives.
operation("dFdx", 1, source_types=(float_type,), c_expression="0.0f"),

View file

@ -515,6 +515,8 @@ ir_validate::visit_leave(ir_expression *ir)
break;
case ir_unop_sin:
case ir_unop_cos:
case ir_unop_asin:
case ir_unop_acos:
case ir_unop_dFdx:
case ir_unop_dFdx_coarse:
case ir_unop_dFdx_fine:

View file

@ -28,6 +28,13 @@
#include "nir_builder.h"
#include "nir_builtin_builder.h"
#ifndef M_PIf
#define M_PIf ((float) M_PI)
#endif
#ifndef M_PI_2f
#define M_PI_2f ((float) M_PI_2)
#endif
nir_def *
nir_cross3(nir_builder *b, nir_def *x, nir_def *y)
{
@ -159,6 +166,81 @@ nir_upsample(nir_builder *b, nir_def *hi, nir_def *lo)
return nir_vec(b, res, lo->num_components);
}
/**
* Approximate asin(x) by formula 4.45 from Abramowitz & Stegun, "Handbook
* of Mathematical Functions":
*
* asin~(x) = (π/2 - sqrt(1 - |x|) * ( a0 + a1 * |x| + a2 * |x|^2 + a3 * |x|^3 )
*
* where a0 = 1.5707288 a1 = -0.2121144 a2 = 0.0742610 a3 = -0.0187203
*
* This has a very small absolute error, but the relative error can become
* large when |x| is small. For small |x| the Taylor series makes a good
* approximation, so when the relative error matters (i.e. for asin rather
* than acos) we do a piecewise approximation with the Taylor series for
* |x| < 0.21502245 and formula 4.45 elsewhere. The crossover point is
* the value in [0.1, 0.7071] where the two approximations are equal.
*/
static nir_def *
build_asin(nir_builder *b, nir_def *x, bool piecewise)
{
if (x->bit_size == 16) {
/* The polynomial approximation may not be precise enough to meet half-float
* precision requirements. Alternatively, we could implement this using
* the formula:
*
* asin(x) = atan2(x, sqrt(1 - x*x))
*
* But that is very expensive, so instead we just do the polynomial
* approximation in 32-bit math and then we convert the result back to
* 16-bit.
*/
return nir_f2f16(b, build_asin(b, nir_f2f32(b, x), piecewise));
}
nir_def *abs_x = nir_fabs(b, x);
nir_def *p0_plus_xp1 = nir_ffma_imm12(b, abs_x, -0.0187293, 0.0742610);
nir_def *expr_tail =
nir_ffma_imm2(b, abs_x,
nir_ffma_imm2(b, abs_x, p0_plus_xp1, -0.2121144),
1.5707288);
nir_def *result0 = nir_fmul(b, nir_fsign(b, x),
nir_a_minus_bc(b, nir_imm_floatN_t(b, M_PI_2f, x->bit_size),
nir_fsqrt(b,
nir_fsub_imm(b, 1.0, abs_x)),
expr_tail));
if (piecewise) {
/* use taylor approximation for |x| < 0.21502245 */
nir_def *x2 = nir_fmul(b, x, x);
nir_def *result1 = nir_fmul(b,
x,
nir_ffma_imm12(b, x2, (1.0/6.0), 1.0));
return nir_bcsel(b,
nir_flt_imm(b, abs_x, 0.21502245),
result1,
result0);
} else {
return result0;
}
}
nir_def *
nir_asin(nir_builder *b, nir_def *x)
{
/* use piecewise approximation to keep low relative error near 0 */
return build_asin(b, x, true);
}
nir_def *
nir_acos(nir_builder *b, nir_def *x)
{
/* piecewise approximation not needed to keep low relative error */
return nir_fsub_imm(b, M_PI_2f, build_asin(b, x, false));
}
nir_def *
nir_atan(nir_builder *b, nir_def *y_over_x)
{

View file

@ -44,9 +44,12 @@ nir_def *nir_normalize(nir_builder *b, nir_def *vec);
nir_def *nir_smoothstep(nir_builder *b, nir_def *edge0,
nir_def *edge1, nir_def *x);
nir_def *nir_upsample(nir_builder *b, nir_def *hi, nir_def *lo);
nir_def *nir_acos(nir_builder *b, nir_def *x);
nir_def *nir_asin(nir_builder *b, nir_def *x);
nir_def *nir_atan(nir_builder *b, nir_def *y_over_x);
nir_def *nir_atan2(nir_builder *b, nir_def *y, nir_def *x);
nir_def *
nir_build_texture_query(nir_builder *b, nir_tex_instr *tex, nir_texop texop,
unsigned components, nir_alu_type dest_type,

View file

@ -120,73 +120,6 @@ matrix_inverse(struct vtn_builder *b, struct vtn_ssa_value *src)
return val;
}
/**
* Approximate asin(x) by the piecewise formula:
* for |x| < 0.5, asin~(x) = x * (1 + x²(pS0 + x²(pS1 + x²*pS2)) / (1 + x²*qS1))
* for |x| 0.5, asin~(x) = sign(x) * (π/2 - sqrt(1 - |x|) * (π/2 + |x|(π/4 - 1 + |x|(p0 + |x|p1))))
*
* The latter is correct to first order at x=0 and x=±1 regardless of the p
* coefficients but can be made second-order correct at both ends by selecting
* the fit coefficients appropriately. Different p coefficients can be used
* in the asin and acos implementation to minimize some relative error metric
* in each case.
*/
static nir_def *
build_asin(nir_builder *b, nir_def *x, float p0, float p1, bool piecewise)
{
if (x->bit_size == 16) {
/* The polynomial approximation isn't precise enough to meet half-float
* precision requirements. Alternatively, we could implement this using
* the formula:
*
* asin(x) = atan2(x, sqrt(1 - x*x))
*
* But that is very expensive, so instead we just do the polynomial
* approximation in 32-bit math and then we convert the result back to
* 16-bit.
*/
nir_def *result =
nir_f2f16(b, build_asin(b, nir_f2f32(b, x), p0, p1, piecewise));
return result;
}
nir_def *one = nir_imm_floatN_t(b, 1.0f, x->bit_size);
nir_def *half = nir_imm_floatN_t(b, 0.5f, x->bit_size);
nir_def *abs_x = nir_fabs(b, x);
nir_def *p0_plus_xp1 = nir_ffma_imm12(b, abs_x, p1, p0);
nir_def *expr_tail =
nir_ffma_imm2(b, abs_x,
nir_ffma_imm2(b, abs_x, p0_plus_xp1, M_PI_4f - 1.0f),
M_PI_2f);
nir_def *result0 = nir_fmul(b, nir_fsign(b, x),
nir_a_minus_bc(b, nir_imm_floatN_t(b, M_PI_2f, x->bit_size),
nir_fsqrt(b, nir_fsub(b, one, abs_x)),
expr_tail));
if (piecewise) {
/* approximation for |x| < 0.5 */
const float pS0 = 1.6666586697e-01f;
const float pS1 = -4.2743422091e-02f;
const float pS2 = -8.6563630030e-03f;
const float qS1 = -7.0662963390e-01f;
nir_def *x2 = nir_fmul(b, x, x);
nir_def *p = nir_fmul(b,
x2,
nir_ffma_imm2(b, x2,
nir_ffma_imm12(b, x2, pS2, pS1),
pS0));
nir_def *q = nir_ffma_imm1(b, x2, qS1, one);
nir_def *result1 = nir_ffma(b, x, nir_fdiv(b, p, q), x);
return nir_bcsel(b, nir_flt(b, abs_x, half), result1, result0);
} else {
return result0;
}
}
static nir_op
vtn_nir_alu_op_for_spirv_glsl_opcode(struct vtn_builder *b,
enum GLSLstd450 opcode,
@ -567,13 +500,11 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
}
case GLSLstd450Asin:
dest->def = build_asin(nb, src[0], 0.086566724, -0.03102955, true);
dest->def = nir_asin(nb, src[0]);
break;
case GLSLstd450Acos:
dest->def =
nir_fsub_imm(nb, M_PI_2f,
build_asin(nb, src[0], 0.08132463, -0.02363318, false));
dest->def = nir_acos(nb, src[0]);
break;
case GLSLstd450Atan:

View file

@ -109,7 +109,7 @@ traces:
label: [crash]
godot/Material Testers.x86_64_2020.04.08_13.38_frame799.rdc:
gl-virgl:
checksum: 7ba4f0c52a719d94b805653d31ce22d7
checksum: 465410a06d966b377bc8c41e52b21b0f
ror/ror-default.trace:
gl-virgl:
label: [crash]

View file

@ -44,17 +44,17 @@ traces:
checksum: 58e2d6ea105ca2735b05ce06709691fb
text: Clear color for the background can be black or grey
zink-radv-gfx1201:
checksum: c65527b7db55687e3c2741c0286b70e2
checksum: 9c55251c2979a107ba852719d7e41889
blender/blender-demo-ellie_pose.trace:
gl-zink-anv-adl:
checksum: ca8df31308386674816c0e801f220565
checksum: ddae9bcbff102e46739454eaa09772c0
gl-zink-anv-tgl:
checksum: ca8df31308386674816c0e801f220565
checksum: ddae9bcbff102e46739454eaa09772c0
zink-radv-vangogh:
checksum: 4670acc17f40e2d20ecf355a64b5612e
checksum: c6a14ab73815748450f4feaf50fa9f2e
zink-radv-gfx1201:
checksum: 4670acc17f40e2d20ecf355a64b5612e
checksum: c6a14ab73815748450f4feaf50fa9f2e
glxgears/glxgears-2-v2.trace:
gl-zink-anv-adl:
@ -97,9 +97,9 @@ traces:
checksum: dbe1de4e2e812413f173ea6c423117ff
text: "'egl_platform.cpp( 227) - Error - Couldn't find a suitable EGL config' -- revisit when we can turn on X11?"
zink-radv-vangogh:
checksum: 0615d3d7598d0088393bfc8fa24f73c8
checksum: 65e86206d1ba3ed8365f8b95dc9e9e71
zink-radv-gfx1201:
checksum: 07f0e057eeea449544e16c7ff65e965f
checksum: f22ee8433dd7b689c46816d576bf3c12
gputest/pixmark-julia-fp32-v2.trace:
gl-zink-anv-adl:

View file

@ -13,7 +13,7 @@ traces:
checksum: e038ef82470fff7b94cf958fa869d130
angle/libangle_restricted_traces_candy_crush_soda_saga.so:
vk-anv-adl:
checksum: dcfc2dbc1e383d10b86a9c3a52452aa8
checksum: dc5d96a70af6165d88f882b96fb73e73
angle/libangle_restricted_traces_free_fire.so:
vk-anv-adl:
checksum: 1a9f03ed30ee254842afe7ab7792a493
@ -28,7 +28,7 @@ traces:
checksum: ae8371870059d87d475da217a453acfc
angle/libangle_restricted_traces_pubg_mobile_battle_royale.so:
vk-anv-adl:
checksum: b1970da0e8d29498078e53d2b6bbff71
checksum: 692350e4315d58f2644f5901b954fa15
angle/libangle_restricted_traces_temple_run_300.so:
vk-anv-adl:
checksum: 72610fdadbfb7d99aa5de13d78598ff2

View file

@ -557,7 +557,7 @@ traces:
checksum: 85b4f3279fde206dace85ffd727057ea
gl-intel-kbl:
label: [no-perf]
checksum: efab5903f88f7f34e7c948cfcfb54cd9
checksum: 46555aa91e3f9b3f12f93a28b4ff069b
gl-intel-whl:
label: [no-perf]
checksum: 6a0e17d98eca33e80810b6158e998aa1
@ -569,19 +569,19 @@ traces:
checksum: 85b4f3279fde206dace85ffd727057ea
blender/blender-demo-ellie_pose.trace:
gl-intel-apl:
checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
checksum: 61d144ac9cd8cbba11b19bb1174855bc
gl-intel-glx:
checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
checksum: 61d144ac9cd8cbba11b19bb1174855bc
gl-intel-amly:
checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
checksum: 61d144ac9cd8cbba11b19bb1174855bc
gl-intel-kbl:
checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
checksum: 61d144ac9cd8cbba11b19bb1174855bc
gl-intel-whl:
checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
checksum: 61d144ac9cd8cbba11b19bb1174855bc
gl-intel-cml:
checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
checksum: 61d144ac9cd8cbba11b19bb1174855bc
gl-intel-adl:
checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
checksum: 61d144ac9cd8cbba11b19bb1174855bc
unvanquished/unvanquished-lowest.trace:
gl-intel-apl:
checksum: 884096b6dd40e2f500d0837501287fac