glsl, spirv: Improve accuracy of asin() and acos()

The polynomial used for asin_expr() was suboptimal (and its source was not documented). A better approximation is found in the _Handbook_of_Mathematical_Functions_ by Abramowitz and Stegun, which is used in Nvidia's Cg toolkit. However, while this approximation gives a good absolute error bound, its relative error exceeds the 4096 ulp allowed by the Vulkan spec. Taking a page from the spirv implementation of asin(), we implement a piecewise approximation where a Taylor series is used for small values of |x|. This patch also harmonizes the GLSL and Vulkan implementations by moving the implementation to common code (nir_builder). Running tests on asin() with a grid of 64000 samples between 0.0 and +1.0, the original asin() at 32 bits has: ``` glsl spirv RMSE: 1.756451e-04 1.609091e-04 worst abs error: 3.904104e-04 at 0.937001 3.904104e-04 at 0.937001 worst ulp error: 11800 at 6.2499e-05 3826 at 0.841331 ``` whereas the new implementation has for both: ``` RMSE: 2.528056e-05 worst abs error: 4.962087e-05 at 0.451149 worst ulp error: 2379 at 0.215106 ``` Reviewed-by: Marek Olšák <marek.olsak@amd.com> Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Acked-by: Mel Henning <mhenning@darkrefraction.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40862>
2026-05-07 02:48:06 +02:00 · 2026-04-03 14:38:44 -03:00 · 2026-04-03 14:38:44 -03:00 · 4ae192a3d9
commit 4ae192a3d9
parent fa784fffd0
13 changed files with 121 additions and 131 deletions
--- a/src/amd/ci/restricted-traces-amd.yml
+++ b/src/amd/ci/restricted-traces-amd.yml
@ -13,7 +13,7 @@ traces:
      checksum: c2264076c2f0ca6b48c3bab152a65a36
  angle/libangle_restricted_traces_candy_crush_soda_saga.so:
    vk-radv-raven:
-      checksum: 3370be7bbc95e6d2ec5b1b3ed36c56a0
+      checksum: a3e8f7c78583fb0e6c5d13f576dad5d2
  angle/libangle_restricted_traces_free_fire.so:
    vk-radv-raven:
      checksum: 62307dd7a10d656733bca7e0f065ac99
@ -28,7 +28,7 @@ traces:
      checksum: 47c004fed88bed8d3d387295399f0810
  angle/libangle_restricted_traces_pubg_mobile_battle_royale.so:
    vk-radv-raven:
-      checksum: 95092db8df12af91de8de5463e4e4bb6
+      checksum: f028e6aeb48690e5a36744be55a56773
  angle/libangle_restricted_traces_temple_run_300.so:
    vk-radv-raven:
      checksum: 4fa6a73dad7d9dd747ba7cbf82aae42e
--- a/src/compiler/glsl/builtin_functions.cpp
+++ b/src/compiler/glsl/builtin_functions.cpp
@ -1209,8 +1209,6 @@ private:
   ir_swizzle *matrix_elt(ir_variable *var, int col, int row);
   ir_dereference_record *record_ref(ir_variable *var, const char *field);

-   ir_expression *asin_expr(ir_variable *x, float p0, float p1);
-
   /**
    * Call function \param f with parameters specified as the linked
    * list \param params of \c ir_variable objects.  \param ret should
@ -6507,6 +6505,8 @@ builtin_builder::_degrees(builtin_available_predicate avail,

 UNOPA(sin, ir_unop_sin)
 UNOPA(cos, ir_unop_cos)
+UNOPA(asin, ir_unop_asin)
+UNOPA(acos, ir_unop_acos)

 ir_function_signature *
 builtin_builder::_tan(builtin_available_predicate avail,
@ -6518,20 +6518,6 @@ builtin_builder::_tan(builtin_available_predicate avail,
   return sig;
 }

-ir_expression *
-builtin_builder::asin_expr(ir_variable *x, float p0, float p1)
-{
-   return mul(sign(x),
-              sub(IMM_FP(x->type, M_PI_2f),
-                  mul(sqrt(sub(IMM_FP(x->type, 1.0f), abs(x))),
-                      add(IMM_FP(x->type, M_PI_2f),
-                          mul(abs(x),
-                              add(IMM_FP(x->type, (M_PI_4f - 1.0f)),
-                                  mul(abs(x),
-                                      add(IMM_FP(x->type, p0),
-                                          mul(abs(x), IMM_FP(x->type, p1))))))))));
-}
-
 /**
 * Generate a ir_call to a function with a set of parameters
 *
@ -6568,30 +6554,6 @@ builtin_builder::call(ir_function *f, ir_variable *ret, ir_exec_list params)
   return new(linalloc) ir_call(sig, deref, &actual_params);
 }

-ir_function_signature *
-builtin_builder::_asin(builtin_available_predicate avail,
-                       const glsl_type *type)
-{
-   ir_variable *x = in_var(type, "x");
-   MAKE_SIG(type, avail, 1, x);
-
-   body.emit(ret(asin_expr(x, 0.086566724f, -0.03102955f)));
-
-   return sig;
-}
-
-ir_function_signature *
-builtin_builder::_acos(builtin_available_predicate avail,
-                       const glsl_type *type)
-{
-   ir_variable *x = in_var(type, "x");
-   MAKE_SIG(type, avail, 1, x);
-
-   body.emit(ret(sub(IMM_FP(type, M_PI_2f), asin_expr(x, 0.08132463f, -0.02363318f))));
-
-   return sig;
-}
-
 ir_function_signature *
 builtin_builder::_sinh(builtin_available_predicate avail,
                       const glsl_type *type)
--- a/src/compiler/glsl/glsl_to_nir.cpp
+++ b/src/compiler/glsl/glsl_to_nir.cpp
@ -2358,6 +2358,12 @@ nir_visitor::visit(ir_expression *ir)
      return;
   }

+   case ir_unop_asin:
+      result = nir_asin(&b, srcs[0]);
+      break;
+   case ir_unop_acos:
+      result = nir_acos(&b, srcs[0]);
+      break;
   case ir_unop_atan:
      result = nir_atan(&b, srcs[0]);
      break;
--- a/src/compiler/glsl/ir.cpp
+++ b/src/compiler/glsl/ir.cpp
@ -236,6 +236,8 @@ ir_expression::ir_expression(int op, ir_rvalue *op0)
   case ir_unop_round_even:
   case ir_unop_sin:
   case ir_unop_cos:
+   case ir_unop_asin:
+   case ir_unop_acos:
   case ir_unop_dFdx:
   case ir_unop_dFdx_coarse:
   case ir_unop_dFdx_fine:
--- a/src/compiler/glsl/ir_expression_operation.py
+++ b/src/compiler/glsl/ir_expression_operation.py
@ -568,6 +568,8 @@ ir_expression_operation = [
   operation("sin", 1, source_types=(float_type,), c_expression="sinf({src0})"),
   operation("cos", 1, source_types=(float_type,), c_expression="cosf({src0})"),
   operation("atan", 1, source_types=(float_type,), c_expression="atan({src0})"),
+   operation("acos", 1, source_types=(float_type,), c_expression="acosf({src0})"),
+   operation("asin", 1, source_types=(float_type,), c_expression="asinf({src0})"),

   # Partial derivatives.
   operation("dFdx", 1, source_types=(float_type,), c_expression="0.0f"),
--- a/src/compiler/glsl/ir_validate.cpp
+++ b/src/compiler/glsl/ir_validate.cpp
@ -515,6 +515,8 @@ ir_validate::visit_leave(ir_expression *ir)
      break;
   case ir_unop_sin:
   case ir_unop_cos:
+   case ir_unop_asin:
+   case ir_unop_acos:
   case ir_unop_dFdx:
   case ir_unop_dFdx_coarse:
   case ir_unop_dFdx_fine:
--- a/src/compiler/nir/nir_builtin_builder.c
+++ b/src/compiler/nir/nir_builtin_builder.c
@ -28,6 +28,13 @@
 #include "nir_builder.h"
 #include "nir_builtin_builder.h"

+#ifndef M_PIf
+#define M_PIf   ((float) M_PI)
+#endif
+#ifndef M_PI_2f
+#define M_PI_2f ((float) M_PI_2)
+#endif
+
 nir_def *
 nir_cross3(nir_builder *b, nir_def *x, nir_def *y)
 {
@ -159,6 +166,81 @@ nir_upsample(nir_builder *b, nir_def *hi, nir_def *lo)
   return nir_vec(b, res, lo->num_components);
 }

+/**
+ * Approximate asin(x) by formula 4.45 from Abramowitz & Stegun, "Handbook
+ * of Mathematical Functions":
+ *
+ * asin~(x) = (π/2 - sqrt(1 - |x|) * ( a0 + a1 * |x| + a2 * |x|^2 + a3 * |x|^3 )
+ *
+ * where a0 = 1.5707288 a1 = -0.2121144 a2 = 0.0742610 a3 = -0.0187203
+ *
+ * This has a very small absolute error, but the relative error can become
+ * large when |x| is small. For small |x| the Taylor series makes a good
+ * approximation, so when the relative error matters (i.e. for asin rather
+ * than acos) we do a piecewise approximation with the Taylor series for
+ * |x| < 0.21502245 and formula 4.45 elsewhere. The crossover point is
+ * the value in [0.1, 0.7071] where the two approximations are equal.
+ */
+static nir_def *
+build_asin(nir_builder *b, nir_def *x, bool piecewise)
+{
+   if (x->bit_size == 16) {
+      /* The polynomial approximation may not be precise enough to meet half-float
+       * precision requirements. Alternatively, we could implement this using
+       * the formula:
+       *
+       * asin(x) = atan2(x, sqrt(1 - x*x))
+       *
+       * But that is very expensive, so instead we just do the polynomial
+       * approximation in 32-bit math and then we convert the result back to
+       * 16-bit.
+       */
+      return nir_f2f16(b, build_asin(b, nir_f2f32(b, x), piecewise));
+   }
+   nir_def *abs_x = nir_fabs(b, x);
+
+   nir_def *p0_plus_xp1 = nir_ffma_imm12(b, abs_x, -0.0187293, 0.0742610);
+
+   nir_def *expr_tail =
+      nir_ffma_imm2(b, abs_x,
+                       nir_ffma_imm2(b, abs_x, p0_plus_xp1, -0.2121144),
+                       1.5707288);
+
+   nir_def *result0 = nir_fmul(b, nir_fsign(b, x),
+                      nir_a_minus_bc(b, nir_imm_floatN_t(b, M_PI_2f, x->bit_size),
+                                        nir_fsqrt(b,
+                                                  nir_fsub_imm(b, 1.0, abs_x)),
+                                        expr_tail));
+   if (piecewise) {
+      /* use taylor approximation for |x| < 0.21502245 */
+
+      nir_def *x2 = nir_fmul(b, x, x);
+      nir_def *result1 = nir_fmul(b,
+                                  x,
+                                  nir_ffma_imm12(b, x2, (1.0/6.0), 1.0));
+      return nir_bcsel(b,
+                       nir_flt_imm(b, abs_x, 0.21502245),
+                       result1,
+                       result0);
+   } else {
+      return result0;
+   }
+}
+
+nir_def *
+nir_asin(nir_builder *b, nir_def *x)
+{
+   /* use piecewise approximation to keep low relative error near 0 */
+   return build_asin(b, x, true);
+}
+
+nir_def *
+nir_acos(nir_builder *b, nir_def *x)
+{
+   /* piecewise approximation not needed to keep low relative error */
+   return nir_fsub_imm(b, M_PI_2f, build_asin(b, x, false));
+}
+
 nir_def *
 nir_atan(nir_builder *b, nir_def *y_over_x)
 {
--- a/src/compiler/nir/nir_builtin_builder.h
+++ b/src/compiler/nir/nir_builtin_builder.h
@ -44,9 +44,12 @@ nir_def *nir_normalize(nir_builder *b, nir_def *vec);
 nir_def *nir_smoothstep(nir_builder *b, nir_def *edge0,
                        nir_def *edge1, nir_def *x);
 nir_def *nir_upsample(nir_builder *b, nir_def *hi, nir_def *lo);
+nir_def *nir_acos(nir_builder *b, nir_def *x);
+nir_def *nir_asin(nir_builder *b, nir_def *x);
 nir_def *nir_atan(nir_builder *b, nir_def *y_over_x);
 nir_def *nir_atan2(nir_builder *b, nir_def *y, nir_def *x);

+
 nir_def *
 nir_build_texture_query(nir_builder *b, nir_tex_instr *tex, nir_texop texop,
                        unsigned components, nir_alu_type dest_type,
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@ -120,73 +120,6 @@ matrix_inverse(struct vtn_builder *b, struct vtn_ssa_value *src)
   return val;
 }

-/**
- * Approximate asin(x) by the piecewise formula:
- * for |x| < 0.5, asin~(x) = x * (1 + x²(pS0 + x²(pS1 + x²*pS2)) / (1 + x²*qS1))
- * for |x| ≥ 0.5, asin~(x) = sign(x) * (π/2 - sqrt(1 - |x|) * (π/2 + |x|(π/4 - 1 + |x|(p0 + |x|p1))))
- *
- * The latter is correct to first order at x=0 and x=±1 regardless of the p
- * coefficients but can be made second-order correct at both ends by selecting
- * the fit coefficients appropriately.  Different p coefficients can be used
- * in the asin and acos implementation to minimize some relative error metric
- * in each case.
- */
-static nir_def *
-build_asin(nir_builder *b, nir_def *x, float p0, float p1, bool piecewise)
-{
-   if (x->bit_size == 16) {
-      /* The polynomial approximation isn't precise enough to meet half-float
-       * precision requirements. Alternatively, we could implement this using
-       * the formula:
-       *
-       * asin(x) = atan2(x, sqrt(1 - x*x))
-       *
-       * But that is very expensive, so instead we just do the polynomial
-       * approximation in 32-bit math and then we convert the result back to
-       * 16-bit.
-       */
-      nir_def *result =
-         nir_f2f16(b, build_asin(b, nir_f2f32(b, x), p0, p1, piecewise));
-
-      return result;
-   }
-   nir_def *one = nir_imm_floatN_t(b, 1.0f, x->bit_size);
-   nir_def *half = nir_imm_floatN_t(b, 0.5f, x->bit_size);
-   nir_def *abs_x = nir_fabs(b, x);
-
-   nir_def *p0_plus_xp1 = nir_ffma_imm12(b, abs_x, p1, p0);
-
-   nir_def *expr_tail =
-      nir_ffma_imm2(b, abs_x,
-                       nir_ffma_imm2(b, abs_x, p0_plus_xp1, M_PI_4f - 1.0f),
-                       M_PI_2f);
-
-   nir_def *result0 = nir_fmul(b, nir_fsign(b, x),
-                      nir_a_minus_bc(b, nir_imm_floatN_t(b, M_PI_2f, x->bit_size),
-                                        nir_fsqrt(b, nir_fsub(b, one, abs_x)),
-                                        expr_tail));
-   if (piecewise) {
-      /* approximation for |x| < 0.5 */
-      const float pS0 =  1.6666586697e-01f;
-      const float pS1 = -4.2743422091e-02f;
-      const float pS2 = -8.6563630030e-03f;
-      const float qS1 = -7.0662963390e-01f;
-
-      nir_def *x2 = nir_fmul(b, x, x);
-      nir_def *p = nir_fmul(b,
-                                x2,
-                                nir_ffma_imm2(b, x2,
-                                                 nir_ffma_imm12(b, x2, pS2, pS1),
-                                                 pS0));
-
-      nir_def *q = nir_ffma_imm1(b, x2, qS1, one);
-      nir_def *result1 = nir_ffma(b, x, nir_fdiv(b, p, q), x);
-      return nir_bcsel(b, nir_flt(b, abs_x, half), result1, result0);
-   } else {
-      return result0;
-   }
-}
-
 static nir_op
 vtn_nir_alu_op_for_spirv_glsl_opcode(struct vtn_builder *b,
                                     enum GLSLstd450 opcode,
@ -567,13 +500,11 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
   }

   case GLSLstd450Asin:
-      dest->def = build_asin(nb, src[0], 0.086566724, -0.03102955, true);
+      dest->def = nir_asin(nb, src[0]);
      break;

   case GLSLstd450Acos:
-      dest->def =
-         nir_fsub_imm(nb, M_PI_2f,
-                          build_asin(nb, src[0], 0.08132463, -0.02363318, false));
+      dest->def = nir_acos(nb, src[0]);
      break;

   case GLSLstd450Atan:
--- a/src/gallium/drivers/virgl/ci/traces-virgl.yml
+++ b/src/gallium/drivers/virgl/ci/traces-virgl.yml
@ -109,7 +109,7 @@ traces:
      label: [crash]
  godot/Material Testers.x86_64_2020.04.08_13.38_frame799.rdc:
    gl-virgl:
-      checksum: 7ba4f0c52a719d94b805653d31ce22d7
+      checksum: 465410a06d966b377bc8c41e52b21b0f
  ror/ror-default.trace:
    gl-virgl:
      label: [crash]
--- a/src/gallium/drivers/zink/ci/traces-zink.yml
+++ b/src/gallium/drivers/zink/ci/traces-zink.yml
@ -44,17 +44,17 @@ traces:
      checksum: 58e2d6ea105ca2735b05ce06709691fb
      text: Clear color for the background can be black or grey
    zink-radv-gfx1201:
-      checksum: c65527b7db55687e3c2741c0286b70e2
+      checksum: 9c55251c2979a107ba852719d7e41889

  blender/blender-demo-ellie_pose.trace:
    gl-zink-anv-adl:
-      checksum: ca8df31308386674816c0e801f220565
+      checksum: ddae9bcbff102e46739454eaa09772c0
    gl-zink-anv-tgl:
-      checksum: ca8df31308386674816c0e801f220565
+      checksum: ddae9bcbff102e46739454eaa09772c0
    zink-radv-vangogh:
-      checksum: 4670acc17f40e2d20ecf355a64b5612e
+      checksum: c6a14ab73815748450f4feaf50fa9f2e
    zink-radv-gfx1201:
-      checksum: 4670acc17f40e2d20ecf355a64b5612e
+      checksum: c6a14ab73815748450f4feaf50fa9f2e

  glxgears/glxgears-2-v2.trace:
    gl-zink-anv-adl:
@ -97,9 +97,9 @@ traces:
      checksum: dbe1de4e2e812413f173ea6c423117ff
      text: "'egl_platform.cpp( 227) - Error   - Couldn't find a suitable EGL config' -- revisit when we can turn on X11?"
    zink-radv-vangogh:
-      checksum: 0615d3d7598d0088393bfc8fa24f73c8
+      checksum: 65e86206d1ba3ed8365f8b95dc9e9e71
    zink-radv-gfx1201:
-      checksum: 07f0e057eeea449544e16c7ff65e965f
+      checksum: f22ee8433dd7b689c46816d576bf3c12

  gputest/pixmark-julia-fp32-v2.trace:
    gl-zink-anv-adl:
--- a/src/intel/ci/restricted-traces-intel.yml
+++ b/src/intel/ci/restricted-traces-intel.yml
@ -13,7 +13,7 @@ traces:
      checksum: e038ef82470fff7b94cf958fa869d130
  angle/libangle_restricted_traces_candy_crush_soda_saga.so:
    vk-anv-adl:
-      checksum: dcfc2dbc1e383d10b86a9c3a52452aa8
+      checksum: dc5d96a70af6165d88f882b96fb73e73
  angle/libangle_restricted_traces_free_fire.so:
    vk-anv-adl:
      checksum: 1a9f03ed30ee254842afe7ab7792a493
@ -28,7 +28,7 @@ traces:
      checksum: ae8371870059d87d475da217a453acfc
  angle/libangle_restricted_traces_pubg_mobile_battle_royale.so:
    vk-anv-adl:
-      checksum: b1970da0e8d29498078e53d2b6bbff71
+      checksum: 692350e4315d58f2644f5901b954fa15
  angle/libangle_restricted_traces_temple_run_300.so:
    vk-anv-adl:
      checksum: 72610fdadbfb7d99aa5de13d78598ff2
--- a/src/intel/ci/traces-iris.yml
+++ b/src/intel/ci/traces-iris.yml
@ -557,7 +557,7 @@ traces:
      checksum: 85b4f3279fde206dace85ffd727057ea
    gl-intel-kbl:
      label: [no-perf]
-      checksum: efab5903f88f7f34e7c948cfcfb54cd9
+      checksum: 46555aa91e3f9b3f12f93a28b4ff069b
    gl-intel-whl:
      label: [no-perf]
      checksum: 6a0e17d98eca33e80810b6158e998aa1
@ -569,19 +569,19 @@ traces:
      checksum: 85b4f3279fde206dace85ffd727057ea
  blender/blender-demo-ellie_pose.trace:
    gl-intel-apl:
-      checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
+      checksum: 61d144ac9cd8cbba11b19bb1174855bc
    gl-intel-glx:
-      checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
+      checksum: 61d144ac9cd8cbba11b19bb1174855bc
    gl-intel-amly:
-      checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
+      checksum: 61d144ac9cd8cbba11b19bb1174855bc
    gl-intel-kbl:
-      checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
+      checksum: 61d144ac9cd8cbba11b19bb1174855bc
    gl-intel-whl:
-      checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
+      checksum: 61d144ac9cd8cbba11b19bb1174855bc
    gl-intel-cml:
-      checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
+      checksum: 61d144ac9cd8cbba11b19bb1174855bc
    gl-intel-adl:
-      checksum: 7d0da76cdf72ff785a1ecc6d4e404e0c
+      checksum: 61d144ac9cd8cbba11b19bb1174855bc
  unvanquished/unvanquished-lowest.trace:
    gl-intel-apl:
      checksum: 884096b6dd40e2f500d0837501287fac