diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c index db249d27058..c428e718b1c 100644 --- a/src/asahi/compiler/agx_compile.c +++ b/src/asahi/compiler/agx_compile.c @@ -20,6 +20,7 @@ #include "agx_nir.h" #include "glsl_types.h" #include "nir.h" +#include "nir_builtin_builder.h" #include "nir_intrinsics.h" #include "nir_intrinsics_indices.h" #include "shader_enums.h" @@ -3360,6 +3361,55 @@ agx_link_libagx(nir_shader *nir, const nir_shader *libagx) glsl_get_cl_type_size_align); } +/* + * The hardware frcp instruction is sometimes off by 1 ULP. For correctly + * rounded frcp, a refinement step is required. This routine has been + * exhaustively tested with a modified math_bruteforce. + * + * While Khronos APIs allow 2.5 ULP error for divides, nir_lower_idiv relies on + * correctly rounded frcp. This is therefore load bearing for integer division + * on all APIs. + */ +static nir_def * +libagx_frcp(nir_builder *b, nir_def *x) +{ + nir_def *u = nir_frcp(b, x); + + /* Do 1 Newton-Raphson refinement step. + * + * Define f(u) = xu - 1. Then f(u) = 0 iff u = 1/x. Newton's method gives: + * + * u_2 = u - f(u) / f'(u) = u - (xu - 1) / x + * + * Our original guess is close, so we approximate (1 / x) by u: + * + * u_2 = u - u(xu - 1) = u + u(1 - xu) + * = fma(fma(-x, u, 1), u, u) + */ + nir_def *one = nir_imm_float(b, 1.0); + nir_def *u_2 = nir_ffma(b, nir_ffma(b, nir_fneg(b, x), u, one), u, u); + + /* If the original value was infinite, frcp will generate the correct zero. + * However, the Newton-Raphson step would multiply 0 * Inf and get a NaN. So + * skip the refinement step for infinite inputs. We do this backwards, + * checking whether the refined result is NaN, since we can implement this + * check in a single fcmpsel instruction. The other case where the refinement + * is NaN is a NaN input, in which skipping refinement is acceptable. + */ + return nir_bcsel(b, nir_fisnan(b, u_2), u, u_2); +} + +static bool +agx_nir_lower_fdiv(nir_builder *b, nir_alu_instr *alu, void *_) +{ + if (alu->op != nir_op_frcp || !alu->exact || alu->def.bit_size != 32) + return false; + + b->cursor = nir_before_instr(&alu->instr); + nir_def_replace(&alu->def, libagx_frcp(b, nir_ssa_for_alu_src(b, alu, 0))); + return true; +} + /* Preprocess NIR independent of shader state */ void agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx) @@ -3404,6 +3454,8 @@ agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx) nir_metadata_control_flow, NULL); NIR_PASS(_, nir, agx_nir_lower_subgroups); NIR_PASS(_, nir, nir_lower_phis_to_scalar, true); + NIR_PASS(_, nir, nir_shader_alu_pass, agx_nir_lower_fdiv, + nir_metadata_control_flow, NULL); /* After lowering, run through the standard suite of NIR optimizations. We * will run through the loop later, once we have the shader key, but if we