mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 22:08:26 +02:00
agx: lower exact frcp
the hardware is off by 1 ULP. fixes nir_lower_idiv brokenness. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30633>
This commit is contained in:
parent
6405153a07
commit
f690592692
1 changed files with 52 additions and 0 deletions
|
|
@ -20,6 +20,7 @@
|
|||
#include "agx_nir.h"
|
||||
#include "glsl_types.h"
|
||||
#include "nir.h"
|
||||
#include "nir_builtin_builder.h"
|
||||
#include "nir_intrinsics.h"
|
||||
#include "nir_intrinsics_indices.h"
|
||||
#include "shader_enums.h"
|
||||
|
|
@ -3360,6 +3361,55 @@ agx_link_libagx(nir_shader *nir, const nir_shader *libagx)
|
|||
glsl_get_cl_type_size_align);
|
||||
}
|
||||
|
||||
/*
|
||||
* The hardware frcp instruction is sometimes off by 1 ULP. For correctly
|
||||
* rounded frcp, a refinement step is required. This routine has been
|
||||
* exhaustively tested with a modified math_bruteforce.
|
||||
*
|
||||
* While Khronos APIs allow 2.5 ULP error for divides, nir_lower_idiv relies on
|
||||
* correctly rounded frcp. This is therefore load bearing for integer division
|
||||
* on all APIs.
|
||||
*/
|
||||
static nir_def *
|
||||
libagx_frcp(nir_builder *b, nir_def *x)
|
||||
{
|
||||
nir_def *u = nir_frcp(b, x);
|
||||
|
||||
/* Do 1 Newton-Raphson refinement step.
|
||||
*
|
||||
* Define f(u) = xu - 1. Then f(u) = 0 iff u = 1/x. Newton's method gives:
|
||||
*
|
||||
* u_2 = u - f(u) / f'(u) = u - (xu - 1) / x
|
||||
*
|
||||
* Our original guess is close, so we approximate (1 / x) by u:
|
||||
*
|
||||
* u_2 = u - u(xu - 1) = u + u(1 - xu)
|
||||
* = fma(fma(-x, u, 1), u, u)
|
||||
*/
|
||||
nir_def *one = nir_imm_float(b, 1.0);
|
||||
nir_def *u_2 = nir_ffma(b, nir_ffma(b, nir_fneg(b, x), u, one), u, u);
|
||||
|
||||
/* If the original value was infinite, frcp will generate the correct zero.
|
||||
* However, the Newton-Raphson step would multiply 0 * Inf and get a NaN. So
|
||||
* skip the refinement step for infinite inputs. We do this backwards,
|
||||
* checking whether the refined result is NaN, since we can implement this
|
||||
* check in a single fcmpsel instruction. The other case where the refinement
|
||||
* is NaN is a NaN input, in which skipping refinement is acceptable.
|
||||
*/
|
||||
return nir_bcsel(b, nir_fisnan(b, u_2), u, u_2);
|
||||
}
|
||||
|
||||
static bool
|
||||
agx_nir_lower_fdiv(nir_builder *b, nir_alu_instr *alu, void *_)
|
||||
{
|
||||
if (alu->op != nir_op_frcp || !alu->exact || alu->def.bit_size != 32)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_before_instr(&alu->instr);
|
||||
nir_def_replace(&alu->def, libagx_frcp(b, nir_ssa_for_alu_src(b, alu, 0)));
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Preprocess NIR independent of shader state */
|
||||
void
|
||||
agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx)
|
||||
|
|
@ -3404,6 +3454,8 @@ agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx)
|
|||
nir_metadata_control_flow, NULL);
|
||||
NIR_PASS(_, nir, agx_nir_lower_subgroups);
|
||||
NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
|
||||
NIR_PASS(_, nir, nir_shader_alu_pass, agx_nir_lower_fdiv,
|
||||
nir_metadata_control_flow, NULL);
|
||||
|
||||
/* After lowering, run through the standard suite of NIR optimizations. We
|
||||
* will run through the loop later, once we have the shader key, but if we
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue