mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 02:50:09 +01:00
nir/lower_idiv: add new llvm-based path
v2: make variable names snake_case v2: minor cleanups in emit_udiv() v2: fix Panfrost build failure v3: use an enum instead of a boolean flag in nir_lower_idiv()'s signature v4: remove nir_op_urcp v5: drop nv50 path v5: rebase v6: add back nv50 path v6: add comment for nir_lower_idiv_path enum v7: rename _nv50/_llvm to _fast/_precise v8: fix etnaviv build failure Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
This commit is contained in:
parent
f729ecefef
commit
8b98d0954e
8 changed files with 136 additions and 17 deletions
|
|
@ -1328,7 +1328,7 @@ setup_isel_context(Program* program,
|
||||||
nir_lower_iabs64));
|
nir_lower_iabs64));
|
||||||
|
|
||||||
nir_opt_idiv_const(nir, 32);
|
nir_opt_idiv_const(nir, 32);
|
||||||
nir_lower_idiv(nir); // TODO: use the LLVM path once !1239 is merged
|
nir_lower_idiv(nir, nir_lower_idiv_fast); // TODO: use the LLVM path once !1239 is merged
|
||||||
|
|
||||||
/* optimize the lowered ALU operations */
|
/* optimize the lowered ALU operations */
|
||||||
nir_copy_prop(nir);
|
nir_copy_prop(nir);
|
||||||
|
|
|
||||||
|
|
@ -932,7 +932,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
|
||||||
NIR_PASS_V(c->s, v3d_nir_lower_io, c);
|
NIR_PASS_V(c->s, v3d_nir_lower_io, c);
|
||||||
NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
|
NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
|
||||||
NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
|
NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
|
||||||
NIR_PASS_V(c->s, nir_lower_idiv);
|
NIR_PASS_V(c->s, nir_lower_idiv, nir_lower_idiv_fast);
|
||||||
|
|
||||||
v3d_optimize_nir(c->s);
|
v3d_optimize_nir(c->s);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3917,7 +3917,19 @@ enum nir_lower_non_uniform_access_type {
|
||||||
bool nir_lower_non_uniform_access(nir_shader *shader,
|
bool nir_lower_non_uniform_access(nir_shader *shader,
|
||||||
enum nir_lower_non_uniform_access_type);
|
enum nir_lower_non_uniform_access_type);
|
||||||
|
|
||||||
bool nir_lower_idiv(nir_shader *shader);
|
enum nir_lower_idiv_path {
|
||||||
|
/* This path is based on NV50LegalizeSSA::handleDIV(). It is the faster of
|
||||||
|
* the two but it is not exact in some cases (for example, 1091317713u /
|
||||||
|
* 1034u gives 5209173 instead of 1055432) */
|
||||||
|
nir_lower_idiv_fast,
|
||||||
|
/* This path is based on AMDGPUTargetLowering::LowerUDIVREM() and
|
||||||
|
* AMDGPUTargetLowering::LowerSDIVREM(). It requires more instructions than
|
||||||
|
* the nv50 path and many of them are integer multiplications, so it is
|
||||||
|
* probably slower. It should always return the correct result, though. */
|
||||||
|
nir_lower_idiv_precise,
|
||||||
|
};
|
||||||
|
|
||||||
|
bool nir_lower_idiv(nir_shader *shader, enum nir_lower_idiv_path path);
|
||||||
|
|
||||||
bool nir_lower_input_attachments(nir_shader *shader, bool use_fragcoord_sysval);
|
bool nir_lower_input_attachments(nir_shader *shader, bool use_fragcoord_sysval);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -27,13 +27,17 @@
|
||||||
#include "nir.h"
|
#include "nir.h"
|
||||||
#include "nir_builder.h"
|
#include "nir_builder.h"
|
||||||
|
|
||||||
/* Lowers idiv/udiv/umod
|
/* Has two paths
|
||||||
* Based on NV50LegalizeSSA::handleDIV()
|
* One (nir_lower_idiv_fast) lowers idiv/udiv/umod and is based on
|
||||||
|
* NV50LegalizeSSA::handleDIV()
|
||||||
*
|
*
|
||||||
* Note that this is probably not enough precision for compute shaders.
|
* Note that this path probably does not have not enough precision for
|
||||||
* Perhaps we want a second higher precision (looping) version of this?
|
* compute shaders. Perhaps we want a second higher precision (looping)
|
||||||
* Or perhaps we assume if you can do compute shaders you can also
|
* version of this? Or perhaps we assume if you can do compute shaders you
|
||||||
* branch out to a pre-optimized shader library routine..
|
* can also branch out to a pre-optimized shader library routine..
|
||||||
|
*
|
||||||
|
* The other path (nir_lower_idiv_precise) is based off of code used by LLVM's
|
||||||
|
* AMDGPU target. It should handle 32-bit idiv/irem/imod/udiv/umod exactly.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
|
|
@ -130,8 +134,109 @@ convert_instr(nir_builder *bld, nir_alu_instr *alu)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ported from LLVM's AMDGPUTargetLowering::LowerUDIVREM */
|
||||||
|
static nir_ssa_def *
|
||||||
|
emit_udiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, bool modulo)
|
||||||
|
{
|
||||||
|
nir_ssa_def *rcp = nir_frcp(bld, nir_u2f32(bld, denom));
|
||||||
|
rcp = nir_f2u32(bld, nir_fmul_imm(bld, rcp, 4294967296.0));
|
||||||
|
nir_ssa_def *rcp_lo = nir_imul(bld, rcp, denom);
|
||||||
|
nir_ssa_def *rcp_hi = nir_umul_high(bld, rcp, denom);
|
||||||
|
nir_ssa_def *rcp_hi_ne_zero = nir_ine(bld, rcp_hi, nir_imm_int(bld, 0));
|
||||||
|
nir_ssa_def *neg_rcp_lo = nir_ineg(bld, rcp_lo);
|
||||||
|
nir_ssa_def *abs_rcp_lo = nir_bcsel(bld, rcp_hi_ne_zero, rcp_lo, neg_rcp_lo);
|
||||||
|
nir_ssa_def *e = nir_umul_high(bld, abs_rcp_lo, rcp);
|
||||||
|
nir_ssa_def *rcp_plus_e = nir_iadd(bld, rcp, e);
|
||||||
|
nir_ssa_def *rcp_minus_e = nir_isub(bld, rcp, e);
|
||||||
|
nir_ssa_def *tmp0 = nir_bcsel(bld, rcp_hi_ne_zero, rcp_minus_e, rcp_plus_e);
|
||||||
|
nir_ssa_def *quotient = nir_umul_high(bld, tmp0, numer);
|
||||||
|
nir_ssa_def *num_s_remainder = nir_imul(bld, quotient, denom);
|
||||||
|
nir_ssa_def *remainder = nir_isub(bld, numer, num_s_remainder);
|
||||||
|
nir_ssa_def *remainder_ge_den = nir_uge(bld, remainder, denom);
|
||||||
|
nir_ssa_def *remainder_ge_zero = nir_uge(bld, numer, num_s_remainder);
|
||||||
|
nir_ssa_def *tmp1 = nir_iand(bld, remainder_ge_den, remainder_ge_zero);
|
||||||
|
|
||||||
|
if (modulo) {
|
||||||
|
nir_ssa_def *rem = nir_bcsel(bld, tmp1,
|
||||||
|
nir_isub(bld, remainder, denom), remainder);
|
||||||
|
return nir_bcsel(bld, remainder_ge_zero,
|
||||||
|
rem, nir_iadd(bld, remainder, denom));
|
||||||
|
} else {
|
||||||
|
nir_ssa_def *one = nir_imm_int(bld, 1);
|
||||||
|
nir_ssa_def *div = nir_bcsel(bld, tmp1,
|
||||||
|
nir_iadd(bld, quotient, one), quotient);
|
||||||
|
return nir_bcsel(bld, remainder_ge_zero,
|
||||||
|
div, nir_isub(bld, quotient, one));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ported from LLVM's AMDGPUTargetLowering::LowerSDIVREM */
|
||||||
|
static nir_ssa_def *
|
||||||
|
emit_idiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, nir_op op)
|
||||||
|
{
|
||||||
|
nir_ssa_def *lh_sign = nir_ilt(bld, numer, nir_imm_int(bld, 0));
|
||||||
|
nir_ssa_def *rh_sign = nir_ilt(bld, denom, nir_imm_int(bld, 0));
|
||||||
|
lh_sign = nir_bcsel(bld, lh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0));
|
||||||
|
rh_sign = nir_bcsel(bld, rh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0));
|
||||||
|
|
||||||
|
nir_ssa_def *lhs = nir_iadd(bld, numer, lh_sign);
|
||||||
|
nir_ssa_def *rhs = nir_iadd(bld, denom, rh_sign);
|
||||||
|
lhs = nir_ixor(bld, lhs, lh_sign);
|
||||||
|
rhs = nir_ixor(bld, rhs, rh_sign);
|
||||||
|
|
||||||
|
if (op == nir_op_idiv) {
|
||||||
|
nir_ssa_def *d_sign = nir_ixor(bld, lh_sign, rh_sign);
|
||||||
|
nir_ssa_def *res = emit_udiv(bld, lhs, rhs, false);
|
||||||
|
res = nir_ixor(bld, res, d_sign);
|
||||||
|
return nir_isub(bld, res, d_sign);
|
||||||
|
} else {
|
||||||
|
nir_ssa_def *res = emit_udiv(bld, lhs, rhs, true);
|
||||||
|
res = nir_ixor(bld, res, lh_sign);
|
||||||
|
res = nir_isub(bld, res, lh_sign);
|
||||||
|
if (op == nir_op_imod) {
|
||||||
|
nir_ssa_def *cond = nir_ieq(bld, res, nir_imm_int(bld, 0));
|
||||||
|
cond = nir_ior(bld, nir_ieq(bld, lh_sign, rh_sign), cond);
|
||||||
|
res = nir_bcsel(bld, cond, res, nir_iadd(bld, res, denom));
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
convert_impl(nir_function_impl *impl)
|
convert_instr_precise(nir_builder *bld, nir_alu_instr *alu)
|
||||||
|
{
|
||||||
|
nir_op op = alu->op;
|
||||||
|
|
||||||
|
if ((op != nir_op_idiv) &&
|
||||||
|
(op != nir_op_imod) &&
|
||||||
|
(op != nir_op_irem) &&
|
||||||
|
(op != nir_op_udiv) &&
|
||||||
|
(op != nir_op_umod))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (alu->dest.dest.ssa.bit_size != 32)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
bld->cursor = nir_before_instr(&alu->instr);
|
||||||
|
|
||||||
|
nir_ssa_def *numer = nir_ssa_for_alu_src(bld, alu, 0);
|
||||||
|
nir_ssa_def *denom = nir_ssa_for_alu_src(bld, alu, 1);
|
||||||
|
|
||||||
|
nir_ssa_def *res = NULL;
|
||||||
|
|
||||||
|
if (op == nir_op_udiv || op == nir_op_umod)
|
||||||
|
res = emit_udiv(bld, numer, denom, op == nir_op_umod);
|
||||||
|
else
|
||||||
|
res = emit_idiv(bld, numer, denom, op);
|
||||||
|
|
||||||
|
assert(alu->dest.dest.is_ssa);
|
||||||
|
nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(res));
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
convert_impl(nir_function_impl *impl, enum nir_lower_idiv_path path)
|
||||||
{
|
{
|
||||||
nir_builder b;
|
nir_builder b;
|
||||||
nir_builder_init(&b, impl);
|
nir_builder_init(&b, impl);
|
||||||
|
|
@ -139,7 +244,9 @@ convert_impl(nir_function_impl *impl)
|
||||||
|
|
||||||
nir_foreach_block(block, impl) {
|
nir_foreach_block(block, impl) {
|
||||||
nir_foreach_instr_safe(instr, block) {
|
nir_foreach_instr_safe(instr, block) {
|
||||||
if (instr->type == nir_instr_type_alu)
|
if (instr->type == nir_instr_type_alu && path == nir_lower_idiv_precise)
|
||||||
|
progress |= convert_instr_precise(&b, nir_instr_as_alu(instr));
|
||||||
|
else if (instr->type == nir_instr_type_alu)
|
||||||
progress |= convert_instr(&b, nir_instr_as_alu(instr));
|
progress |= convert_instr(&b, nir_instr_as_alu(instr));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -151,13 +258,13 @@ convert_impl(nir_function_impl *impl)
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
nir_lower_idiv(nir_shader *shader)
|
nir_lower_idiv(nir_shader *shader, enum nir_lower_idiv_path path)
|
||||||
{
|
{
|
||||||
bool progress = false;
|
bool progress = false;
|
||||||
|
|
||||||
nir_foreach_function(function, shader) {
|
nir_foreach_function(function, shader) {
|
||||||
if (function->impl)
|
if (function->impl)
|
||||||
progress |= convert_impl(function->impl);
|
progress |= convert_impl(function->impl, path);
|
||||||
}
|
}
|
||||||
|
|
||||||
return progress;
|
return progress;
|
||||||
|
|
|
||||||
|
|
@ -277,7 +277,7 @@ ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
|
||||||
* NOTE that UBO analysis pass should only be done once, before variants
|
* NOTE that UBO analysis pass should only be done once, before variants
|
||||||
*/
|
*/
|
||||||
const bool ubo_progress = !key && OPT(s, ir3_nir_analyze_ubo_ranges, shader);
|
const bool ubo_progress = !key && OPT(s, ir3_nir_analyze_ubo_ranges, shader);
|
||||||
const bool idiv_progress = OPT(s, nir_lower_idiv);
|
const bool idiv_progress = OPT(s, nir_lower_idiv, nir_lower_idiv_fast);
|
||||||
if (ubo_progress || idiv_progress)
|
if (ubo_progress || idiv_progress)
|
||||||
ir3_optimize_loop(s);
|
ir3_optimize_loop(s);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -762,7 +762,7 @@ etna_compile_shader_nir(struct etna_shader_variant *v)
|
||||||
OPT_V(s, nir_opt_algebraic);
|
OPT_V(s, nir_opt_algebraic);
|
||||||
OPT_V(s, nir_lower_bool_to_float);
|
OPT_V(s, nir_lower_bool_to_float);
|
||||||
} else {
|
} else {
|
||||||
OPT_V(s, nir_lower_idiv);
|
OPT_V(s, nir_lower_idiv, nir_lower_idiv_fast);
|
||||||
OPT_V(s, nir_lower_bool_to_int32);
|
OPT_V(s, nir_lower_bool_to_int32);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2321,7 +2321,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
|
||||||
|
|
||||||
NIR_PASS_V(c->s, vc4_nir_lower_io, c);
|
NIR_PASS_V(c->s, vc4_nir_lower_io, c);
|
||||||
NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c);
|
NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c);
|
||||||
NIR_PASS_V(c->s, nir_lower_idiv);
|
NIR_PASS_V(c->s, nir_lower_idiv, nir_lower_idiv_fast);
|
||||||
|
|
||||||
vc4_optimize_nir(c->s);
|
vc4_optimize_nir(c->s);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -482,7 +482,7 @@ optimise_nir(nir_shader *nir)
|
||||||
|
|
||||||
NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
|
NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
|
||||||
NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
|
NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
|
||||||
NIR_PASS(progress, nir, nir_lower_idiv);
|
NIR_PASS(progress, nir, nir_lower_idiv, nir_lower_idiv_fast);
|
||||||
|
|
||||||
nir_lower_tex_options lower_tex_options = {
|
nir_lower_tex_options lower_tex_options = {
|
||||||
.lower_txs_lod = true,
|
.lower_txs_lod = true,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue