mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 07:20:10 +01:00
nir: Optimize uniform iadd, fadd, and ixor reduction operations
This adds optimizations for iadd, fadd, and ixor with reduce, inclusive scan, and exclusive scan. NOTE: The fadd and ixor optimizations had no shader-db or fossil-db changes on any Intel platform. NOTE 2: This change "fixes" arb_compute_variable_group_size-local-size and base-local-size.shader_test on DG2 and MTL. This is just changing the code path taken to not use whatever path was not working properly before. This is a subset of the things optimized by ACO. See also https://gitlab.freedesktop.org/mesa/mesa/-/issues/3731#note_682802. The min, max, iand, and ior exclusive_scan optimizations are not implemented. Broadwell on shader-db is not happy. I have not investigated. v2: Silence some warnings about discarding const. v3: Rename mbcnt to count_active_invocations. Add a big comment explaining the differences between the two paths. Suggested by Rhys. shader-db: All Gfx9 and newer platforms had similar results. (Ice Lake shown) total instructions in shared programs: 20300384 -> 20299545 (<.01%) instructions in affected programs: 19167 -> 18328 (-4.38%) helped: 35 / HURT: 0 total cycles in shared programs: 842809750 -> 842766381 (<.01%) cycles in affected programs: 2160249 -> 2116880 (-2.01%) helped: 33 / HURT: 2 total spills in shared programs: 4632 -> 4626 (-0.13%) spills in affected programs: 206 -> 200 (-2.91%) helped: 3 / HURT: 0 total fills in shared programs: 5594 -> 5581 (-0.23%) fills in affected programs: 664 -> 651 (-1.96%) helped: 3 / HURT: 1 fossil-db results: All Intel platforms had similar results. (Ice Lake shown) Totals: Instrs: 165551893 -> 165513303 (-0.02%) Cycles: 15132539132 -> 15125314947 (-0.05%); split: -0.05%, +0.00% Spill count: 45258 -> 45204 (-0.12%) Fill count: 74286 -> 74157 (-0.17%) Scratch Memory Size: 2467840 -> 2451456 (-0.66%) Totals from 712 (0.11% of 656120) affected shaders: Instrs: 598931 -> 560341 (-6.44%) Cycles: 184650167 -> 177425982 (-3.91%); split: -3.95%, +0.04% Spill count: 983 -> 929 (-5.49%) Fill count: 2274 -> 2145 (-5.67%) Scratch Memory Size: 52224 -> 35840 (-31.37%) Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27044>
This commit is contained in:
parent
c63ea755fe
commit
535caaf3e0
3 changed files with 98 additions and 16 deletions
|
|
@ -6390,7 +6390,8 @@ bool nir_lower_undef_to_zero(nir_shader *shader);
|
|||
|
||||
bool nir_opt_uniform_atomics(nir_shader *shader);
|
||||
|
||||
bool nir_opt_uniform_subgroup(nir_shader *shader);
|
||||
bool nir_opt_uniform_subgroup(nir_shader *shader,
|
||||
const nir_lower_subgroups_options *);
|
||||
|
||||
bool nir_opt_vectorize(nir_shader *shader, nir_vectorize_cb filter,
|
||||
void *data);
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ opt_uniform_subgroup_filter(const nir_instr *instr, const void *_state)
|
|||
return !nir_src_is_divergent(intrin->src[0]);
|
||||
|
||||
case nir_intrinsic_reduce:
|
||||
case nir_intrinsic_exclusive_scan:
|
||||
case nir_intrinsic_inclusive_scan: {
|
||||
if (nir_src_is_divergent(intrin->src[0]))
|
||||
return false;
|
||||
|
|
@ -39,6 +40,11 @@ opt_uniform_subgroup_filter(const nir_instr *instr, const void *_state)
|
|||
const nir_op reduction_op = (nir_op) nir_intrinsic_reduction_op(intrin);
|
||||
|
||||
switch (reduction_op) {
|
||||
case nir_op_iadd:
|
||||
case nir_op_fadd:
|
||||
case nir_op_ixor:
|
||||
return true;
|
||||
|
||||
case nir_op_imin:
|
||||
case nir_op_umin:
|
||||
case nir_op_fmin:
|
||||
|
|
@ -47,9 +53,8 @@ opt_uniform_subgroup_filter(const nir_instr *instr, const void *_state)
|
|||
case nir_op_fmax:
|
||||
case nir_op_iand:
|
||||
case nir_op_ior:
|
||||
return true;
|
||||
return intrin->intrinsic != nir_intrinsic_exclusive_scan;
|
||||
|
||||
/* FINISHME: iadd, ixor, and fadd are also possible. */
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
|
@ -60,21 +65,94 @@ opt_uniform_subgroup_filter(const nir_instr *instr, const void *_state)
|
|||
}
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
count_active_invocations(nir_builder *b, nir_def *value, bool inclusive,
|
||||
bool has_mbcnt_amd)
|
||||
{
|
||||
/* For the non-inclusive case, the two paths are functionally the same.
|
||||
* For the inclusive case, the are similar but very subtly different.
|
||||
*
|
||||
* The bit_count path will mask "value" with the subgroup LE mask instead
|
||||
* of the subgroup LT mask. This is the definition of the inclusive count.
|
||||
*
|
||||
* AMD's mbcnt instruction always uses the subgroup LT mask. To perform the
|
||||
* inclusive count using mbcnt, two assumptions are made. First, trivially,
|
||||
* the current invocation is active. Second, the bit for the current
|
||||
* invocation in "value" is set. Since "value" is assumed to be the result
|
||||
* of ballot(true), the second condition will also be met.
|
||||
*
|
||||
* When those conditions are met, the inclusive count is the exclusive
|
||||
* count plus one.
|
||||
*/
|
||||
if (has_mbcnt_amd) {
|
||||
return nir_mbcnt_amd(b, value, nir_imm_int(b, (int) inclusive));
|
||||
} else {
|
||||
nir_def *mask = inclusive
|
||||
? nir_load_subgroup_le_mask(b, 1, 32)
|
||||
: nir_load_subgroup_lt_mask(b, 1, 32);
|
||||
|
||||
return nir_bit_count(b, nir_iand(b, value, mask));
|
||||
}
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
opt_uniform_subgroup_instr(nir_builder *b, nir_instr *instr, void *_state)
|
||||
{
|
||||
const nir_lower_subgroups_options *options = (nir_lower_subgroups_options *) _state;
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
|
||||
if (intrin->intrinsic == nir_intrinsic_reduce ||
|
||||
intrin->intrinsic == nir_intrinsic_inclusive_scan ||
|
||||
intrin->intrinsic == nir_intrinsic_exclusive_scan) {
|
||||
const nir_op reduction_op = (nir_op) nir_intrinsic_reduction_op(intrin);
|
||||
|
||||
if (reduction_op == nir_op_iadd ||
|
||||
reduction_op == nir_op_fadd ||
|
||||
reduction_op == nir_op_ixor) {
|
||||
nir_def *count;
|
||||
|
||||
nir_def *ballot = nir_ballot(b, options->ballot_components,
|
||||
options->ballot_bit_size, nir_imm_true(b));
|
||||
|
||||
if (intrin->intrinsic == nir_intrinsic_reduce) {
|
||||
count = nir_bit_count(b, ballot);
|
||||
} else {
|
||||
count = count_active_invocations(b, ballot,
|
||||
intrin->intrinsic == nir_intrinsic_inclusive_scan,
|
||||
false);
|
||||
}
|
||||
|
||||
const unsigned bit_size = intrin->src[0].ssa->bit_size;
|
||||
|
||||
if (reduction_op == nir_op_iadd) {
|
||||
return nir_imul(b,
|
||||
nir_u2uN(b, count, bit_size),
|
||||
intrin->src[0].ssa);
|
||||
} else if (reduction_op == nir_op_fadd) {
|
||||
return nir_fmul(b,
|
||||
nir_u2fN(b, count, bit_size),
|
||||
intrin->src[0].ssa);
|
||||
} else {
|
||||
return nir_imul(b,
|
||||
nir_u2uN(b,
|
||||
nir_iand(b, count, nir_imm_int(b, 1)),
|
||||
bit_size),
|
||||
intrin->src[0].ssa);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return intrin->src[0].ssa;
|
||||
}
|
||||
|
||||
bool
|
||||
nir_opt_uniform_subgroup(nir_shader *shader)
|
||||
nir_opt_uniform_subgroup(nir_shader *shader,
|
||||
const nir_lower_subgroups_options *options)
|
||||
{
|
||||
bool progress = nir_shader_lower_instructions(shader,
|
||||
opt_uniform_subgroup_filter,
|
||||
opt_uniform_subgroup_instr,
|
||||
NULL);
|
||||
(void *) options);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1699,12 +1699,14 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
|||
NIR_PASS(_, nir, nir_convert_to_lcssa, true, true);
|
||||
NIR_PASS_V(nir, nir_divergence_analysis);
|
||||
|
||||
static const nir_lower_subgroups_options subgroups_options = {
|
||||
.ballot_bit_size = 32,
|
||||
.ballot_components = 1,
|
||||
.lower_elect = true,
|
||||
.lower_subgroup_masks = true,
|
||||
};
|
||||
|
||||
if (OPT(nir_opt_uniform_atomics)) {
|
||||
const nir_lower_subgroups_options subgroups_options = {
|
||||
.ballot_bit_size = 32,
|
||||
.ballot_components = 1,
|
||||
.lower_elect = true,
|
||||
};
|
||||
OPT(nir_lower_subgroups, &subgroups_options);
|
||||
|
||||
if (OPT(nir_lower_int64))
|
||||
|
|
@ -1716,12 +1718,13 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
|
|||
/* nir_opt_uniform_subgroup can create some operations (e.g.,
|
||||
* load_subgroup_lt_mask) that need to be lowered again.
|
||||
*/
|
||||
if (OPT(nir_opt_uniform_subgroup)) {
|
||||
const nir_lower_subgroups_options subgroups_options = {
|
||||
.ballot_bit_size = 32,
|
||||
.ballot_components = 1,
|
||||
.lower_subgroup_masks = true,
|
||||
};
|
||||
if (OPT(nir_opt_uniform_subgroup, &subgroups_options)) {
|
||||
/* Some of the optimizations can generate 64-bit integer multiplication
|
||||
* that must be lowered.
|
||||
*/
|
||||
if (OPT(nir_lower_int64))
|
||||
brw_nir_optimize(nir, is_scalar, devinfo);
|
||||
|
||||
OPT(nir_lower_subgroups, &subgroups_options);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue