mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 03:00:11 +01:00
nir/opt_uniform_subgroup: optimize add/xor reduce of bcsel(div, con, con)
Foz-DB Navi48: Totals from 12 (0.01% of 97623) affected shaders: Instrs: 9207 -> 8973 (-2.54%) CodeSize: 54192 -> 52832 (-2.51%) VGPRs: 768 -> 480 (-37.50%) Latency: 39516 -> 38507 (-2.55%) InvThroughput: 10155 -> 9859 (-2.91%) PreSGPRs: 329 -> 332 (+0.91%) PreVGPRs: 268 -> 263 (-1.87%) VALU: 4393 -> 4257 (-3.10%) SALU: 1037 -> 1019 (-1.74%) VOPD: 602 -> 599 (-0.50%) Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38974>
This commit is contained in:
parent
0e5e1cb9b0
commit
71f0c0d6a6
1 changed files with 69 additions and 35 deletions
|
|
@ -94,6 +94,14 @@ parse_select_of_con_values(nir_builder *b, nir_def *def, struct select_info *inf
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static nir_def *
|
||||||
|
get_ballot(nir_builder *b, nir_def *cond,
|
||||||
|
const nir_lower_subgroups_options *options)
|
||||||
|
{
|
||||||
|
return nir_ballot(b, options->ballot_components,
|
||||||
|
options->ballot_bit_size, cond ? cond : nir_imm_true(b));
|
||||||
|
}
|
||||||
|
|
||||||
static nir_def *
|
static nir_def *
|
||||||
ballot_bit_count(nir_builder *b, nir_def *ballot)
|
ballot_bit_count(nir_builder *b, nir_def *ballot)
|
||||||
{
|
{
|
||||||
|
|
@ -103,25 +111,34 @@ ballot_bit_count(nir_builder *b, nir_def *ballot)
|
||||||
}
|
}
|
||||||
|
|
||||||
static nir_def *
|
static nir_def *
|
||||||
count_active_invocations(nir_builder *b, nir_def *value, bool inclusive,
|
count_active_invocations(nir_builder *b, nir_def *cond, bool inclusive,
|
||||||
const nir_lower_subgroups_options *options)
|
const nir_lower_subgroups_options *options)
|
||||||
{
|
{
|
||||||
|
nir_def *value = get_ballot(b, cond, options);
|
||||||
|
|
||||||
/* For the non-inclusive case, the two paths are functionally the same.
|
/* For the non-inclusive case, the two paths are functionally the same.
|
||||||
* For the inclusive case, the are similar but very subtly different.
|
* For the inclusive case, the are similar but very subtly different.
|
||||||
*
|
*
|
||||||
* The bit_count path will mask "value" with the subgroup LE mask instead
|
* The bit_count path will mask "value" with the subgroup LE mask instead
|
||||||
* of the subgroup LT mask. This is the definition of the inclusive count.
|
* of the subgroup LT mask. This is the definition of the inclusive count.
|
||||||
*
|
*
|
||||||
* AMD's mbcnt instruction always uses the subgroup LT mask. To perform the
|
* AMD's mbcnt instruction always uses the subgroup LT mask.
|
||||||
* inclusive count using mbcnt, two assumptions are made. First, trivially,
|
|
||||||
* the current invocation is active. Second, the bit for the current
|
|
||||||
* invocation in "value" is set. Since "value" is assumed to be the result
|
|
||||||
* of ballot(true), the second condition will also be met.
|
|
||||||
*
|
*
|
||||||
* When those conditions are met, the inclusive count is the exclusive
|
* When we know the condition is true, the bit for the current
|
||||||
* count plus one.
|
* invocation value[N] is 1. Therefore we can count value[0:N-1] and
|
||||||
|
* only need to add 1 for the inclusive count.
|
||||||
|
*
|
||||||
|
* When we can't make any assumption about the active invocations' bits
|
||||||
|
* because the condition is not known true, transform the inclusive case
|
||||||
|
* to an exclusive count by counting value[1:N] and adding value[0]
|
||||||
|
* in the accumulator.
|
||||||
|
* The additional operations here can use the uniform ALU.
|
||||||
*/
|
*/
|
||||||
if (options->lower_ballot_bit_count_to_mbcnt_amd) {
|
if (options->lower_ballot_bit_count_to_mbcnt_amd && inclusive && cond) {
|
||||||
|
nir_def *first_bit = nir_iand_imm(b, nir_u2u32(b, value), 1);
|
||||||
|
value = nir_ushr_imm(b, value, 1);
|
||||||
|
return nir_mbcnt_amd(b, value, first_bit);
|
||||||
|
} else if (options->lower_ballot_bit_count_to_mbcnt_amd) {
|
||||||
return nir_mbcnt_amd(b, value, nir_imm_int(b, (int)inclusive));
|
return nir_mbcnt_amd(b, value, nir_imm_int(b, (int)inclusive));
|
||||||
} else {
|
} else {
|
||||||
nir_def *mask =
|
nir_def *mask =
|
||||||
|
|
@ -134,6 +151,31 @@ count_active_invocations(nir_builder *b, nir_def *value, bool inclusive,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static nir_def *
|
||||||
|
conditional_add_xor_reduce(nir_builder *b, nir_intrinsic_instr *intrin, nir_def *cond, nir_def *src,
|
||||||
|
const nir_lower_subgroups_options *options)
|
||||||
|
{
|
||||||
|
const nir_op reduction_op = (nir_op)nir_intrinsic_reduction_op(intrin);
|
||||||
|
nir_def *count;
|
||||||
|
|
||||||
|
if (intrin->intrinsic == nir_intrinsic_reduce) {
|
||||||
|
count = ballot_bit_count(b, get_ballot(b, cond, options));
|
||||||
|
} else {
|
||||||
|
count = count_active_invocations(b, cond,
|
||||||
|
intrin->intrinsic == nir_intrinsic_inclusive_scan,
|
||||||
|
options);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (reduction_op == nir_op_iadd) {
|
||||||
|
return nir_imul(b, nir_u2uN(b, count, src->bit_size), src);
|
||||||
|
} else if (reduction_op == nir_op_fadd) {
|
||||||
|
return nir_fmul(b, nir_u2fN(b, count, src->bit_size), src);
|
||||||
|
} else {
|
||||||
|
count = nir_iand(b, count, nir_imm_int(b, 1));
|
||||||
|
return nir_imul(b, nir_u2uN(b, count, src->bit_size), src);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
opt_uniform_subgroup_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *_state)
|
opt_uniform_subgroup_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *_state)
|
||||||
{
|
{
|
||||||
|
|
@ -188,38 +230,30 @@ opt_uniform_subgroup_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *_s
|
||||||
case nir_op_iadd:
|
case nir_op_iadd:
|
||||||
case nir_op_fadd:
|
case nir_op_fadd:
|
||||||
case nir_op_ixor: {
|
case nir_op_ixor: {
|
||||||
if (nir_src_is_divergent(&intrin->src[0]))
|
|
||||||
return false;
|
|
||||||
if (nir_intrinsic_has_cluster_size(intrin) && nir_intrinsic_cluster_size(intrin))
|
if (nir_intrinsic_has_cluster_size(intrin) && nir_intrinsic_cluster_size(intrin))
|
||||||
return false;
|
return false;
|
||||||
nir_def *count;
|
|
||||||
|
|
||||||
nir_def *ballot = nir_ballot(b, options->ballot_components,
|
if (!nir_src_is_divergent(&intrin->src[0])) {
|
||||||
options->ballot_bit_size, nir_imm_true(b));
|
replacement = conditional_add_xor_reduce(b, intrin, NULL, intrin->src[0].ssa, options);
|
||||||
|
|
||||||
if (intrin->intrinsic == nir_intrinsic_reduce) {
|
|
||||||
count = ballot_bit_count(b, ballot);
|
|
||||||
} else {
|
} else {
|
||||||
count = count_active_invocations(b, ballot,
|
/* Ballot must be scalar. */
|
||||||
intrin->intrinsic == nir_intrinsic_inclusive_scan,
|
if (intrin->def.num_components != 1)
|
||||||
options);
|
return false;
|
||||||
|
|
||||||
|
struct select_info sel;
|
||||||
|
if (!parse_select_of_con_values(b, intrin->src[0].ssa, &sel))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
nir_def *parts[2];
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < 2; i++) {
|
||||||
|
nir_def *cond = i ? nir_inot(b, sel.cond) : sel.cond;
|
||||||
|
parts[i] = conditional_add_xor_reduce(b, intrin, cond, sel.values[i], options);
|
||||||
|
}
|
||||||
|
|
||||||
|
replacement = nir_build_alu2(b, reduction_op, parts[0], parts[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
const unsigned bit_size = intrin->src[0].ssa->bit_size;
|
|
||||||
|
|
||||||
if (reduction_op == nir_op_iadd) {
|
|
||||||
replacement = nir_imul(b, nir_u2uN(b, count, bit_size),
|
|
||||||
intrin->src[0].ssa);
|
|
||||||
} else if (reduction_op == nir_op_fadd) {
|
|
||||||
replacement = nir_fmul(b, nir_u2fN(b, count, bit_size),
|
|
||||||
intrin->src[0].ssa);
|
|
||||||
} else {
|
|
||||||
replacement = nir_imul(b,
|
|
||||||
nir_u2uN(b,
|
|
||||||
nir_iand(b, count, nir_imm_int(b, 1)),
|
|
||||||
bit_size),
|
|
||||||
intrin->src[0].ssa);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue