2020-09-01 16:31:37 +01:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2020 Valve Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Optimizes atomics (with uniform offsets) using subgroup operations to ensure
|
|
|
|
|
* only one atomic operation is done per subgroup. So res = atomicAdd(addr, 1)
|
|
|
|
|
* would become something like:
|
|
|
|
|
*
|
|
|
|
|
* uint tmp = subgroupAdd(1);
|
|
|
|
|
* uint res;
|
|
|
|
|
* if (subgroupElect())
|
|
|
|
|
* res = atomicAdd(addr, tmp);
|
|
|
|
|
* res = subgroupBroadcastFirst(res) + subgroupExclusiveAdd(1);
|
|
|
|
|
*
|
|
|
|
|
* This pass requires and preserves LCSSA and divergence information.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "nir/nir.h"
|
|
|
|
|
#include "nir/nir_builder.h"
|
|
|
|
|
|
|
|
|
|
static nir_op
|
2023-05-12 10:55:28 -04:00
|
|
|
atomic_op_to_alu(nir_atomic_op op)
|
2020-09-01 16:31:37 +01:00
|
|
|
{
|
|
|
|
|
switch (op) {
|
2023-08-08 12:00:35 -05:00
|
|
|
case nir_atomic_op_iadd:
|
|
|
|
|
return nir_op_iadd;
|
|
|
|
|
case nir_atomic_op_imin:
|
|
|
|
|
return nir_op_imin;
|
|
|
|
|
case nir_atomic_op_umin:
|
|
|
|
|
return nir_op_umin;
|
|
|
|
|
case nir_atomic_op_imax:
|
|
|
|
|
return nir_op_imax;
|
|
|
|
|
case nir_atomic_op_umax:
|
|
|
|
|
return nir_op_umax;
|
|
|
|
|
case nir_atomic_op_iand:
|
|
|
|
|
return nir_op_iand;
|
|
|
|
|
case nir_atomic_op_ior:
|
|
|
|
|
return nir_op_ior;
|
|
|
|
|
case nir_atomic_op_ixor:
|
|
|
|
|
return nir_op_ixor;
|
|
|
|
|
case nir_atomic_op_fadd:
|
|
|
|
|
return nir_op_fadd;
|
|
|
|
|
case nir_atomic_op_fmin:
|
|
|
|
|
return nir_op_fmin;
|
|
|
|
|
case nir_atomic_op_fmax:
|
|
|
|
|
return nir_op_fmax;
|
2023-05-12 10:55:28 -04:00
|
|
|
|
|
|
|
|
/* We don't handle exchanges or wraps */
|
|
|
|
|
case nir_atomic_op_xchg:
|
|
|
|
|
case nir_atomic_op_cmpxchg:
|
|
|
|
|
case nir_atomic_op_fcmpxchg:
|
|
|
|
|
case nir_atomic_op_inc_wrap:
|
|
|
|
|
case nir_atomic_op_dec_wrap:
|
|
|
|
|
return nir_num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unreachable("Unknown atomic op");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static nir_op
|
|
|
|
|
parse_atomic_op(nir_intrinsic_instr *intr, unsigned *offset_src,
|
|
|
|
|
unsigned *data_src, unsigned *offset2_src)
|
|
|
|
|
{
|
|
|
|
|
switch (intr->intrinsic) {
|
|
|
|
|
case nir_intrinsic_ssbo_atomic:
|
|
|
|
|
*offset_src = 1;
|
|
|
|
|
*data_src = 2;
|
|
|
|
|
*offset2_src = *offset_src;
|
|
|
|
|
return atomic_op_to_alu(nir_intrinsic_atomic_op(intr));
|
|
|
|
|
case nir_intrinsic_shared_atomic:
|
|
|
|
|
case nir_intrinsic_global_atomic:
|
|
|
|
|
case nir_intrinsic_deref_atomic:
|
|
|
|
|
*offset_src = 0;
|
|
|
|
|
*data_src = 1;
|
|
|
|
|
*offset2_src = *offset_src;
|
|
|
|
|
return atomic_op_to_alu(nir_intrinsic_atomic_op(intr));
|
|
|
|
|
case nir_intrinsic_global_atomic_amd:
|
|
|
|
|
*offset_src = 0;
|
|
|
|
|
*data_src = 1;
|
|
|
|
|
*offset2_src = 2;
|
|
|
|
|
return atomic_op_to_alu(nir_intrinsic_atomic_op(intr));
|
|
|
|
|
case nir_intrinsic_image_deref_atomic:
|
|
|
|
|
case nir_intrinsic_image_atomic:
|
|
|
|
|
case nir_intrinsic_bindless_image_atomic:
|
|
|
|
|
*offset_src = 1;
|
|
|
|
|
*data_src = 3;
|
|
|
|
|
*offset2_src = *offset_src;
|
|
|
|
|
return atomic_op_to_alu(nir_intrinsic_atomic_op(intr));
|
|
|
|
|
|
2020-09-01 16:31:37 +01:00
|
|
|
default:
|
|
|
|
|
return nir_num_opcodes;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-01-21 17:27:13 +00:00
|
|
|
static unsigned
|
|
|
|
|
get_dim(nir_ssa_scalar scalar)
|
|
|
|
|
{
|
|
|
|
|
if (!scalar.def->divergent)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
if (scalar.def->parent_instr->type == nir_instr_type_intrinsic) {
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(scalar.def->parent_instr);
|
|
|
|
|
if (intrin->intrinsic == nir_intrinsic_load_subgroup_invocation)
|
|
|
|
|
return 0x8;
|
|
|
|
|
else if (intrin->intrinsic == nir_intrinsic_load_local_invocation_index)
|
|
|
|
|
return 0x7;
|
|
|
|
|
else if (intrin->intrinsic == nir_intrinsic_load_local_invocation_id)
|
|
|
|
|
return 1 << scalar.comp;
|
|
|
|
|
else if (intrin->intrinsic == nir_intrinsic_load_global_invocation_index)
|
|
|
|
|
return 0x7;
|
|
|
|
|
else if (intrin->intrinsic == nir_intrinsic_load_global_invocation_id)
|
|
|
|
|
return 1 << scalar.comp;
|
|
|
|
|
} else if (nir_ssa_scalar_is_alu(scalar)) {
|
|
|
|
|
if (nir_ssa_scalar_alu_op(scalar) == nir_op_iadd ||
|
|
|
|
|
nir_ssa_scalar_alu_op(scalar) == nir_op_imul) {
|
|
|
|
|
nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
|
|
|
|
|
nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
|
|
|
|
|
|
|
|
|
|
unsigned src0_dim = get_dim(src0);
|
|
|
|
|
if (!src0_dim && src0.def->divergent)
|
|
|
|
|
return 0;
|
|
|
|
|
unsigned src1_dim = get_dim(src1);
|
|
|
|
|
if (!src1_dim && src1.def->divergent)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
return src0_dim | src1_dim;
|
|
|
|
|
} else if (nir_ssa_scalar_alu_op(scalar) == nir_op_ishl) {
|
|
|
|
|
nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(scalar, 0);
|
|
|
|
|
nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(scalar, 1);
|
|
|
|
|
return src1.def->divergent ? 0 : get_dim(src0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-01 17:39:35 +01:00
|
|
|
/* Returns a bitmask of invocation indices that are compared against a subgroup
|
|
|
|
|
* uniform value.
|
|
|
|
|
*/
|
|
|
|
|
static unsigned
|
|
|
|
|
match_invocation_comparison(nir_ssa_scalar scalar)
|
|
|
|
|
{
|
2021-01-21 17:27:31 +00:00
|
|
|
bool is_alu = nir_ssa_scalar_is_alu(scalar);
|
|
|
|
|
if (is_alu && nir_ssa_scalar_alu_op(scalar) == nir_op_iand) {
|
2020-09-01 17:39:35 +01:00
|
|
|
return match_invocation_comparison(nir_ssa_scalar_chase_alu_src(scalar, 0)) |
|
|
|
|
|
match_invocation_comparison(nir_ssa_scalar_chase_alu_src(scalar, 1));
|
2021-01-21 17:27:31 +00:00
|
|
|
} else if (is_alu && nir_ssa_scalar_alu_op(scalar) == nir_op_ieq) {
|
2021-01-21 17:27:13 +00:00
|
|
|
if (!nir_ssa_scalar_chase_alu_src(scalar, 0).def->divergent)
|
|
|
|
|
return get_dim(nir_ssa_scalar_chase_alu_src(scalar, 1));
|
|
|
|
|
if (!nir_ssa_scalar_chase_alu_src(scalar, 1).def->divergent)
|
|
|
|
|
return get_dim(nir_ssa_scalar_chase_alu_src(scalar, 0));
|
2020-09-01 17:39:35 +01:00
|
|
|
} else if (scalar.def->parent_instr->type == nir_instr_type_intrinsic) {
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(scalar.def->parent_instr);
|
|
|
|
|
if (intrin->intrinsic == nir_intrinsic_elect)
|
|
|
|
|
return 0x8;
|
|
|
|
|
}
|
2021-01-21 17:27:13 +00:00
|
|
|
|
|
|
|
|
return 0;
|
2020-09-01 17:39:35 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Returns true if the intrinsic is already conditional so that at most one
|
|
|
|
|
* invocation in the subgroup does the atomic.
|
|
|
|
|
*/
|
|
|
|
|
static bool
|
|
|
|
|
is_atomic_already_optimized(nir_shader *shader, nir_intrinsic_instr *instr)
|
|
|
|
|
{
|
|
|
|
|
unsigned dims = 0;
|
|
|
|
|
for (nir_cf_node *cf = &instr->instr.block->cf_node; cf; cf = cf->parent) {
|
|
|
|
|
if (cf->type == nir_cf_node_if) {
|
|
|
|
|
nir_block *first_then = nir_if_first_then_block(nir_cf_node_as_if(cf));
|
|
|
|
|
nir_block *last_then = nir_if_last_then_block(nir_cf_node_as_if(cf));
|
|
|
|
|
bool within_then = instr->instr.block->index >= first_then->index;
|
|
|
|
|
within_then = within_then && instr->instr.block->index <= last_then->index;
|
|
|
|
|
if (!within_then)
|
|
|
|
|
continue;
|
|
|
|
|
|
2023-08-08 12:00:35 -05:00
|
|
|
nir_ssa_scalar cond = { nir_cf_node_as_if(cf)->condition.ssa, 0 };
|
2020-09-01 17:39:35 +01:00
|
|
|
dims |= match_invocation_comparison(cond);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
nir/uniform_atomics: fix is_atomic_already_optimized without workgroups
dims_needed would have been zero, so this would always returned true for
non-compute stages.
Also fix this for variable workgroup sizes.
Improves Shadow of the Tomb Raider RX 6800 performance by 10.6%, 11.5% and
4.5% (day_of_dead, jungle and paititi scenes).
radv_perf before and after:
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'day_of_dead', 'avg_fps': '62.913333333333334', 'min_fps': '62.81', 'max_fps': '62.98', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'jungle', 'avg_fps': '64.02666666666666', 'min_fps': '63.93', 'max_fps': '64.11', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'paititi', 'avg_fps': '74.81666666666666', 'min_fps': '74.72', 'max_fps': '74.88', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'day_of_dead', 'avg_fps': '69.57', 'min_fps': '69.52', 'max_fps': '69.63', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'jungle', 'avg_fps': '71.41000000000001', 'min_fps': '71.31', 'max_fps': '71.5', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'paititi', 'avg_fps': '78.16666666666667', 'min_fps': '78.07', 'max_fps': '78.23', 'interations': '3'}
Performance now seems slightly better than AMDVLK 2021.Q4.3:
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'day_of_dead', 'avg_fps': '68.02666666666666', 'min_fps': '67.95', 'max_fps': '68.16', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'jungle', 'avg_fps': '70.24666666666667', 'min_fps': '69.83', 'max_fps': '70.51', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'paititi', 'avg_fps': '77.19', 'min_fps': '77.18', 'max_fps': '77.2', 'interations': '3'}
fossil-db (Sienna Cichlid):
Totals from 40 (0.03% of 134621) affected shaders:
CodeSize: 62676 -> 65996 (+5.30%)
Instrs: 11372 -> 12111 (+6.50%)
Latency: 144122 -> 142848 (-0.88%); split: -1.09%, +0.21%
InvThroughput: 19686 -> 19847 (+0.82%); split: -0.06%, +0.87%
VClause: 304 -> 306 (+0.66%)
SClause: 603 -> 604 (+0.17%); split: -0.83%, +1.00%
Copies: 780 -> 858 (+10.00%)
Branches: 235 -> 329 (+40.00%)
PreSGPRs: 1072 -> 1083 (+1.03%); split: -0.37%, +1.40%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14407>
2022-01-05 13:51:50 +00:00
|
|
|
if (gl_shader_stage_uses_workgroup(shader->info.stage)) {
|
|
|
|
|
unsigned dims_needed = 0;
|
|
|
|
|
for (unsigned i = 0; i < 3; i++)
|
|
|
|
|
dims_needed |= (shader->info.workgroup_size_variable ||
|
2023-08-08 12:00:35 -05:00
|
|
|
shader->info.workgroup_size[i] > 1)
|
|
|
|
|
<< i;
|
nir/uniform_atomics: fix is_atomic_already_optimized without workgroups
dims_needed would have been zero, so this would always returned true for
non-compute stages.
Also fix this for variable workgroup sizes.
Improves Shadow of the Tomb Raider RX 6800 performance by 10.6%, 11.5% and
4.5% (day_of_dead, jungle and paititi scenes).
radv_perf before and after:
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'day_of_dead', 'avg_fps': '62.913333333333334', 'min_fps': '62.81', 'max_fps': '62.98', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'jungle', 'avg_fps': '64.02666666666666', 'min_fps': '63.93', 'max_fps': '64.11', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'paititi', 'avg_fps': '74.81666666666666', 'min_fps': '74.72', 'max_fps': '74.88', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'day_of_dead', 'avg_fps': '69.57', 'min_fps': '69.52', 'max_fps': '69.63', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'jungle', 'avg_fps': '71.41000000000001', 'min_fps': '71.31', 'max_fps': '71.5', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'paititi', 'avg_fps': '78.16666666666667', 'min_fps': '78.07', 'max_fps': '78.23', 'interations': '3'}
Performance now seems slightly better than AMDVLK 2021.Q4.3:
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'day_of_dead', 'avg_fps': '68.02666666666666', 'min_fps': '67.95', 'max_fps': '68.16', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'jungle', 'avg_fps': '70.24666666666667', 'min_fps': '69.83', 'max_fps': '70.51', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'paititi', 'avg_fps': '77.19', 'min_fps': '77.18', 'max_fps': '77.2', 'interations': '3'}
fossil-db (Sienna Cichlid):
Totals from 40 (0.03% of 134621) affected shaders:
CodeSize: 62676 -> 65996 (+5.30%)
Instrs: 11372 -> 12111 (+6.50%)
Latency: 144122 -> 142848 (-0.88%); split: -1.09%, +0.21%
InvThroughput: 19686 -> 19847 (+0.82%); split: -0.06%, +0.87%
VClause: 304 -> 306 (+0.66%)
SClause: 603 -> 604 (+0.17%); split: -0.83%, +1.00%
Copies: 780 -> 858 (+10.00%)
Branches: 235 -> 329 (+40.00%)
PreSGPRs: 1072 -> 1083 (+1.03%); split: -0.37%, +1.40%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14407>
2022-01-05 13:51:50 +00:00
|
|
|
if ((dims & dims_needed) == dims_needed)
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2020-09-01 17:39:35 +01:00
|
|
|
|
nir/uniform_atomics: fix is_atomic_already_optimized without workgroups
dims_needed would have been zero, so this would always returned true for
non-compute stages.
Also fix this for variable workgroup sizes.
Improves Shadow of the Tomb Raider RX 6800 performance by 10.6%, 11.5% and
4.5% (day_of_dead, jungle and paititi scenes).
radv_perf before and after:
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'day_of_dead', 'avg_fps': '62.913333333333334', 'min_fps': '62.81', 'max_fps': '62.98', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'jungle', 'avg_fps': '64.02666666666666', 'min_fps': '63.93', 'max_fps': '64.11', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'paititi', 'avg_fps': '74.81666666666666', 'min_fps': '74.72', 'max_fps': '74.88', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'day_of_dead', 'avg_fps': '69.57', 'min_fps': '69.52', 'max_fps': '69.63', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'jungle', 'avg_fps': '71.41000000000001', 'min_fps': '71.31', 'max_fps': '71.5', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'paititi', 'avg_fps': '78.16666666666667', 'min_fps': '78.07', 'max_fps': '78.23', 'interations': '3'}
Performance now seems slightly better than AMDVLK 2021.Q4.3:
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'day_of_dead', 'avg_fps': '68.02666666666666', 'min_fps': '67.95', 'max_fps': '68.16', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'jungle', 'avg_fps': '70.24666666666667', 'min_fps': '69.83', 'max_fps': '70.51', 'interations': '3'}
{'app': 'SotTR', 'resolution': '3840x2160', 'preset': 'VeryHigh', 'antialiasing': 'off', 'scene': 'paititi', 'avg_fps': '77.19', 'min_fps': '77.18', 'max_fps': '77.2', 'interations': '3'}
fossil-db (Sienna Cichlid):
Totals from 40 (0.03% of 134621) affected shaders:
CodeSize: 62676 -> 65996 (+5.30%)
Instrs: 11372 -> 12111 (+6.50%)
Latency: 144122 -> 142848 (-0.88%); split: -1.09%, +0.21%
InvThroughput: 19686 -> 19847 (+0.82%); split: -0.06%, +0.87%
VClause: 304 -> 306 (+0.66%)
SClause: 603 -> 604 (+0.17%); split: -0.83%, +1.00%
Copies: 780 -> 858 (+10.00%)
Branches: 235 -> 329 (+40.00%)
PreSGPRs: 1072 -> 1083 (+1.03%); split: -0.37%, +1.40%
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14407>
2022-01-05 13:51:50 +00:00
|
|
|
return dims & 0x8;
|
2020-09-01 17:39:35 +01:00
|
|
|
}
|
|
|
|
|
|
2020-09-01 16:31:37 +01:00
|
|
|
/* Perform a reduction and/or exclusive scan. */
|
|
|
|
|
static void
|
|
|
|
|
reduce_data(nir_builder *b, nir_op op, nir_ssa_def *data,
|
|
|
|
|
nir_ssa_def **reduce, nir_ssa_def **scan)
|
|
|
|
|
{
|
2020-09-03 17:20:17 +01:00
|
|
|
if (scan) {
|
2023-08-08 12:00:35 -05:00
|
|
|
*scan = nir_exclusive_scan(b, data, .reduction_op = op);
|
2020-09-03 17:20:17 +01:00
|
|
|
if (reduce) {
|
|
|
|
|
nir_ssa_def *last_lane = nir_last_invocation(b);
|
|
|
|
|
nir_ssa_def *res = nir_build_alu(b, op, *scan, data, NULL, NULL);
|
|
|
|
|
*reduce = nir_read_invocation(b, res, last_lane);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
2023-08-08 12:00:35 -05:00
|
|
|
*reduce = nir_reduce(b, data, .reduction_op = op);
|
2020-09-01 16:31:37 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static nir_ssa_def *
|
|
|
|
|
optimize_atomic(nir_builder *b, nir_intrinsic_instr *intrin, bool return_prev)
|
|
|
|
|
{
|
2021-09-16 00:19:22 +02:00
|
|
|
unsigned offset_src = 0;
|
|
|
|
|
unsigned data_src = 0;
|
2021-12-02 14:33:17 +00:00
|
|
|
unsigned offset2_src = 0;
|
2023-05-12 10:55:28 -04:00
|
|
|
nir_op op = parse_atomic_op(intrin, &offset_src, &data_src, &offset2_src);
|
2020-09-01 16:31:37 +01:00
|
|
|
nir_ssa_def *data = intrin->src[data_src].ssa;
|
|
|
|
|
|
|
|
|
|
/* Separate uniform reduction and scan is faster than doing a combined scan+reduce */
|
|
|
|
|
bool combined_scan_reduce = return_prev && data->divergent;
|
|
|
|
|
nir_ssa_def *reduce = NULL, *scan = NULL;
|
|
|
|
|
reduce_data(b, op, data, &reduce, combined_scan_reduce ? &scan : NULL);
|
|
|
|
|
|
|
|
|
|
nir_instr_rewrite_src(&intrin->instr, &intrin->src[data_src], nir_src_for_ssa(reduce));
|
|
|
|
|
nir_update_instr_divergence(b->shader, &intrin->instr);
|
|
|
|
|
|
2020-09-03 17:20:17 +01:00
|
|
|
nir_ssa_def *cond = nir_elect(b, 1);
|
2020-09-01 16:31:37 +01:00
|
|
|
|
|
|
|
|
nir_if *nif = nir_push_if(b, cond);
|
|
|
|
|
|
|
|
|
|
nir_instr_remove(&intrin->instr);
|
|
|
|
|
nir_builder_instr_insert(b, &intrin->instr);
|
|
|
|
|
|
|
|
|
|
if (return_prev) {
|
|
|
|
|
nir_push_else(b, nif);
|
|
|
|
|
|
|
|
|
|
nir_ssa_def *undef = nir_ssa_undef(b, 1, intrin->dest.ssa.bit_size);
|
|
|
|
|
|
|
|
|
|
nir_pop_if(b, nif);
|
|
|
|
|
nir_ssa_def *result = nir_if_phi(b, &intrin->dest.ssa, undef);
|
2020-09-03 17:20:17 +01:00
|
|
|
result = nir_read_first_invocation(b, result);
|
2020-09-01 16:31:37 +01:00
|
|
|
|
|
|
|
|
if (!combined_scan_reduce)
|
|
|
|
|
reduce_data(b, op, data, NULL, &scan);
|
|
|
|
|
|
|
|
|
|
return nir_build_alu(b, op, result, scan, NULL, NULL);
|
|
|
|
|
} else {
|
|
|
|
|
nir_pop_if(b, nif);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
optimize_and_rewrite_atomic(nir_builder *b, nir_intrinsic_instr *intrin)
|
|
|
|
|
{
|
|
|
|
|
nir_if *helper_nif = NULL;
|
|
|
|
|
if (b->shader->info.stage == MESA_SHADER_FRAGMENT) {
|
2020-09-03 17:20:17 +01:00
|
|
|
nir_ssa_def *helper = nir_is_helper_invocation(b, 1);
|
2020-09-01 16:31:37 +01:00
|
|
|
helper_nif = nir_push_if(b, nir_inot(b, helper));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ASSERTED bool original_result_divergent = intrin->dest.ssa.divergent;
|
2021-02-02 16:00:53 +00:00
|
|
|
bool return_prev = !nir_ssa_def_is_unused(&intrin->dest.ssa);
|
2020-09-01 16:31:37 +01:00
|
|
|
|
|
|
|
|
nir_ssa_def old_result = intrin->dest.ssa;
|
|
|
|
|
list_replace(&intrin->dest.ssa.uses, &old_result.uses);
|
nir: Drop unused name from nir_ssa_dest_init
Since 624e799cc34 ("nir: Drop nir_ssa_def::name and nir_register::name"), SSA
defs don't have names, making the name argument unused. Drop it from the
signature and fix the call sites. This was done with the help of the following
Coccinelle semantic patch:
@@
expression A, B, C, D, E;
@@
-nir_ssa_dest_init(A, B, C, D, E);
+nir_ssa_dest_init(A, B, C, D);
Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23078>
2023-05-17 09:08:22 -04:00
|
|
|
nir_ssa_dest_init(&intrin->instr, &intrin->dest, 1,
|
|
|
|
|
intrin->dest.ssa.bit_size);
|
2020-09-01 16:31:37 +01:00
|
|
|
|
|
|
|
|
nir_ssa_def *result = optimize_atomic(b, intrin, return_prev);
|
|
|
|
|
|
|
|
|
|
if (helper_nif) {
|
|
|
|
|
nir_push_else(b, helper_nif);
|
|
|
|
|
nir_ssa_def *undef = result ? nir_ssa_undef(b, 1, result->bit_size) : NULL;
|
|
|
|
|
nir_pop_if(b, helper_nif);
|
|
|
|
|
if (result)
|
|
|
|
|
result = nir_if_phi(b, result, undef);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (result) {
|
|
|
|
|
assert(result->divergent == original_result_divergent);
|
2021-03-03 00:13:38 -06:00
|
|
|
nir_ssa_def_rewrite_uses(&old_result, result);
|
2020-09-01 16:31:37 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
opt_uniform_atomics(nir_function_impl *impl)
|
|
|
|
|
{
|
|
|
|
|
bool progress = false;
|
2023-06-26 10:42:29 -04:00
|
|
|
nir_builder b = nir_builder_create(impl);
|
2020-09-01 16:31:37 +01:00
|
|
|
b.update_divergence = true;
|
|
|
|
|
|
|
|
|
|
nir_foreach_block(block, impl) {
|
|
|
|
|
nir_foreach_instr_safe(instr, block) {
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
2021-12-02 14:33:17 +00:00
|
|
|
unsigned offset_src, data_src, offset2_src;
|
2023-05-12 10:55:28 -04:00
|
|
|
if (parse_atomic_op(intrin, &offset_src, &data_src, &offset2_src) ==
|
2021-12-02 14:33:17 +00:00
|
|
|
nir_num_opcodes)
|
2020-09-01 16:31:37 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (nir_src_is_divergent(intrin->src[offset_src]))
|
|
|
|
|
continue;
|
2021-12-02 14:33:17 +00:00
|
|
|
if (nir_src_is_divergent(intrin->src[offset2_src]))
|
|
|
|
|
continue;
|
2020-09-01 16:31:37 +01:00
|
|
|
|
2020-09-01 17:39:35 +01:00
|
|
|
if (is_atomic_already_optimized(b.shader, intrin))
|
|
|
|
|
continue;
|
|
|
|
|
|
2020-09-01 16:31:37 +01:00
|
|
|
b.cursor = nir_before_instr(instr);
|
|
|
|
|
optimize_and_rewrite_atomic(&b, intrin);
|
|
|
|
|
progress = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
nir_opt_uniform_atomics(nir_shader *shader)
|
|
|
|
|
{
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
/* A 1x1x1 workgroup only ever has one active lane, so there's no point in
|
|
|
|
|
* optimizing any atomics.
|
|
|
|
|
*/
|
2021-05-05 12:24:44 -07:00
|
|
|
if (gl_shader_stage_uses_workgroup(shader->info.stage) &&
|
|
|
|
|
!shader->info.workgroup_size_variable &&
|
|
|
|
|
shader->info.workgroup_size[0] == 1 && shader->info.workgroup_size[1] == 1 &&
|
|
|
|
|
shader->info.workgroup_size[2] == 1)
|
2020-09-01 16:31:37 +01:00
|
|
|
return false;
|
|
|
|
|
|
2023-06-22 13:27:59 -04:00
|
|
|
nir_foreach_function_impl(impl, shader) {
|
|
|
|
|
if (opt_uniform_atomics(impl)) {
|
2020-09-01 16:31:37 +01:00
|
|
|
progress = true;
|
2023-06-22 13:27:59 -04:00
|
|
|
nir_metadata_preserve(impl, nir_metadata_none);
|
2020-09-01 16:31:37 +01:00
|
|
|
} else {
|
2023-06-22 13:27:59 -04:00
|
|
|
nir_metadata_preserve(impl, nir_metadata_all);
|
2020-09-01 16:31:37 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
}
|