From cfa217ee0413e0478743365128f3a9ca013640fd Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 22 Apr 2024 18:49:00 +0100 Subject: [PATCH] nir/opt_offsets: don't check NUW for unswizzled buffer_amd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This isn't necessary. fossil-db (navi21): Totals from 13 (0.02% of 79377) affected shaders: Instrs: 18070 -> 18042 (-0.15%); split: -0.17%, +0.01% CodeSize: 98336 -> 98012 (-0.33%) Latency: 72735 -> 72992 (+0.35%); split: -0.02%, +0.38% InvThroughput: 13157 -> 13105 (-0.40%) VClause: 334 -> 324 (-2.99%) SClause: 563 -> 564 (+0.18%) Copies: 1194 -> 1197 (+0.25%) VALU: 12330 -> 12297 (-0.27%) fossil-db (polaris10): Totals from 10 (0.02% of 61794) affected shaders: Instrs: 4543 -> 4441 (-2.25%) CodeSize: 30196 -> 29388 (-2.68%) Latency: 64290 -> 64272 (-0.03%); split: -0.05%, +0.02% InvThroughput: 20371 -> 20362 (-0.04%); split: -0.08%, +0.04% VClause: 195 -> 135 (-30.77%) Copies: 97 -> 100 (+3.09%) PreSGPRs: 178 -> 177 (-0.56%) VALU: 1765 -> 1666 (-5.61%) VMEM: 2448 -> 2445 (-0.12%) Signed-off-by: Rhys Perry Reviewed-by: Timur Kristóf Reviewed-by: Georg Lehmann Part-of: --- src/compiler/nir/nir_opt_offsets.c | 36 +++++++++++++++++------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/src/compiler/nir/nir_opt_offsets.c b/src/compiler/nir/nir_opt_offsets.c index dc740a90be4..b64c609564d 100644 --- a/src/compiler/nir/nir_opt_offsets.c +++ b/src/compiler/nir/nir_opt_offsets.c @@ -35,7 +35,8 @@ typedef struct } opt_offsets_state; static nir_scalar -try_extract_const_addition(nir_builder *b, nir_scalar val, opt_offsets_state *state, unsigned *out_const, uint32_t max) +try_extract_const_addition(nir_builder *b, nir_scalar val, opt_offsets_state *state, unsigned *out_const, + uint32_t max, bool need_nuw) { val = nir_scalar_chase_movs(val); @@ -56,7 +57,8 @@ try_extract_const_addition(nir_builder *b, nir_scalar val, opt_offsets_state *st * Ignored for ints-as-floats (lower_bitops is a proxy for that), where * unsigned wrapping doesn't make sense. */ - if (!state->options->allow_offset_wrap && !alu->no_unsigned_wrap && !b->shader->options->lower_bitops) { + if (!state->options->allow_offset_wrap && need_nuw && !alu->no_unsigned_wrap && + !b->shader->options->lower_bitops) { if (!state->range_ht) { /* Cache for nir_unsigned_upper_bound */ state->range_ht = _mesa_pointer_hash_table_create(NULL); @@ -79,14 +81,14 @@ try_extract_const_addition(nir_builder *b, nir_scalar val, opt_offsets_state *st uint32_t offset = nir_scalar_as_uint(src[i]); if (offset + *out_const <= max) { *out_const += offset; - return try_extract_const_addition(b, src[1 - i], state, out_const, max); + return try_extract_const_addition(b, src[1 - i], state, out_const, max, need_nuw); } } } uint32_t orig_offset = *out_const; - src[0] = try_extract_const_addition(b, src[0], state, out_const, max); - src[1] = try_extract_const_addition(b, src[1], state, out_const, max); + src[0] = try_extract_const_addition(b, src[0], state, out_const, max, need_nuw); + src[1] = try_extract_const_addition(b, src[1], state, out_const, max, need_nuw); if (*out_const == orig_offset) return val; @@ -102,7 +104,8 @@ try_fold_load_store(nir_builder *b, nir_intrinsic_instr *intrin, opt_offsets_state *state, unsigned offset_src_idx, - uint32_t max) + uint32_t max, + bool need_nuw) { /* Assume that BASE is the constant offset of a load/store. * Try to constant-fold additions to the offset source @@ -122,7 +125,7 @@ try_fold_load_store(nir_builder *b, if (!nir_src_is_const(*off_src)) { uint32_t add_offset = 0; nir_scalar val = { .def = off_src->ssa, .comp = 0 }; - val = try_extract_const_addition(b, val, state, &add_offset, max - off_const); + val = try_extract_const_addition(b, val, state, &add_offset, max - off_const, need_nuw); if (add_offset == 0) return false; off_const += add_offset; @@ -198,29 +201,32 @@ process_instr(nir_builder *b, nir_instr *instr, void *s) switch (intrin->intrinsic) { case nir_intrinsic_load_uniform: case nir_intrinsic_load_const_ir3: - return try_fold_load_store(b, intrin, state, 0, get_max(state, intrin, state->options->uniform_max)); + return try_fold_load_store(b, intrin, state, 0, get_max(state, intrin, state->options->uniform_max), true); case nir_intrinsic_load_ubo_vec4: - return try_fold_load_store(b, intrin, state, 1, get_max(state, intrin, state->options->ubo_vec4_max)); + return try_fold_load_store(b, intrin, state, 1, get_max(state, intrin, state->options->ubo_vec4_max), true); case nir_intrinsic_shared_atomic: case nir_intrinsic_shared_atomic_swap: - return try_fold_load_store(b, intrin, state, 0, get_max(state, intrin, state->options->shared_atomic_max)); + return try_fold_load_store(b, intrin, state, 0, get_max(state, intrin, state->options->shared_atomic_max), true); case nir_intrinsic_load_shared: case nir_intrinsic_load_shared_ir3: - return try_fold_load_store(b, intrin, state, 0, get_max(state, intrin, state->options->shared_max)); + return try_fold_load_store(b, intrin, state, 0, get_max(state, intrin, state->options->shared_max), true); case nir_intrinsic_store_shared: case nir_intrinsic_store_shared_ir3: - return try_fold_load_store(b, intrin, state, 1, get_max(state, intrin, state->options->shared_max)); + return try_fold_load_store(b, intrin, state, 1, get_max(state, intrin, state->options->shared_max), true); case nir_intrinsic_load_shared2_amd: return try_fold_shared2(b, intrin, state, 0); case nir_intrinsic_store_shared2_amd: return try_fold_shared2(b, intrin, state, 1); case nir_intrinsic_load_buffer_amd: - return try_fold_load_store(b, intrin, state, 1, state->options->buffer_max); + return try_fold_load_store(b, intrin, state, 1, get_max(state, intrin, state->options->buffer_max), + nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD); case nir_intrinsic_store_buffer_amd: + return try_fold_load_store(b, intrin, state, 2, get_max(state, intrin, state->options->buffer_max), + nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD); case nir_intrinsic_load_ssbo_ir3: - return try_fold_load_store(b, intrin, state, 2, get_max(state, intrin, state->options->buffer_max)); + return try_fold_load_store(b, intrin, state, 2, get_max(state, intrin, state->options->buffer_max), true); case nir_intrinsic_store_ssbo_ir3: - return try_fold_load_store(b, intrin, state, 3, get_max(state, intrin, state->options->buffer_max)); + return try_fold_load_store(b, intrin, state, 3, get_max(state, intrin, state->options->buffer_max), true); default: return false; }