From 115958b6f01d3967c86ac43c9188bea1bf460c2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Thu, 30 Mar 2023 23:44:18 +0200 Subject: [PATCH] ac/nir/ngg: Slightly improve attribute ring offset calculation. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inspired by Nicolai Hähnle's commit in LLPC. Instead of using a SALU instruction to add to the scalar offset, rely on the buffer swizzling and use constant offset. Fossil DB stats on GFX1100: Totals from 47910 (35.51% of 134913) affected shaders: CodeSize: 87927612 -> 86968136 (-1.09%) Instrs: 17584007 -> 17440094 (-0.82%) Latency: 97232173 -> 97126311 (-0.11%) InvThroughput: 9904586 -> 9905288 (+0.01%); split: -0.02%, +0.02% VClause: 544430 -> 542566 (-0.34%) Signed-off-by: Timur Kristóf Reviewed-by: Marek Olšák Reviewed-by: Rhys Perry Part-of: --- src/amd/common/ac_nir_lower_ngg.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index b7deb08aa97..c2ed88318b5 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -2229,12 +2229,11 @@ export_vertex_params_gfx11(nir_builder *b, nir_ssa_def *export_tid, nir_ssa_def if (exported_params & BITFIELD_BIT(offset)) continue; - nir_ssa_def *soffset = nir_iadd_imm(b, attr_offset, offset * 16 * 32); - nir_ssa_def *comp[4]; for (unsigned j = 0; j < 4; j++) comp[j] = outputs[i].chan[j] ? outputs[i].chan[j] : undef; - nir_store_buffer_amd(b, nir_vec(b, comp, 4), attr_rsrc, voffset, soffset, vindex, + nir_store_buffer_amd(b, nir_vec(b, comp, 4), attr_rsrc, voffset, attr_offset, vindex, + .base = offset * 16, .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT | ACCESS_IS_SWIZZLED_AMD); exported_params |= BITFIELD_BIT(offset); @@ -3750,10 +3749,11 @@ ms_store_arrayed_output_intrin(nir_builder *b, * (Also much better than storing and reloading from the scratch ring.) */ const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin); + unsigned param_offset = s->vs_output_param_offset[io_sem.location]; nir_ssa_def *ring = nir_load_ring_attr_amd(b); nir_ssa_def *soffset = nir_load_ring_attr_offset_amd(b); - soffset = nir_iadd_imm(b, soffset, s->vs_output_param_offset[io_sem.location] * 16 * 32); - nir_store_buffer_amd(b, store_val, ring, base_addr_off, soffset, arr_index, .base = const_off, + nir_store_buffer_amd(b, store_val, ring, base_addr_off, soffset, arr_index, + .base = const_off + param_offset * 16, .memory_modes = nir_var_shader_out, .access = ACCESS_COHERENT | ACCESS_IS_SWIZZLED_AMD); } else if (out_mode == ms_out_mode_var) {