ac/nir: allow 8/16-bit smem loads

fossil-db (gfx1201):
Totals from 295 (0.37% of 79377) affected shaders:
Instrs: 314018 -> 313355 (-0.21%); split: -0.22%, +0.00%
CodeSize: 1697996 -> 1696528 (-0.09%); split: -0.11%, +0.02%
Latency: 4197719 -> 4197106 (-0.01%)
InvThroughput: 1258891 -> 1258744 (-0.01%)
PreSGPRs: 12232 -> 12230 (-0.02%)
SALU: 66762 -> 66269 (-0.74%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34162>
This commit is contained in:
Rhys Perry 2025-03-11 12:29:10 +00:00 committed by Marge Bot
parent 5b116c4de9
commit 3b42626973

View file

@ -35,7 +35,13 @@ use_smem_for_load(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data_)
return false;
}
if (intrin->def.divergent || (cb_data->after_lowering && intrin->def.bit_size < 32))
if (intrin->def.divergent)
return false;
/* ACO doesn't support instruction selection for multi-component 8/16-bit SMEM loads. */
const bool supports_scalar_subdword = cb_data->gfx_level >= GFX12 && !cb_data->use_llvm;
if (cb_data->after_lowering && intrin->def.bit_size < 32 &&
(intrin->def.num_components > 1 || !supports_scalar_subdword))
return false;
enum gl_access_qualifier access = nir_intrinsic_access(intrin);
@ -83,12 +89,14 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
if (!is_load)
return res;
/* Lower 8/16-bit loads to 32-bit, unless it's a VMEM scalar load. */
/* Lower 8/16-bit loads to 32-bit, unless it's a VMEM (or SMEM on GFX12+) scalar load. */
const bool support_subdword = res.num_components == 1 && !is_smem &&
(!cb_data->use_llvm || intrin != nir_intrinsic_load_ubo);
const bool supports_scalar_subdword =
!is_smem || (cb_data->gfx_level >= GFX12 && intrin != nir_intrinsic_load_push_constant);
const bool supported_subdword = res.num_components == 1 && supports_scalar_subdword &&
(!cb_data->use_llvm || intrin != nir_intrinsic_load_ubo);
if (res.bit_size >= 32 || support_subdword)
if (res.bit_size >= 32 || supported_subdword)
return res;
const uint32_t max_pad = 4 - MIN2(combined_align, 4);
@ -143,4 +151,4 @@ ac_nir_lower_mem_access_bit_sizes(nir_shader *shader, enum amd_gfx_level gfx_lev
.cb_data = &cb_data,
};
return nir_lower_mem_access_bit_sizes(shader, &lower_mem_access_options);
}
}