panvk: Use LD_VAR_BUF[_IMM] when possible

If we determine that the amount of varyings will fit within the 8-bit
offset of LD_VAR_BUF[_IMM], instruct the compiler to use it for varyings
and skip setting up Attribute Descriptors.

This should save a bit of memory and overhead in reading varyings.

Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32969>
This commit is contained in:
Lars-Ivar Hesselberg Simonsen 2025-01-20 12:07:04 +01:00 committed by Marge Bot
parent de86641d3f
commit 7b949dd8c4
5 changed files with 32 additions and 10 deletions

View file

@ -594,9 +594,7 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
bool immediate = bi_is_imm_var_desc_handle(b, instr, &imm_index); bool immediate = bi_is_imm_var_desc_handle(b, instr, &imm_index);
unsigned base = nir_intrinsic_base(instr); unsigned base = nir_intrinsic_base(instr);
/* LD_VAR_BUF[_IMM] takes an 8-bit offset, limiting its use to 64 or less /* Only use LD_VAR_BUF[_IMM] if explicitly told by the driver
* varying components, assuming F32.
* Therefore, only use LD_VAR_BUF[_IMM] if explicitly told by the driver
* through a compiler input value, falling back to LD_VAR[_IMM] + * through a compiler input value, falling back to LD_VAR[_IMM] +
* Attribute Descriptors otherwise. */ * Attribute Descriptors otherwise. */
bool use_ld_var_buf = bool use_ld_var_buf =

View file

@ -239,7 +239,10 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
const struct panvk_descriptor_state *desc_state = const struct panvk_descriptor_state *desc_state =
&cmdbuf->state.gfx.desc_state; &cmdbuf->state.gfx.desc_state;
uint32_t num_varying_attr_descs = fs->desc_info.max_varying_loads; /* If the shader is using LD_VAR_BUF[_IMM], we do not have to set up
* Attribute Descriptors for varying loads. */
uint32_t num_varying_attr_descs =
panvk_use_ld_var_buf(fs) ? 0 : fs->desc_info.max_varying_loads;
uint32_t desc_count = uint32_t desc_count =
fs->desc_info.dyn_bufs.count + num_varying_attr_descs + 1; fs->desc_info.dyn_bufs.count + num_varying_attr_descs + 1;
struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem( struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
@ -249,7 +252,8 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
if (desc_count && !driver_set.gpu) if (desc_count && !driver_set.gpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY; return VK_ERROR_OUT_OF_DEVICE_MEMORY;
emit_varying_descs(cmdbuf, (struct mali_attribute_packed *)(&descs[0])); if (num_varying_attr_descs > 0)
emit_varying_descs(cmdbuf, (struct mali_attribute_packed *)(&descs[0]));
/* Dummy sampler always comes right after the varyings. */ /* Dummy sampler always comes right after the varyings. */
pan_cast_and_pack(&descs[num_varying_attr_descs], SAMPLER, cfg) { pan_cast_and_pack(&descs[num_varying_attr_descs], SAMPLER, cfg) {

View file

@ -353,6 +353,18 @@ struct panvk_internal_shader {
#endif #endif
}; };
#if PAN_ARCH >= 9
static inline bool
panvk_use_ld_var_buf(const struct panvk_shader *shader)
{
/* LD_VAR_BUF[_IMM] takes an 8-bit offset, limiting its use to 16 or less
* varyings, assuming highp vec4. */
if (shader->desc_info.max_varying_loads <= 16)
return true;
return false;
}
#endif
VK_DEFINE_NONDISP_HANDLE_CASTS(panvk_internal_shader, vk.base, VkShaderEXT, VK_DEFINE_NONDISP_HANDLE_CASTS(panvk_internal_shader, vk.base, VkShaderEXT,
VK_OBJECT_TYPE_SHADER_EXT) VK_OBJECT_TYPE_SHADER_EXT)

View file

@ -1258,7 +1258,13 @@ panvk_per_arch(nir_lower_descriptors)(
goto out; goto out;
#if PAN_ARCH >= 9 #if PAN_ARCH >= 9
ctx.desc_info.num_varying_attr_descs = shader->desc_info.max_varying_loads; ctx.desc_info.num_varying_attr_descs = 0;
/* We require Attribute Descriptors if we cannot use LD_VAR_BUF[_IMM] for
* varyings. */
if (shader->info.stage == MESA_SHADER_FRAGMENT &&
!panvk_use_ld_var_buf(shader))
ctx.desc_info.num_varying_attr_descs =
shader->desc_info.max_varying_loads;
#endif #endif
create_copy_table(nir, &ctx); create_copy_table(nir, &ctx);
upload_shader_desc_info(dev, shader, &ctx.desc_info); upload_shader_desc_info(dev, shader, &ctx.desc_info);

View file

@ -1053,10 +1053,6 @@ panvk_compile_shader(struct panvk_device *dev,
.gpu_id = phys_dev->kmod.props.gpu_prod_id, .gpu_id = phys_dev->kmod.props.gpu_prod_id,
.no_ubo_to_push = true, .no_ubo_to_push = true,
.view_mask = (state && state->rp) ? state->rp->view_mask : 0, .view_mask = (state && state->rp) ? state->rp->view_mask : 0,
#if PAN_ARCH >= 9
/* LD_VAR_BUF does not support maxVertexOutputComponents (128) */
.valhall.use_ld_var_buf = false,
#endif
}; };
if (info->stage == MESA_SHADER_FRAGMENT && state != NULL && if (info->stage == MESA_SHADER_FRAGMENT && state != NULL &&
@ -1066,6 +1062,12 @@ panvk_compile_shader(struct panvk_device *dev,
panvk_lower_nir(dev, nir, info->set_layout_count, info->set_layouts, panvk_lower_nir(dev, nir, info->set_layout_count, info->set_layouts,
info->robustness, noperspective_varyings, &inputs, shader); info->robustness, noperspective_varyings, &inputs, shader);
#if PAN_ARCH >= 9
if (info->stage == MESA_SHADER_FRAGMENT)
/* Use LD_VAR_BUF[_IMM] for varyings if possible. */
inputs.valhall.use_ld_var_buf = panvk_use_ld_var_buf(shader);
#endif
result = panvk_compile_nir(dev, nir, info->flags, &inputs, shader); result = panvk_compile_nir(dev, nir, info->flags, &inputs, shader);
/* We need to update info.push.count because it's used to initialize the /* We need to update info.push.count because it's used to initialize the