panvk: Use LD_VAR[_IMM] + ADs for varyings

The current implementation uses LD_VAR_BUF[_IMM] to look up varyings,
which limits the number of varying components to 64 due to an 8-bit
offset value.

As this does not align to maxVertexOutputComponents (128), this change
replaces the use of LD_VAR_BUF[_IMM] with LD_VAR[_IMM] + Attribute
Descriptors, which do not have this limitation.

As allocating Attribute Descriptors is potentially expensive, this can
be further optimized by falling back to LD_VAR_BUF[_IMM] in cases where
we can ensure we do not use more than 64 varying components.

This change currently does not change behavior for gallium/panfrost,
though that should be done as well.

Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32969>
This commit is contained in:
Lars-Ivar Hesselberg Simonsen 2025-01-07 16:52:53 +01:00 committed by Marge Bot
parent 7881d19d01
commit 6d5ae5b3af
7 changed files with 146 additions and 57 deletions

View file

@ -132,6 +132,10 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
.gpu_id = panfrost_device_gpu_id(dev),
};
if (dev->arch >= 9)
/* Use LD_VAR_BUF for varying lookups. */
inputs.valhall.use_ld_var_buf = true;
/* Lower this early so the backends don't have to worry about it */
if (s->info.stage == MESA_SHADER_FRAGMENT) {
inputs.fixed_varying_mask = key->fs.fixed_varying_mask;

View file

@ -273,19 +273,10 @@ dEQP-VK.api.device_init.create_device_global_priority_query_khr.basic,Fail
dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.63,Fail
dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.63,Fail
dEQP-VK.glsl.limits.near_max.fragment_input.components_123,Fail
dEQP-VK.glsl.limits.near_max.fragment_input.components_124,Fail
dEQP-VK.pipeline.monolithic.max_varyings.test_vertex_io_between_vertex_fragment,Fail
dEQP-VK.pipeline.pipeline_library.max_varyings.test_vertex_io_between_vertex_fragment,Fail
dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.63,Fail
dEQP-VK.renderpass.multiple_subpasses_multiple_command_buffers.test,Fail
dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.63,Fail
dEQP-VK.pipeline.fast_linked_library.max_varyings.test_vertex_io_between_vertex_fragment,Fail
dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex,Crash
dEQP-VK.rasterization.rasterization_order_attachment_access.depth.samples_1.multi_draw_barriers,Crash

View file

@ -589,45 +589,30 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
b->shader->info.bifrost->uses_flat_shading = true;
}
enum bi_source_format source_format =
smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
nir_src *offset = nir_get_io_offset_src(instr);
unsigned imm_index = 0;
bool immediate = bi_is_imm_var_desc_handle(b, instr, &imm_index);
unsigned base = nir_intrinsic_base(instr);
/* On Valhall, ensure the table and index are valid for usage with immediate
* form when IDVS isn't used */
if (b->shader->arch >= 9 && !b->shader->malloc_idvs)
immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) &&
pan_res_handle_get_index(base) < 256;
/* LD_VAR_BUF[_IMM] takes an 8-bit offset, limiting its use to 64 or less
* varying components, assuming F32.
* Therefore, only use LD_VAR_BUF[_IMM] if explicitly told by the driver
* through a compiler input value, falling back to LD_VAR[_IMM] +
* Attribute Descriptors otherwise. */
bool use_ld_var_buf =
b->shader->malloc_idvs && b->shader->inputs->valhall.use_ld_var_buf;
if (b->shader->malloc_idvs && immediate) {
/* Immediate index given in bytes. */
bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
update, vecsize,
bi_varying_offset(b->shader, instr));
} else if (immediate) {
bi_instr *I;
if (use_ld_var_buf) {
enum bi_source_format source_format =
smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
if (smooth) {
I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
pan_res_handle_get_index(imm_index));
if (immediate) {
/* Immediate index given in bytes. */
bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
update, vecsize,
bi_varying_offset(b->shader, instr));
} else {
I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
pan_res_handle_get_index(imm_index));
}
/* Valhall usually uses machine-allocated IDVS. If this is disabled,
* use a simple Midgard-style ABI.
*/
if (b->shader->arch >= 9)
I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
} else {
bi_index idx = bi_src_index(offset);
if (b->shader->malloc_idvs) {
bi_index idx = bi_src_index(offset);
/* Index needs to be in bytes, but NIR gives the index
* in slots. For now assume 16 bytes per element.
*/
@ -639,7 +624,33 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample,
source_format, update, vecsize);
}
} else {
/* On Valhall, ensure the table and index are valid for usage with
* immediate form when IDVS isn't used */
if (b->shader->arch >= 9)
immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) &&
pan_res_handle_get_index(base) < 256;
if (immediate) {
bi_instr *I;
if (smooth) {
I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
pan_res_handle_get_index(imm_index));
} else {
I =
bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
pan_res_handle_get_index(imm_index));
}
/* Valhall usually uses LD_VAR_BUF. If this is disabled, use a simple
* Midgard-style ABI. */
if (b->shader->arch >= 9)
I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
} else {
bi_index idx = bi_src_index(offset);
if (base != 0)
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);

View file

@ -121,6 +121,10 @@ struct panfrost_compile_inputs {
struct {
uint32_t rt_conv[8];
} bifrost;
struct {
/* Use LD_VAR_BUF[_IMM] instead of LD_VAR[_IMM] to load varyings. */
bool use_ld_var_buf;
} valhall;
};
};

View file

@ -165,6 +165,73 @@ prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf)
return VK_SUCCESS;
}
static uint32_t
get_varying_slots(const struct panvk_cmd_buffer *cmdbuf)
{
const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
const struct panvk_shader *fs = get_fs(cmdbuf);
uint32_t varying_slots = 0;
if (fs) {
unsigned vs_vars = vs->info.varyings.output_count;
unsigned fs_vars = fs->info.varyings.input_count;
varying_slots = MAX2(vs_vars, fs_vars);
}
return varying_slots;
}
static void
emit_varying_descs(const struct panvk_cmd_buffer *cmdbuf,
struct mali_attribute_packed *descs)
{
uint32_t varying_slots = get_varying_slots(cmdbuf);
/* Assumes 16 byte slots. We could do better. */
uint32_t varying_size = varying_slots * 16;
const struct panvk_shader *fs = get_fs(cmdbuf);
for (uint32_t i = 0; i < varying_slots; i++) {
const struct pan_shader_varying *var = &fs->info.varyings.input[i];
/* Skip special varyings. */
if (var->location < VARYING_SLOT_VAR0)
continue;
/* We currently always write out F32 in the vertex shaders, so the format
* needs to reflect this. */
enum pipe_format f = var->format;
switch (f) {
case PIPE_FORMAT_R16_FLOAT:
f = PIPE_FORMAT_R32_FLOAT;
break;
case PIPE_FORMAT_R16G16_FLOAT:
f = PIPE_FORMAT_R32G32_FLOAT;
break;
case PIPE_FORMAT_R16G16B16_FLOAT:
f = PIPE_FORMAT_R32G32B32_FLOAT;
break;
case PIPE_FORMAT_R16G16B16A16_FLOAT:
f = PIPE_FORMAT_R32G32B32A32_FLOAT;
break;
default:
break;
}
uint32_t loc = var->location - VARYING_SLOT_VAR0;
pan_pack(&descs[i], ATTRIBUTE, cfg) {
cfg.attribute_type = MALI_ATTRIBUTE_TYPE_VERTEX_PACKET;
cfg.offset_enable = false;
cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw;
cfg.table = 61;
cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
cfg.offset = 1024 + (loc * 16);
cfg.buffer_index = 0;
cfg.attribute_stride = varying_size;
cfg.packet_stride = varying_size + 16;
}
}
}
static VkResult
prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
{
@ -172,7 +239,7 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
const struct panvk_descriptor_state *desc_state =
&cmdbuf->state.gfx.desc_state;
uint32_t desc_count = fs->desc_info.dyn_bufs.count + 1;
uint32_t desc_count = fs->desc_info.dyn_bufs.count + MAX_VARYING + 1;
struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
struct panvk_opaque_desc *descs = driver_set.cpu;
@ -180,13 +247,15 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
if (desc_count && !driver_set.gpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
/* Dummy sampler always comes first. */
pan_cast_and_pack(&descs[0], SAMPLER, cfg) {
emit_varying_descs(cmdbuf, (struct mali_attribute_packed *)(&descs[0]));
/* Dummy sampler always comes right after the varyings. */
pan_cast_and_pack(&descs[MAX_VARYING], SAMPLER, cfg) {
cfg.clamp_integer_array_indices = false;
}
panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, fs,
(struct mali_buffer_packed *)(&descs[1]));
panvk_per_arch(cmd_fill_dyn_bufs)(
desc_state, fs, (struct mali_buffer_packed *)(&descs[1 + MAX_VARYING]));
fs_desc_state->driver_set.dev_addr = driver_set.gpu;
fs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
@ -1650,16 +1719,8 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
if (result != VK_SUCCESS)
return result;
uint32_t varying_size = 0;
if (fs) {
unsigned vs_vars = vs->info.varyings.output_count;
unsigned fs_vars = fs->info.varyings.input_count;
unsigned var_slots = MAX2(vs_vars, fs_vars);
/* Assumes 16 byte slots. We could do better. */
varying_size = var_slots * 16;
}
/* Assumes 16 byte slots. We could do better. */
uint32_t varying_size = get_varying_slots(cmdbuf) * 16;
cs_update_vt_ctx(b) {
/* We don't use the resource dep system yet. */

View file

@ -1025,8 +1025,22 @@ create_copy_table(nir_shader *nir, struct lower_desc_ctx *ctx)
for (uint32_t i = 0; i < PANVK_BIFROST_DESC_TABLE_COUNT; i++)
copy_count += desc_info->others[i].count;
#else
/* Dummy sampler comes after the vertex attributes. */
uint32_t dummy_sampler_idx = nir->info.stage == MESA_SHADER_VERTEX ? 16 : 0;
uint32_t dummy_sampler_idx;
switch (nir->info.stage) {
case MESA_SHADER_VERTEX:
/* Dummy sampler comes after the vertex attributes. */
dummy_sampler_idx = 16;
break;
case MESA_SHADER_FRAGMENT:
/* Dummy sampler comes after the varyings. */
dummy_sampler_idx = MAX_VARYING;
break;
case MESA_SHADER_COMPUTE:
dummy_sampler_idx = 0;
break;
default:
unreachable("unexpected stage");
}
desc_info->dummy_sampler_handle = pan_res_handle(0, dummy_sampler_idx);
copy_count = desc_info->dyn_bufs.count + desc_info->dyn_bufs.count;

View file

@ -1041,6 +1041,10 @@ panvk_compile_shader(struct panvk_device *dev,
.gpu_id = phys_dev->kmod.props.gpu_prod_id,
.no_ubo_to_push = true,
.view_mask = (state && state->rp) ? state->rp->view_mask : 0,
#if PAN_ARCH >= 9
/* LD_VAR_BUF does not support maxVertexOutputComponents (128) */
.valhall.use_ld_var_buf = false,
#endif
};
if (info->stage == MESA_SHADER_FRAGMENT && state != NULL &&