panvk: Implement multiDrawIndirect for v10+
Some checks are pending
macOS-CI / macOS-CI (dri) (push) Waiting to run
macOS-CI / macOS-CI (xlib) (push) Waiting to run

Signed-off-by: Ryan Mckeever <ryan.mckeever@collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Olivia Lee <benjamin.lee@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34617>
This commit is contained in:
Ryan Mckeever 2025-04-18 20:24:53 -07:00 committed by Marge Bot
parent 2bbfcaf01f
commit a5a0dd3ccc
4 changed files with 221 additions and 151 deletions

View file

@ -12,3 +12,4 @@ VK_EXT_direct_mode_display on panvk
VK_EXT_extended_dynamic_state[2] on panvk
Vulkan 1.2 on panvk/v10+
VK_KHR_shader_quad_control on panvk/v10+
multiDrawIndirect on panvk/v10+

View file

@ -110,7 +110,8 @@ vs_driver_set_is_dirty(struct panvk_cmd_buffer *cmdbuf)
}
static VkResult
prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf)
prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf,
const struct panvk_draw_info *draw)
{
if (!vs_driver_set_is_dirty(cmdbuf))
return VK_SUCCESS;
@ -139,15 +140,23 @@ prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf)
uint32_t vb_offset = vs->desc_info.dyn_bufs.count + MAX_VS_ATTRIBS + 1;
uint32_t desc_count = vb_offset + vb_count;
uint32_t repeat_count = 1;
if (draw->indirect.draw_count > 1 &&
cmdbuf->state.gfx.vi.attribs_changing_on_base_instance != 0)
repeat_count = draw->indirect.draw_count;
const struct panvk_descriptor_state *desc_state =
&cmdbuf->state.gfx.desc_state;
struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
cmdbuf, desc, repeat_count * desc_count * PANVK_DESCRIPTOR_SIZE,
PANVK_DESCRIPTOR_SIZE);
struct panvk_opaque_desc *descs = driver_set.cpu;
if (!driver_set.gpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
for (uint32_t r = 0; r < repeat_count; r++) {
for (uint32_t i = 0; i < MAX_VS_ATTRIBS; i++) {
if (vi->attributes_valid & BITFIELD_BIT(i)) {
emit_vs_attrib(cmdbuf, i, vb_offset,
@ -180,6 +189,9 @@ prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf)
}
}
descs += desc_count;
}
vs_desc_state->driver_set.dev_addr = driver_set.gpu;
vs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
gfx_state_set_dirty(cmdbuf, DESC_STATE);
@ -1326,7 +1338,7 @@ get_render_ctx(struct panvk_cmd_buffer *cmdbuf)
}
static VkResult
prepare_vs(struct panvk_cmd_buffer *cmdbuf)
prepare_vs(struct panvk_cmd_buffer *cmdbuf, const struct panvk_draw_info *draw)
{
struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
@ -1335,14 +1347,20 @@ prepare_vs(struct panvk_cmd_buffer *cmdbuf)
panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
bool upd_res_table = false;
VkResult result = prepare_vs_driver_set(cmdbuf);
VkResult result = prepare_vs_driver_set(cmdbuf, draw);
if (result != VK_SUCCESS)
return result;
if (gfx_state_dirty(cmdbuf, VS) || gfx_state_dirty(cmdbuf, DESC_STATE) ||
vs_driver_set_is_dirty(cmdbuf)) {
uint32_t repeat_count = 1;
if (draw->indirect.draw_count > 1 &&
cmdbuf->state.gfx.vi.attribs_changing_on_base_instance != 0)
repeat_count = draw->indirect.draw_count;
result = panvk_per_arch(cmd_prepare_shader_res_table)(
cmdbuf, desc_state, vs, vs_desc_state, 1);
cmdbuf, desc_state, vs, vs_desc_state, repeat_count);
if (result != VK_SUCCESS)
return result;
@ -1406,7 +1424,8 @@ prepare_fs(struct panvk_cmd_buffer *cmdbuf)
}
static VkResult
prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf,
const struct panvk_draw_info *draw)
{
struct cs_builder *b =
panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
@ -1415,7 +1434,15 @@ prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
VkResult result;
if (gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS)) {
result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, vs, 1);
uint32_t repeat_count = 1;
if (draw->indirect.draw_count > 1 &&
(shader_uses_sysval(vs, graphics, vs.first_vertex) ||
shader_uses_sysval(vs, graphics, vs.base_instance)))
repeat_count = draw->indirect.draw_count;
result =
panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, vs, repeat_count);
if (result != VK_SUCCESS)
return result;
@ -1961,11 +1988,11 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw);
result = prepare_push_uniforms(cmdbuf);
result = prepare_push_uniforms(cmdbuf, draw);
if (result != VK_SUCCESS)
return result;
result = prepare_vs(cmdbuf);
result = prepare_vs(cmdbuf, draw);
if (result != VK_SUCCESS)
return result;
@ -2197,9 +2224,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
assert(cmdbuf->state.gfx.render.layer_count <= 1 ||
cmdbuf->state.gfx.render.view_mask);
/* MultiDrawIndirect (.maxDrawIndirectCount) needs additional changes. */
assert(draw->indirect.draw_count == 1);
/* Force a new push uniform block to be allocated */
gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
@ -2207,15 +2231,42 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
if (result != VK_SUCCESS)
return;
struct panvk_shader_desc_state *vs_desc_state =
&cmdbuf->state.gfx.vs.desc;
const struct vk_dynamic_graphics_state *dyns =
&cmdbuf->vk.dynamic_graphics_state;
const struct vk_vertex_input_state *vi = dyns->vi;
struct mali_primitive_flags_packed flags_override =
get_tiler_flags_override(draw);
uint32_t patch_attribs =
cmdbuf->state.gfx.vi.attribs_changing_on_base_instance;
uint32_t vs_res_table_size =
(util_last_bit(vs->desc_info.used_set_mask) + 1) * pan_size(RESOURCE);
bool patch_faus = shader_uses_sysval(vs, graphics, vs.first_vertex) ||
shader_uses_sysval(vs, graphics, vs.base_instance);
struct cs_index draw_params_addr = cs_scratch_reg64(b, 0);
struct cs_index vs_drv_set = cs_scratch_reg64(b, 2);
struct cs_index attrib_offset = cs_scratch_reg32(b, 4);
struct cs_index multiplicand = cs_scratch_reg32(b, 5);
struct cs_index draw_count = cs_scratch_reg32(b, 6);
struct cs_index draw_id = cs_scratch_reg32(b, 7);
struct cs_index vs_fau_addr = cs_scratch_reg64(b, 8);
struct cs_index tracing_scratch_regs = cs_scratch_reg_tuple(b, 10, 4);
uint32_t vs_fau_count = BITSET_COUNT(vs->fau.used_sysvals) +
BITSET_COUNT(vs->fau.used_push_consts);
if (patch_faus)
cs_move64_to(b, vs_fau_addr, cmdbuf->state.gfx.vs.push_uniforms);
cs_move64_to(b, draw_params_addr, draw->indirect.buffer_dev_addr);
cs_move32_to(b, draw_id, 0);
cs_move32_to(b, draw_count, draw->indirect.draw_count);
cs_req_res(b, CS_IDVS_RES);
cs_while(b, MALI_CS_CONDITION_GREATER, draw_count) {
cs_update_vt_ctx(b) {
cs_move32_to(b, cs_sr_reg32(b, IDVS, GLOBAL_ATTRIBUTE_OFFSET), 0);
/* Load SR33-37 from indirect buffer. */
@ -2227,35 +2278,25 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
/* Wait for the SR33-37 indirect buffer load. */
cs_wait_slot(b, SB_ID(LS), false);
if (shader_uses_sysval(vs, graphics, vs.first_vertex) ||
shader_uses_sysval(vs, graphics, vs.base_instance)) {
struct cs_index fau_block_addr = cs_scratch_reg64(b, 2);
cs_move64_to(b, fau_block_addr, cmdbuf->state.gfx.vs.push_uniforms);
if (patch_faus) {
if (shader_uses_sysval(vs, graphics, vs.first_vertex)) {
cs_store32(b, cs_sr_reg32(b, IDVS, VERTEX_OFFSET), fau_block_addr,
cs_store32(b, cs_sr_reg32(b, IDVS, VERTEX_OFFSET), vs_fau_addr,
shader_remapped_sysval_offset(
vs, sysval_offset(graphics, vs.first_vertex)));
}
if (shader_uses_sysval(vs, graphics, vs.base_instance)) {
cs_store32(b, cs_sr_reg32(b, IDVS, INSTANCE_OFFSET), fau_block_addr,
cs_store32(b, cs_sr_reg32(b, IDVS, INSTANCE_OFFSET), vs_fau_addr,
shader_remapped_sysval_offset(
vs, sysval_offset(graphics, vs.base_instance)));
}
/* Wait for the store using SR-37 as src to finish, so we can overwrite
* it. */
/* Wait for the store using SR-37 as src to finish, so we can
* overwrite it. */
cs_wait_slot(b, SB_ID(LS), false);
}
if (patch_attribs != 0) {
struct panvk_shader_desc_state *vs_desc_state =
&cmdbuf->state.gfx.vs.desc;
const struct vk_dynamic_graphics_state *dyns =
&cmdbuf->vk.dynamic_graphics_state;
const struct vk_vertex_input_state *vi = dyns->vi;
cs_move64_to(b, vs_drv_set, vs_desc_state->driver_set.dev_addr);
/* If firstInstance=0, skip the offset adjustment. */
@ -2274,7 +2315,8 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
/* Emulated immediate multiply: we walk the bits in
* base_instance, and accumulate (stride << bit_pos) if the bit
* is present. This is sub-optimal, but it's simple :-). */
cs_add32(b, multiplicand, cs_sr_reg32(b, IDVS, INSTANCE_OFFSET), 0);
cs_add32(b, multiplicand,
cs_sr_reg32(b, IDVS, INSTANCE_OFFSET), 0);
for (uint32_t i = 31; i > 0; i--) {
uint32_t add = stride << i;
@ -2304,27 +2346,50 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
}
}
/* NIR expects zero-based instance ID, but even if it did have an intrinsic to
* load the absolute instance ID, we'd want to keep it zero-based to work around
* Mali's limitation on non-zero firstInstance when a instance divisor is used.
/* NIR expects zero-based instance ID, but even if it did have an
* intrinsic to load the absolute instance ID, we'd want to keep it
* zero-based to work around Mali's limitation on non-zero firstInstance
* when a instance divisor is used.
*/
cs_update_vt_ctx(b)
cs_move32_to(b, cs_sr_reg32(b, IDVS, INSTANCE_OFFSET), 0);
struct mali_primitive_flags_packed flags_override =
get_tiler_flags_override(draw);
cs_req_res(b, CS_IDVS_RES);
#if PAN_ARCH >= 12
cs_trace_run_idvs2(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
flags_override.opaque[0], false, true, cs_undef(),
cs_trace_run_idvs2(b, tracing_ctx, tracing_scratch_regs,
flags_override.opaque[0], false, true, draw_id,
MALI_IDVS_SHADING_MODE_EARLY);
#else
cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
cs_trace_run_idvs(b, tracing_ctx, tracing_scratch_regs,
flags_override.opaque[0], false, true,
cs_shader_res_sel(0, 0, 1, 0),
cs_shader_res_sel(2, 2, 2, 0), cs_undef());
cs_shader_res_sel(2, 2, 2, 0), draw_id);
#endif
cs_add32(b, draw_count, draw_count, -1);
cs_add32(b, draw_id, draw_id, 1);
cs_add64(b, draw_params_addr, draw_params_addr,
draw->indirect.stride);
if (patch_faus) {
cs_add64(b, vs_fau_addr, vs_fau_addr, vs_fau_count * sizeof(uint64_t));
cs_update_vt_ctx(b) {
cs_add64(b, cs_sr_reg64(b, IDVS, VERTEX_FAU),
cs_sr_reg64(b, IDVS, VERTEX_FAU),
vs_fau_count * sizeof(uint64_t));
}
}
if (patch_attribs != 0) {
cs_add64(b, vs_drv_set, vs_drv_set,
vs_desc_state->driver_set.size);
cs_update_vt_ctx(b) {
cs_add64(b, cs_sr_reg64(b, IDVS, VERTEX_SRT),
cs_sr_reg64(b, IDVS, VERTEX_SRT), vs_res_table_size);
}
}
}
cs_req_res(b, 0);
}

View file

@ -374,6 +374,7 @@ get_features(const struct panvk_physical_device *device,
.independentBlend = true,
.sampleRateShading = true,
.logicOp = true,
.multiDrawIndirect = arch >= 10,
.wideLines = true,
.largePoints = true,
.occlusionQueryPrecise = true,
@ -809,8 +810,7 @@ get_device_properties(const struct panvk_instance *instance,
.mipmapPrecisionBits = 8,
/* Software limit. */
.maxDrawIndexedIndexValue = UINT32_MAX,
/* Make it one for now. */
.maxDrawIndirectCount = 1,
.maxDrawIndirectCount = arch >= 10 ? UINT32_MAX : 1,
.maxSamplerLodBias = (float)INT16_MAX / 256.0f,
.maxSamplerAnisotropy = 16,
.maxViewports = 1,

View file

@ -119,6 +119,10 @@ panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data)
#endif
case nir_intrinsic_load_draw_id:
/* Multidraw is supported on v10. */
if (PAN_ARCH >= 10)
return false;
/* TODO: We only implement single-draw direct and indirect draws, so this
* is sufficient. We'll revisit this when we get around to implementing
* multidraw. */