panvk: Implement indirect dispatch on JM

Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com>
Acked-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32839>
This commit is contained in:
Mary Guillemard 2025-01-02 16:35:00 +01:00 committed by Marge Bot
parent bf9331b3fe
commit 8b4e515ca2
3 changed files with 96 additions and 49 deletions

View file

@ -283,26 +283,6 @@ spec@ext_image_dma_buf_import@ext_image_dma_buf_import-refcount-multithread,Cras
dEQP-VK.api.object_management.max_concurrent.device,Fail
dEQP-VK.api.object_management.max_concurrent.device_group,Fail
# CmdDispatchIndirect not supported yet
dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.empty_command,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.large_offset,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.large_offset_multiple_invocations,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.multi_dispatch,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.multi_dispatch_reuse_command,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.multiple_groups,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.multiple_groups_multiple_invocations,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.single_invocation,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.small_offset,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.empty_command,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.large_offset,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.large_offset_multiple_invocations,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.multi_dispatch,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.multi_dispatch_reuse_command,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.multiple_groups,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.multiple_groups_multiple_invocations,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.single_invocation,Crash
dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.small_offset,Crash
# CTS bug, see https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/5296
dEQP-VK.api.device_init.create_device_unsupported_features.protected_memory_features,Fail

View file

@ -71,7 +71,7 @@ shaders@glsl-bug-110796
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed*
# indirect dispatch and draw not supported yet
# indirect draw not supported yet
dEQP-VK.synchronization.*indirect*
dEQP-VK.synchronization2.*indirect*
dEQP-VK.draw.renderpass.indirect_draw.*

View file

@ -9,11 +9,14 @@
* SPDX-License-Identifier: MIT
*/
#include <stdint.h>
#include "genxml/gen_macros.h"
#include "panvk_buffer.h"
#include "panvk_cmd_alloc.h"
#include "panvk_cmd_buffer.h"
#include "panvk_cmd_desc_state.h"
#include "panvk_cmd_precomp.h"
#include "panvk_device.h"
#include "panvk_entrypoints.h"
#include "panvk_macros.h"
@ -36,7 +39,6 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf,
struct panvk_batch *batch = cmdbuf->cur_batch;
assert(batch);
assert(!indirect && "Indirect not supported yet!");
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(cmdbuf->vk.base.device->physical);
@ -45,6 +47,7 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf,
batch->tlsinfo.tls.size = shader->info.tls_size;
batch->tlsinfo.wls.size = shader->info.wls_size;
if (batch->tlsinfo.wls.size) {
unsigned core_id_range;
@ -58,29 +61,16 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf,
return batch->tls.gpu;
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
uint32_t baseGroupX, uint32_t baseGroupY,
uint32_t baseGroupZ, uint32_t groupCountX,
uint32_t groupCountY, uint32_t groupCountZ)
static void
cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
const struct panvk_shader *shader = cmdbuf->state.compute.shader;
VkResult result;
if (groupCountX == 0 || groupCountY == 0 || groupCountZ == 0)
return;
/* If there's no compute shader, we can skip the dispatch. */
if (!panvk_priv_mem_dev_addr(shader->rsd))
return;
struct panvk_dispatch_info info = {
.wg_base = {baseGroupX, baseGroupY, baseGroupZ},
.direct.wg_count = {groupCountX, groupCountY, groupCountZ},
};
struct pan_compute_dim wg_count = {groupCountX, groupCountY, groupCountZ};
panvk_per_arch(cmd_close_batch)(cmdbuf);
struct panvk_batch *batch = panvk_per_arch(cmd_open_batch)(cmdbuf);
@ -89,8 +79,14 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
struct panvk_shader_desc_state *cs_desc_state =
&cmdbuf->state.compute.cs.desc;
struct pan_compute_dim wg_count = {
info->direct.wg_count.x,
info->direct.wg_count.y,
info->direct.wg_count.z,
};
bool indirect = info->indirect.buffer_dev_addr != 0;
uint64_t tsd = panvk_per_arch(cmd_dispatch_prepare_tls)(cmdbuf, shader,
&wg_count, false);
&wg_count, indirect);
result = panvk_per_arch(cmd_prepare_push_descs)(
cmdbuf, desc_state, shader->desc_info.used_set_mask);
@ -105,7 +101,7 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
return;
}
panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, &info);
panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, info);
result = panvk_per_arch(cmd_prepare_push_uniforms)(
cmdbuf, cmdbuf->state.compute.shader, 1);
@ -135,10 +131,12 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
util_dynarray_append(&batch->jobs, void *, job.cpu);
pan_pack_work_groups_compute(
pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION), wg_count.x, wg_count.y,
wg_count.z, shader->cs.local_size.x, shader->cs.local_size.y,
shader->cs.local_size.z, false, false);
if (!indirect) {
pan_pack_work_groups_compute(
pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION), wg_count.x,
wg_count.y, wg_count.z, shader->cs.local_size.x,
shader->cs.local_size.y, shader->cs.local_size.z, false, false);
}
pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
cfg.job_task_split = util_logbase2_ceil(shader->cs.local_size.x + 1) +
@ -158,22 +156,91 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
cfg.samplers = cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_SAMPLER];
}
unsigned indirect_dep = 0;
if (indirect) {
/* We redirect write to memory sink for null pointers */
uint64_t num_work_groups_x_sysval_addr = 0x8ull << 60;
uint64_t num_work_groups_y_sysval_addr = 0x8ull << 60;
uint64_t num_work_groups_z_sysval_addr = 0x8ull << 60;
if (shader_uses_sysval(shader, compute, num_work_groups.x)) {
num_work_groups_x_sysval_addr =
cmdbuf->state.compute.push_uniforms +
shader_remapped_sysval_offset(
shader, sysval_offset(compute, num_work_groups.x));
}
if (shader_uses_sysval(shader, compute, num_work_groups.y)) {
num_work_groups_y_sysval_addr =
cmdbuf->state.compute.push_uniforms +
shader_remapped_sysval_offset(
shader, sysval_offset(compute, num_work_groups.y));
}
if (shader_uses_sysval(shader, compute, num_work_groups.z)) {
num_work_groups_z_sysval_addr =
cmdbuf->state.compute.push_uniforms +
shader_remapped_sysval_offset(
shader, sysval_offset(compute, num_work_groups.z));
}
struct panvk_precomp_ctx precomp_ctx = panvk_per_arch(precomp_cs)(cmdbuf);
enum panlib_barrier precomp_barrier =
copy_desc_job.gpu == 0 ? PANLIB_BARRIER_JM_SUPPRESS_PREFETCH
: PANLIB_BARRIER_NONE;
panlib_indirect_dispatch(
&precomp_ctx, panlib_1d(1), precomp_barrier,
info->indirect.buffer_dev_addr, shader->cs.local_size.x,
shader->cs.local_size.y, shader->cs.local_size.z, job.gpu,
num_work_groups_x_sysval_addr, num_work_groups_y_sysval_addr,
num_work_groups_z_sysval_addr);
indirect_dep = batch->vtc_jc.job_index;
}
util_dynarray_append(&batch->jobs, void *, job.cpu);
unsigned copy_desc_dep =
copy_desc_job.gpu
? pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false,
0, 0, &copy_desc_job, false)
: 0;
? pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false,
indirect, 0, 0, &copy_desc_job, false)
: indirect_dep;
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, 0,
copy_desc_dep, &job, false);
pan_jc_add_job(&batch->vtc_jc,
indirect ? MALI_JOB_TYPE_NOT_STARTED : MALI_JOB_TYPE_COMPUTE,
indirect, false, 0, copy_desc_dep, &job, false);
panvk_per_arch(cmd_close_batch)(cmdbuf);
clear_dirty_after_dispatch(cmdbuf);
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
uint32_t baseGroupX, uint32_t baseGroupY,
uint32_t baseGroupZ, uint32_t groupCountX,
uint32_t groupCountY, uint32_t groupCountZ)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
if (groupCountX == 0 || groupCountY == 0 || groupCountZ == 0)
return;
struct panvk_dispatch_info info = {
.wg_base = {baseGroupX, baseGroupY, baseGroupZ},
.direct.wg_count = {groupCountX, groupCountY, groupCountZ},
};
cmd_dispatch(cmdbuf, &info);
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchIndirect)(VkCommandBuffer commandBuffer,
VkBuffer _buffer, VkDeviceSize offset)
{
panvk_stub();
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
uint64_t buffer_gpu = panvk_buffer_gpu_ptr(buffer, offset);
struct panvk_dispatch_info info = {
.indirect.buffer_dev_addr = buffer_gpu,
};
cmd_dispatch(cmdbuf, &info);
}