diff --git a/src/panfrost/ci/panfrost-g52-fails.txt b/src/panfrost/ci/panfrost-g52-fails.txt index e81bdbd1a8d..a29929f3117 100644 --- a/src/panfrost/ci/panfrost-g52-fails.txt +++ b/src/panfrost/ci/panfrost-g52-fails.txt @@ -283,26 +283,6 @@ spec@ext_image_dma_buf_import@ext_image_dma_buf_import-refcount-multithread,Cras dEQP-VK.api.object_management.max_concurrent.device,Fail dEQP-VK.api.object_management.max_concurrent.device_group,Fail -# CmdDispatchIndirect not supported yet -dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.empty_command,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.large_offset,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.large_offset_multiple_invocations,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.multi_dispatch,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.multi_dispatch_reuse_command,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.multiple_groups,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.multiple_groups_multiple_invocations,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.single_invocation,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.gen_in_compute.small_offset,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.empty_command,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.large_offset,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.large_offset_multiple_invocations,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.multi_dispatch,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.multi_dispatch_reuse_command,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.multiple_groups,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.multiple_groups_multiple_invocations,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.single_invocation,Crash -dEQP-VK.compute.pipeline.indirect_dispatch.upload_buffer.small_offset,Crash - # CTS bug, see https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/5296 dEQP-VK.api.device_init.create_device_unsupported_features.protected_memory_features,Fail diff --git a/src/panfrost/ci/panfrost-g52-skips.txt b/src/panfrost/ci/panfrost-g52-skips.txt index aea5cfdcfad..bf71218aaf6 100644 --- a/src/panfrost/ci/panfrost-g52-skips.txt +++ b/src/panfrost/ci/panfrost-g52-skips.txt @@ -71,7 +71,7 @@ shaders@glsl-bug-110796 dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed* -# indirect dispatch and draw not supported yet +# indirect draw not supported yet dEQP-VK.synchronization.*indirect* dEQP-VK.synchronization2.*indirect* dEQP-VK.draw.renderpass.indirect_draw.* diff --git a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c index 47a98bc07c8..27b0e907693 100644 --- a/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_dispatch.c @@ -9,11 +9,14 @@ * SPDX-License-Identifier: MIT */ +#include #include "genxml/gen_macros.h" +#include "panvk_buffer.h" #include "panvk_cmd_alloc.h" #include "panvk_cmd_buffer.h" #include "panvk_cmd_desc_state.h" +#include "panvk_cmd_precomp.h" #include "panvk_device.h" #include "panvk_entrypoints.h" #include "panvk_macros.h" @@ -36,7 +39,6 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf, struct panvk_batch *batch = cmdbuf->cur_batch; assert(batch); - assert(!indirect && "Indirect not supported yet!"); struct panvk_physical_device *phys_dev = to_panvk_physical_device(cmdbuf->vk.base.device->physical); @@ -45,6 +47,7 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf, batch->tlsinfo.tls.size = shader->info.tls_size; batch->tlsinfo.wls.size = shader->info.wls_size; + if (batch->tlsinfo.wls.size) { unsigned core_id_range; @@ -58,29 +61,16 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf, return batch->tls.gpu; } -VKAPI_ATTR void VKAPI_CALL -panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer, - uint32_t baseGroupX, uint32_t baseGroupY, - uint32_t baseGroupZ, uint32_t groupCountX, - uint32_t groupCountY, uint32_t groupCountZ) +static void +cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) { - VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); const struct panvk_shader *shader = cmdbuf->state.compute.shader; VkResult result; - if (groupCountX == 0 || groupCountY == 0 || groupCountZ == 0) - return; - /* If there's no compute shader, we can skip the dispatch. */ if (!panvk_priv_mem_dev_addr(shader->rsd)) return; - struct panvk_dispatch_info info = { - .wg_base = {baseGroupX, baseGroupY, baseGroupZ}, - .direct.wg_count = {groupCountX, groupCountY, groupCountZ}, - }; - struct pan_compute_dim wg_count = {groupCountX, groupCountY, groupCountZ}; - panvk_per_arch(cmd_close_batch)(cmdbuf); struct panvk_batch *batch = panvk_per_arch(cmd_open_batch)(cmdbuf); @@ -89,8 +79,14 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer, struct panvk_shader_desc_state *cs_desc_state = &cmdbuf->state.compute.cs.desc; + struct pan_compute_dim wg_count = { + info->direct.wg_count.x, + info->direct.wg_count.y, + info->direct.wg_count.z, + }; + bool indirect = info->indirect.buffer_dev_addr != 0; uint64_t tsd = panvk_per_arch(cmd_dispatch_prepare_tls)(cmdbuf, shader, - &wg_count, false); + &wg_count, indirect); result = panvk_per_arch(cmd_prepare_push_descs)( cmdbuf, desc_state, shader->desc_info.used_set_mask); @@ -105,7 +101,7 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer, return; } - panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, &info); + panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, info); result = panvk_per_arch(cmd_prepare_push_uniforms)( cmdbuf, cmdbuf->state.compute.shader, 1); @@ -135,10 +131,12 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer, util_dynarray_append(&batch->jobs, void *, job.cpu); - pan_pack_work_groups_compute( - pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION), wg_count.x, wg_count.y, - wg_count.z, shader->cs.local_size.x, shader->cs.local_size.y, - shader->cs.local_size.z, false, false); + if (!indirect) { + pan_pack_work_groups_compute( + pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION), wg_count.x, + wg_count.y, wg_count.z, shader->cs.local_size.x, + shader->cs.local_size.y, shader->cs.local_size.z, false, false); + } pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { cfg.job_task_split = util_logbase2_ceil(shader->cs.local_size.x + 1) + @@ -158,22 +156,91 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer, cfg.samplers = cs_desc_state->tables[PANVK_BIFROST_DESC_TABLE_SAMPLER]; } + unsigned indirect_dep = 0; + if (indirect) { + /* We redirect write to memory sink for null pointers */ + uint64_t num_work_groups_x_sysval_addr = 0x8ull << 60; + uint64_t num_work_groups_y_sysval_addr = 0x8ull << 60; + uint64_t num_work_groups_z_sysval_addr = 0x8ull << 60; + + if (shader_uses_sysval(shader, compute, num_work_groups.x)) { + num_work_groups_x_sysval_addr = + cmdbuf->state.compute.push_uniforms + + shader_remapped_sysval_offset( + shader, sysval_offset(compute, num_work_groups.x)); + } + + if (shader_uses_sysval(shader, compute, num_work_groups.y)) { + num_work_groups_y_sysval_addr = + cmdbuf->state.compute.push_uniforms + + shader_remapped_sysval_offset( + shader, sysval_offset(compute, num_work_groups.y)); + } + + if (shader_uses_sysval(shader, compute, num_work_groups.z)) { + num_work_groups_z_sysval_addr = + cmdbuf->state.compute.push_uniforms + + shader_remapped_sysval_offset( + shader, sysval_offset(compute, num_work_groups.z)); + } + + struct panvk_precomp_ctx precomp_ctx = panvk_per_arch(precomp_cs)(cmdbuf); + enum panlib_barrier precomp_barrier = + copy_desc_job.gpu == 0 ? PANLIB_BARRIER_JM_SUPPRESS_PREFETCH + : PANLIB_BARRIER_NONE; + + panlib_indirect_dispatch( + &precomp_ctx, panlib_1d(1), precomp_barrier, + info->indirect.buffer_dev_addr, shader->cs.local_size.x, + shader->cs.local_size.y, shader->cs.local_size.z, job.gpu, + num_work_groups_x_sysval_addr, num_work_groups_y_sysval_addr, + num_work_groups_z_sysval_addr); + indirect_dep = batch->vtc_jc.job_index; + } + + util_dynarray_append(&batch->jobs, void *, job.cpu); + unsigned copy_desc_dep = copy_desc_job.gpu - ? pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, - 0, 0, ©_desc_job, false) - : 0; + ? pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, + indirect, 0, 0, ©_desc_job, false) + : indirect_dep; - pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, - copy_desc_dep, &job, false); + pan_jc_add_job(&batch->vtc_jc, + indirect ? MALI_JOB_TYPE_NOT_STARTED : MALI_JOB_TYPE_COMPUTE, + indirect, false, 0, copy_desc_dep, &job, false); panvk_per_arch(cmd_close_batch)(cmdbuf); clear_dirty_after_dispatch(cmdbuf); } +VKAPI_ATTR void VKAPI_CALL +panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer, + uint32_t baseGroupX, uint32_t baseGroupY, + uint32_t baseGroupZ, uint32_t groupCountX, + uint32_t groupCountY, uint32_t groupCountZ) +{ + VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + + if (groupCountX == 0 || groupCountY == 0 || groupCountZ == 0) + return; + + struct panvk_dispatch_info info = { + .wg_base = {baseGroupX, baseGroupY, baseGroupZ}, + .direct.wg_count = {groupCountX, groupCountY, groupCountZ}, + }; + cmd_dispatch(cmdbuf, &info); +} + VKAPI_ATTR void VKAPI_CALL panvk_per_arch(CmdDispatchIndirect)(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset) { - panvk_stub(); + VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); + VK_FROM_HANDLE(panvk_buffer, buffer, _buffer); + uint64_t buffer_gpu = panvk_buffer_gpu_ptr(buffer, offset); + struct panvk_dispatch_info info = { + .indirect.buffer_dev_addr = buffer_gpu, + }; + cmd_dispatch(cmdbuf, &info); }