panvk: Move TLS preparation logic to cmd_dispatch_prepare_tls

This will be reused for precomp.

Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com>
Acked-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32720>
This commit is contained in:
Mary Guillemard 2024-12-18 08:43:26 +01:00 committed by Marge Bot
parent 3d2cc01f8a
commit 005703e5b5
3 changed files with 91 additions and 46 deletions

View file

@ -20,6 +20,7 @@
#include "panvk_cmd_push_constant.h" #include "panvk_cmd_push_constant.h"
#include "panvk_device.h" #include "panvk_device.h"
#include "panvk_entrypoints.h" #include "panvk_entrypoints.h"
#include "panvk_macros.h"
#include "panvk_meta.h" #include "panvk_meta.h"
#include "panvk_physical_device.h" #include "panvk_physical_device.h"
@ -128,28 +129,18 @@ calculate_workgroups_per_task(const struct panvk_shader *shader,
return wg_per_task; return wg_per_task;
} }
static void uint64_t
cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf,
const struct panvk_shader *shader,
const struct pan_compute_dim *dim,
bool indirect)
{ {
const struct panvk_shader *shader = cmdbuf->state.compute.shader;
VkResult result;
/* If there's no compute shader, we can skip the dispatch. */
if (!panvk_priv_mem_dev_addr(shader->spd))
return;
struct panvk_physical_device *phys_dev = struct panvk_physical_device *phys_dev =
to_panvk_physical_device(cmdbuf->vk.base.device->physical); to_panvk_physical_device(cmdbuf->vk.base.device->physical);
struct panvk_descriptor_state *desc_state =
&cmdbuf->state.compute.desc_state;
struct panvk_shader_desc_state *cs_desc_state =
&cmdbuf->state.compute.cs.desc;
const struct cs_tracing_ctx *tracing_ctx =
&cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing;
struct panfrost_ptr tsd = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE); struct panfrost_ptr tsd = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
if (!tsd.gpu) if (!tsd.gpu)
return; return tsd.gpu;
struct pan_tls_info tlsinfo = { struct pan_tls_info tlsinfo = {
.tls.size = shader->info.tls_size, .tls.size = shader->info.tls_size,
@ -159,8 +150,6 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
unsigned core_count = unsigned core_count =
panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range); panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
bool indirect = info->indirect.buffer_dev_addr != 0;
/* Only used for indirect dispatch */ /* Only used for indirect dispatch */
unsigned wg_per_task = 0; unsigned wg_per_task = 0;
if (indirect) if (indirect)
@ -180,13 +169,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
/* TODO: Similar to what we are doing for indirect this should change /* TODO: Similar to what we are doing for indirect this should change
* to calculate the maximum number of workgroups we can execute * to calculate the maximum number of workgroups we can execute
* concurrently. */ * concurrently. */
struct pan_compute_dim dim = { tlsinfo.wls.instances = pan_wls_instances(dim);
info->direct.wg_count.x,
info->direct.wg_count.y,
info->direct.wg_count.z,
};
tlsinfo.wls.instances = pan_wls_instances(&dim);
} }
/* TODO: Clamp WLS instance to some maximum WLS budget. */ /* TODO: Clamp WLS instance to some maximum WLS budget. */
@ -201,7 +184,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
tlsinfo.wls.ptr = tlsinfo.wls.ptr =
panvk_cmd_alloc_dev_mem(cmdbuf, tls, wls_total_size, 4096).gpu; panvk_cmd_alloc_dev_mem(cmdbuf, tls, wls_total_size, 4096).gpu;
if (!tlsinfo.wls.ptr) if (!tlsinfo.wls.ptr)
return; return 0;
} }
cmdbuf->state.tls.info.tls.size = cmdbuf->state.tls.info.tls.size =
@ -210,11 +193,50 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
if (!cmdbuf->state.tls.desc.gpu) { if (!cmdbuf->state.tls.desc.gpu) {
cmdbuf->state.tls.desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE); cmdbuf->state.tls.desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
if (!cmdbuf->state.tls.desc.gpu) if (!cmdbuf->state.tls.desc.gpu)
return; return 0;
} }
GENX(pan_emit_tls)(&tlsinfo, tsd.cpu); GENX(pan_emit_tls)(&tlsinfo, tsd.cpu);
return tsd.gpu;
}
static void
cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
{
const struct panvk_shader *shader = cmdbuf->state.compute.shader;
VkResult result;
/* If there's no compute shader, we can skip the dispatch. */
if (!panvk_priv_mem_dev_addr(shader->spd))
return;
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(cmdbuf->vk.base.device->physical);
struct panvk_descriptor_state *desc_state =
&cmdbuf->state.compute.desc_state;
struct panvk_shader_desc_state *cs_desc_state =
&cmdbuf->state.compute.cs.desc;
const struct cs_tracing_ctx *tracing_ctx =
&cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing;
struct pan_compute_dim dim = {
info->direct.wg_count.x,
info->direct.wg_count.y,
info->direct.wg_count.z,
};
bool indirect = info->indirect.buffer_dev_addr != 0;
uint64_t tsd =
panvk_per_arch(cmd_dispatch_prepare_tls)(cmdbuf, shader, &dim, indirect);
if (!tsd)
return;
/* Only used for indirect dispatch */
unsigned wg_per_task = 0;
if (indirect)
wg_per_task = calculate_workgroups_per_task(shader, phys_dev);
if (compute_state_dirty(cmdbuf, DESC_STATE) || if (compute_state_dirty(cmdbuf, DESC_STATE) ||
compute_state_dirty(cmdbuf, CS)) { compute_state_dirty(cmdbuf, CS)) {
result = panvk_per_arch(cmd_prepare_push_descs)( result = panvk_per_arch(cmd_prepare_push_descs)(
@ -245,11 +267,11 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE); struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
/* Copy the global TLS pointer to the per-job TSD. */ /* Copy the global TLS pointer to the per-job TSD. */
if (tlsinfo.tls.size) { if (shader->info.tls_size) {
cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu); cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu);
cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
cs_wait_slot(b, SB_ID(LS), false); cs_wait_slot(b, SB_ID(LS), false);
cs_move64_to(b, cs_scratch_reg64(b, 0), tsd.gpu); cs_move64_to(b, cs_scratch_reg64(b, 0), tsd);
cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8); cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
cs_wait_slot(b, SB_ID(LS), false); cs_wait_slot(b, SB_ID(LS), false);
} }
@ -269,7 +291,7 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
cs_move64_to(b, cs_sr_reg64(b, 16), cs_move64_to(b, cs_sr_reg64(b, 16),
panvk_priv_mem_dev_addr(shader->spd)); panvk_priv_mem_dev_addr(shader->spd));
cs_move64_to(b, cs_sr_reg64(b, 24), tsd.gpu); cs_move64_to(b, cs_sr_reg64(b, 24), tsd);
/* Global attribute offset */ /* Global attribute offset */
cs_move32_to(b, cs_sr_reg32(b, 32), 0); cs_move32_to(b, cs_sr_reg32(b, 32), 0);

View file

@ -16,6 +16,7 @@
#include "panvk_cmd_desc_state.h" #include "panvk_cmd_desc_state.h"
#include "panvk_device.h" #include "panvk_device.h"
#include "panvk_entrypoints.h" #include "panvk_entrypoints.h"
#include "panvk_macros.h"
#include "panvk_meta.h" #include "panvk_meta.h"
#include "panvk_physical_device.h" #include "panvk_physical_device.h"
@ -26,6 +27,36 @@
#include <vulkan/vulkan_core.h> #include <vulkan/vulkan_core.h>
uint64_t
panvk_per_arch(cmd_dispatch_prepare_tls)(struct panvk_cmd_buffer *cmdbuf,
const struct panvk_shader *shader,
const struct pan_compute_dim *dim,
bool indirect)
{
struct panvk_batch *batch = cmdbuf->cur_batch;
assert(batch);
assert(!indirect && "Indirect not supported yet!");
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(cmdbuf->vk.base.device->physical);
panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, false);
batch->tlsinfo.tls.size = shader->info.tls_size;
batch->tlsinfo.wls.size = shader->info.wls_size;
if (batch->tlsinfo.wls.size) {
unsigned core_id_range;
panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
batch->tlsinfo.wls.instances = pan_wls_instances(dim);
batch->wls_total_size = pan_wls_adjust_size(batch->tlsinfo.wls.size) *
batch->tlsinfo.wls.instances * core_id_range;
}
return batch->tls.gpu;
}
VKAPI_ATTR void VKAPI_CALL VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer, panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupX, uint32_t baseGroupY,
@ -47,9 +78,6 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
.wg_base = {baseGroupX, baseGroupY, baseGroupZ}, .wg_base = {baseGroupX, baseGroupY, baseGroupZ},
.direct.wg_count = {groupCountX, groupCountY, groupCountZ}, .direct.wg_count = {groupCountX, groupCountY, groupCountZ},
}; };
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct panvk_physical_device *phys_dev =
to_panvk_physical_device(dev->vk.physical);
struct pan_compute_dim wg_count = {groupCountX, groupCountY, groupCountZ}; struct pan_compute_dim wg_count = {groupCountX, groupCountY, groupCountZ};
panvk_per_arch(cmd_close_batch)(cmdbuf); panvk_per_arch(cmd_close_batch)(cmdbuf);
@ -60,8 +88,8 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
struct panvk_shader_desc_state *cs_desc_state = struct panvk_shader_desc_state *cs_desc_state =
&cmdbuf->state.compute.cs.desc; &cmdbuf->state.compute.cs.desc;
panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, false); uint64_t tsd = panvk_per_arch(cmd_dispatch_prepare_tls)(
uint64_t tsd = batch->tls.gpu; cmdbuf, shader, &wg_count, false);
result = panvk_per_arch(cmd_prepare_push_descs)( result = panvk_per_arch(cmd_prepare_push_descs)(
cmdbuf, desc_state, shader->desc_info.used_set_mask); cmdbuf, desc_state, shader->desc_info.used_set_mask);
@ -138,17 +166,6 @@ panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, 0, pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_COMPUTE, false, false, 0,
copy_desc_dep, &job, false); copy_desc_dep, &job, false);
batch->tlsinfo.tls.size = shader->info.tls_size;
batch->tlsinfo.wls.size = shader->info.wls_size;
if (batch->tlsinfo.wls.size) {
unsigned core_id_range;
panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
batch->tlsinfo.wls.instances = pan_wls_instances(&wg_count);
batch->wls_total_size = pan_wls_adjust_size(batch->tlsinfo.wls.size) *
batch->tlsinfo.wls.instances * core_id_range;
}
panvk_per_arch(cmd_close_batch)(cmdbuf); panvk_per_arch(cmd_close_batch)(cmdbuf);
clear_dirty_after_dispatch(cmdbuf); clear_dirty_after_dispatch(cmdbuf);
} }

View file

@ -10,6 +10,8 @@
#error "PAN_ARCH must be defined" #error "PAN_ARCH must be defined"
#endif #endif
#include "pan_desc.h"
enum panvk_cmd_compute_dirty_state { enum panvk_cmd_compute_dirty_state {
PANVK_CMD_COMPUTE_DIRTY_CS, PANVK_CMD_COMPUTE_DIRTY_CS,
PANVK_CMD_COMPUTE_DIRTY_DESC_STATE, PANVK_CMD_COMPUTE_DIRTY_DESC_STATE,
@ -74,4 +76,8 @@ struct panvk_dispatch_info {
void panvk_per_arch(cmd_prepare_dispatch_sysvals)( void panvk_per_arch(cmd_prepare_dispatch_sysvals)(
struct panvk_cmd_buffer *cmdbuf, const struct panvk_dispatch_info *info); struct panvk_cmd_buffer *cmdbuf, const struct panvk_dispatch_info *info);
uint64_t panvk_per_arch(cmd_dispatch_prepare_tls)(
struct panvk_cmd_buffer *cmdbuf, const struct panvk_shader *shader,
const struct pan_compute_dim *dim, bool indirect);
#endif #endif