panvk: Parallelize min max index search on JM

This parallelize min max index search and avoid running that logic per
layer.

This should speed up indexed draw a bit.

Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com>
Reviewed-by: Olivia Lee <olivia.lee@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35724>
This commit is contained in:
Mary Guillemard 2025-07-01 00:54:45 +02:00 committed by Marge Bot
parent d936bb496c
commit f729bedf89
3 changed files with 118 additions and 42 deletions

View file

@ -10,36 +10,6 @@
#include "draw_helper.h"
#if (PAN_ARCH == 6 || PAN_ARCH == 7)
static void
panlib_index_minmax_search(global uint8_t *index_buffer_ptr,
uint32_t index_bit_size, uint32_t start,
uint32_t count, bool primitive_restart,
uint32_t *min_ptr, uint32_t *max_ptr)
{
*max_ptr = 0;
switch (index_bit_size) {
#define MINMAX_SEARCH_CASE(sz) \
case sz: { \
global uint##sz##_t *indices = (global uint##sz##_t *)index_buffer_ptr; \
*min_ptr = UINT##sz##_MAX; \
for (uint32_t i = 0; i < count; i++) { \
if (primitive_restart && indices[i + start] == UINT##sz##_MAX) \
continue; \
*min_ptr = min((uint32_t)indices[i + start], *min_ptr); \
*max_ptr = max((uint32_t)indices[i + start], *max_ptr); \
} \
break; \
}
MINMAX_SEARCH_CASE(32)
MINMAX_SEARCH_CASE(16)
MINMAX_SEARCH_CASE(8)
#undef MINMAX_SEARCH_CASE
default:
assert(0 && "Invalid index size");
}
}
struct panlib_draw_info {
struct {
uint32_t size;
@ -494,6 +464,7 @@ panlib_draw_indirect_helper(
KERNEL(1)
panlib_draw_indexed_indirect_helper(
global VkDrawIndexedIndirectCommand *cmd, global uint8_t *index_buffer_ptr,
global struct libpan_draw_helper_index_min_max_result *index_min_max_res,
uint32_t index_size, uint32_t primitive_vertex_count,
uint32_t attrib_bufs_valid, uint32_t attribs_valid,
global struct mali_attribute_buffer_packed *varying_bufs_descs,
@ -504,21 +475,15 @@ panlib_draw_indexed_indirect_helper(
global struct libpan_draw_helper_attrib_info *attribs_infos,
global uint32_t *first_vertex_sysval, global uint32_t *first_instance_sysval,
global uint32_t *raw_vertex_offset_sysval, global uint8_t *idvs_job,
global uint8_t *vertex_job, global uint8_t *tiler_job,
uint8_t primitive_restart)
global uint8_t *vertex_job, global uint8_t *tiler_job)
{
const uint32_t index_count = cmd->indexCount;
const uint32_t first_index = cmd->firstIndex;
const uint32_t first_instance = cmd->firstInstance;
const uint32_t instance_count = cmd->instanceCount;
const int32_t vertex_offset = cmd->vertexOffset;
/* First compute the min and max range */
uint32_t min_vertex, max_vertex;
panlib_index_minmax_search(index_buffer_ptr, index_size * 8, first_index,
index_count, primitive_restart, &min_vertex,
&max_vertex);
const uint32_t min_vertex = index_min_max_res->min;
const uint32_t max_vertex = index_min_max_res->max;
const uint32_t vertex_range = max_vertex - min_vertex + 1;
struct panlib_draw_info draw = {
@ -558,4 +523,60 @@ panlib_draw_indexed_indirect_helper(
*first_instance_sysval = draw.instance.base;
*raw_vertex_offset_sysval = draw.vertex.raw_offset;
}
KERNEL(64)
panlib_draw_index_minmax_search_helper(global uint8_t *index_buffer_ptr,
global VkDrawIndexedIndirectCommand *cmd,
global atomic_uint *min_ptr,
global atomic_uint *max_ptr,
uint32_t index_bytes_log2__3,
uint8_t primitive_restart__2)
{
/* Max count of values to process per thread */
const uint32_t max_count_per_thread = 1024;
const uint32_t index_bit_size = (1 << index_bytes_log2__3) * 8;
const uint32_t start = cmd->firstIndex;
const uint32_t index_count = cmd->indexCount;
uint32_t base_idx = cl_global_id.x * max_count_per_thread;
/* If the thread is out of range, bail out */
if (base_idx >= index_count)
return;
/* Compute expected max iteration to do in this thread */
uint32_t count = MIN2(max_count_per_thread, index_count - base_idx);
/* Sanity check so nothing weird will happen */
assert(base_idx + count <= index_count);
uint32_t local_min = ((uint64_t)1 << index_bit_size) - 1;
uint32_t local_max = 0;
switch (index_bit_size) {
#define MINMAX_SEARCH_CASE(sz) \
case sz: { \
global uint##sz##_t *indices = (global uint##sz##_t *)index_buffer_ptr; \
for (uint32_t i = 0; i < count; i++) { \
uint32_t val = (uint32_t)indices[start + base_idx + i]; \
if (primitive_restart__2 && val == UINT##sz##_MAX) \
continue; \
local_min = min(local_min, val); \
local_max = max(local_max, val); \
} \
break; \
}
MINMAX_SEARCH_CASE(32)
MINMAX_SEARCH_CASE(16)
MINMAX_SEARCH_CASE(8)
#undef MINMAX_SEARCH_CASE
default:
assert(0 && "Invalid index size");
}
atomic_fetch_min(min_ptr, local_min);
atomic_fetch_max(max_ptr, local_max);
}
#endif

View file

@ -21,6 +21,11 @@ enum panlib_varying_buf_id {
PANLIB_VARY_BUF_MAX,
};
struct libpan_draw_helper_index_min_max_result {
uint32_t min;
uint32_t max;
};
struct libpan_draw_helper_varying_buf_info {
uint64_t address;
uint32_t size;

View file

@ -1588,6 +1588,57 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
: cmdbuf->state.gfx.render.layer_count;
const struct panvk_shader_variant *fs = panvk_shader_only_variant(get_fs(cmdbuf));
struct panvk_precomp_ctx precomp_ctx = panvk_per_arch(precomp_cs)(cmdbuf);
uint64_t index_min_max_res_ptr = 0;
uint32_t job_before_indirect_helper = copy_desc_job_id;
if (draw->info.index.size) {
index_min_max_res_ptr =
panvk_cmd_alloc_dev_mem(
cmdbuf, desc,
sizeof(struct libpan_draw_helper_index_min_max_result), 8)
.gpu;
const struct panlib_draw_index_minmax_search_helper_args args = {
.index_buffer_ptr = cmdbuf->state.gfx.ib.dev_addr,
.cmd = draw->info.indirect.buffer_dev_addr,
.min_ptr =
index_min_max_res_ptr +
offsetof(struct libpan_draw_helper_index_min_max_result, min),
.max_ptr =
index_min_max_res_ptr +
offsetof(struct libpan_draw_helper_index_min_max_result, max),
};
struct libpan_draw_helper_index_min_max_result val = {
.min = ((uint64_t)1 << (draw->info.index.size * 8)) - 1,
.max = 0,
};
uint64_t *raw_val = (uint64_t *)&val;
struct pan_ptr write_job =
pan_pool_alloc_desc(&cmdbuf->desc_pool.base, WRITE_VALUE_JOB);
pan_section_pack(write_job.cpu, WRITE_VALUE_JOB, PAYLOAD, payload) {
payload.type = MALI_WRITE_VALUE_TYPE_IMMEDIATE_64;
payload.address = index_min_max_res_ptr;
payload.immediate_value = *raw_val;
};
unsigned write_job_id =
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_WRITE_VALUE, false, false,
0, copy_desc_job_id, &write_job, false);
util_dynarray_append(&batch->jobs, void *, write_job.cpu);
uint32_t index_count = cmdbuf->state.gfx.ib.size / draw->info.index.size;
uint32_t wg_count = DIV_ROUND_UP(index_count, 65536);
assert(wg_count <= 65536);
panlib_draw_index_minmax_search_helper_struct(
&precomp_ctx, panlib_1d_with_jm_deps(wg_count, 0, write_job_id),
PANLIB_BARRIER_NONE, args, util_logbase2(draw->info.index.size),
ia->primitive_restart_enable);
job_before_indirect_helper = batch->vtc_jc.job_index;
}
for (uint32_t i = 0; i < enabled_layer_count; i++) {
/* Force a new push uniform block to be allocated */
gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
@ -1669,20 +1720,19 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
vs, sysval_offset(graphics, vs.raw_vertex_offset));
}
struct panvk_precomp_ctx precomp_ctx = panvk_per_arch(precomp_cs)(cmdbuf);
enum panlib_barrier indirect_barrier =
PANLIB_BARRIER_JM_SUPPRESS_PREFETCH;
struct panlib_precomp_grid indirect_grid =
panlib_1d_with_jm_deps(1, 0, copy_desc_job_id);
panlib_1d_with_jm_deps(1, 0, job_before_indirect_helper);
if (draw->info.indirect.buffer_dev_addr != 0 && draw->info.index.size) {
const struct panlib_draw_indexed_indirect_helper_args args = {
.cmd = draw->info.indirect.buffer_dev_addr,
.index_buffer_ptr = cmdbuf->state.gfx.ib.dev_addr,
.index_min_max_res = index_min_max_res_ptr,
.index_size = draw->info.index.size,
.primitive_vertex_count = primitive_vertex_count(
translate_prim_topology(ia->primitive_topology)),
.primitive_restart = ia->primitive_restart_enable,
.varying_bufs_descs = draw->varying_bufs,
.varying_bufs_info = draw->indirect_info.varying_bufs,
.attrib_bufs_descs = draw->vs.attribute_bufs,