mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 09:38:07 +02:00
panvk: Parallelize min max index search on JM
This parallelize min max index search and avoid running that logic per layer. This should speed up indexed draw a bit. Signed-off-by: Mary Guillemard <mary.guillemard@collabora.com> Reviewed-by: Olivia Lee <olivia.lee@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35724>
This commit is contained in:
parent
d936bb496c
commit
f729bedf89
3 changed files with 118 additions and 42 deletions
|
|
@ -10,36 +10,6 @@
|
|||
#include "draw_helper.h"
|
||||
|
||||
#if (PAN_ARCH == 6 || PAN_ARCH == 7)
|
||||
static void
|
||||
panlib_index_minmax_search(global uint8_t *index_buffer_ptr,
|
||||
uint32_t index_bit_size, uint32_t start,
|
||||
uint32_t count, bool primitive_restart,
|
||||
uint32_t *min_ptr, uint32_t *max_ptr)
|
||||
{
|
||||
*max_ptr = 0;
|
||||
|
||||
switch (index_bit_size) {
|
||||
#define MINMAX_SEARCH_CASE(sz) \
|
||||
case sz: { \
|
||||
global uint##sz##_t *indices = (global uint##sz##_t *)index_buffer_ptr; \
|
||||
*min_ptr = UINT##sz##_MAX; \
|
||||
for (uint32_t i = 0; i < count; i++) { \
|
||||
if (primitive_restart && indices[i + start] == UINT##sz##_MAX) \
|
||||
continue; \
|
||||
*min_ptr = min((uint32_t)indices[i + start], *min_ptr); \
|
||||
*max_ptr = max((uint32_t)indices[i + start], *max_ptr); \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
MINMAX_SEARCH_CASE(32)
|
||||
MINMAX_SEARCH_CASE(16)
|
||||
MINMAX_SEARCH_CASE(8)
|
||||
#undef MINMAX_SEARCH_CASE
|
||||
default:
|
||||
assert(0 && "Invalid index size");
|
||||
}
|
||||
}
|
||||
|
||||
struct panlib_draw_info {
|
||||
struct {
|
||||
uint32_t size;
|
||||
|
|
@ -494,6 +464,7 @@ panlib_draw_indirect_helper(
|
|||
KERNEL(1)
|
||||
panlib_draw_indexed_indirect_helper(
|
||||
global VkDrawIndexedIndirectCommand *cmd, global uint8_t *index_buffer_ptr,
|
||||
global struct libpan_draw_helper_index_min_max_result *index_min_max_res,
|
||||
uint32_t index_size, uint32_t primitive_vertex_count,
|
||||
uint32_t attrib_bufs_valid, uint32_t attribs_valid,
|
||||
global struct mali_attribute_buffer_packed *varying_bufs_descs,
|
||||
|
|
@ -504,21 +475,15 @@ panlib_draw_indexed_indirect_helper(
|
|||
global struct libpan_draw_helper_attrib_info *attribs_infos,
|
||||
global uint32_t *first_vertex_sysval, global uint32_t *first_instance_sysval,
|
||||
global uint32_t *raw_vertex_offset_sysval, global uint8_t *idvs_job,
|
||||
global uint8_t *vertex_job, global uint8_t *tiler_job,
|
||||
uint8_t primitive_restart)
|
||||
global uint8_t *vertex_job, global uint8_t *tiler_job)
|
||||
{
|
||||
const uint32_t index_count = cmd->indexCount;
|
||||
const uint32_t first_index = cmd->firstIndex;
|
||||
const uint32_t first_instance = cmd->firstInstance;
|
||||
const uint32_t instance_count = cmd->instanceCount;
|
||||
const int32_t vertex_offset = cmd->vertexOffset;
|
||||
|
||||
/* First compute the min and max range */
|
||||
uint32_t min_vertex, max_vertex;
|
||||
panlib_index_minmax_search(index_buffer_ptr, index_size * 8, first_index,
|
||||
index_count, primitive_restart, &min_vertex,
|
||||
&max_vertex);
|
||||
|
||||
const uint32_t min_vertex = index_min_max_res->min;
|
||||
const uint32_t max_vertex = index_min_max_res->max;
|
||||
const uint32_t vertex_range = max_vertex - min_vertex + 1;
|
||||
|
||||
struct panlib_draw_info draw = {
|
||||
|
|
@ -558,4 +523,60 @@ panlib_draw_indexed_indirect_helper(
|
|||
*first_instance_sysval = draw.instance.base;
|
||||
*raw_vertex_offset_sysval = draw.vertex.raw_offset;
|
||||
}
|
||||
|
||||
KERNEL(64)
|
||||
panlib_draw_index_minmax_search_helper(global uint8_t *index_buffer_ptr,
|
||||
global VkDrawIndexedIndirectCommand *cmd,
|
||||
global atomic_uint *min_ptr,
|
||||
global atomic_uint *max_ptr,
|
||||
uint32_t index_bytes_log2__3,
|
||||
uint8_t primitive_restart__2)
|
||||
{
|
||||
/* Max count of values to process per thread */
|
||||
const uint32_t max_count_per_thread = 1024;
|
||||
|
||||
const uint32_t index_bit_size = (1 << index_bytes_log2__3) * 8;
|
||||
const uint32_t start = cmd->firstIndex;
|
||||
const uint32_t index_count = cmd->indexCount;
|
||||
|
||||
uint32_t base_idx = cl_global_id.x * max_count_per_thread;
|
||||
|
||||
/* If the thread is out of range, bail out */
|
||||
if (base_idx >= index_count)
|
||||
return;
|
||||
|
||||
/* Compute expected max iteration to do in this thread */
|
||||
uint32_t count = MIN2(max_count_per_thread, index_count - base_idx);
|
||||
|
||||
/* Sanity check so nothing weird will happen */
|
||||
assert(base_idx + count <= index_count);
|
||||
|
||||
uint32_t local_min = ((uint64_t)1 << index_bit_size) - 1;
|
||||
uint32_t local_max = 0;
|
||||
|
||||
switch (index_bit_size) {
|
||||
#define MINMAX_SEARCH_CASE(sz) \
|
||||
case sz: { \
|
||||
global uint##sz##_t *indices = (global uint##sz##_t *)index_buffer_ptr; \
|
||||
for (uint32_t i = 0; i < count; i++) { \
|
||||
uint32_t val = (uint32_t)indices[start + base_idx + i]; \
|
||||
if (primitive_restart__2 && val == UINT##sz##_MAX) \
|
||||
continue; \
|
||||
local_min = min(local_min, val); \
|
||||
local_max = max(local_max, val); \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
MINMAX_SEARCH_CASE(32)
|
||||
MINMAX_SEARCH_CASE(16)
|
||||
MINMAX_SEARCH_CASE(8)
|
||||
#undef MINMAX_SEARCH_CASE
|
||||
default:
|
||||
assert(0 && "Invalid index size");
|
||||
}
|
||||
|
||||
atomic_fetch_min(min_ptr, local_min);
|
||||
atomic_fetch_max(max_ptr, local_max);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -21,6 +21,11 @@ enum panlib_varying_buf_id {
|
|||
PANLIB_VARY_BUF_MAX,
|
||||
};
|
||||
|
||||
struct libpan_draw_helper_index_min_max_result {
|
||||
uint32_t min;
|
||||
uint32_t max;
|
||||
};
|
||||
|
||||
struct libpan_draw_helper_varying_buf_info {
|
||||
uint64_t address;
|
||||
uint32_t size;
|
||||
|
|
|
|||
|
|
@ -1588,6 +1588,57 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
|
|||
: cmdbuf->state.gfx.render.layer_count;
|
||||
const struct panvk_shader_variant *fs = panvk_shader_only_variant(get_fs(cmdbuf));
|
||||
|
||||
struct panvk_precomp_ctx precomp_ctx = panvk_per_arch(precomp_cs)(cmdbuf);
|
||||
uint64_t index_min_max_res_ptr = 0;
|
||||
uint32_t job_before_indirect_helper = copy_desc_job_id;
|
||||
if (draw->info.index.size) {
|
||||
index_min_max_res_ptr =
|
||||
panvk_cmd_alloc_dev_mem(
|
||||
cmdbuf, desc,
|
||||
sizeof(struct libpan_draw_helper_index_min_max_result), 8)
|
||||
.gpu;
|
||||
const struct panlib_draw_index_minmax_search_helper_args args = {
|
||||
.index_buffer_ptr = cmdbuf->state.gfx.ib.dev_addr,
|
||||
.cmd = draw->info.indirect.buffer_dev_addr,
|
||||
.min_ptr =
|
||||
index_min_max_res_ptr +
|
||||
offsetof(struct libpan_draw_helper_index_min_max_result, min),
|
||||
.max_ptr =
|
||||
index_min_max_res_ptr +
|
||||
offsetof(struct libpan_draw_helper_index_min_max_result, max),
|
||||
};
|
||||
|
||||
struct libpan_draw_helper_index_min_max_result val = {
|
||||
.min = ((uint64_t)1 << (draw->info.index.size * 8)) - 1,
|
||||
.max = 0,
|
||||
};
|
||||
uint64_t *raw_val = (uint64_t *)&val;
|
||||
|
||||
struct pan_ptr write_job =
|
||||
pan_pool_alloc_desc(&cmdbuf->desc_pool.base, WRITE_VALUE_JOB);
|
||||
|
||||
pan_section_pack(write_job.cpu, WRITE_VALUE_JOB, PAYLOAD, payload) {
|
||||
payload.type = MALI_WRITE_VALUE_TYPE_IMMEDIATE_64;
|
||||
payload.address = index_min_max_res_ptr;
|
||||
payload.immediate_value = *raw_val;
|
||||
};
|
||||
|
||||
unsigned write_job_id =
|
||||
pan_jc_add_job(&batch->vtc_jc, MALI_JOB_TYPE_WRITE_VALUE, false, false,
|
||||
0, copy_desc_job_id, &write_job, false);
|
||||
util_dynarray_append(&batch->jobs, void *, write_job.cpu);
|
||||
|
||||
uint32_t index_count = cmdbuf->state.gfx.ib.size / draw->info.index.size;
|
||||
uint32_t wg_count = DIV_ROUND_UP(index_count, 65536);
|
||||
assert(wg_count <= 65536);
|
||||
|
||||
panlib_draw_index_minmax_search_helper_struct(
|
||||
&precomp_ctx, panlib_1d_with_jm_deps(wg_count, 0, write_job_id),
|
||||
PANLIB_BARRIER_NONE, args, util_logbase2(draw->info.index.size),
|
||||
ia->primitive_restart_enable);
|
||||
job_before_indirect_helper = batch->vtc_jc.job_index;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < enabled_layer_count; i++) {
|
||||
/* Force a new push uniform block to be allocated */
|
||||
gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
|
||||
|
|
@ -1669,20 +1720,19 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
|
|||
vs, sysval_offset(graphics, vs.raw_vertex_offset));
|
||||
}
|
||||
|
||||
struct panvk_precomp_ctx precomp_ctx = panvk_per_arch(precomp_cs)(cmdbuf);
|
||||
enum panlib_barrier indirect_barrier =
|
||||
PANLIB_BARRIER_JM_SUPPRESS_PREFETCH;
|
||||
struct panlib_precomp_grid indirect_grid =
|
||||
panlib_1d_with_jm_deps(1, 0, copy_desc_job_id);
|
||||
panlib_1d_with_jm_deps(1, 0, job_before_indirect_helper);
|
||||
|
||||
if (draw->info.indirect.buffer_dev_addr != 0 && draw->info.index.size) {
|
||||
const struct panlib_draw_indexed_indirect_helper_args args = {
|
||||
.cmd = draw->info.indirect.buffer_dev_addr,
|
||||
.index_buffer_ptr = cmdbuf->state.gfx.ib.dev_addr,
|
||||
.index_min_max_res = index_min_max_res_ptr,
|
||||
.index_size = draw->info.index.size,
|
||||
.primitive_vertex_count = primitive_vertex_count(
|
||||
translate_prim_topology(ia->primitive_topology)),
|
||||
.primitive_restart = ia->primitive_restart_enable,
|
||||
.varying_bufs_descs = draw->varying_bufs,
|
||||
.varying_bufs_info = draw->indirect_info.varying_bufs,
|
||||
.attrib_bufs_descs = draw->vs.attribute_bufs,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue