panvk/csf: Fix multi-layer rendering

We assumed a tiler descriptor could handle 256 layers at a time, but
it's actually limited to 8 on v10, so let's adjust the code to take
that into account.

Fixes: 5544d39f44 ("panvk: Add a CSF backend for panvk_queue/cmd_buffer")
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11882
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Tested-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31227>
This commit is contained in:
Boris Brezillon 2024-09-18 17:20:01 +02:00 committed by Marge Bot
parent dbfaf15bc1
commit 157a4dc509

View file

@ -587,11 +587,17 @@ calc_fbd_size(struct panvk_cmd_buffer *cmdbuf)
return fbd_size;
}
#define MAX_LAYERS_PER_TILER_DESC 8
static uint32_t
calc_render_descs_size(struct panvk_cmd_buffer *cmdbuf)
{
return (calc_fbd_size(cmdbuf) * cmdbuf->state.gfx.render.layer_count) +
pan_size(TILER_CONTEXT);
uint32_t fbd_count = cmdbuf->state.gfx.render.layer_count;
uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
MAX_LAYERS_PER_TILER_DESC);
return (calc_fbd_size(cmdbuf) * fbd_count) +
(td_count * pan_size(TILER_CONTEXT));
}
static void
@ -668,19 +674,16 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
struct panfrost_ptr tiler_desc = {0};
struct mali_tiler_context_packed tiler_tmpl;
uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
MAX_LAYERS_PER_TILER_DESC);
if (!simul_use) {
tiler_desc = panvk_cmd_alloc_desc(cmdbuf, TILER_CONTEXT);
tiler_desc = panvk_cmd_alloc_desc_array(cmdbuf, td_count, TILER_CONTEXT);
if (!tiler_desc.gpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
} else {
/* If the tiler descriptor is allocated from the ring buffer, we set a
* dumb non-zero address to allow the is-tiler-acquired test to pass. */
tiler_desc.cpu = &tiler_tmpl;
tiler_desc.gpu = 0xdeadbeefdeadbeefull;
}
pan_pack(tiler_desc.cpu, TILER_CONTEXT, cfg) {
pan_pack(&tiler_tmpl, TILER_CONTEXT, cfg) {
unsigned max_levels = tiler_features.max_levels;
assert(max_levels >= 2);
@ -701,11 +704,13 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
/* TODO: revisit for VK_EXT_provoking_vertex. */
cfg.first_provoking_vertex = true;
cfg.layer_count = cmdbuf->state.gfx.render.layer_count;
/* This will be overloaded. */
cfg.layer_count = 1;
cfg.layer_offset = 0;
}
cmdbuf->state.gfx.render.tiler = tiler_desc.gpu;
cmdbuf->state.gfx.render.tiler =
simul_use ? 0xdeadbeefdeadbeefull : tiler_desc.gpu;
struct cs_index tiler_ctx_addr = cs_sr_reg64(b, 40);
@ -722,12 +727,6 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
}
cs_render_desc_ringbuf_move_ptr(b, descs_sz);
/* Lay out words 2:5, so they can be stored along the other updates. */
cs_move64_to(b, cs_scratch_reg64(b, 2),
tiler_tmpl.opaque[2] | (uint64_t)tiler_tmpl.opaque[3] << 32);
cs_move64_to(b, cs_scratch_reg64(b, 4),
tiler_tmpl.opaque[4] | (uint64_t)tiler_tmpl.opaque[5] << 32);
} else {
cs_update_vt_ctx(b) {
cs_move64_to(b, tiler_ctx_addr, tiler_desc.gpu);
@ -737,40 +736,114 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
/* Reset the polygon list. */
cs_move64_to(b, cs_scratch_reg64(b, 0), 0);
/* Lay out words 2, 3 and 5, so they can be stored along the other updates.
* Word 4 contains layer information and will be updated in the loop. */
cs_move64_to(b, cs_scratch_reg64(b, 2),
tiler_tmpl.opaque[2] | (uint64_t)tiler_tmpl.opaque[3] << 32);
cs_move32_to(b, cs_scratch_reg32(b, 5), tiler_tmpl.opaque[5]);
/* Load the tiler_heap and geom_buf from the context. */
cs_load_to(b, cs_scratch_reg_tuple(b, 6, 4), cs_subqueue_ctx_reg(b),
BITFIELD_MASK(4),
offsetof(struct panvk_cs_subqueue_context, render.tiler_heap));
/* Reset the completed chain. */
/* Fill extra fields with zeroes so we can reset the completed
* top/bottom and private states. */
cs_move64_to(b, cs_scratch_reg64(b, 10), 0);
cs_move64_to(b, cs_scratch_reg64(b, 12), 0);
cs_wait_slot(b, SB_ID(LS), false);
/* Update the first half of the tiler desc. */
if (simul_use) {
cs_store(b, cs_scratch_reg_tuple(b, 0, 14), tiler_ctx_addr,
BITFIELD_MASK(14), 0);
} else {
cs_store(b, cs_scratch_reg_tuple(b, 0, 2), tiler_ctx_addr,
BITFIELD_MASK(2), 0);
cs_store(b, cs_scratch_reg_tuple(b, 6, 8), tiler_ctx_addr,
BITFIELD_MASK(8), 24);
}
cs_wait_slot(b, SB_ID(LS), false);
/* r10:13 are already zero, fill r8:9 and r14:15 with zeros so we can reset
* the private state in one store. */
cs_move64_to(b, cs_scratch_reg64(b, 8), 0);
cs_move64_to(b, cs_scratch_reg64(b, 14), 0);
/* Update the second half of the tiler descriptor. */
cs_store(b, cs_scratch_reg_tuple(b, 8, 8), tiler_ctx_addr, BITFIELD_MASK(8),
96);
cs_wait_slot(b, SB_ID(LS), false);
/* Take care of the tiler desc with layer_offset=0 outside of the loop. */
cs_move32_to(b, cs_scratch_reg32(b, 4),
MIN2(cmdbuf->state.gfx.render.layer_count - 1,
MAX_LAYERS_PER_TILER_DESC - 1));
/* Replace words 0:13 and 24:31. */
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
BITFIELD_MASK(16), 0);
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
cs_wait_slot(b, SB_ID(LS), false);
uint32_t remaining_layers =
td_count > 1
? cmdbuf->state.gfx.render.layer_count % MAX_LAYERS_PER_TILER_DESC
: 0;
uint32_t full_td_count =
cmdbuf->state.gfx.render.layer_count / MAX_LAYERS_PER_TILER_DESC;
if (remaining_layers) {
int32_t layer_offset =
-(cmdbuf->state.gfx.render.layer_count - remaining_layers) &
BITFIELD_MASK(9);
/* If the last tiler descriptor is not full, we emit it outside of the
* loop to pass the right layer count. All this would be a lot simpler
* if we had OR/AND instructions, but here we are. */
cs_update_vt_ctx(b)
cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
pan_size(TILER_CONTEXT) * full_td_count);
cs_move32_to(b, cs_scratch_reg32(b, 4),
(layer_offset << 8) | (remaining_layers - 1));
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
BITFIELD_MASK(16), 0);
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
cs_wait_slot(b, SB_ID(LS), false);
cs_update_vt_ctx(b)
cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
-pan_size(TILER_CONTEXT));
} else if (full_td_count > 1) {
cs_update_vt_ctx(b)
cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
pan_size(TILER_CONTEXT) * (full_td_count - 1));
}
if (full_td_count > 1) {
struct cs_index counter_reg = cs_scratch_reg32(b, 17);
uint32_t layer_offset =
(-MAX_LAYERS_PER_TILER_DESC * (full_td_count - 1)) & BITFIELD_MASK(9);
cs_move32_to(b, counter_reg, full_td_count - 1);
cs_move32_to(b, cs_scratch_reg32(b, 4),
(layer_offset << 8) | (MAX_LAYERS_PER_TILER_DESC - 1));
/* We iterate the remaining full tiler descriptors in reverse order, so we
* can start from the smallest layer offset, and increment it by
* MAX_LAYERS_PER_TILER_DESC << 8 at each iteration. Again, the split is
* mostly due to the lack of AND instructions, and the fact layer_offset
* is a 9-bit signed integer inside a 32-bit word, which ADD32 can't deal
* with unless the number we add is positive.
*/
cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) {
/* Replace words 0:13 and 24:31. */
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
BITFIELD_MASK(16), 0);
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
cs_wait_slot(b, SB_ID(LS), false);
cs_add32(b, cs_scratch_reg32(b, 4), cs_scratch_reg32(b, 4),
MAX_LAYERS_PER_TILER_DESC << 8);
cs_add32(b, counter_reg, counter_reg, -1);
cs_update_vt_ctx(b)
cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
-pan_size(TILER_CONTEXT));
}
}
/* Then we change the scoreboard slot used for iterators. */
panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
@ -788,9 +861,6 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
uint32_t fbds_sz =
calc_fbd_size(cmdbuf) * cmdbuf->state.gfx.render.layer_count;
memset(&cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds, 0,
sizeof(cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds));
cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem(
cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
if (!cmdbuf->state.gfx.render.fbds.gpu)
@ -1244,12 +1314,7 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
cs_move32_to(b, cs_sr_reg32(b, 34), draw->instance.count);
cs_move32_to(b, cs_sr_reg32(b, 35), draw->index.offset);
cs_move32_to(b, cs_sr_reg32(b, 36), draw->index.vertex_offset);
/* Instance ID is assumed to be zero-based for now. See if we can
* extend nir_lower_system_values() and the lower options to make
* instance-ID non-zero based, or if it's fine to always return
* zero for the instance base. */
cs_move32_to(b, cs_sr_reg32(b, 37), 0);
cs_move32_to(b, cs_sr_reg32(b, 37), draw->instance.base);
/* We don't use the resource dep system yet. */
cs_move32_to(b, cs_sr_reg32(b, 38), 0);
@ -1284,10 +1349,37 @@ panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
clear_dirty(cmdbuf, draw);
uint32_t idvs_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
MAX_LAYERS_PER_TILER_DESC);
cs_req_res(b, CS_IDVS_RES);
cs_run_idvs(b, tiler_idvs_flags.opaque[0], false, true,
cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0),
cs_undef());
if (idvs_count > 1) {
struct cs_index counter_reg = cs_scratch_reg32(b, 17);
struct cs_index tiler_ctx_addr = cs_sr_reg64(b, 40);
cs_move32_to(b, counter_reg, idvs_count);
cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) {
cs_run_idvs(b, tiler_idvs_flags.opaque[0], false, true,
cs_shader_res_sel(0, 0, 1, 0),
cs_shader_res_sel(2, 2, 2, 0), cs_undef());
cs_add32(b, counter_reg, counter_reg, -1);
cs_update_vt_ctx(b) {
cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
pan_size(TILER_CONTEXT));
}
}
cs_update_vt_ctx(b) {
cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
-(idvs_count * pan_size(TILER_CONTEXT)));
}
} else {
cs_run_idvs(b, tiler_idvs_flags.opaque[0], false, true,
cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0),
cs_undef());
}
cs_req_res(b, 0);
}
@ -1763,8 +1855,9 @@ static uint8_t
prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, uint32_t layer, void *fbd)
{
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
bool simul_use =
!(cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT);
memset(&cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds, 0,
sizeof(cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds));
if (cmdbuf->state.tls.desc.gpu) {
ASSERTED unsigned num_preload_jobs =
@ -1779,7 +1872,7 @@ prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, uint32_t layer, void *fbd)
}
struct pan_tiler_context tiler_ctx = {
.valhall.desc = !simul_use ? cmdbuf->state.gfx.render.tiler : 0,
.valhall.layer_offset = layer - (layer % MAX_LAYERS_PER_TILER_DESC),
};
return GENX(pan_emit_fbd)(&cmdbuf->state.gfx.render.fb.info, layer, NULL,
@ -1929,25 +2022,31 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
struct cs_index layer_count = cs_sr_reg32(b, 47);
struct cs_index fbd_ptr = cs_sr_reg64(b, 48);
struct cs_index tiler_ptr = cs_sr_reg64(b, 50);
struct cs_index src_fbd_ptr = cs_undef();
struct cs_index cur_tiler = cs_sr_reg64(b, 52);
struct cs_index remaining_layers_in_td = cs_sr_reg32(b, 54);
struct cs_index src_fbd_ptr = cs_sr_reg64(b, 56);
uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
MAX_LAYERS_PER_TILER_DESC);
if (copy_fbds) {
src_fbd_ptr = cs_sr_reg64(b, 52);
cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count);
cs_load64_to(
b, tiler_ptr, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
cs_wait_slot(b, SB_ID(LS), false);
cs_add64(b, fbd_ptr, tiler_ptr, pan_size(TILER_CONTEXT));
cs_add64(b, fbd_ptr, tiler_ptr, pan_size(TILER_CONTEXT) * td_count);
cs_move64_to(b, src_fbd_ptr, fbds.gpu);
} else if (cmdbuf->state.gfx.render.tiler) {
cs_move64_to(b, fbd_ptr, fbds.gpu);
cs_move64_to(b, tiler_ptr, cmdbuf->state.gfx.render.tiler);
}
cs_add64(b, cur_tiler, tiler_ptr, 0);
cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count);
cs_move32_to(b, remaining_layers_in_td, MAX_LAYERS_PER_TILER_DESC);
cs_req_res(b, CS_FRAG_RES);
cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
if (copy_fbds) {
for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
@ -1963,19 +2062,23 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
}
if (cmdbuf->state.gfx.render.tiler) {
cs_store64(b, tiler_ptr, fbd_ptr, 56);
cs_store64(b, cur_tiler, fbd_ptr, 56);
cs_wait_slot(b, SB_ID(LS), false);
}
cs_update_frag_ctx(b)
cs_add64(b, cs_sr_reg64(b, 40), fbd_ptr, fbd_flags);
cs_req_res(b, CS_FRAG_RES);
cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
cs_req_res(b, 0);
cs_add64(b, fbd_ptr, fbd_ptr, fbd_sz);
cs_add32(b, layer_count, layer_count, -1);
cs_add32(b, remaining_layers_in_td, remaining_layers_in_td, -1);
cs_if(b, MALI_CS_CONDITION_LEQUAL, remaining_layers_in_td) {
cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT));
cs_move32_to(b, remaining_layers_in_td, MAX_LAYERS_PER_TILER_DESC);
}
}
cs_req_res(b, 0);
struct cs_index sync_addr = cs_scratch_reg64(b, 0);
struct cs_index iter_sb = cs_scratch_reg32(b, 2);
@ -1986,6 +2089,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4);
struct cs_index completed_top = cs_scratch_reg64(b, 10);
struct cs_index completed_bottom = cs_scratch_reg64(b, 12);
struct cs_index tiler_count = cs_sr_reg32(b, 47);
cs_move64_to(b, add_val, 1);
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
@ -1999,21 +2103,26 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
render.desc_ringbuf.syncobj));
}
if (cmdbuf->state.gfx.render.tiler)
cs_load_to(b, completed, tiler_ptr, BITFIELD_MASK(4), 40);
cs_wait_slot(b, SB_ID(LS), false);
cs_add64(b, sync_addr, sync_addr,
PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64));
cs_move32_to(b, tiler_count, td_count);
cs_add64(b, cur_tiler, tiler_ptr, 0);
cs_match(b, iter_sb, cmp_scratch) {
#define CASE(x) \
cs_case(b, x) { \
if (cmdbuf->state.gfx.render.tiler) { \
cs_finish_fragment(b, true, completed_top, completed_bottom, \
cs_defer(SB_WAIT_ITER(x), \
SB_ID(DEFERRED_SYNC))); \
cs_while(b, MALI_CS_CONDITION_GREATER, tiler_count) { \
cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40); \
cs_wait_slot(b, SB_ID(LS), false); \
cs_finish_fragment(b, true, completed_top, completed_bottom, \
cs_defer(SB_WAIT_ITER(x), \
SB_ID(DEFERRED_SYNC))); \
cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT)); \
cs_add32(b, tiler_count, tiler_count, -1); \
} \
} \
if (copy_fbds) { \
cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, \