panvk/csf: Fix cross command buffer render pass suspend/resume

Fix cross command buffer render pass suspend/resume by
emitting a render context (tiler+framebuffer descriptors)
on suspend that we can re-use on resume.

This involves splitting the issue_fragment_jobs() logic to
decouple the framebuffer descriptor initialization and the
run_fragment emission. This also requires patching a few
places where we were testing the tiler/fbd values to
determine if we are in a render pass, which no longer works
when a render pass is resumed.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Reviewed-by: Chia-I Wu <olvaffe@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32213>
This commit is contained in:
Boris Brezillon 2024-11-18 18:40:48 +01:00 committed by Marge Bot
parent 5a6e992048
commit 3d5d6327be
5 changed files with 276 additions and 254 deletions

View file

@ -491,13 +491,6 @@ dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multiple_interpolation.separate.
dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multiple_interpolation.structured.no_sample_decoration.1_sample,Fail
dEQP-VK.draw.dynamic_rendering.primary_cmd_buff.multiple_interpolation.structured.no_sample_decoration.4_samples,Fail
dEQP-VK.dynamic_rendering.primary_cmd_buff.basic.2_cmdbuffers_resuming,Fail
dEQP-VK.dynamic_rendering.primary_cmd_buff.basic.2_secondary_2_primary_cmdbuffers_resuming,Fail
dEQP-VK.dynamic_rendering.primary_cmd_buff.basic.2_secondary_cmdbuffers_resuming,Fail
dEQP-VK.dynamic_rendering.primary_cmd_buff.basic.contents_2_primary_secondary_cmdbuffers_resuming,Fail
dEQP-VK.dynamic_rendering.primary_cmd_buff.basic.contents_2_secondary_2_primary_cmdbuffers_resuming,Fail
dEQP-VK.dynamic_rendering.primary_cmd_buff.basic.contents_secondary_2_primary_cmdbuffers_resuming,Fail
dEQP-VK.glsl.limits.near_max.fragment_input.components_123,Fail
dEQP-VK.glsl.limits.near_max.fragment_input.components_124,Fail
@ -756,107 +749,6 @@ dEQP-VK.spirv_assembly.instruction.graphics.opquantize.too_small_vert,Fail
dEQP-VK.api.command_buffers.record_many_draws_primary_2,Crash
dEQP-VK.api.command_buffers.record_many_draws_secondary_2,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed0,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed1,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed10,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed11,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed12,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed13,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed14,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed15,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed16,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed17,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed18,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed19,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed2,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed20,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed21,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed22,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed23,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed24,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed25,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed26,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed27,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed28,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed29,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed3,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed30,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed31,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed32,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed33,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed34,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed35,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed36,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed37,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed38,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed39,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed4,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed40,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed41,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed42,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed43,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed44,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed45,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed46,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed47,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed48,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed49,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed5,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed50,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed51,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed52,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed53,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed54,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed55,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed56,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed57,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed58,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed59,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed6,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed60,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed61,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed62,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed63,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed64,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed65,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed66,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed67,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed68,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed69,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed7,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed70,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed71,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed72,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed73,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed74,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed75,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed76,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed77,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed78,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed79,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed8,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed80,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed81,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed82,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed83,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed84,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed85,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed86,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed87,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed88,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed89,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed9,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed90,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed91,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed92,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed93,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed94,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed95,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed96,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed97,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed98,Crash
dEQP-VK.dynamic_rendering.primary_cmd_buff.random.seed99,Crash
dEQP-VK.pipeline.fast_linked_library.max_varyings.test_vertex_io_between_vertex_fragment,Fail
dEQP-VK.pipeline.fast_linked_library.multisample.alpha_to_coverage_no_color_attachment.samples_4.alpha_opaque,Fail
dEQP-VK.pipeline.fast_linked_library.multisample.alpha_to_coverage_unused_attachment.samples_4.alpha_invisible,Fail

View file

@ -108,8 +108,12 @@ enum panvk_cs_regs {
PANVK_CS_REG_RUN_IDVS_SR_START = 0,
PANVK_CS_REG_RUN_IDVS_SR_END = 60,
/* RUN_FRAGMENT staging regs. */
PANVK_CS_REG_RUN_FRAGMENT_SR_START = 40,
/* RUN_FRAGMENT staging regs.
* SW ABI:
* - r38:39 contain the pointer to the first tiler descriptor. This is
* needed to gather completed heap chunks after a run_fragment.
*/
PANVK_CS_REG_RUN_FRAGMENT_SR_START = 38,
PANVK_CS_REG_RUN_FRAGMENT_SR_END = 46,
/* RUN_COMPUTE staging regs. */

View file

@ -161,8 +161,14 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
}
}
/* If this is a secondary command buffer, we don't poison the reg file to
* preserve the render pass context. We also don't poison the reg file if the
* last render pass was suspended. In practice we could preserve only the
* registers that matter, but this is a debug feature so let's keep things
* simple with this all-or-nothing approach. */
if ((instance->debug_flags & PANVK_DEBUG_CS) &&
cmdbuf->vk.level != VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
cmdbuf->vk.level != VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
!(cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)) {
cs_update_cmdbuf_regs(b) {
/* Poison all cmdbuf registers to make sure we don't inherit state from
* a previously executed cmdbuf. */
@ -902,6 +908,23 @@ panvk_per_arch(CmdExecuteCommands)(VkCommandBuffer commandBuffer,
cs_call(prim_b, addr, size);
}
}
/* We need to propagate the suspending state of the secondary command
* buffer if we want to avoid poisoning the reg file when the secondary
* command buffer suspended the render pass. */
if (secondary->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)
primary->state.gfx.render.flags = secondary->state.gfx.render.flags;
/* If the render context we passed to the secondary command buffer got
* invalidated, reset the FB/tiler descs and treat things as if we
* suspended the render pass, since those descriptors have been
* re-emitted by the secondary command buffer already. */
if (secondary->state.gfx.render.invalidate_inherited_ctx) {
memset(&primary->state.gfx.render.fbds, 0,
sizeof(primary->state.gfx.render.fbds));
primary->state.gfx.render.tiler = 0;
primary->state.gfx.render.flags |= VK_RENDERING_RESUMING_BIT;
}
}
/* From the Vulkan 1.3.275 spec:

View file

@ -723,9 +723,21 @@ cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size)
cs_wait_slot(b, SB_ID(LS), false);
}
static bool
inherits_render_ctx(struct panvk_cmd_buffer *cmdbuf)
{
return (cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
(cmdbuf->flags &
VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) ||
(cmdbuf->state.gfx.render.flags & VK_RENDERING_RESUMING_BIT);
}
static VkResult
get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
{
assert(cmdbuf->state.gfx.render.invalidate_inherited_ctx ||
!inherits_render_ctx(cmdbuf));
if (cmdbuf->state.gfx.render.tiler)
return VK_SUCCESS;
@ -910,24 +922,183 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
return VK_SUCCESS;
}
static uint8_t
prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, uint32_t layer, void *fbd)
{
struct pan_tiler_context tiler_ctx = {
.valhall.layer_offset = layer - (layer % MAX_LAYERS_PER_TILER_DESC),
};
if (!(cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
uint32_t td_idx = layer / MAX_LAYERS_PER_TILER_DESC;
tiler_ctx.valhall.desc =
cmdbuf->state.gfx.render.tiler + (td_idx * pan_size(TILER_CONTEXT));
}
return GENX(pan_emit_fbd)(&cmdbuf->state.gfx.render.fb.info, layer, NULL,
&tiler_ctx, fbd);
}
static VkResult
get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
{
assert(cmdbuf->state.gfx.render.invalidate_inherited_ctx ||
!inherits_render_ctx(cmdbuf));
if (cmdbuf->state.gfx.render.fbds.gpu ||
!cmdbuf->state.gfx.render.layer_count)
return VK_SUCCESS;
uint32_t fbds_sz =
calc_fbd_size(cmdbuf) * cmdbuf->state.gfx.render.layer_count;
uint32_t fbd_sz = calc_fbd_size(cmdbuf);
uint32_t fbds_sz = fbd_sz * cmdbuf->state.gfx.render.layer_count;
cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem(
cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
if (!cmdbuf->state.gfx.render.fbds.gpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
bool simul_use =
cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
/* The only bit we patch in FBDs is the tiler pointer. If tiler is not
* involved (clear job) or if the update can happen in place (not
* simultaneous use of the command buffer), we can avoid the
* copy.
*
* According to VUID-VkSubmitInfo2KHR-commandBuffer-06192 and
* VUID-VkSubmitInfo2KHR-commandBuffer-06010, suspend/resume operations
* can't cross the vkQueueSubmit2() boundary, so no need to dynamically
* allocate descriptors in that case:
* "
* If any commandBuffer member of an element of pCommandBufferInfos
* contains any suspended render pass instances, they must be resumed by a
* render pass instance later in submission order within
* pCommandBufferInfos.
*
* If any commandBuffer member of an element of pCommandBufferInfos
* contains any resumed render pass instances, they must be suspended by a
* render pass instance earlier in submission order within
* pCommandBufferInfos.
* "
*/
bool copy_fbds = simul_use && cmdbuf->state.gfx.render.tiler;
struct panfrost_ptr fbds = cmdbuf->state.gfx.render.fbds;
uint32_t fbd_flags = 0;
fbinfo->sample_positions =
dev->sample_positions->addr.dev +
panfrost_sample_positions_offset(pan_sample_pattern(fbinfo->nr_samples));
VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf);
if (result != VK_SUCCESS)
return result;
/* We prepare all FB descriptors upfront. */
for (uint32_t i = 0; i < cmdbuf->state.gfx.render.layer_count; i++) {
uint32_t new_fbd_flags =
prepare_fb_desc(cmdbuf, i, fbds.cpu + (fbd_sz * i));
/* Make sure all FBDs have the same flags. */
assert(i == 0 || new_fbd_flags == fbd_flags);
fbd_flags = new_fbd_flags;
}
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
if (copy_fbds) {
struct cs_index cur_tiler = cs_sr_reg64(b, 38);
struct cs_index dst_fbd_ptr = cs_sr_reg64(b, 40);
struct cs_index layer_count = cs_sr_reg32(b, 47);
struct cs_index src_fbd_ptr = cs_sr_reg64(b, 48);
struct cs_index remaining_layers_in_td = cs_sr_reg32(b, 50);
uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
MAX_LAYERS_PER_TILER_DESC);
cs_update_frag_ctx(b) {
cs_load64_to(b, cur_tiler, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context,
render.desc_ringbuf.ptr));
cs_wait_slot(b, SB_ID(LS), false);
cs_add64(b, dst_fbd_ptr, cur_tiler,
pan_size(TILER_CONTEXT) * td_count);
}
cs_move64_to(b, src_fbd_ptr, fbds.gpu);
cs_move32_to(b, remaining_layers_in_td, MAX_LAYERS_PER_TILER_DESC);
cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count);
cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
/* Our loop is copying 64-bytes at a time, so make sure the
* framebuffer size is aligned on 64-bytes. */
assert(fbd_sz == ALIGN_POT(fbd_sz, 64));
for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
if (fbd_off == 0) {
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14), src_fbd_ptr,
BITFIELD_MASK(14), fbd_off);
cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0);
} else {
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), src_fbd_ptr,
BITFIELD_MASK(16), fbd_off);
}
cs_wait_slot(b, SB_ID(LS), false);
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), dst_fbd_ptr,
BITFIELD_MASK(16), fbd_off);
cs_wait_slot(b, SB_ID(LS), false);
}
cs_add64(b, src_fbd_ptr, src_fbd_ptr, fbd_sz);
cs_update_frag_ctx(b)
cs_add64(b, dst_fbd_ptr, dst_fbd_ptr, fbd_sz);
cs_add32(b, remaining_layers_in_td, remaining_layers_in_td, -1);
cs_add32(b, layer_count, layer_count, -1);
cs_if(b, MALI_CS_CONDITION_LEQUAL, remaining_layers_in_td) {
cs_update_frag_ctx(b)
cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT));
cs_move32_to(b, remaining_layers_in_td,
MAX_LAYERS_PER_TILER_DESC);
}
}
cs_update_frag_ctx(b) {
uint32_t full_td_count =
cmdbuf->state.gfx.render.layer_count / MAX_LAYERS_PER_TILER_DESC;
/* If the last tiler descriptor is not full, cur_tiler points to the
* last tiler descriptor, not the FBD that follows. */
if (full_td_count < td_count)
cs_add64(b, dst_fbd_ptr, cur_tiler,
fbd_flags + pan_size(TILER_CONTEXT));
else
cs_add64(b, dst_fbd_ptr, cur_tiler, fbd_flags);
cs_add64(b, cur_tiler, cur_tiler,
-(full_td_count * pan_size(TILER_CONTEXT)));
}
} else {
cs_update_frag_ctx(b) {
cs_move64_to(b, cs_sr_reg64(b, 40), fbds.gpu | fbd_flags);
cs_move64_to(b, cs_sr_reg64(b, 38), cmdbuf->state.gfx.render.tiler);
}
}
return VK_SUCCESS;
}
static VkResult
get_render_ctx(struct panvk_cmd_buffer *cmdbuf)
{
VkResult result = get_tiler_desc(cmdbuf);
if (result != VK_SUCCESS)
return result;
return get_fb_descs(cmdbuf);
}
static VkResult
prepare_vs(struct panvk_cmd_buffer *cmdbuf)
{
@ -1308,13 +1479,8 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
if (result != VK_SUCCESS)
return result;
if (cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||
!(cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
result = get_tiler_desc(cmdbuf);
if (result != VK_SUCCESS)
return result;
result = get_fb_descs(cmdbuf);
if (!inherits_render_ctx(cmdbuf)) {
result = get_render_ctx(cmdbuf);
if (result != VK_SUCCESS)
return result;
}
@ -1479,12 +1645,9 @@ panvk_per_arch(cmd_prepare_exec_cmd_for_draws)(
{
VkResult result;
if (secondary->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
result = get_tiler_desc(primary);
if (result != VK_SUCCESS)
return;
result = get_fb_descs(primary);
if ((secondary->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) &&
!inherits_render_ctx(primary)) {
result = get_render_ctx(primary);
if (result != VK_SUCCESS)
return;
}
@ -1730,42 +1893,25 @@ panvk_per_arch(CmdBeginRendering)(VkCommandBuffer commandBuffer,
struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT;
/* When resuming from a suspended pass, the state should be unchanged. */
if (resuming)
state->render.flags = pRenderingInfo->flags;
else
panvk_per_arch(cmd_init_render_state)(cmdbuf, pRenderingInfo);
panvk_per_arch(cmd_init_render_state)(cmdbuf, pRenderingInfo);
/* If we're not resuming, the FBD should be NULL. */
assert(!state->render.fbds.gpu || resuming);
if (!resuming)
panvk_per_arch(cmd_preload_render_area_border)(cmdbuf, pRenderingInfo);
}
static uint8_t
prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, uint32_t layer, void *fbd)
{
struct pan_tiler_context tiler_ctx = {
.valhall.layer_offset = layer - (layer % MAX_LAYERS_PER_TILER_DESC),
};
return GENX(pan_emit_fbd)(&cmdbuf->state.gfx.render.fb.info, layer, NULL,
&tiler_ctx, fbd);
}
static void
flush_tiling(struct panvk_cmd_buffer *cmdbuf)
{
if (!cmdbuf->state.gfx.render.fbds.gpu)
return;
struct cs_builder *b =
panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
struct cs_index render_ctx = cs_scratch_reg64(b, 2);
if (cmdbuf->state.gfx.render.tiler) {
if (cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf)) {
/* Flush the tiling operations and signal the internal sync object. */
cs_req_res(b, CS_TILER_RES);
cs_finish_tiling(b, false);
@ -1824,9 +1970,6 @@ flush_tiling(struct panvk_cmd_buffer *cmdbuf)
static void
wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf)
{
if (!cmdbuf->state.gfx.render.tiler)
return;
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
struct cs_index vt_sync_addr = cs_scratch_reg64(b, 0);
struct cs_index vt_sync_point = cs_scratch_reg64(b, 2);
@ -1847,10 +1990,6 @@ wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf)
static VkResult
issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
{
if (!cmdbuf->state.gfx.render.fbds.gpu)
return VK_SUCCESS;
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
@ -1868,10 +2007,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
(fbinfo->extent.maxy << 16) | fbinfo->extent.maxx);
}
fbinfo->sample_positions =
dev->sample_positions->addr.dev +
panfrost_sample_positions_offset(pan_sample_pattern(fbinfo->nr_samples));
bool simul_use =
cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
@ -1879,93 +2014,34 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
* involved (clear job) or if the update can happen in place (not
* simultaneous use of the command buffer), we can avoid the
* copy. */
bool copy_fbds = simul_use && cmdbuf->state.gfx.render.tiler;
bool needs_tiling =
cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf);
/* If the command buffer can run in parallel on different queues, we need
* to make sure each instance has its own descriptors, unless tiling is
* not needed (AKA RUN_FRAGMENT used for clears), because then the FBD
* descriptors are constant (no need to patch them at runtime). */
bool free_render_descs = simul_use && needs_tiling;
uint32_t fbd_sz = calc_fbd_size(cmdbuf);
struct panfrost_ptr fbds = cmdbuf->state.gfx.render.fbds;
uint8_t fbd_flags = 0;
VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf);
if (result != VK_SUCCESS)
return result;
/* We prepare all FB descriptors upfront. */
for (uint32_t i = 0; i < cmdbuf->state.gfx.render.layer_count; i++) {
uint32_t new_fbd_flags =
prepare_fb_desc(cmdbuf, i, fbds.cpu + (fbd_sz * i));
/* Make sure all FBDs have the same flags. */
assert(i == 0 || new_fbd_flags == fbd_flags);
fbd_flags = new_fbd_flags;
}
struct cs_index layer_count = cs_sr_reg32(b, 47);
struct cs_index fbd_ptr = cs_sr_reg64(b, 48);
struct cs_index tiler_ptr = cs_sr_reg64(b, 50);
struct cs_index cur_tiler = cs_sr_reg64(b, 52);
struct cs_index remaining_layers_in_td = cs_sr_reg32(b, 54);
struct cs_index src_fbd_ptr = cs_sr_reg64(b, 56);
uint32_t td_count = 0;
if (cmdbuf->state.gfx.render.tiler) {
if (needs_tiling) {
td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
MAX_LAYERS_PER_TILER_DESC);
}
if (copy_fbds) {
cs_load64_to(
b, tiler_ptr, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
cs_wait_slot(b, SB_ID(LS), false);
cs_add64(b, fbd_ptr, tiler_ptr, pan_size(TILER_CONTEXT) * td_count);
cs_move64_to(b, src_fbd_ptr, fbds.gpu);
} else {
cs_move64_to(b, fbd_ptr, fbds.gpu);
if (cmdbuf->state.gfx.render.tiler)
cs_move64_to(b, tiler_ptr, cmdbuf->state.gfx.render.tiler);
}
if (cmdbuf->state.gfx.render.tiler) {
cs_add64(b, cur_tiler, tiler_ptr, 0);
cs_move32_to(b, remaining_layers_in_td, MAX_LAYERS_PER_TILER_DESC);
}
cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count);
cs_req_res(b, CS_FRAG_RES);
cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
if (copy_fbds) {
for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), src_fbd_ptr,
BITFIELD_MASK(16), fbd_off);
cs_wait_slot(b, SB_ID(LS), false);
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), fbd_ptr,
BITFIELD_MASK(16), fbd_off);
cs_wait_slot(b, SB_ID(LS), false);
}
if (cmdbuf->state.gfx.render.layer_count > 1) {
struct cs_index layer_count = cs_sr_reg32(b, 47);
cs_add64(b, src_fbd_ptr, src_fbd_ptr, fbd_sz);
cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count);
cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
cs_add32(b, layer_count, layer_count, -1);
cs_update_frag_ctx(b)
cs_add64(b, cs_sr_reg64(b, 40), cs_sr_reg64(b, 40), fbd_sz);
}
if (cmdbuf->state.gfx.render.tiler) {
cs_store64(b, cur_tiler, fbd_ptr, 56);
cs_wait_slot(b, SB_ID(LS), false);
}
cs_update_frag_ctx(b)
cs_add64(b, cs_sr_reg64(b, 40), fbd_ptr, fbd_flags);
} else {
cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
cs_add64(b, fbd_ptr, fbd_ptr, fbd_sz);
cs_add32(b, layer_count, layer_count, -1);
if (cmdbuf->state.gfx.render.tiler) {
cs_add32(b, remaining_layers_in_td, remaining_layers_in_td, -1);
cs_if(b, MALI_CS_CONDITION_LEQUAL, remaining_layers_in_td) {
cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT));
cs_move32_to(b, remaining_layers_in_td, MAX_LAYERS_PER_TILER_DESC);
}
}
}
cs_req_res(b, 0);
@ -1975,9 +2051,11 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
struct cs_index add_val = cs_scratch_reg64(b, 4);
struct cs_index release_sz = cs_scratch_reg32(b, 5);
struct cs_index ringbuf_sync_addr = cs_scratch_reg64(b, 6);
struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4);
struct cs_index completed_top = cs_scratch_reg64(b, 10);
struct cs_index completed_bottom = cs_scratch_reg64(b, 12);
struct cs_index cur_tiler = cs_sr_reg64(b, 38);
struct cs_index tiler_count = cs_sr_reg32(b, 47);
cs_move64_to(b, add_val, 1);
@ -1985,7 +2063,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
BITFIELD_MASK(3),
offsetof(struct panvk_cs_subqueue_context, syncobjs));
if (copy_fbds) {
if (free_render_descs) {
cs_move32_to(b, release_sz, calc_render_descs_size(cmdbuf));
cs_load64_to(b, ringbuf_sync_addr, cs_subqueue_ctx_reg(b),
offsetof(struct panvk_cs_subqueue_context,
@ -1997,7 +2075,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_add64(b, sync_addr, sync_addr,
PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64));
cs_move32_to(b, tiler_count, td_count);
cs_add64(b, cur_tiler, tiler_ptr, 0);
cs_match(b, iter_sb, cmp_scratch) {
#define CASE(x) \
@ -2014,12 +2091,13 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_wait_slot(b, SB_ID(LS), false); \
cs_finish_fragment(b, false, completed_top, completed_bottom, \
async); \
cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT)); \
cs_update_frag_ctx(b) \
cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT)); \
cs_add32(b, tiler_count, tiler_count, -1); \
} \
cs_frag_end(b, async); \
} \
if (copy_fbds) { \
if (free_render_descs) { \
cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, release_sz, \
ringbuf_sync_addr, async); \
} \
@ -2041,15 +2119,12 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_wait_slot(b, SB_ID(LS), false);
/* Update the ring buffer position. */
if (copy_fbds)
if (free_render_descs)
cs_render_desc_ringbuf_move_ptr(b, calc_render_descs_size(cmdbuf));
/* Update the frag seqno. */
++cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].relative_sync_point;
memset(&cmdbuf->state.gfx.render.fbds, 0,
sizeof(cmdbuf->state.gfx.render.fbds));
cmdbuf->state.gfx.render.tiler = 0;
return VK_SUCCESS;
}
@ -2058,37 +2133,61 @@ void
panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf)
{
/* If there was no draw queued, we don't need to force a preload. */
if (!cmdbuf->state.gfx.render.fbds.gpu)
return;
if (cmdbuf->state.gfx.render.fbds.gpu || inherits_render_ctx(cmdbuf)) {
flush_tiling(cmdbuf);
issue_fragment_jobs(cmdbuf);
memset(&cmdbuf->state.gfx.render.fbds, 0,
sizeof(cmdbuf->state.gfx.render.fbds));
cmdbuf->state.gfx.render.tiler = 0;
flush_tiling(cmdbuf);
issue_fragment_jobs(cmdbuf);
memset(&cmdbuf->state.gfx.render.fbds, 0,
sizeof(cmdbuf->state.gfx.render.fbds));
cmdbuf->state.gfx.render.tiler = 0;
panvk_per_arch(cmd_force_fb_preload)(cmdbuf, NULL);
panvk_per_arch(cmd_force_fb_preload)(cmdbuf, NULL);
/* We inherited the render context, and need to let the primary command
* buffer know that it's changed. */
cmdbuf->state.gfx.render.invalidate_inherited_ctx = true;
/* Re-emit the FB/Tiler descs if we inherited them. */
if (inherits_render_ctx(cmdbuf))
get_render_ctx(cmdbuf);
}
}
VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdEndRendering)(VkCommandBuffer commandBuffer)
{
VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
bool suspending = cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT;
if (!(cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)) {
if (!suspending) {
struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
bool clear = fbinfo->zs.clear.z | fbinfo->zs.clear.s;
for (unsigned i = 0; i < fbinfo->rt_count; i++)
clear |= fbinfo->rts[i].clear;
if (clear) {
if (clear && !inherits_render_ctx(cmdbuf)) {
VkResult result = get_fb_descs(cmdbuf);
if (result != VK_SUCCESS)
return;
}
flush_tiling(cmdbuf);
issue_fragment_jobs(cmdbuf);
panvk_per_arch(cmd_resolve_attachments)(cmdbuf);
if (cmdbuf->state.gfx.render.fbds.gpu || inherits_render_ctx(cmdbuf)) {
flush_tiling(cmdbuf);
issue_fragment_jobs(cmdbuf);
}
} else if (!inherits_render_ctx(cmdbuf)) {
/* If we're suspending the render pass and we didn't inherit the render
* context, we need to emit it now, so it's available when the render pass
* is resumed. */
VkResult result = get_render_ctx(cmdbuf);
if (result != VK_SUCCESS)
return;
}
memset(&cmdbuf->state.gfx.render.fbds, 0,
sizeof(cmdbuf->state.gfx.render.fbds));
cmdbuf->state.gfx.render.tiler = 0;
/* If we're not suspending, we need to resolve attachments. */
if (!suspending)
panvk_per_arch(cmd_resolve_attachments)(cmdbuf);
}

View file

@ -70,6 +70,10 @@ struct panvk_rendering_state {
#if PAN_ARCH >= 10
struct panfrost_ptr fbds;
mali_ptr tiler;
/* When a secondary command buffer has to flush draws, it disturbs the
* inherited context, and the primary command buffer needs to know. */
bool invalidate_inherited_ctx;
#endif
};