tu: Add support for "unresolve" ops

These ops replicate the single-sampled source attachment to the
multi-sampled destination attachment before the start of a subpass. This
is the new hardware feature for
VK_EXT_multisample_render_to_single_sampled, and the actual
implementation of the extension emulates everything on top of these.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37919>
This commit is contained in:
Connor Abbott 2025-10-16 09:20:36 -04:00 committed by Marge Bot
parent 7542d5068c
commit 9c5012b03c
5 changed files with 138 additions and 12 deletions

View file

@ -1253,10 +1253,11 @@ r3d_src_stencil(struct tu_cmd_buffer *cmd,
}
static void
r3d_src_gmem_load(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const struct tu_image_view *iview,
uint32_t layer)
r3d_src_load(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const struct tu_image_view *iview,
uint32_t layer,
bool override_swap)
{
uint32_t desc[A6XX_TEX_CONST_DWORDS];
@ -1281,8 +1282,9 @@ r3d_src_gmem_load(struct tu_cmd_buffer *cmd,
* GMEM, so we need to fixup the swizzle and swap.
*/
desc[0] &= ~(A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
A6XX_TEX_CONST_0_SWAP__MASK);
A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
if (override_swap)
desc[0] &= ~A6XX_TEX_CONST_0_SWAP__MASK;
desc[0] |= A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
@ -1294,6 +1296,24 @@ r3d_src_gmem_load(struct tu_cmd_buffer *cmd,
VK_FILTER_NEAREST);
}
static void
r3d_src_gmem_load(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const struct tu_image_view *iview,
uint32_t layer)
{
r3d_src_load(cmd, cs, iview, layer, true);
}
static void
r3d_src_sysmem_load(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const struct tu_image_view *iview,
uint32_t layer)
{
r3d_src_load(cmd, cs, iview, layer, false);
}
template <chip CHIP>
static void
r3d_src_gmem(struct tu_cmd_buffer *cmd,
@ -3576,6 +3596,11 @@ resolve_sysmem(struct tu_cmd_buffer *cmd,
{
const struct blit_ops *ops = &r2d_ops<CHIP>;
/* A2D does not support "unresolve". */
if (dst->image->layout[0].nr_samples > 1) {
ops = &r3d_ops<CHIP>;
}
trace_start_sysmem_resolve(&cmd->rp_trace, cs, cmd, vk_dst_format);
enum pipe_format src_format = vk_format_to_pipe_format(vk_src_format);
@ -3595,7 +3620,11 @@ resolve_sysmem(struct tu_cmd_buffer *cmd,
ops->src_stencil(cmd, cs, src, i, VK_FILTER_NEAREST);
}
} else {
ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
if (ops == &r3d_ops<CHIP>) {
r3d_src_sysmem_load(cmd, cs, src, i);
} else {
ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
}
}
if (dst_separate_ds) {
@ -5081,12 +5110,13 @@ tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
uint32_t a,
uint32_t gmem_a,
bool cond_exec_allowed,
bool force_load)
{
const struct tu_image_view *iview = cmd->state.attachments[a];
const struct tu_render_pass_attachment *attachment =
&cmd->state.pass->attachments[a];
&cmd->state.pass->attachments[gmem_a];
bool load_common = attachment->load || force_load;
bool load_stencil =
@ -5110,7 +5140,10 @@ tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
tu_begin_load_store_cond_exec(cmd, cs, true);
if (TU_DEBUG(3D_LOAD) ||
cmd->state.pass->has_fdm) {
cmd->state.pass->has_fdm ||
/* Replicating unresolve seems to not work and the blob never uses it.
*/
(a != gmem_a)) {
if (load_common || load_stencil)
tu_disable_draw_states(cmd, cs);

View file

@ -70,6 +70,7 @@ tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
uint32_t a,
uint32_t gmem_a,
bool cond_exec_allowed,
bool force_load);

View file

@ -1521,6 +1521,62 @@ tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,
}
}
template <chip CHIP>
static void
tu6_emit_sysmem_unresolve(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t layer_mask,
uint32_t a,
uint32_t gmem_a)
{
const struct tu_framebuffer *fb = cmd->state.framebuffer;
const struct tu_image_view *src = cmd->state.attachments[a];
const struct tu_image_view *dst = cmd->state.attachments[gmem_a];
tu_resolve_sysmem<CHIP>(cmd, cs, src, dst, layer_mask, fb->layers, &cmd->state.render_area);
}
template <chip CHIP>
static void
tu6_emit_sysmem_unresolves(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const struct tu_subpass *subpass)
{
if (subpass->unresolve_count) {
/* Similar to above, we need to explicitly flush afterwards to keep this
* in sync with draw commands. However we also don't currently insert
* dependencies when a resolve is followed by an unresolve so we also
* need to manually flush for that case.
*/
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
/* Wait for the flushes to land before using the 2D engine */
tu_cs_emit_wfi(cs);
bool unresolve_ds = false;
for (unsigned i = 0; i < subpass->unresolve_count; i++) {
uint32_t a = subpass->unresolve_attachments[i].attachment;
if (a == VK_ATTACHMENT_UNUSED)
continue;
if (vk_format_is_depth_or_stencil(cmd->state.pass->attachments[a].format))
unresolve_ds = true;
uint32_t gmem_a = tu_subpass_get_attachment_to_unresolve(subpass, i);
tu6_emit_sysmem_unresolve<CHIP>(cmd, cs, subpass->multiview_mask, a, gmem_a);
}
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_COLOR);
if (unresolve_ds) {
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_DEPTH);
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_DEPTH);
}
tu_cs_emit_wfi(cs);
}
}
template <chip CHIP>
static void
tu6_emit_gmem_resolves(struct tu_cmd_buffer *cmd,
@ -1552,7 +1608,7 @@ tu6_emit_gmem_resolves(struct tu_cmd_buffer *cmd,
"TODO: missing GMEM->GMEM resolve path\n");
if (CHIP >= A7XX)
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
tu_load_gmem_attachment<CHIP>(cmd, cs, resolve_group, a, false, true);
tu_load_gmem_attachment<CHIP>(cmd, cs, resolve_group, a, a, false, true);
}
}
}
@ -5607,11 +5663,27 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r
tu6_emit_blit_scissor(cmd, cs, true, false);
emitted_scissor = true;
}
tu_load_gmem_attachment<CHIP>(cmd, cs, resolve_group, i,
tu_load_gmem_attachment<CHIP>(cmd, cs, resolve_group, i, i,
cond_load_allowed, false);
}
}
/* Emit unresolves that replicate single-sampled attachments into
* multisampled GMEM attachments.
*/
for (uint32_t i = 0; i < cmd->state.subpass->unresolve_count; ++i) {
uint32_t a = cmd->state.subpass->unresolve_attachments[i].attachment;
if (a == VK_ATTACHMENT_UNUSED)
continue;
uint32_t gmem_a =
tu_subpass_get_attachment_to_unresolve(cmd->state.subpass, i);
tu_load_gmem_attachment<CHIP>(cmd, cs, resolve_group, a, gmem_a,
cond_load_allowed, true);
}
if (!cmd->device->physical_device->info->a7xx.has_generic_clear) {
/* Emit gmem clears that are first used in this subpass. */
emitted_scissor = false;
@ -5640,18 +5712,23 @@ template <chip CHIP>
static void
tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd)
{
if (cmd->device->physical_device->info->a7xx.has_generic_clear)
if (cmd->device->physical_device->info->a7xx.has_generic_clear &&
!cmd->state.subpass->unresolve_count)
return;
struct tu_cs *cs = &cmd->draw_cs;
uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
tu6_emit_sysmem_unresolves<CHIP>(cmd, cs, cmd->state.subpass);
for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) {
struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[i];
if (att->clear_mask && att->first_subpass_idx == subpass_idx)
tu_clear_sysmem_attachment<CHIP>(cmd, cs, i);
}
tu_cond_exec_end(cs); /* sysmem */
}

View file

@ -1491,3 +1491,13 @@ tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t
return subpass->color_attachments[index].attachment;
}
uint32_t
tu_subpass_get_attachment_to_unresolve(const struct tu_subpass *subpass, uint32_t index)
{
if (index == subpass->color_count &&
index == (subpass->unresolve_count - 1))
return subpass->depth_stencil_attachment.attachment;
return subpass->color_attachments[index].attachment;
}

View file

@ -48,6 +48,7 @@ struct tu_subpass
uint32_t input_count;
uint32_t color_count;
uint32_t resolve_count;
uint32_t unresolve_count;
bool resolve_depth_stencil;
bool legacy_dithering_enabled;
@ -64,6 +65,7 @@ struct tu_subpass
struct tu_subpass_attachment *input_attachments;
struct tu_subpass_attachment *color_attachments;
struct tu_subpass_attachment *resolve_attachments;
struct tu_subpass_attachment *unresolve_attachments;
struct tu_subpass_attachment depth_stencil_attachment;
uint32_t fsr_attachment;
@ -157,4 +159,7 @@ void tu_setup_dynamic_inheritance(struct tu_cmd_buffer *cmd_buffer,
uint32_t
tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index);
uint32_t
tu_subpass_get_attachment_to_unresolve(const struct tu_subpass *subpass, uint32_t index);
#endif /* TU_PASS_H */