tu/a7xx: use concurrent resolve groups

Establish groups of resolve and unresolve operations that the a7xx
hardware can then use to improve efficiency. Creating such groups enables
continuation of command stream processing while these (un)resolves are in
progress, as long as those latter operations don't depend on the grouped
(un)resolves.

To enable concurrent resolves and unresolves, corresponding fields on the
RB_CCU_CNTL register have to be set appropriately.

Resolve groups are tracked through a scoped struct that logs any pending
resolve operation. Once the group is complete, the emit helper function
will write out the CCU_END_RESOLVE_GROUP event to the command stream.

The buffer ID field on the RB_BLIT_INFO register can be used to disperse
different resolve operations across all available slots in the resolve
engine. The 0x8 and 0x9 IDs are reserved for depth and stencil buffers,
while the 0x0-0x7 range is used for color buffers. A simple incremented
counter is used to assign IDs for all color buffers inside any resolve
group. While it can occur for two color or depth/stencil buffers inside
the same resolve group to have identical IDs, hardware doesn't seem to
have a problem with handling that.

Two TU_DEBUG options are provided, 'noconcurrentresolves' and
'noconcurrentunresolves` disable respective operations by adjusting the
mode set through RB_CCU_CNTL.

Signed-off-by: Zan Dobersek <zdobersek@igalia.com>
Reviewed-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31190>
This commit is contained in:
Zan Dobersek 2024-10-23 12:07:50 +02:00 committed by Marge Bot
parent f0e5331b21
commit 25b73dff5a
6 changed files with 224 additions and 44 deletions

View file

@ -1516,6 +1516,8 @@ aspect_write_mask_generic_clear(enum pipe_format format, VkImageAspectFlags aspe
mask = 0x1;
if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
mask = 0x2;
if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))
mask = 0x3;
}
return mask;
}
@ -1882,6 +1884,7 @@ pack_blit_event_clear_value(const VkClearValue *val, enum pipe_format format, ui
static void
event_blit_setup(struct tu_cs *cs,
uint32_t buffer_id,
const struct tu_render_pass_attachment *att,
enum a6xx_blit_event_type blit_event_type,
uint32_t clear_mask)
@ -1899,7 +1902,8 @@ event_blit_setup(struct tu_cs *cs,
vk_format_is_int(att->format) ||
vk_format_is_depth_or_stencil(att->format),
.depth = vk_format_is_depth_or_stencil(att->format),
.clear_mask = clear_mask, ));
.clear_mask = clear_mask,
.buffer_id = buffer_id));
}
struct event_blit_dst_view {
@ -1984,6 +1988,7 @@ event_blit_run(struct tu_cmd_buffer *cmd,
static void
tu7_generic_layer_clear(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t buffer_id,
enum pipe_format format,
uint8_t clear_mask,
bool separate_stencil,
@ -2003,7 +2008,7 @@ tu7_generic_layer_clear(struct tu_cmd_buffer *cmd,
event_blit_dst_view blt_view = blt_view_from_tu_view(iview, layer);
event_blit_setup(cs, att, BLIT_EVENT_CLEAR, clear_mask);
event_blit_setup(cs, buffer_id, att, BLIT_EVENT_CLEAR, clear_mask);
event_blit_run<A7XX>(cmd, cs, att, &blt_view, separate_stencil);
}
@ -3476,6 +3481,70 @@ tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
}
TU_GENX(tu_resolve_sysmem);
enum tu_resolve_group_buffer_type {
TU_RESOLVE_GROUP_COLOR_BUFFER,
TU_RESOLVE_GROUP_DEPTH_BUFFER,
TU_RESOLVE_GROUP_STENCIL_BUFFER,
};
template <chip CHIP>
static uint32_t
tu_resolve_group_include_buffer(struct tu_resolve_group *resolve_group,
enum tu_resolve_group_buffer_type buffer_type)
{
/* Resolve groups are not usable on a6xx, so no pending resolve is
* established. The default value of 0 is returned as the buffer ID.
*/
if (CHIP == A6XX)
return 0;
resolve_group->pending_resolves = true;
if (buffer_type == TU_RESOLVE_GROUP_DEPTH_BUFFER)
return 0x8;
if (buffer_type == TU_RESOLVE_GROUP_STENCIL_BUFFER)
return 0x9;
const uint32_t max_color_buffers = 8;
uint32_t buffer_id = resolve_group->color_buffer_id++;
return buffer_id % max_color_buffers;
}
template <chip CHIP>
static uint32_t
tu_resolve_group_include_buffer_for_format(struct tu_resolve_group *resolve_group,
VkFormat format)
{
enum tu_resolve_group_buffer_type buffer_type = TU_RESOLVE_GROUP_COLOR_BUFFER;
/* D24_UNORM_S8_UINT should be assigned the depth buffer type, regardless of
* whether depth, stencil or both are being resolved.
*/
if (format == VK_FORMAT_D24_UNORM_S8_UINT)
buffer_type = TU_RESOLVE_GROUP_DEPTH_BUFFER;
return tu_resolve_group_include_buffer<CHIP>(resolve_group, buffer_type);
}
template <chip CHIP>
void
tu_emit_resolve_group(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group)
{
/* Resolve groups are not usable on A6XX, so that template instantiation
* should behave as a no-op.
*/
if (CHIP == A6XX || !resolve_group->pending_resolves)
return;
resolve_group->color_buffer_id = 0;
resolve_group->pending_resolves = false;
tu_emit_raw_event_write<CHIP>(cmd, cs, CCU_END_RESOLVE_GROUP, false);
}
TU_GENX(tu_emit_resolve_group);
template <chip CHIP>
static void
clear_image_cp_blit(struct tu_cmd_buffer *cmd,
@ -3538,6 +3607,7 @@ clear_image_cp_blit(struct tu_cmd_buffer *cmd,
static void
clear_image_event_blit(struct tu_cmd_buffer *cmd,
struct tu_image *image,
uint32_t buffer_id,
const VkClearValue *clear_value,
const VkImageSubresourceRange *range,
VkImageAspectFlags aspect_mask)
@ -3573,7 +3643,8 @@ clear_image_event_blit(struct tu_cmd_buffer *cmd,
.sample_0 = vk_format_is_int(vk_format) ||
vk_format_is_depth_or_stencil(vk_format),
.depth = vk_format_is_depth_or_stencil(vk_format),
.clear_mask = aspect_write_mask_generic_clear(format, aspect_mask)));
.clear_mask = aspect_write_mask_generic_clear(format, aspect_mask),
.buffer_id = buffer_id));
uint32_t clear_vals[4] = {};
pack_blit_event_clear_value(clear_value, format, clear_vals);
@ -3656,12 +3727,13 @@ template <chip CHIP>
static void
clear_image(struct tu_cmd_buffer *cmd,
struct tu_image *image,
uint32_t buffer_id,
const VkClearValue *clear_value,
const VkImageSubresourceRange *range,
VkImageAspectFlags aspect_mask)
{
if (use_generic_clear_for_image_clear(cmd, image)) {
clear_image_event_blit(cmd, image, clear_value, range, aspect_mask);
clear_image_event_blit(cmd, image, buffer_id, clear_value, range, aspect_mask);
} else {
clear_image_cp_blit<CHIP>(cmd, image, clear_value, range, aspect_mask);
}
@ -3686,9 +3758,14 @@ tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
tu_emit_cache_flush<CHIP>(cmd);
}
struct tu_resolve_group resolve_group = {};
for (unsigned i = 0; i < rangeCount; i++) {
clear_image<CHIP>(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, TU_RESOLVE_GROUP_COLOR_BUFFER);
clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
}
tu_emit_resolve_group<CHIP>(cmd, &cmd->cs, &resolve_group);
}
TU_GENX(tu_CmdClearColorImage);
@ -3712,19 +3789,31 @@ tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
tu_emit_cache_flush<CHIP>(cmd);
}
struct tu_resolve_group resolve_group = {};
for (unsigned i = 0; i < rangeCount; i++) {
const VkImageSubresourceRange *range = &pRanges[i];
if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
/* can't clear both depth and stencil at once, split up the aspect mask */
u_foreach_bit(b, range->aspectMask)
clear_image<CHIP>(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
u_foreach_bit(b, range->aspectMask) {
uint32_t buffer_id = 0;
if (BIT(b) == VK_IMAGE_ASPECT_DEPTH_BIT)
buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER);
if (BIT(b) == VK_IMAGE_ASPECT_STENCIL_BIT)
buffer_id = tu_resolve_group_include_buffer<CHIP>(&resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER);
clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pDepthStencil, range, BIT(b));
}
continue;
}
clear_image<CHIP>(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
uint32_t buffer_id = tu_resolve_group_include_buffer_for_format<CHIP>(&resolve_group, image->vk.format);
clear_image<CHIP>(cmd, image, buffer_id, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
}
tu_emit_resolve_group<CHIP>(cmd, &cmd->cs, &resolve_group);
tu_lrz_clear_depth_image<CHIP>(cmd, image, pDepthStencil, rangeCount, pRanges);
}
TU_GENX(tu_CmdClearDepthStencilImage);
@ -3933,6 +4022,7 @@ template <chip CHIP>
static void
clear_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t buffer_id,
enum pipe_format format,
uint8_t clear_mask,
uint32_t gmem_offset,
@ -3943,7 +4033,8 @@ clear_gmem_attachment(struct tu_cmd_buffer *cmd,
blit_base_format<CHIP>(format, false, true)));
tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.type = BLIT_EVENT_CLEAR,
.clear_mask = clear_mask));
.clear_mask = clear_mask,
.buffer_id = buffer_id));
tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
tu_cs_emit(cs, gmem_offset);
@ -3964,6 +4055,7 @@ template <chip CHIP>
static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
uint32_t attachment,
uint32_t base_layer,
uint32_t layers,
@ -3984,15 +4076,18 @@ tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
uint32_t layer = i + base_layer;
if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
clear_gmem_attachment<CHIP>(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf,
uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER);
clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, 0xf,
tu_attachment_gmem_offset(cmd, att, layer), value);
}
if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
clear_gmem_attachment<CHIP>(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf,
uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER);
clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, 0xf,
tu_attachment_gmem_offset_stencil(cmd, att, layer), value);
}
} else {
clear_gmem_attachment<CHIP>(cmd, cs, format, aspect_write_mask(format, mask),
uint32_t buffer_id = tu_resolve_group_include_buffer_for_format<CHIP>(resolve_group, att->format);
clear_gmem_attachment<CHIP>(cmd, cs, buffer_id, format, aspect_write_mask(format, mask),
tu_attachment_gmem_offset(cmd, att, layer), value);
}
}
@ -4016,6 +4111,8 @@ tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
if (rect_count > 1)
perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream");
struct tu_resolve_group resolve_group = {};
for (unsigned i = 0; i < rect_count; i++) {
unsigned x1 = rects[i].rect.offset.x;
unsigned y1 = rects[i].rect.offset.y;
@ -4036,13 +4133,16 @@ tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
if (a == VK_ATTACHMENT_UNUSED)
continue;
tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, a, rects[i].baseArrayLayer,
tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a,
rects[i].baseArrayLayer,
rects[i].layerCount,
subpass->multiview_mask,
attachments[j].aspectMask,
&attachments[j].clearValue);
}
}
tu_emit_resolve_group<CHIP>(cmd, cs, &resolve_group);
}
template <chip CHIP>
@ -4109,6 +4209,7 @@ static void
tu7_clear_attachment_generic_single_rect(
struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
const struct tu_render_pass_attachment *att,
const VkClearAttachment *clear_att,
uint32_t a,
@ -4136,15 +4237,18 @@ tu7_clear_attachment_generic_single_rect(
if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
if (clear_att->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_Z32_FLOAT, mask,
uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER);
tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, mask,
false, layer, value, a);
}
if (clear_att->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_S8_UINT, mask, true,
uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER);
tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, mask, true,
layer, value, a);
}
} else {
tu7_generic_layer_clear(cmd, cs, format, mask, false, layer, value, a);
uint32_t buffer_id = tu_resolve_group_include_buffer_for_format<A7XX>(resolve_group, att->format);
tu7_generic_layer_clear(cmd, cs, buffer_id, format, mask, false, layer, value, a);
}
}
}
@ -4178,6 +4282,8 @@ tu_clear_attachments_generic(struct tu_cmd_buffer *cmd,
tu_cs_emit_wfi(cs);
tu_cond_exec_end(cs);
struct tu_resolve_group resolve_group = {};
const struct tu_subpass *subpass = cmd->state.subpass;
for (uint32_t i = 0; i < attachmentCount; i++) {
uint32_t a;
@ -4194,11 +4300,13 @@ tu_clear_attachments_generic(struct tu_cmd_buffer *cmd,
iview->view.ubwc_enabled, att->samples);
for (unsigned j = 0; j < rectCount; j++) {
tu7_clear_attachment_generic_single_rect(
cmd, cs, att, &pAttachments[i], a, &pRects[j]);
cmd, cs, &resolve_group, att, &pAttachments[i], a, &pRects[j]);
}
trace_end_generic_clear(&cmd->trace, cs);
}
}
tu_emit_resolve_group<A7XX>(cmd, cs, &resolve_group);
}
template <chip CHIP>
@ -4330,6 +4438,7 @@ template <chip CHIP>
void
tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
uint32_t a)
{
const struct tu_render_pass_attachment *attachment =
@ -4338,7 +4447,8 @@ tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
if (!attachment->clear_mask)
return;
tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, a, 0, cmd->state.framebuffer->layers,
tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, resolve_group, a, 0,
cmd->state.framebuffer->layers,
attachment->clear_views,
attachment->clear_mask,
&cmd->state.clear_values[a]);
@ -4346,7 +4456,10 @@ tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
TU_GENX(tu_clear_gmem_attachment);
void
tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a)
tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
uint32_t a)
{
const struct tu_render_pass_attachment *att =
&cmd->state.pass->attachments[a];
@ -4363,15 +4476,18 @@ tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32
aspect_write_mask_generic_clear(format, att->clear_mask);
if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
if (att->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_Z32_FLOAT, mask,
uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, TU_RESOLVE_GROUP_DEPTH_BUFFER);
tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_Z32_FLOAT, mask,
false, layer, value, a);
}
if (att->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_S8_UINT, mask, true,
uint32_t buffer_id = tu_resolve_group_include_buffer<A7XX>(resolve_group, TU_RESOLVE_GROUP_STENCIL_BUFFER);
tu7_generic_layer_clear(cmd, cs, buffer_id, PIPE_FORMAT_S8_UINT, mask, true,
layer, value, a);
}
} else {
tu7_generic_layer_clear(cmd, cs, format, mask, false, layer, value, a);
uint32_t buffer_id = tu_resolve_group_include_buffer_for_format<A7XX>(resolve_group, att->format);
tu7_generic_layer_clear(cmd, cs, buffer_id, format, mask, false, layer, value, a);
}
}
@ -4385,6 +4501,7 @@ template <chip CHIP>
static void
tu_emit_blit(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
const struct tu_image_view *iview,
const struct tu_render_pass_attachment *attachment,
const VkClearValue *clear_value,
@ -4426,7 +4543,18 @@ tu_emit_blit(struct tu_cmd_buffer *cmd,
tu_cs_emit_array(cs, clear_vals, 4);
}
event_blit_setup(cs, attachment, blit_event_type, clear_mask);
enum tu_resolve_group_buffer_type buffer_type = TU_RESOLVE_GROUP_COLOR_BUFFER;
if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
if (!separate_stencil)
buffer_type = TU_RESOLVE_GROUP_DEPTH_BUFFER;
else
buffer_type = TU_RESOLVE_GROUP_STENCIL_BUFFER;
} else if (attachment->format == VK_FORMAT_D24_UNORM_S8_UINT) {
buffer_type = TU_RESOLVE_GROUP_DEPTH_BUFFER;
}
uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, buffer_type);
event_blit_setup(cs, buffer_id, attachment, blit_event_type, clear_mask);
for_each_layer(i, attachment->clear_views, cmd->state.framebuffer->layers) {
event_blit_dst_view blt_view = blt_view_from_tu_view(iview, i);
@ -4618,6 +4746,7 @@ template <chip CHIP>
void
tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
uint32_t a,
bool cond_exec_allowed,
bool force_load)
@ -4659,10 +4788,10 @@ tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
load_3d_blit<CHIP>(cmd, cs, iview, attachment, true);
} else {
if (load_common)
tu_emit_blit<CHIP>(cmd, cs, iview, attachment, NULL, BLIT_EVENT_LOAD, false);
tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, attachment, NULL, BLIT_EVENT_LOAD, false);
if (load_stencil)
tu_emit_blit<CHIP>(cmd, cs, iview, attachment, NULL, BLIT_EVENT_LOAD, true);
tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, attachment, NULL, BLIT_EVENT_LOAD, true);
}
if (cond_exec)
@ -4928,6 +5057,7 @@ template <chip CHIP>
void
tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
uint32_t a,
uint32_t gmem_a,
uint32_t layers,
@ -4982,9 +5112,9 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
/* use fast path when render area is aligned, except for unsupported resolve cases */
if (use_fast_path) {
if (store_common)
tu_emit_blit<CHIP>(cmd, cs, iview, src, clear_value, BLIT_EVENT_STORE, false);
tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, src, clear_value, BLIT_EVENT_STORE, false);
if (store_separate_stencil)
tu_emit_blit<CHIP>(cmd, cs, iview, src, clear_value, BLIT_EVENT_STORE, true);
tu_emit_blit<CHIP>(cmd, cs, resolve_group, iview, src, clear_value, BLIT_EVENT_STORE, true);
if (cond_exec) {
tu_end_load_store_cond_exec(cmd, cs, false);

View file

@ -34,6 +34,17 @@ tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
uint32_t layers,
const VkRect2D *rect);
struct tu_resolve_group {
uint32_t color_buffer_id;
bool pending_resolves;
};
template <chip CHIP>
void
tu_emit_resolve_group(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group);
template <chip CHIP>
void
tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
@ -44,17 +55,20 @@ template <chip CHIP>
void
tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
uint32_t a);
void
tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
uint32_t a);
template <chip CHIP>
void
tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
uint32_t a,
bool cond_exec_allowed,
bool force_load);
@ -64,6 +78,7 @@ template <chip CHIP>
void
tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
struct tu_resolve_group *resolve_group,
uint32_t a,
uint32_t gmem_a,
uint32_t layers,

View file

@ -44,7 +44,7 @@ tu_clone_trace(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
}
template <chip CHIP>
static void
void
tu_emit_raw_event_write(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
enum vgt_event_type event,
@ -67,6 +67,7 @@ tu_emit_raw_event_write(struct tu_cmd_buffer *cmd,
tu_cs_emit(cs, 0);
}
}
TU_GENX(tu_emit_raw_event_write);
template <chip CHIP>
void
@ -1241,6 +1242,8 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu6_emit_blit_scissor(cmd, cs, true);
struct tu_resolve_group resolve_group = {};
/* Resolve should happen before store in case BLIT_EVENT_STORE_AND_CLEAR is
* used for a store.
*/
@ -1249,8 +1252,8 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
uint32_t a = subpass->resolve_attachments[i].attachment;
if (a != VK_ATTACHMENT_UNUSED) {
uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
tu_store_gmem_attachment<CHIP>(cmd, cs, a, gmem_a, fb->layers,
subpass->multiview_mask, false);
tu_store_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a, gmem_a,
fb->layers, subpass->multiview_mask, false);
}
}
}
@ -1259,12 +1262,14 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
if (pass->attachments[a].gmem) {
const bool cond_exec_allowed = cmd->state.tiling->binning_possible &&
cmd->state.pass->has_cond_load_store;
tu_store_gmem_attachment<CHIP>(cmd, cs, a, a,
tu_store_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a, a,
fb->layers, subpass->multiview_mask,
cond_exec_allowed);
}
}
tu_emit_resolve_group<CHIP>(cmd, cs, &resolve_group);
if (pass->has_fdm)
tu_cs_set_writeable(cs, false);
}
@ -1295,10 +1300,20 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
* change per-RP and don't require a WFI to take effect, only CCU inval/flush
* events are required.
*/
tu_cs_emit_regs(cs, RB_CCU_CNTL(CHIP,
enum a7xx_concurrent_resolve_mode resolve_mode = CONCURRENT_RESOLVE_MODE_2;
if (TU_DEBUG(NO_CONCURRENT_RESOLVES))
resolve_mode = CONCURRENT_RESOLVE_MODE_DISABLED;
enum a7xx_concurrent_unresolve_mode unresolve_mode = CONCURRENT_UNRESOLVE_MODE_FULL;
if (TU_DEBUG(NO_CONCURRENT_UNRESOLVES))
unresolve_mode = CONCURRENT_UNRESOLVE_MODE_DISABLED;
tu_cs_emit_regs(cs, RB_CCU_CNTL(A7XX,
.gmem_fast_clear_disable =
!dev->physical_device->info->a6xx.has_gmem_fast_clear,
.concurrent_resolve = dev->physical_device->info->a6xx.concurrent_resolve,
!dev->physical_device->info->a6xx.has_gmem_fast_clear,
.concurrent_resolve_mode = resolve_mode,
.concurrent_unresolve_mode = unresolve_mode,
));
}
@ -4467,7 +4482,7 @@ tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
template <chip CHIP>
static void
tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd)
tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resolve_group)
{
struct tu_cs *cs = &cmd->draw_cs;
uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
@ -4498,7 +4513,8 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd)
tu6_emit_blit_scissor(cmd, cs, true);
emitted_scissor = true;
}
tu_load_gmem_attachment<CHIP>(cmd, cs, i, cond_load_allowed, false);
tu_load_gmem_attachment<CHIP>(cmd, cs, resolve_group, i,
cond_load_allowed, false);
}
}
@ -4513,7 +4529,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd)
tu6_emit_blit_scissor(cmd, cs, false);
emitted_scissor = true;
}
tu_clear_gmem_attachment<CHIP>(cmd, cs, i);
tu_clear_gmem_attachment<CHIP>(cmd, cs, resolve_group, i);
}
}
}
@ -4546,7 +4562,7 @@ tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd)
}
static void
tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd)
tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resolve_group)
{
if (cmd->state.render_area.extent.width == 0 ||
cmd->state.render_area.extent.height == 0)
@ -4564,7 +4580,7 @@ tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd)
tu6_emit_blit_scissor(cmd, cs, false);
emitted_scissor = true;
}
tu7_generic_clear_attachment(cmd, cs, i);
tu7_generic_clear_attachment(cmd, cs, resolve_group, i);
}
}
}
@ -4582,12 +4598,16 @@ tu_emit_subpass_begin(struct tu_cmd_buffer *cmd)
{
tu_fill_render_pass_state(&cmd->state.vk_rp, cmd->state.pass, cmd->state.subpass);
tu_emit_subpass_begin_gmem<CHIP>(cmd);
struct tu_resolve_group resolve_group = {};
tu_emit_subpass_begin_gmem<CHIP>(cmd, &resolve_group);
tu_emit_subpass_begin_sysmem<CHIP>(cmd);
if (cmd->device->physical_device->info->a7xx.has_generic_clear) {
tu7_emit_subpass_clear(cmd);
tu7_emit_subpass_clear(cmd, &resolve_group);
}
tu_emit_resolve_group<CHIP>(cmd, &cmd->draw_cs, &resolve_group);
tu6_emit_zs<CHIP>(cmd, cmd->state.subpass, &cmd->draw_cs);
tu6_emit_mrt<CHIP>(cmd, cmd->state.subpass, &cmd->draw_cs);
tu6_emit_render_cntl<CHIP>(cmd, cmd->state.subpass, &cmd->draw_cs, false);
@ -4964,6 +4984,8 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
if (subpass->resolve_attachments) {
tu6_emit_blit_scissor(cmd, cs, true);
struct tu_resolve_group resolve_group = {};
for (unsigned i = 0; i < subpass->resolve_count; i++) {
uint32_t a = subpass->resolve_attachments[i].attachment;
if (a == VK_ATTACHMENT_UNUSED)
@ -4971,8 +4993,8 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
tu_store_gmem_attachment<CHIP>(cmd, cs, a, gmem_a, fb->layers,
subpass->multiview_mask, false);
tu_store_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a, gmem_a,
fb->layers, subpass->multiview_mask, false);
if (!pass->attachments[a].gmem)
continue;
@ -4981,8 +5003,10 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
* if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..
*/
perf_debug(cmd->device, "TODO: missing GMEM->GMEM resolve path\n");
tu_load_gmem_attachment<CHIP>(cmd, cs, a, false, true);
tu_load_gmem_attachment<CHIP>(cmd, cs, &resolve_group, a, false, true);
}
tu_emit_resolve_group<CHIP>(cmd, cs, &resolve_group);
}
tu_cond_exec_end(cs);

View file

@ -679,6 +679,13 @@ void tu_cmd_render(struct tu_cmd_buffer *cmd);
enum fd_gpu_event : uint32_t;
template <chip CHIP>
void
tu_emit_raw_event_write(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
enum vgt_event_type event,
bool needs_seqno);
template <chip CHIP>
void
tu_emit_event_write(struct tu_cmd_buffer *cmd,

View file

@ -43,6 +43,8 @@ static const struct debug_control tu_debug_options[] = {
{ "noconform", TU_DEBUG_NOCONFORM },
{ "rd", TU_DEBUG_RD },
{ "hiprio", TU_DEBUG_HIPRIO },
{ "noconcurrentresolves", TU_DEBUG_NO_CONCURRENT_RESOLVES },
{ "noconcurrentunresolves", TU_DEBUG_NO_CONCURRENT_UNRESOLVES },
{ NULL, 0 }
};

View file

@ -49,6 +49,8 @@ enum tu_debug_flags
TU_DEBUG_NOCONFORM = 1 << 24,
TU_DEBUG_RD = 1 << 25,
TU_DEBUG_HIPRIO = 1 << 26,
TU_DEBUG_NO_CONCURRENT_RESOLVES = 1 << 27,
TU_DEBUG_NO_CONCURRENT_UNRESOLVES = 1 << 28,
};
struct tu_env {