tu: Implement VK_QCOM_subpass_shader_resolve

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38451>
This commit is contained in:
Connor Abbott 2025-05-05 13:37:14 -04:00 committed by Marge Bot
parent 7691f1b70d
commit ad84ae2719
14 changed files with 403 additions and 88 deletions

View file

@ -763,6 +763,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_const_ir3:
case nir_intrinsic_load_frag_size_ir3:
case nir_intrinsic_load_frag_offset_ir3:
case nir_intrinsic_load_gmem_frag_scale_ir3:
case nir_intrinsic_load_gmem_frag_offset_ir3:
case nir_intrinsic_bindless_resource_ir3:
case nir_intrinsic_ray_intersection_ir3:
case nir_intrinsic_load_attribute_payload_intel:

View file

@ -1517,6 +1517,11 @@ intrinsic("load_frag_size_ir3", src_comp=[1], dest_comp=2, indices=[RANGE],
flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32])
intrinsic("load_frag_offset_ir3", src_comp=[1], dest_comp=2, indices=[RANGE],
flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32])
# Per-view GMEM FragCoord scale and offset.
intrinsic("load_gmem_frag_scale_ir3", src_comp=[1], dest_comp=2, indices=[RANGE],
flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32])
intrinsic("load_gmem_frag_offset_ir3", src_comp=[1], dest_comp=2, indices=[RANGE],
flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32])
# IR3-specific load/store intrinsics. These access a buffer used to pass data
# between geometry stages - perhaps it's explicit access to the vertex cache.

View file

@ -3113,26 +3113,42 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
dst[0] = create_driver_param(ctx, IR3_DP_FS(frag_invocation_count));
break;
case nir_intrinsic_load_frag_size_ir3:
case nir_intrinsic_load_frag_offset_ir3: {
unsigned param =
intr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
case nir_intrinsic_load_frag_offset_ir3:
case nir_intrinsic_load_gmem_frag_scale_ir3:
case nir_intrinsic_load_gmem_frag_offset_ir3: {
unsigned param;
switch (intr->intrinsic) {
case nir_intrinsic_load_frag_size_ir3:
param = IR3_DP_FS(frag_size);
break;
case nir_intrinsic_load_frag_offset_ir3:
param = IR3_DP_FS(frag_offset);
break;
case nir_intrinsic_load_gmem_frag_scale_ir3:
param = IR3_DP_FS(gmem_frag_scale);
break;
case nir_intrinsic_load_gmem_frag_offset_ir3:
param = IR3_DP_FS(gmem_frag_offset);
break;
default:
UNREACHABLE("bad intrinsic");
}
if (nir_src_is_const(intr->src[0])) {
uint32_t view = nir_src_as_uint(intr->src[0]);
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, param + 4 * view + i);
dst[i] = create_driver_param(ctx, param + 8 * view + i);
}
create_rpt = true;
} else {
struct ir3_instruction *view = ir3_get_src(ctx, &intr->src[0])[0];
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param_indirect(ctx, param + i,
ir3_get_addr0(ctx, view, 4));
ir3_get_addr0(ctx, view, 8));
}
ctx->so->constlen =
MAX2(ctx->so->constlen,
const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4 +
param / 4 + nir_intrinsic_range(intr));
param / 4 + nir_intrinsic_range(intr) * 2);
}
break;
}

View file

@ -1434,11 +1434,19 @@ ir3_get_driver_param_info(const nir_shader *shader, nir_intrinsic_instr *intr,
break;
case nir_intrinsic_load_frag_size_ir3:
param_info->offset = IR3_DP_FS(frag_size);
param_info->extra_size = 4 * (nir_intrinsic_range(intr) - 1);
param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1);
break;
case nir_intrinsic_load_frag_offset_ir3:
param_info->offset = IR3_DP_FS(frag_offset);
param_info->extra_size = 4 * (nir_intrinsic_range(intr) - 1);
param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1);
break;
case nir_intrinsic_load_gmem_frag_scale_ir3:
param_info->offset = IR3_DP_FS(gmem_frag_scale);
param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1);
break;
case nir_intrinsic_load_gmem_frag_offset_ir3:
param_info->offset = IR3_DP_FS(gmem_frag_offset);
param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1);
break;
case nir_intrinsic_load_frag_invocation_count:
param_info->offset = IR3_DP_FS(frag_invocation_count);

View file

@ -107,7 +107,11 @@ struct ir3_driver_params_fs {
uint32_t frag_size;
uint32_t __pad_09;
uint32_t frag_offset;
uint32_t __pad_11_12[2];
uint32_t __pad_11;
uint32_t gmem_frag_scale;
uint32_t __pad_13;
uint32_t gmem_frag_offset;
uint32_t __pad_15;
};
#define IR3_DP_FS(name) dword_offsetof(struct ir3_driver_params_fs, name)

View file

@ -4043,6 +4043,7 @@ struct apply_sysmem_clear_coords_state {
unsigned layer;
float z_clear_val;
VkRect2D rect;
bool custom_resolve;
};
static void
@ -4053,7 +4054,8 @@ fdm_apply_sysmem_clear_coords(struct tu_cmd_buffer *cmd,
const VkOffset2D *hw_viewport_offsets,
unsigned views,
const VkExtent2D *frag_areas,
const VkRect2D *bins)
const VkRect2D *bins,
bool binning)
{
const struct apply_sysmem_clear_coords_state *state =
(const struct apply_sysmem_clear_coords_state *)data;
@ -4064,9 +4066,15 @@ fdm_apply_sysmem_clear_coords(struct tu_cmd_buffer *cmd,
hw_viewport_offsets[MIN2(state->view, views - 1)];
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset);
offset.x -= hw_viewport_offset.x;
offset.y -= hw_viewport_offset.y;
if (state->custom_resolve && !binning) {
offset = (VkOffset2D) {};
frag_area = (VkExtent2D) { 1, 1 };
}
unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
frag_area.width) + offset.x;
@ -4251,6 +4259,7 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
.layer = rects[i].baseArrayLayer + layer,
.z_clear_val = z_clear_val,
.rect = rects[i].rect,
.custom_resolve = subpass->custom_resolve,
};
tu_create_fdm_bin_patchpoint(cmd, cs, 4, TU_FDM_NONE,
fdm_apply_sysmem_clear_coords,
@ -4323,6 +4332,7 @@ clear_gmem_attachment(struct tu_cmd_buffer *cmd,
struct apply_gmem_clear_coords_state {
unsigned view;
VkRect2D rect;
bool custom_resolve;
};
static void
@ -4333,7 +4343,8 @@ fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd,
const VkOffset2D *hw_viewport_offsets,
unsigned views,
const VkExtent2D *frag_areas,
const VkRect2D *bins)
const VkRect2D *bins,
bool binning)
{
const struct apply_gmem_clear_coords_state *state =
(const struct apply_gmem_clear_coords_state *)data;
@ -4343,6 +4354,11 @@ fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd,
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset);
if (state->custom_resolve) {
offset = (VkOffset2D) {};
frag_area = (VkExtent2D) { 1, 1 };
}
unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
frag_area.width) + offset.x - 1;
@ -4491,6 +4507,7 @@ tu_clear_attachments(struct tu_cmd_buffer *cmd,
const VkClearRect *pRects)
{
struct tu_cs *cs = &cmd->draw_cs;
const struct tu_subpass *subpass = cmd->state.subpass;
/* sysmem path behaves like a draw, note we don't have a way of using different
* flushes for sysmem/gmem, so this needs to be outside of the cond_exec
@ -4504,8 +4521,11 @@ tu_clear_attachments(struct tu_cmd_buffer *cmd,
*
* Similarly, we also use the 3D path when in a secondary command buffer that
* doesn't know the GMEM layout that will be chosen by the primary.
*
* Don't use the GMEM path if we are in a custom resolve.
*/
if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT ||
subpass->custom_resolve) {
tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
return;
}
@ -4514,7 +4534,6 @@ tu_clear_attachments(struct tu_cmd_buffer *cmd,
* binning time, then emit the clear as a 3D draw so that it contributes to
* that visibility.
*/
const struct tu_subpass *subpass = cmd->state.subpass;
for (uint32_t i = 0; i < attachmentCount; i++) {
uint32_t a;
if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
@ -4561,6 +4580,7 @@ tu7_clear_attachment_generic_single_rect(
struct apply_gmem_clear_coords_state state = {
.view = 0,
.rect = rect->rect,
.custom_resolve = subpass->custom_resolve,
};
tu_create_fdm_bin_patchpoint(cmd, cs, 3, TU_FDM_SKIP_BINNING,
fdm_apply_gmem_clear_coords, state);
@ -4589,6 +4609,7 @@ tu7_clear_attachment_generic_single_rect(
struct apply_gmem_clear_coords_state state = {
.view = layer,
.rect = rect->rect,
.custom_resolve = subpass->custom_resolve,
};
tu_create_fdm_bin_patchpoint(cmd, cs, 3, TU_FDM_SKIP_BINNING,
fdm_apply_gmem_clear_coords, state);
@ -4970,7 +4991,8 @@ fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
const VkOffset2D *hw_viewport_offsets,
unsigned views,
const VkExtent2D *frag_areas,
const VkRect2D *bins)
const VkRect2D *bins,
bool binning)
{
const struct apply_load_coords_state *state =
(const struct apply_load_coords_state *)data;
@ -5435,6 +5457,8 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
subpass->color_attachments[j].attachment;
if (tu_attachment_store_mismatched_mutability(cmd, a, gmem_a))
cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
if (subpass->custom_resolve)
cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
}
}
@ -5454,7 +5478,8 @@ fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
const VkOffset2D *hw_viewport_offsets,
unsigned views,
const VkExtent2D *frag_areas,
const VkRect2D *bins)
const VkRect2D *bins,
bool binning)
{
const struct apply_store_coords_state *state =
(const struct apply_store_coords_state *)data;

View file

@ -1412,6 +1412,64 @@ tu_fdm_num_layers(const struct tu_cmd_buffer *cmd)
(cmd->state.fdm_per_layer ? cmd->state.framebuffer->layers : 1);
}
template <chip CHIP>
static void
tu6_emit_bin_size_gmem(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
enum a6xx_buffers_location buffers_location,
bool disable_lrz)
{
struct tu_physical_device *phys_dev = cmd->device->physical_device;
const struct tu_tiling_config *tiling = cmd->state.tiling;
bool hw_binning = use_hw_binning(cmd);
tu6_emit_bin_size<CHIP>(
cs, buffers_location == BUFFERS_IN_GMEM ? tiling->tile0.width : 0,
buffers_location == BUFFERS_IN_GMEM ? tiling->tile0.height : 0,
{
.render_mode = RENDERING_PASS,
.force_lrz_write_dis = !phys_dev->info->props.has_lrz_feedback,
.buffers_location = buffers_location,
.lrz_feedback_zmode_mask =
phys_dev->info->props.has_lrz_feedback
? (hw_binning ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_Z_LATE_Z :
LRZ_FEEDBACK_EARLY_Z_LATE_Z)
: LRZ_FEEDBACK_NONE,
.force_lrz_dis = CHIP >= A7XX && disable_lrz,
});
}
/* Set always-identical registers used specifically for GMEM */
template <chip CHIP>
static void
tu7_emit_tile_render_begin_regs(struct tu_cs *cs)
{
tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP, 0x0));
tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_GMEM));
}
/* Set always-identical registers used specifically for sysmem */
template <chip CHIP>
static void
tu7_emit_sysmem_render_begin_regs(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
{
tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP,
.z_sysmem = true,
.s_sysmem = true,
.rt0_sysmem = true,
.rt1_sysmem = true,
.rt2_sysmem = true,
.rt3_sysmem = true,
.rt4_sysmem = true,
.rt5_sysmem = true,
.rt6_sysmem = true,
.rt7_sysmem = true,
));
tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_SYSMEM));
}
template <chip CHIP>
static void
tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
@ -1419,7 +1477,6 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
const struct tu_tile_config *tile,
bool fdm, const VkOffset2D *fdm_offsets)
{
struct tu_physical_device *phys_dev = cmd->device->physical_device;
const struct tu_tiling_config *tiling = cmd->state.tiling;
const struct tu_framebuffer *fb = cmd->state.framebuffer;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
@ -1470,19 +1527,15 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
if (fdm_offsets && (tile->pos.x == 0 || tile->pos.y == 0))
disable_lrz = true;
tu6_emit_bin_size<CHIP>(
cs, tiling->tile0.width, tiling->tile0.height,
{
.render_mode = RENDERING_PASS,
.force_lrz_write_dis = !phys_dev->info->props.has_lrz_feedback,
.buffers_location = BUFFERS_IN_GMEM,
.lrz_feedback_zmode_mask =
phys_dev->info->props.has_lrz_feedback && !bin_is_scaled
? (hw_binning ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_Z_LATE_Z :
LRZ_FEEDBACK_EARLY_Z_LATE_Z)
: LRZ_FEEDBACK_NONE,
.force_lrz_dis = CHIP >= A7XX && disable_lrz,
});
/* When using custom resolve we need to re-emit these regs as they are
* overwritten when switching to sysmem.
*/
if (CHIP >= A7XX &&
cmd->state.pass->subpasses[cmd->state.pass->subpass_count - 1].custom_resolve) {
tu7_emit_tile_render_begin_regs<CHIP>(cs);
}
tu6_emit_bin_size_gmem<CHIP>(cmd, cs, BUFFERS_IN_GMEM, disable_lrz);
tu_cs_emit_regs(cs,
A6XX_VFD_RENDER_MODE(RENDERING_PASS));
@ -1634,7 +1687,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
tu_cs_emit_qw(cs, patch->iova);
patch->apply(cmd, cs, patch->data, (VkOffset2D) { x1, y1 },
frag_offsets, views, tile->frag_areas, bins);
frag_offsets, views, tile->frag_areas, bins, false);
}
/* Make the CP wait until the CP_MEM_WRITE's to the command buffers
@ -2111,15 +2164,6 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
tu_cs_emit(cs, A6XX_SP_VS_CONST_CONFIG_CONSTLEN(8) | A6XX_SP_VS_CONST_CONFIG_ENABLED);
}
/* Set always-identical registers used specifically for GMEM */
template <chip CHIP>
static void
tu7_emit_tile_render_begin_regs(struct tu_cs *cs)
{
tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP, 0x0));
tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_GMEM));
}
/* Emit the bin restore preamble, which runs in between bins when L1
* preemption with skipsaverestore happens and we switch back to this context.
* We need to restore static registers normally programmed at cmdbuf start
@ -2435,7 +2479,7 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
tu_cs_emit_qw(cs, patch->iova);
patch->apply(cmd, cs, patch->data, (VkOffset2D) {0, 0}, frag_offsets,
num_views, unscaled_frag_areas, bins);
num_views, unscaled_frag_areas, bins, true);
}
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
@ -2532,6 +2576,8 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
bool gmem)
{
const struct tu_tiling_config *tiling = cmd->state.tiling;
uint32_t layers = MAX2(cmd->state.framebuffer->layers,
cmd->state.pass->num_views);
/* note: we can probably emit input attachments just once for the whole
* renderpass, this would avoid emitting both sysmem/gmem versions
@ -2621,7 +2667,11 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
gmem_offset = att->gmem_offset_stencil[cmd->state.gmem_layout];
}
if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem) {
if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem ||
/* Skip GMEM patching when tiling is impossible as we may get
* assertion failures from register packing below.
*/
!tiling->possible) {
memcpy(&texture.map[i * A6XX_TEX_CONST_DWORDS], dst, sizeof(dst));
continue;
}
@ -2647,10 +2697,17 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
dst[2] =
A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp);
/* Note: it seems the HW implicitly calculates the array pitch with the
* GMEM tiling, so we don't need to specify the pitch ourselves.
/* Note: it seems the HW implicitly calculates the array pitch, except
* when rendering to sysmem (i.e. in a custom resolve subpass). We only
* guarantee the pitch is valid when there is more than 1 layer, so skip
* emitting it otherwise to avoid asserts.
*/
dst[3] = 0;
if (layers > 1) {
dst[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(tiling->tile0.width *
tiling->tile0.height * cpp);
} else {
dst[3] = 0;
}
dst[4] = cmd->device->physical_device->gmem_base + gmem_offset;
dst[5] &= A6XX_TEX_CONST_5_DEPTH__MASK;
for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
@ -2985,20 +3042,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
});
if (CHIP == A7XX) {
tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP,
.z_sysmem = true,
.s_sysmem = true,
.rt0_sysmem = true,
.rt1_sysmem = true,
.rt2_sysmem = true,
.rt3_sysmem = true,
.rt4_sysmem = true,
.rt5_sysmem = true,
.rt6_sysmem = true,
.rt7_sysmem = true,
));
tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_SYSMEM));
tu7_emit_sysmem_render_begin_regs<CHIP>(cmd, cs);
}
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
@ -6400,9 +6444,14 @@ static void
tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resolve_group)
{
struct tu_cs *cs = &cmd->draw_cs;
uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
const struct tu_subpass *subpass = cmd->state.subpass;
uint32_t subpass_idx = subpass - cmd->state.pass->subpasses;
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);
/* Shader resolve subpasses don't use GMEM */
if (subpass->custom_resolve)
return;
/* If we might choose to bin, then put the loads under a check for geometry
* having been binned to this tile. If we don't choose to bin in the end,
* then we will have manually set those registers to say geometry is present.
@ -6496,9 +6545,11 @@ tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd)
return;
struct tu_cs *cs = &cmd->draw_cs;
uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
const struct tu_subpass *subpass = cmd->state.subpass;
uint32_t subpass_idx = subpass - cmd->state.pass->subpasses;
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
if (!subpass->custom_resolve)
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
tu6_emit_sysmem_unresolves<CHIP>(cmd, cs, cmd->state.subpass);
@ -6508,7 +6559,8 @@ tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd)
tu_clear_sysmem_attachment<CHIP>(cmd, cs, i);
}
tu_cond_exec_end(cs); /* sysmem */
if (!subpass->custom_resolve)
tu_cond_exec_end(cs); /* sysmem */
}
static void
@ -6584,6 +6636,84 @@ tu7_emit_subpass_shading_rate(struct tu_cmd_buffer *cmd,
cmd->prev_fsr_is_null = false;
}
/* If this is a shader resolve subpass, switch to writing to sysmem.
*/
template <chip CHIP>
static void
tu_emit_subpass_custom_resolve(struct tu_cmd_buffer *cmd)
{
struct tu_cs *cs = &cmd->draw_cs;
const struct tu_subpass *subpass = cmd->state.subpass;
const struct tu_framebuffer *fb = cmd->state.framebuffer;
const struct tu_tiling_config *tiling = cmd->state.tiling;
if (!subpass->custom_resolve)
return;
/* Since a7xx, buffer location can be controlled per-buffer. We also have
* to update the steering register so that generic clears use sysmem.
*/
if (CHIP >= A7XX) {
tu7_emit_sysmem_render_begin_regs<CHIP>(cmd, cs);
/* Disable foveation offset here. It's not necessary for custom resolve.
*/
tu_cs_emit_regs(cs, GRAS_BIN_FOVEAT(CHIP));
tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
} else {
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
/* On a6xx the location is set in *_BIN_CONTROL */
tu6_emit_bin_size_gmem<CHIP>(cmd, cs, BUFFERS_IN_SYSMEM, false);
tu_cond_exec_end(cs);
}
/* With FDM and non-subsampled images, we switch from rendering space to
* framebuffer space in the custom resolve subpass when not in the binning
* pass because we are writing directly to the user-visible attachment. We
* already aren't relying on the window scissor whenever FDM is enabled,
* but it can get in the way if FDM offset is being used because it is
* specified in rendering space, so the origin is shifted to the right and
* down compared to the framebuffer-space bin coordinates and part of the
* bin gets incorrectly clipped. Just disable it here by setting it to the
* entire framebuffer. Add an extra tile size for when we are in the
* binning pass and still using rendering space.
*/
if (tu_enable_fdm_offset(cmd)) {
tu6_emit_window_scissor<CHIP>(cs, 0, 0,
fb->width + tiling->tile0.width - 1,
fb->height + tiling->tile0.height - 1);
}
/* If FDM is enabled, we need to re-emit all FDM-related state. */
if (cmd->state.pass->fragment_density_map.attachment !=
VK_ATTACHMENT_UNUSED) {
cmd->state.dirty |= TU_CMD_DIRTY_FDM;
}
}
/* If the last subpass is a shader resolve pass, emit flushes after switching
* to sysmem, similar to fixed-function 3D resolves. Our flushing code assumes
* that when in GMEM mode CCU isn't in use so we have to flush it ourselves.
*/
template<chip CHIP>
static void
tu_emit_custom_resolve_end(struct tu_cmd_buffer *cmd)
{
struct tu_cs *cs = &cmd->draw_cs;
const struct tu_subpass *subpass = cmd->state.subpass;
if (!subpass->custom_resolve)
return;
if (subpass->color_count)
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED)
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_DEPTH);
}
/* emit loads, clears, and mrt/zs/msaa/ubwc state for the subpass that is
* starting (either at vkCmdBeginRenderPass2() or vkCmdNextSubpass2())
*
@ -6599,6 +6729,7 @@ tu_emit_subpass_begin(struct tu_cmd_buffer *cmd)
struct tu_resolve_group resolve_group = {};
tu_emit_subpass_custom_resolve<CHIP>(cmd);
tu_emit_subpass_begin_gmem<CHIP>(cmd, &resolve_group);
tu_emit_subpass_begin_sysmem<CHIP>(cmd);
if (cmd->device->physical_device->info->props.has_generic_clear) {
@ -7582,6 +7713,7 @@ fs_params_size(struct tu_cmd_buffer *cmd)
struct apply_fs_params_state {
unsigned num_consts;
bool custom_resolve;
};
static void
@ -7592,13 +7724,14 @@ fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
const VkOffset2D *hw_viewport_offsets,
unsigned views,
const VkExtent2D *frag_areas,
const VkRect2D *bins)
const VkRect2D *bins,
bool binning)
{
const struct apply_fs_params_state *state =
(const struct apply_fs_params_state *)data;
unsigned num_consts = state->num_consts;
for (unsigned i = 0; i < num_consts; i++) {
for (unsigned i = 0; i < DIV_ROUND_UP(num_consts, 2); i++) {
/* FDM per layer may be enabled in the shader but not in the renderpass,
* in which case views will be 1 and we have to replicate the one view
* to all of the layers.
@ -7607,10 +7740,38 @@ fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
VkRect2D bin = bins[MIN2(i, views - 1)];
VkOffset2D offset = tu_fdm_per_bin_offset(area, bin, common_bin_offset);
tu_cs_emit(cs, area.width);
tu_cs_emit(cs, area.height);
tu_cs_emit(cs, fui(offset.x));
tu_cs_emit(cs, fui(offset.y));
/* For custom resolve, we switch to rendering directly to sysmem and so
* the fragment size becomes 1x1. This means we have to scale down
* FragCoord when accessing GMEM input attachments.
*
* TODO: When we support subsampled images, this should also only happen
* for non-subsampled images.
*/
if (state->custom_resolve) {
tu_cs_emit(cs, 1 /* width */);
tu_cs_emit(cs, 1 /* height */);
tu_cs_emit(cs, fui(0.0));
tu_cs_emit(cs, fui(0.0));
} else {
tu_cs_emit(cs, area.width);
tu_cs_emit(cs, area.height);
tu_cs_emit(cs, fui(offset.x));
tu_cs_emit(cs, fui(offset.y));
}
if (i * 2 + 1 < num_consts) {
if (state->custom_resolve) {
tu_cs_emit(cs, fui(1. / area.width));
tu_cs_emit(cs, fui(1. / area.height));
tu_cs_emit(cs, fui(offset.x));
tu_cs_emit(cs, fui(offset.y));
} else {
tu_cs_emit(cs, fui(1.0));
tu_cs_emit(cs, fui(1.0));
tu_cs_emit(cs, fui(0.0));
tu_cs_emit(cs, fui(0.0));
}
}
}
}
@ -7632,16 +7793,23 @@ tu_emit_fdm_params(struct tu_cmd_buffer *cmd,
if (fs->fs.has_fdm) {
struct apply_fs_params_state state = {
.num_consts = num_units - 1,
.custom_resolve = cmd->state.subpass->custom_resolve,
};
tu_create_fdm_bin_patchpoint(cmd, cs, 4 * (num_units - 1),
TU_FDM_SKIP_BINNING,
fdm_apply_fs_params, state);
} else {
for (unsigned i = 1; i < num_units; i++) {
for (unsigned i = 0; i < DIV_ROUND_UP((num_units - 1), 2); i++) {
tu_cs_emit(cs, 1);
tu_cs_emit(cs, 1);
tu_cs_emit(cs, fui(0.0f));
tu_cs_emit(cs, fui(0.0f));
if (i * 2 + 1 < num_units - 1) {
tu_cs_emit(cs, fui(1.0));
tu_cs_emit(cs, fui(1.0));
tu_cs_emit(cs, fui(0.0));
tu_cs_emit(cs, fui(0.0));
}
}
}
}
@ -9174,6 +9342,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
fdm_offsets = test_offsets;
}
TU_CALLX(cmd_buffer->device, tu_emit_custom_resolve_end)(cmd_buffer);
tu_cs_end(&cmd_buffer->draw_cs);
tu_cs_end(&cmd_buffer->draw_epilogue_cs);
TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer, fdm_offsets);
@ -9202,6 +9372,8 @@ tu_CmdEndRendering2EXT(VkCommandBuffer commandBuffer,
*/
TU_CALLX(cmd_buffer->device, tu_lrz_flush_valid_during_renderpass)
(cmd_buffer, &cmd_buffer->draw_cs);
} else {
TU_CALLX(cmd_buffer->device, tu_emit_custom_resolve_end)(cmd_buffer);
}
const VkRenderPassFragmentDensityMapOffsetEndInfoEXT *fdm_offset_info =

View file

@ -858,7 +858,8 @@ typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
const VkOffset2D *hw_viewport_offsets,
unsigned views,
const VkExtent2D *frag_areas,
const VkRect2D *bins);
const VkRect2D *bins,
bool binning);
enum tu_fdm_flags {
TU_FDM_NONE = 0,
@ -926,7 +927,7 @@ _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
};
hw_viewport_offsets[i] = (VkOffset2D) { 0, 0 };
}
apply(cmd, cs, state, (VkOffset2D) {0, 0}, hw_viewport_offsets, num_views, unscaled_frag_areas, bins);
apply(cmd, cs, state, (VkOffset2D) {0, 0}, hw_viewport_offsets, num_views, unscaled_frag_areas, bins, false);
assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
util_dynarray_append(&cmd->fdm_bin_patchpoints, patch);

View file

@ -350,6 +350,7 @@ get_device_extensions(const struct tu_physical_device *device,
.IMG_filter_cubic = device->info->props.has_tex_filter_cubic,
.NV_compute_shader_derivatives = device->info->chip >= 7,
.QCOM_fragment_density_map_offset = true,
.QCOM_render_pass_shader_resolve = true,
.VALVE_fragment_density_map_layered = true,
.VALVE_mutable_descriptor_type = true,
} };

View file

@ -969,7 +969,8 @@ tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const
struct tu_subpass *subpass = &pass->subpasses[i];
struct tu_render_pass_attachment *att = &pass->attachments[a];
att->gmem = true;
if (!subpass->custom_resolve)
att->gmem = true;
update_samples(subpass, att->samples);
att->used_views |= subpass->multiview_mask;
@ -1182,6 +1183,8 @@ tu_CreateRenderPass2(VkDevice _device,
subpass->srgb_cntl = 0;
subpass->legacy_dithering_enabled = desc->flags &
VK_SUBPASS_DESCRIPTION_ENABLE_LEGACY_DITHERING_BIT_EXT;
subpass->custom_resolve = desc->flags &
VK_SUBPASS_DESCRIPTION_SHADER_RESOLVE_BIT_QCOM;
const BITMASK_ENUM(VkSubpassDescriptionFlagBits) raster_order_access_bits =
VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_EXT |

View file

@ -82,6 +82,8 @@ struct tu_subpass
bool depth_used;
bool stencil_used;
bool custom_resolve;
VkSampleCountFlagBits samples;
uint32_t srgb_cntl;

View file

@ -1773,6 +1773,12 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
~attachments_referenced;
}
if (builder->state &
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
keys[MESA_SHADER_FRAGMENT].custom_resolve =
builder->graphics_state.rp->custom_resolve;
}
if (builder->create_flags &
VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) {
for (unsigned i = 0; i < builder->num_libraries; i++) {
@ -2578,6 +2584,7 @@ struct apply_viewport_state {
bool share_scale;
/* See tu_pipeline::fake_single_viewport */
bool fake_single_viewport;
bool custom_resolve;
};
/* It's a hardware restriction that the window offset (i.e. common_bin_offset)
@ -2624,7 +2631,8 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
VkOffset2D common_bin_offset,
const VkOffset2D *hw_viewport_offsets,
unsigned views,
const VkExtent2D *frag_areas, const VkRect2D *bins)
const VkExtent2D *frag_areas, const VkRect2D *bins,
bool binning)
{
const struct apply_viewport_state *state =
(const struct apply_viewport_state *)data;
@ -2653,9 +2661,16 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
*/
VkViewport viewport =
state->fake_single_viewport ? state->vp.viewports[0] : state->vp.viewports[i];
if (frag_area.width == 1 && frag_area.height == 1 &&
common_bin_offset.x == bin.offset.x &&
common_bin_offset.y == bin.offset.y) {
if ((frag_area.width == 1 && frag_area.height == 1 &&
common_bin_offset.x == bin.offset.x &&
common_bin_offset.y == bin.offset.y) ||
/* When in a custom resolve operation (TODO: and using
* non-subsampled images) we switch to framebuffer coordinates so we
* shouldn't apply the transform. However the binning pass isn't
* aware of this, so we have to keep applying the transform for
* binning.
*/
(state->custom_resolve && !binning)) {
vp.viewports[i] = viewport;
continue;
}
@ -2692,6 +2707,7 @@ tu6_emit_viewport_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
.share_scale = !cmd->state.per_view_viewport &&
!cmd->state.per_layer_viewport,
.fake_single_viewport = cmd->state.fake_single_viewport,
.custom_resolve = cmd->state.subpass->custom_resolve,
};
if (cmd->state.per_view_viewport)
state.vp.viewport_count = num_views;
@ -2753,7 +2769,8 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
VkOffset2D common_bin_offset,
const VkOffset2D *hw_viewport_offsets,
unsigned views,
const VkExtent2D *frag_areas, const VkRect2D *bins)
const VkExtent2D *frag_areas, const VkRect2D *bins,
bool binning)
{
const struct apply_viewport_state *state =
(const struct apply_viewport_state *)data;
@ -2781,6 +2798,19 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
common_bin_offset);
offset.x -= hw_viewport_offset.x;
offset.y -= hw_viewport_offset.y;
/* Disable scaling and offset when doing a custom resolve to a
* non-subsampled image and not in the binning pass, because we
* use framebuffer coordinates.
*
* TODO: When we support subsampled images, only do this for
* non-subsampled images.
*/
if (state->custom_resolve && !binning) {
offset = (VkOffset2D) {};
frag_area = (VkExtent2D) {1, 1};
}
VkOffset2D min = {
scissor.offset.x / frag_area.width + offset.x,
scissor.offset.y / frag_area.width + offset.y,
@ -2791,12 +2821,20 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
};
/* Intersect scissor with the scaled bin, this essentially replaces the
* window scissor.
* window scissor. With custom resolve (TODO: and non-subsampled images)
* we have to use the unscaled bin instead.
*/
uint32_t scaled_width = bin.extent.width / frag_area.width;
uint32_t scaled_height = bin.extent.height / frag_area.height;
uint32_t bin_x = common_bin_offset.x - hw_viewport_offset.x;
uint32_t bin_y = common_bin_offset.y - hw_viewport_offset.y;
int32_t bin_x;
int32_t bin_y;
if (state->custom_resolve && !binning) {
bin_x = bin.offset.x;
bin_y = bin.offset.y;
} else {
bin_x = common_bin_offset.x - hw_viewport_offset.x;
bin_y = common_bin_offset.y - hw_viewport_offset.y;
}
vp.scissors[i].offset.x = MAX2(min.x, bin_x);
vp.scissors[i].offset.y = MAX2(min.y, bin_y);
vp.scissors[i].extent.width =
@ -2818,6 +2856,7 @@ tu6_emit_scissor_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
.share_scale = !cmd->state.per_view_viewport &&
!cmd->state.per_layer_viewport,
.fake_single_viewport = cmd->state.fake_single_viewport,
.custom_resolve = cmd->state.subpass->custom_resolve,
};
if (cmd->state.per_view_viewport)
state.vp.scissor_count = num_views;
@ -4426,6 +4465,8 @@ tu_fill_render_pass_state(struct vk_render_pass_state *rp,
rp->color_attachment_formats[i] = pass->attachments[a].format;
rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
}
rp->custom_resolve = subpass->custom_resolve;
}
static void

View file

@ -643,20 +643,37 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
return true;
case nir_intrinsic_load_frag_size_ir3:
case nir_intrinsic_load_frag_offset_ir3: {
case nir_intrinsic_load_frag_offset_ir3:
case nir_intrinsic_load_gmem_frag_scale_ir3:
case nir_intrinsic_load_gmem_frag_offset_ir3: {
if (!dev->compiler->load_shader_consts_via_preamble)
return false;
unsigned param =
instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
unsigned param;
switch (instr->intrinsic) {
case nir_intrinsic_load_frag_size_ir3:
param = IR3_DP_FS(frag_size);
break;
case nir_intrinsic_load_frag_offset_ir3:
param = IR3_DP_FS(frag_offset);
break;
case nir_intrinsic_load_gmem_frag_scale_ir3:
param = IR3_DP_FS(gmem_frag_scale);
break;
case nir_intrinsic_load_gmem_frag_offset_ir3:
param = IR3_DP_FS(gmem_frag_offset);
break;
default:
UNREACHABLE("bad intrinsic");
}
unsigned offset = param - IR3_DP_FS_DYNAMIC;
unsigned base = param - IR3_DP_FS_DYNAMIC;
nir_def *view = instr->src[0].ssa;
nir_def *offset = nir_imul_imm(b, view, 2);
nir_def *result =
ir3_load_driver_ubo_indirect(b, 2, &shader->const_state.fdm_ubo,
offset, view, nir_intrinsic_range(instr));
base, offset, nir_intrinsic_range(instr) * 2);
nir_def_replace(&instr->def, result);
return true;
@ -1147,6 +1164,7 @@ struct lower_fdm_options {
unsigned num_views;
bool adjust_fragcoord;
bool use_layer;
bool adjust_gmem_fragcoord;
};
static bool
@ -1211,7 +1229,22 @@ lower_fdm_instr(struct nir_builder *b, nir_instr *instr, void *data)
}
if (intrin->intrinsic == nir_intrinsic_load_frag_coord_gmem_ir3) {
return nir_load_frag_coord_unscaled_ir3(b);
nir_def *unscaled_coord = nir_load_frag_coord_unscaled_ir3(b);
if (!options->adjust_gmem_fragcoord)
return unscaled_coord;
nir_def *frag_offset =
nir_load_gmem_frag_offset_ir3(b, view, .range = options->num_views);
nir_def *frag_scale =
nir_load_gmem_frag_scale_ir3(b, view, .range = options->num_views);
nir_def *xy = nir_trim_vector(b, unscaled_coord, 2);
xy = nir_fadd(b, nir_fmul(b, xy, frag_scale), frag_offset);
return nir_vec4(b,
nir_channel(b, xy, 0),
nir_channel(b, xy, 1),
nir_channel(b, unscaled_coord, 2),
nir_channel(b, unscaled_coord, 3));
}
assert(intrin->intrinsic == nir_intrinsic_load_frag_size);
@ -2802,6 +2835,7 @@ tu_shader_create(struct tu_device *dev,
key->max_fdm_layers, 1),
.adjust_fragcoord = key->fragment_density_map,
.use_layer = !key->multiview_mask,
.adjust_gmem_fragcoord = key->fragment_density_map && key->custom_resolve,
};
NIR_PASS(_, nir, tu_nir_lower_fdm, &fdm_options);

View file

@ -128,6 +128,7 @@ struct tu_shader_key {
bool robust_storage_access2;
bool robust_uniform_access2;
bool lower_view_index_to_device_index;
bool custom_resolve;
enum ir3_wavesize_option api_wavesize, real_wavesize;
};