mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 11:18:08 +02:00
tu: Implement VK_QCOM_subpass_shader_resolve
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38451>
This commit is contained in:
parent
7691f1b70d
commit
ad84ae2719
14 changed files with 403 additions and 88 deletions
|
|
@ -763,6 +763,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
|||
case nir_intrinsic_load_const_ir3:
|
||||
case nir_intrinsic_load_frag_size_ir3:
|
||||
case nir_intrinsic_load_frag_offset_ir3:
|
||||
case nir_intrinsic_load_gmem_frag_scale_ir3:
|
||||
case nir_intrinsic_load_gmem_frag_offset_ir3:
|
||||
case nir_intrinsic_bindless_resource_ir3:
|
||||
case nir_intrinsic_ray_intersection_ir3:
|
||||
case nir_intrinsic_load_attribute_payload_intel:
|
||||
|
|
|
|||
|
|
@ -1517,6 +1517,11 @@ intrinsic("load_frag_size_ir3", src_comp=[1], dest_comp=2, indices=[RANGE],
|
|||
flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32])
|
||||
intrinsic("load_frag_offset_ir3", src_comp=[1], dest_comp=2, indices=[RANGE],
|
||||
flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32])
|
||||
# Per-view GMEM FragCoord scale and offset.
|
||||
intrinsic("load_gmem_frag_scale_ir3", src_comp=[1], dest_comp=2, indices=[RANGE],
|
||||
flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32])
|
||||
intrinsic("load_gmem_frag_offset_ir3", src_comp=[1], dest_comp=2, indices=[RANGE],
|
||||
flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32])
|
||||
|
||||
# IR3-specific load/store intrinsics. These access a buffer used to pass data
|
||||
# between geometry stages - perhaps it's explicit access to the vertex cache.
|
||||
|
|
|
|||
|
|
@ -3113,26 +3113,42 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
|||
dst[0] = create_driver_param(ctx, IR3_DP_FS(frag_invocation_count));
|
||||
break;
|
||||
case nir_intrinsic_load_frag_size_ir3:
|
||||
case nir_intrinsic_load_frag_offset_ir3: {
|
||||
unsigned param =
|
||||
intr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
|
||||
IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
|
||||
case nir_intrinsic_load_frag_offset_ir3:
|
||||
case nir_intrinsic_load_gmem_frag_scale_ir3:
|
||||
case nir_intrinsic_load_gmem_frag_offset_ir3: {
|
||||
unsigned param;
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_load_frag_size_ir3:
|
||||
param = IR3_DP_FS(frag_size);
|
||||
break;
|
||||
case nir_intrinsic_load_frag_offset_ir3:
|
||||
param = IR3_DP_FS(frag_offset);
|
||||
break;
|
||||
case nir_intrinsic_load_gmem_frag_scale_ir3:
|
||||
param = IR3_DP_FS(gmem_frag_scale);
|
||||
break;
|
||||
case nir_intrinsic_load_gmem_frag_offset_ir3:
|
||||
param = IR3_DP_FS(gmem_frag_offset);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("bad intrinsic");
|
||||
}
|
||||
if (nir_src_is_const(intr->src[0])) {
|
||||
uint32_t view = nir_src_as_uint(intr->src[0]);
|
||||
for (int i = 0; i < dest_components; i++) {
|
||||
dst[i] = create_driver_param(ctx, param + 4 * view + i);
|
||||
dst[i] = create_driver_param(ctx, param + 8 * view + i);
|
||||
}
|
||||
create_rpt = true;
|
||||
} else {
|
||||
struct ir3_instruction *view = ir3_get_src(ctx, &intr->src[0])[0];
|
||||
for (int i = 0; i < dest_components; i++) {
|
||||
dst[i] = create_driver_param_indirect(ctx, param + i,
|
||||
ir3_get_addr0(ctx, view, 4));
|
||||
ir3_get_addr0(ctx, view, 8));
|
||||
}
|
||||
ctx->so->constlen =
|
||||
MAX2(ctx->so->constlen,
|
||||
const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4 +
|
||||
param / 4 + nir_intrinsic_range(intr));
|
||||
param / 4 + nir_intrinsic_range(intr) * 2);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1434,11 +1434,19 @@ ir3_get_driver_param_info(const nir_shader *shader, nir_intrinsic_instr *intr,
|
|||
break;
|
||||
case nir_intrinsic_load_frag_size_ir3:
|
||||
param_info->offset = IR3_DP_FS(frag_size);
|
||||
param_info->extra_size = 4 * (nir_intrinsic_range(intr) - 1);
|
||||
param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1);
|
||||
break;
|
||||
case nir_intrinsic_load_frag_offset_ir3:
|
||||
param_info->offset = IR3_DP_FS(frag_offset);
|
||||
param_info->extra_size = 4 * (nir_intrinsic_range(intr) - 1);
|
||||
param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1);
|
||||
break;
|
||||
case nir_intrinsic_load_gmem_frag_scale_ir3:
|
||||
param_info->offset = IR3_DP_FS(gmem_frag_scale);
|
||||
param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1);
|
||||
break;
|
||||
case nir_intrinsic_load_gmem_frag_offset_ir3:
|
||||
param_info->offset = IR3_DP_FS(gmem_frag_offset);
|
||||
param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1);
|
||||
break;
|
||||
case nir_intrinsic_load_frag_invocation_count:
|
||||
param_info->offset = IR3_DP_FS(frag_invocation_count);
|
||||
|
|
|
|||
|
|
@ -107,7 +107,11 @@ struct ir3_driver_params_fs {
|
|||
uint32_t frag_size;
|
||||
uint32_t __pad_09;
|
||||
uint32_t frag_offset;
|
||||
uint32_t __pad_11_12[2];
|
||||
uint32_t __pad_11;
|
||||
uint32_t gmem_frag_scale;
|
||||
uint32_t __pad_13;
|
||||
uint32_t gmem_frag_offset;
|
||||
uint32_t __pad_15;
|
||||
};
|
||||
#define IR3_DP_FS(name) dword_offsetof(struct ir3_driver_params_fs, name)
|
||||
|
||||
|
|
|
|||
|
|
@ -4043,6 +4043,7 @@ struct apply_sysmem_clear_coords_state {
|
|||
unsigned layer;
|
||||
float z_clear_val;
|
||||
VkRect2D rect;
|
||||
bool custom_resolve;
|
||||
};
|
||||
|
||||
static void
|
||||
|
|
@ -4053,7 +4054,8 @@ fdm_apply_sysmem_clear_coords(struct tu_cmd_buffer *cmd,
|
|||
const VkOffset2D *hw_viewport_offsets,
|
||||
unsigned views,
|
||||
const VkExtent2D *frag_areas,
|
||||
const VkRect2D *bins)
|
||||
const VkRect2D *bins,
|
||||
bool binning)
|
||||
{
|
||||
const struct apply_sysmem_clear_coords_state *state =
|
||||
(const struct apply_sysmem_clear_coords_state *)data;
|
||||
|
|
@ -4064,9 +4066,15 @@ fdm_apply_sysmem_clear_coords(struct tu_cmd_buffer *cmd,
|
|||
hw_viewport_offsets[MIN2(state->view, views - 1)];
|
||||
|
||||
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset);
|
||||
|
||||
offset.x -= hw_viewport_offset.x;
|
||||
offset.y -= hw_viewport_offset.y;
|
||||
|
||||
if (state->custom_resolve && !binning) {
|
||||
offset = (VkOffset2D) {};
|
||||
frag_area = (VkExtent2D) { 1, 1 };
|
||||
}
|
||||
|
||||
unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
|
||||
unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
|
||||
frag_area.width) + offset.x;
|
||||
|
|
@ -4251,6 +4259,7 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
|
|||
.layer = rects[i].baseArrayLayer + layer,
|
||||
.z_clear_val = z_clear_val,
|
||||
.rect = rects[i].rect,
|
||||
.custom_resolve = subpass->custom_resolve,
|
||||
};
|
||||
tu_create_fdm_bin_patchpoint(cmd, cs, 4, TU_FDM_NONE,
|
||||
fdm_apply_sysmem_clear_coords,
|
||||
|
|
@ -4323,6 +4332,7 @@ clear_gmem_attachment(struct tu_cmd_buffer *cmd,
|
|||
struct apply_gmem_clear_coords_state {
|
||||
unsigned view;
|
||||
VkRect2D rect;
|
||||
bool custom_resolve;
|
||||
};
|
||||
|
||||
static void
|
||||
|
|
@ -4333,7 +4343,8 @@ fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd,
|
|||
const VkOffset2D *hw_viewport_offsets,
|
||||
unsigned views,
|
||||
const VkExtent2D *frag_areas,
|
||||
const VkRect2D *bins)
|
||||
const VkRect2D *bins,
|
||||
bool binning)
|
||||
{
|
||||
const struct apply_gmem_clear_coords_state *state =
|
||||
(const struct apply_gmem_clear_coords_state *)data;
|
||||
|
|
@ -4343,6 +4354,11 @@ fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd,
|
|||
|
||||
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset);
|
||||
|
||||
if (state->custom_resolve) {
|
||||
offset = (VkOffset2D) {};
|
||||
frag_area = (VkExtent2D) { 1, 1 };
|
||||
}
|
||||
|
||||
unsigned x1 = state->rect.offset.x / frag_area.width + offset.x;
|
||||
unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width,
|
||||
frag_area.width) + offset.x - 1;
|
||||
|
|
@ -4491,6 +4507,7 @@ tu_clear_attachments(struct tu_cmd_buffer *cmd,
|
|||
const VkClearRect *pRects)
|
||||
{
|
||||
struct tu_cs *cs = &cmd->draw_cs;
|
||||
const struct tu_subpass *subpass = cmd->state.subpass;
|
||||
|
||||
/* sysmem path behaves like a draw, note we don't have a way of using different
|
||||
* flushes for sysmem/gmem, so this needs to be outside of the cond_exec
|
||||
|
|
@ -4504,8 +4521,11 @@ tu_clear_attachments(struct tu_cmd_buffer *cmd,
|
|||
*
|
||||
* Similarly, we also use the 3D path when in a secondary command buffer that
|
||||
* doesn't know the GMEM layout that will be chosen by the primary.
|
||||
*
|
||||
* Don't use the GMEM path if we are in a custom resolve.
|
||||
*/
|
||||
if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
|
||||
if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT ||
|
||||
subpass->custom_resolve) {
|
||||
tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
|
||||
return;
|
||||
}
|
||||
|
|
@ -4514,7 +4534,6 @@ tu_clear_attachments(struct tu_cmd_buffer *cmd,
|
|||
* binning time, then emit the clear as a 3D draw so that it contributes to
|
||||
* that visibility.
|
||||
*/
|
||||
const struct tu_subpass *subpass = cmd->state.subpass;
|
||||
for (uint32_t i = 0; i < attachmentCount; i++) {
|
||||
uint32_t a;
|
||||
if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
|
||||
|
|
@ -4561,6 +4580,7 @@ tu7_clear_attachment_generic_single_rect(
|
|||
struct apply_gmem_clear_coords_state state = {
|
||||
.view = 0,
|
||||
.rect = rect->rect,
|
||||
.custom_resolve = subpass->custom_resolve,
|
||||
};
|
||||
tu_create_fdm_bin_patchpoint(cmd, cs, 3, TU_FDM_SKIP_BINNING,
|
||||
fdm_apply_gmem_clear_coords, state);
|
||||
|
|
@ -4589,6 +4609,7 @@ tu7_clear_attachment_generic_single_rect(
|
|||
struct apply_gmem_clear_coords_state state = {
|
||||
.view = layer,
|
||||
.rect = rect->rect,
|
||||
.custom_resolve = subpass->custom_resolve,
|
||||
};
|
||||
tu_create_fdm_bin_patchpoint(cmd, cs, 3, TU_FDM_SKIP_BINNING,
|
||||
fdm_apply_gmem_clear_coords, state);
|
||||
|
|
@ -4970,7 +4991,8 @@ fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
|
|||
const VkOffset2D *hw_viewport_offsets,
|
||||
unsigned views,
|
||||
const VkExtent2D *frag_areas,
|
||||
const VkRect2D *bins)
|
||||
const VkRect2D *bins,
|
||||
bool binning)
|
||||
{
|
||||
const struct apply_load_coords_state *state =
|
||||
(const struct apply_load_coords_state *)data;
|
||||
|
|
@ -5435,6 +5457,8 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
|
|||
subpass->color_attachments[j].attachment;
|
||||
if (tu_attachment_store_mismatched_mutability(cmd, a, gmem_a))
|
||||
cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
|
||||
if (subpass->custom_resolve)
|
||||
cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -5454,7 +5478,8 @@ fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
|
|||
const VkOffset2D *hw_viewport_offsets,
|
||||
unsigned views,
|
||||
const VkExtent2D *frag_areas,
|
||||
const VkRect2D *bins)
|
||||
const VkRect2D *bins,
|
||||
bool binning)
|
||||
{
|
||||
const struct apply_store_coords_state *state =
|
||||
(const struct apply_store_coords_state *)data;
|
||||
|
|
|
|||
|
|
@ -1412,6 +1412,64 @@ tu_fdm_num_layers(const struct tu_cmd_buffer *cmd)
|
|||
(cmd->state.fdm_per_layer ? cmd->state.framebuffer->layers : 1);
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu6_emit_bin_size_gmem(struct tu_cmd_buffer *cmd,
|
||||
struct tu_cs *cs,
|
||||
enum a6xx_buffers_location buffers_location,
|
||||
bool disable_lrz)
|
||||
{
|
||||
struct tu_physical_device *phys_dev = cmd->device->physical_device;
|
||||
const struct tu_tiling_config *tiling = cmd->state.tiling;
|
||||
bool hw_binning = use_hw_binning(cmd);
|
||||
|
||||
tu6_emit_bin_size<CHIP>(
|
||||
cs, buffers_location == BUFFERS_IN_GMEM ? tiling->tile0.width : 0,
|
||||
buffers_location == BUFFERS_IN_GMEM ? tiling->tile0.height : 0,
|
||||
{
|
||||
.render_mode = RENDERING_PASS,
|
||||
.force_lrz_write_dis = !phys_dev->info->props.has_lrz_feedback,
|
||||
.buffers_location = buffers_location,
|
||||
.lrz_feedback_zmode_mask =
|
||||
phys_dev->info->props.has_lrz_feedback
|
||||
? (hw_binning ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_Z_LATE_Z :
|
||||
LRZ_FEEDBACK_EARLY_Z_LATE_Z)
|
||||
: LRZ_FEEDBACK_NONE,
|
||||
.force_lrz_dis = CHIP >= A7XX && disable_lrz,
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
/* Set always-identical registers used specifically for GMEM */
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu7_emit_tile_render_begin_regs(struct tu_cs *cs)
|
||||
{
|
||||
tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP, 0x0));
|
||||
tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_GMEM));
|
||||
}
|
||||
|
||||
/* Set always-identical registers used specifically for sysmem */
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu7_emit_sysmem_render_begin_regs(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
|
||||
{
|
||||
tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP,
|
||||
.z_sysmem = true,
|
||||
.s_sysmem = true,
|
||||
.rt0_sysmem = true,
|
||||
.rt1_sysmem = true,
|
||||
.rt2_sysmem = true,
|
||||
.rt3_sysmem = true,
|
||||
.rt4_sysmem = true,
|
||||
.rt5_sysmem = true,
|
||||
.rt6_sysmem = true,
|
||||
.rt7_sysmem = true,
|
||||
));
|
||||
|
||||
tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_SYSMEM));
|
||||
}
|
||||
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
|
||||
|
|
@ -1419,7 +1477,6 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
|
|||
const struct tu_tile_config *tile,
|
||||
bool fdm, const VkOffset2D *fdm_offsets)
|
||||
{
|
||||
struct tu_physical_device *phys_dev = cmd->device->physical_device;
|
||||
const struct tu_tiling_config *tiling = cmd->state.tiling;
|
||||
const struct tu_framebuffer *fb = cmd->state.framebuffer;
|
||||
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling);
|
||||
|
|
@ -1470,19 +1527,15 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
|
|||
if (fdm_offsets && (tile->pos.x == 0 || tile->pos.y == 0))
|
||||
disable_lrz = true;
|
||||
|
||||
tu6_emit_bin_size<CHIP>(
|
||||
cs, tiling->tile0.width, tiling->tile0.height,
|
||||
{
|
||||
.render_mode = RENDERING_PASS,
|
||||
.force_lrz_write_dis = !phys_dev->info->props.has_lrz_feedback,
|
||||
.buffers_location = BUFFERS_IN_GMEM,
|
||||
.lrz_feedback_zmode_mask =
|
||||
phys_dev->info->props.has_lrz_feedback && !bin_is_scaled
|
||||
? (hw_binning ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_Z_LATE_Z :
|
||||
LRZ_FEEDBACK_EARLY_Z_LATE_Z)
|
||||
: LRZ_FEEDBACK_NONE,
|
||||
.force_lrz_dis = CHIP >= A7XX && disable_lrz,
|
||||
});
|
||||
/* When using custom resolve we need to re-emit these regs as they are
|
||||
* overwritten when switching to sysmem.
|
||||
*/
|
||||
if (CHIP >= A7XX &&
|
||||
cmd->state.pass->subpasses[cmd->state.pass->subpass_count - 1].custom_resolve) {
|
||||
tu7_emit_tile_render_begin_regs<CHIP>(cs);
|
||||
}
|
||||
|
||||
tu6_emit_bin_size_gmem<CHIP>(cmd, cs, BUFFERS_IN_GMEM, disable_lrz);
|
||||
|
||||
tu_cs_emit_regs(cs,
|
||||
A6XX_VFD_RENDER_MODE(RENDERING_PASS));
|
||||
|
|
@ -1634,7 +1687,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
|
|||
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
|
||||
tu_cs_emit_qw(cs, patch->iova);
|
||||
patch->apply(cmd, cs, patch->data, (VkOffset2D) { x1, y1 },
|
||||
frag_offsets, views, tile->frag_areas, bins);
|
||||
frag_offsets, views, tile->frag_areas, bins, false);
|
||||
}
|
||||
|
||||
/* Make the CP wait until the CP_MEM_WRITE's to the command buffers
|
||||
|
|
@ -2111,15 +2164,6 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs)
|
|||
tu_cs_emit(cs, A6XX_SP_VS_CONST_CONFIG_CONSTLEN(8) | A6XX_SP_VS_CONST_CONFIG_ENABLED);
|
||||
}
|
||||
|
||||
/* Set always-identical registers used specifically for GMEM */
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu7_emit_tile_render_begin_regs(struct tu_cs *cs)
|
||||
{
|
||||
tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP, 0x0));
|
||||
tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_GMEM));
|
||||
}
|
||||
|
||||
/* Emit the bin restore preamble, which runs in between bins when L1
|
||||
* preemption with skipsaverestore happens and we switch back to this context.
|
||||
* We need to restore static registers normally programmed at cmdbuf start
|
||||
|
|
@ -2435,7 +2479,7 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size);
|
||||
tu_cs_emit_qw(cs, patch->iova);
|
||||
patch->apply(cmd, cs, patch->data, (VkOffset2D) {0, 0}, frag_offsets,
|
||||
num_views, unscaled_frag_areas, bins);
|
||||
num_views, unscaled_frag_areas, bins, true);
|
||||
}
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
|
||||
|
|
@ -2532,6 +2576,8 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
|
|||
bool gmem)
|
||||
{
|
||||
const struct tu_tiling_config *tiling = cmd->state.tiling;
|
||||
uint32_t layers = MAX2(cmd->state.framebuffer->layers,
|
||||
cmd->state.pass->num_views);
|
||||
|
||||
/* note: we can probably emit input attachments just once for the whole
|
||||
* renderpass, this would avoid emitting both sysmem/gmem versions
|
||||
|
|
@ -2621,7 +2667,11 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
|
|||
gmem_offset = att->gmem_offset_stencil[cmd->state.gmem_layout];
|
||||
}
|
||||
|
||||
if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem) {
|
||||
if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem ||
|
||||
/* Skip GMEM patching when tiling is impossible as we may get
|
||||
* assertion failures from register packing below.
|
||||
*/
|
||||
!tiling->possible) {
|
||||
memcpy(&texture.map[i * A6XX_TEX_CONST_DWORDS], dst, sizeof(dst));
|
||||
continue;
|
||||
}
|
||||
|
|
@ -2647,10 +2697,17 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
|
|||
dst[2] =
|
||||
A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
|
||||
A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp);
|
||||
/* Note: it seems the HW implicitly calculates the array pitch with the
|
||||
* GMEM tiling, so we don't need to specify the pitch ourselves.
|
||||
/* Note: it seems the HW implicitly calculates the array pitch, except
|
||||
* when rendering to sysmem (i.e. in a custom resolve subpass). We only
|
||||
* guarantee the pitch is valid when there is more than 1 layer, so skip
|
||||
* emitting it otherwise to avoid asserts.
|
||||
*/
|
||||
dst[3] = 0;
|
||||
if (layers > 1) {
|
||||
dst[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(tiling->tile0.width *
|
||||
tiling->tile0.height * cpp);
|
||||
} else {
|
||||
dst[3] = 0;
|
||||
}
|
||||
dst[4] = cmd->device->physical_device->gmem_base + gmem_offset;
|
||||
dst[5] &= A6XX_TEX_CONST_5_DEPTH__MASK;
|
||||
for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
|
||||
|
|
@ -2985,20 +3042,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
|
|||
});
|
||||
|
||||
if (CHIP == A7XX) {
|
||||
tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP,
|
||||
.z_sysmem = true,
|
||||
.s_sysmem = true,
|
||||
.rt0_sysmem = true,
|
||||
.rt1_sysmem = true,
|
||||
.rt2_sysmem = true,
|
||||
.rt3_sysmem = true,
|
||||
.rt4_sysmem = true,
|
||||
.rt5_sysmem = true,
|
||||
.rt6_sysmem = true,
|
||||
.rt7_sysmem = true,
|
||||
));
|
||||
|
||||
tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_SYSMEM));
|
||||
tu7_emit_sysmem_render_begin_regs<CHIP>(cmd, cs);
|
||||
}
|
||||
|
||||
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
|
||||
|
|
@ -6400,9 +6444,14 @@ static void
|
|||
tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resolve_group)
|
||||
{
|
||||
struct tu_cs *cs = &cmd->draw_cs;
|
||||
uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
|
||||
const struct tu_subpass *subpass = cmd->state.subpass;
|
||||
uint32_t subpass_idx = subpass - cmd->state.pass->subpasses;
|
||||
const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling);
|
||||
|
||||
/* Shader resolve subpasses don't use GMEM */
|
||||
if (subpass->custom_resolve)
|
||||
return;
|
||||
|
||||
/* If we might choose to bin, then put the loads under a check for geometry
|
||||
* having been binned to this tile. If we don't choose to bin in the end,
|
||||
* then we will have manually set those registers to say geometry is present.
|
||||
|
|
@ -6496,9 +6545,11 @@ tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd)
|
|||
return;
|
||||
|
||||
struct tu_cs *cs = &cmd->draw_cs;
|
||||
uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses;
|
||||
const struct tu_subpass *subpass = cmd->state.subpass;
|
||||
uint32_t subpass_idx = subpass - cmd->state.pass->subpasses;
|
||||
|
||||
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
|
||||
if (!subpass->custom_resolve)
|
||||
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
|
||||
|
||||
tu6_emit_sysmem_unresolves<CHIP>(cmd, cs, cmd->state.subpass);
|
||||
|
||||
|
|
@ -6508,7 +6559,8 @@ tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd)
|
|||
tu_clear_sysmem_attachment<CHIP>(cmd, cs, i);
|
||||
}
|
||||
|
||||
tu_cond_exec_end(cs); /* sysmem */
|
||||
if (!subpass->custom_resolve)
|
||||
tu_cond_exec_end(cs); /* sysmem */
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
@ -6584,6 +6636,84 @@ tu7_emit_subpass_shading_rate(struct tu_cmd_buffer *cmd,
|
|||
cmd->prev_fsr_is_null = false;
|
||||
}
|
||||
|
||||
/* If this is a shader resolve subpass, switch to writing to sysmem.
|
||||
*/
|
||||
template <chip CHIP>
|
||||
static void
|
||||
tu_emit_subpass_custom_resolve(struct tu_cmd_buffer *cmd)
|
||||
{
|
||||
struct tu_cs *cs = &cmd->draw_cs;
|
||||
const struct tu_subpass *subpass = cmd->state.subpass;
|
||||
const struct tu_framebuffer *fb = cmd->state.framebuffer;
|
||||
const struct tu_tiling_config *tiling = cmd->state.tiling;
|
||||
|
||||
if (!subpass->custom_resolve)
|
||||
return;
|
||||
|
||||
/* Since a7xx, buffer location can be controlled per-buffer. We also have
|
||||
* to update the steering register so that generic clears use sysmem.
|
||||
*/
|
||||
if (CHIP >= A7XX) {
|
||||
tu7_emit_sysmem_render_begin_regs<CHIP>(cmd, cs);
|
||||
|
||||
/* Disable foveation offset here. It's not necessary for custom resolve.
|
||||
*/
|
||||
tu_cs_emit_regs(cs, GRAS_BIN_FOVEAT(CHIP));
|
||||
tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP));
|
||||
} else {
|
||||
tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
|
||||
|
||||
/* On a6xx the location is set in *_BIN_CONTROL */
|
||||
tu6_emit_bin_size_gmem<CHIP>(cmd, cs, BUFFERS_IN_SYSMEM, false);
|
||||
|
||||
tu_cond_exec_end(cs);
|
||||
}
|
||||
|
||||
/* With FDM and non-subsampled images, we switch from rendering space to
|
||||
* framebuffer space in the custom resolve subpass when not in the binning
|
||||
* pass because we are writing directly to the user-visible attachment. We
|
||||
* already aren't relying on the window scissor whenever FDM is enabled,
|
||||
* but it can get in the way if FDM offset is being used because it is
|
||||
* specified in rendering space, so the origin is shifted to the right and
|
||||
* down compared to the framebuffer-space bin coordinates and part of the
|
||||
* bin gets incorrectly clipped. Just disable it here by setting it to the
|
||||
* entire framebuffer. Add an extra tile size for when we are in the
|
||||
* binning pass and still using rendering space.
|
||||
*/
|
||||
if (tu_enable_fdm_offset(cmd)) {
|
||||
tu6_emit_window_scissor<CHIP>(cs, 0, 0,
|
||||
fb->width + tiling->tile0.width - 1,
|
||||
fb->height + tiling->tile0.height - 1);
|
||||
}
|
||||
|
||||
/* If FDM is enabled, we need to re-emit all FDM-related state. */
|
||||
if (cmd->state.pass->fragment_density_map.attachment !=
|
||||
VK_ATTACHMENT_UNUSED) {
|
||||
cmd->state.dirty |= TU_CMD_DIRTY_FDM;
|
||||
}
|
||||
}
|
||||
|
||||
/* If the last subpass is a shader resolve pass, emit flushes after switching
|
||||
* to sysmem, similar to fixed-function 3D resolves. Our flushing code assumes
|
||||
* that when in GMEM mode CCU isn't in use so we have to flush it ourselves.
|
||||
*/
|
||||
template<chip CHIP>
|
||||
static void
|
||||
tu_emit_custom_resolve_end(struct tu_cmd_buffer *cmd)
|
||||
{
|
||||
struct tu_cs *cs = &cmd->draw_cs;
|
||||
|
||||
const struct tu_subpass *subpass = cmd->state.subpass;
|
||||
|
||||
if (!subpass->custom_resolve)
|
||||
return;
|
||||
|
||||
if (subpass->color_count)
|
||||
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
|
||||
if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED)
|
||||
tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_DEPTH);
|
||||
}
|
||||
|
||||
/* emit loads, clears, and mrt/zs/msaa/ubwc state for the subpass that is
|
||||
* starting (either at vkCmdBeginRenderPass2() or vkCmdNextSubpass2())
|
||||
*
|
||||
|
|
@ -6599,6 +6729,7 @@ tu_emit_subpass_begin(struct tu_cmd_buffer *cmd)
|
|||
|
||||
struct tu_resolve_group resolve_group = {};
|
||||
|
||||
tu_emit_subpass_custom_resolve<CHIP>(cmd);
|
||||
tu_emit_subpass_begin_gmem<CHIP>(cmd, &resolve_group);
|
||||
tu_emit_subpass_begin_sysmem<CHIP>(cmd);
|
||||
if (cmd->device->physical_device->info->props.has_generic_clear) {
|
||||
|
|
@ -7582,6 +7713,7 @@ fs_params_size(struct tu_cmd_buffer *cmd)
|
|||
|
||||
struct apply_fs_params_state {
|
||||
unsigned num_consts;
|
||||
bool custom_resolve;
|
||||
};
|
||||
|
||||
static void
|
||||
|
|
@ -7592,13 +7724,14 @@ fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
|
|||
const VkOffset2D *hw_viewport_offsets,
|
||||
unsigned views,
|
||||
const VkExtent2D *frag_areas,
|
||||
const VkRect2D *bins)
|
||||
const VkRect2D *bins,
|
||||
bool binning)
|
||||
{
|
||||
const struct apply_fs_params_state *state =
|
||||
(const struct apply_fs_params_state *)data;
|
||||
unsigned num_consts = state->num_consts;
|
||||
|
||||
for (unsigned i = 0; i < num_consts; i++) {
|
||||
for (unsigned i = 0; i < DIV_ROUND_UP(num_consts, 2); i++) {
|
||||
/* FDM per layer may be enabled in the shader but not in the renderpass,
|
||||
* in which case views will be 1 and we have to replicate the one view
|
||||
* to all of the layers.
|
||||
|
|
@ -7607,10 +7740,38 @@ fdm_apply_fs_params(struct tu_cmd_buffer *cmd,
|
|||
VkRect2D bin = bins[MIN2(i, views - 1)];
|
||||
VkOffset2D offset = tu_fdm_per_bin_offset(area, bin, common_bin_offset);
|
||||
|
||||
tu_cs_emit(cs, area.width);
|
||||
tu_cs_emit(cs, area.height);
|
||||
tu_cs_emit(cs, fui(offset.x));
|
||||
tu_cs_emit(cs, fui(offset.y));
|
||||
/* For custom resolve, we switch to rendering directly to sysmem and so
|
||||
* the fragment size becomes 1x1. This means we have to scale down
|
||||
* FragCoord when accessing GMEM input attachments.
|
||||
*
|
||||
* TODO: When we support subsampled images, this should also only happen
|
||||
* for non-subsampled images.
|
||||
*/
|
||||
if (state->custom_resolve) {
|
||||
tu_cs_emit(cs, 1 /* width */);
|
||||
tu_cs_emit(cs, 1 /* height */);
|
||||
tu_cs_emit(cs, fui(0.0));
|
||||
tu_cs_emit(cs, fui(0.0));
|
||||
} else {
|
||||
tu_cs_emit(cs, area.width);
|
||||
tu_cs_emit(cs, area.height);
|
||||
tu_cs_emit(cs, fui(offset.x));
|
||||
tu_cs_emit(cs, fui(offset.y));
|
||||
}
|
||||
|
||||
if (i * 2 + 1 < num_consts) {
|
||||
if (state->custom_resolve) {
|
||||
tu_cs_emit(cs, fui(1. / area.width));
|
||||
tu_cs_emit(cs, fui(1. / area.height));
|
||||
tu_cs_emit(cs, fui(offset.x));
|
||||
tu_cs_emit(cs, fui(offset.y));
|
||||
} else {
|
||||
tu_cs_emit(cs, fui(1.0));
|
||||
tu_cs_emit(cs, fui(1.0));
|
||||
tu_cs_emit(cs, fui(0.0));
|
||||
tu_cs_emit(cs, fui(0.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -7632,16 +7793,23 @@ tu_emit_fdm_params(struct tu_cmd_buffer *cmd,
|
|||
if (fs->fs.has_fdm) {
|
||||
struct apply_fs_params_state state = {
|
||||
.num_consts = num_units - 1,
|
||||
.custom_resolve = cmd->state.subpass->custom_resolve,
|
||||
};
|
||||
tu_create_fdm_bin_patchpoint(cmd, cs, 4 * (num_units - 1),
|
||||
TU_FDM_SKIP_BINNING,
|
||||
fdm_apply_fs_params, state);
|
||||
} else {
|
||||
for (unsigned i = 1; i < num_units; i++) {
|
||||
for (unsigned i = 0; i < DIV_ROUND_UP((num_units - 1), 2); i++) {
|
||||
tu_cs_emit(cs, 1);
|
||||
tu_cs_emit(cs, 1);
|
||||
tu_cs_emit(cs, fui(0.0f));
|
||||
tu_cs_emit(cs, fui(0.0f));
|
||||
if (i * 2 + 1 < num_units - 1) {
|
||||
tu_cs_emit(cs, fui(1.0));
|
||||
tu_cs_emit(cs, fui(1.0));
|
||||
tu_cs_emit(cs, fui(0.0));
|
||||
tu_cs_emit(cs, fui(0.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -9174,6 +9342,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
|
|||
fdm_offsets = test_offsets;
|
||||
}
|
||||
|
||||
TU_CALLX(cmd_buffer->device, tu_emit_custom_resolve_end)(cmd_buffer);
|
||||
|
||||
tu_cs_end(&cmd_buffer->draw_cs);
|
||||
tu_cs_end(&cmd_buffer->draw_epilogue_cs);
|
||||
TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer, fdm_offsets);
|
||||
|
|
@ -9202,6 +9372,8 @@ tu_CmdEndRendering2EXT(VkCommandBuffer commandBuffer,
|
|||
*/
|
||||
TU_CALLX(cmd_buffer->device, tu_lrz_flush_valid_during_renderpass)
|
||||
(cmd_buffer, &cmd_buffer->draw_cs);
|
||||
} else {
|
||||
TU_CALLX(cmd_buffer->device, tu_emit_custom_resolve_end)(cmd_buffer);
|
||||
}
|
||||
|
||||
const VkRenderPassFragmentDensityMapOffsetEndInfoEXT *fdm_offset_info =
|
||||
|
|
|
|||
|
|
@ -858,7 +858,8 @@ typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
|
|||
const VkOffset2D *hw_viewport_offsets,
|
||||
unsigned views,
|
||||
const VkExtent2D *frag_areas,
|
||||
const VkRect2D *bins);
|
||||
const VkRect2D *bins,
|
||||
bool binning);
|
||||
|
||||
enum tu_fdm_flags {
|
||||
TU_FDM_NONE = 0,
|
||||
|
|
@ -926,7 +927,7 @@ _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
|
|||
};
|
||||
hw_viewport_offsets[i] = (VkOffset2D) { 0, 0 };
|
||||
}
|
||||
apply(cmd, cs, state, (VkOffset2D) {0, 0}, hw_viewport_offsets, num_views, unscaled_frag_areas, bins);
|
||||
apply(cmd, cs, state, (VkOffset2D) {0, 0}, hw_viewport_offsets, num_views, unscaled_frag_areas, bins, false);
|
||||
assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
|
||||
|
||||
util_dynarray_append(&cmd->fdm_bin_patchpoints, patch);
|
||||
|
|
|
|||
|
|
@ -350,6 +350,7 @@ get_device_extensions(const struct tu_physical_device *device,
|
|||
.IMG_filter_cubic = device->info->props.has_tex_filter_cubic,
|
||||
.NV_compute_shader_derivatives = device->info->chip >= 7,
|
||||
.QCOM_fragment_density_map_offset = true,
|
||||
.QCOM_render_pass_shader_resolve = true,
|
||||
.VALVE_fragment_density_map_layered = true,
|
||||
.VALVE_mutable_descriptor_type = true,
|
||||
} };
|
||||
|
|
|
|||
|
|
@ -969,7 +969,8 @@ tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const
|
|||
struct tu_subpass *subpass = &pass->subpasses[i];
|
||||
struct tu_render_pass_attachment *att = &pass->attachments[a];
|
||||
|
||||
att->gmem = true;
|
||||
if (!subpass->custom_resolve)
|
||||
att->gmem = true;
|
||||
update_samples(subpass, att->samples);
|
||||
att->used_views |= subpass->multiview_mask;
|
||||
|
||||
|
|
@ -1182,6 +1183,8 @@ tu_CreateRenderPass2(VkDevice _device,
|
|||
subpass->srgb_cntl = 0;
|
||||
subpass->legacy_dithering_enabled = desc->flags &
|
||||
VK_SUBPASS_DESCRIPTION_ENABLE_LEGACY_DITHERING_BIT_EXT;
|
||||
subpass->custom_resolve = desc->flags &
|
||||
VK_SUBPASS_DESCRIPTION_SHADER_RESOLVE_BIT_QCOM;
|
||||
|
||||
const BITMASK_ENUM(VkSubpassDescriptionFlagBits) raster_order_access_bits =
|
||||
VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_EXT |
|
||||
|
|
|
|||
|
|
@ -82,6 +82,8 @@ struct tu_subpass
|
|||
bool depth_used;
|
||||
bool stencil_used;
|
||||
|
||||
bool custom_resolve;
|
||||
|
||||
VkSampleCountFlagBits samples;
|
||||
|
||||
uint32_t srgb_cntl;
|
||||
|
|
|
|||
|
|
@ -1773,6 +1773,12 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
|
|||
~attachments_referenced;
|
||||
}
|
||||
|
||||
if (builder->state &
|
||||
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
|
||||
keys[MESA_SHADER_FRAGMENT].custom_resolve =
|
||||
builder->graphics_state.rp->custom_resolve;
|
||||
}
|
||||
|
||||
if (builder->create_flags &
|
||||
VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) {
|
||||
for (unsigned i = 0; i < builder->num_libraries; i++) {
|
||||
|
|
@ -2578,6 +2584,7 @@ struct apply_viewport_state {
|
|||
bool share_scale;
|
||||
/* See tu_pipeline::fake_single_viewport */
|
||||
bool fake_single_viewport;
|
||||
bool custom_resolve;
|
||||
};
|
||||
|
||||
/* It's a hardware restriction that the window offset (i.e. common_bin_offset)
|
||||
|
|
@ -2624,7 +2631,8 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
|
|||
VkOffset2D common_bin_offset,
|
||||
const VkOffset2D *hw_viewport_offsets,
|
||||
unsigned views,
|
||||
const VkExtent2D *frag_areas, const VkRect2D *bins)
|
||||
const VkExtent2D *frag_areas, const VkRect2D *bins,
|
||||
bool binning)
|
||||
{
|
||||
const struct apply_viewport_state *state =
|
||||
(const struct apply_viewport_state *)data;
|
||||
|
|
@ -2653,9 +2661,16 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
|
|||
*/
|
||||
VkViewport viewport =
|
||||
state->fake_single_viewport ? state->vp.viewports[0] : state->vp.viewports[i];
|
||||
if (frag_area.width == 1 && frag_area.height == 1 &&
|
||||
common_bin_offset.x == bin.offset.x &&
|
||||
common_bin_offset.y == bin.offset.y) {
|
||||
if ((frag_area.width == 1 && frag_area.height == 1 &&
|
||||
common_bin_offset.x == bin.offset.x &&
|
||||
common_bin_offset.y == bin.offset.y) ||
|
||||
/* When in a custom resolve operation (TODO: and using
|
||||
* non-subsampled images) we switch to framebuffer coordinates so we
|
||||
* shouldn't apply the transform. However the binning pass isn't
|
||||
* aware of this, so we have to keep applying the transform for
|
||||
* binning.
|
||||
*/
|
||||
(state->custom_resolve && !binning)) {
|
||||
vp.viewports[i] = viewport;
|
||||
continue;
|
||||
}
|
||||
|
|
@ -2692,6 +2707,7 @@ tu6_emit_viewport_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
|
|||
.share_scale = !cmd->state.per_view_viewport &&
|
||||
!cmd->state.per_layer_viewport,
|
||||
.fake_single_viewport = cmd->state.fake_single_viewport,
|
||||
.custom_resolve = cmd->state.subpass->custom_resolve,
|
||||
};
|
||||
if (cmd->state.per_view_viewport)
|
||||
state.vp.viewport_count = num_views;
|
||||
|
|
@ -2753,7 +2769,8 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
|
|||
VkOffset2D common_bin_offset,
|
||||
const VkOffset2D *hw_viewport_offsets,
|
||||
unsigned views,
|
||||
const VkExtent2D *frag_areas, const VkRect2D *bins)
|
||||
const VkExtent2D *frag_areas, const VkRect2D *bins,
|
||||
bool binning)
|
||||
{
|
||||
const struct apply_viewport_state *state =
|
||||
(const struct apply_viewport_state *)data;
|
||||
|
|
@ -2781,6 +2798,19 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
|
|||
common_bin_offset);
|
||||
offset.x -= hw_viewport_offset.x;
|
||||
offset.y -= hw_viewport_offset.y;
|
||||
|
||||
/* Disable scaling and offset when doing a custom resolve to a
|
||||
* non-subsampled image and not in the binning pass, because we
|
||||
* use framebuffer coordinates.
|
||||
*
|
||||
* TODO: When we support subsampled images, only do this for
|
||||
* non-subsampled images.
|
||||
*/
|
||||
if (state->custom_resolve && !binning) {
|
||||
offset = (VkOffset2D) {};
|
||||
frag_area = (VkExtent2D) {1, 1};
|
||||
}
|
||||
|
||||
VkOffset2D min = {
|
||||
scissor.offset.x / frag_area.width + offset.x,
|
||||
scissor.offset.y / frag_area.width + offset.y,
|
||||
|
|
@ -2791,12 +2821,20 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
|
|||
};
|
||||
|
||||
/* Intersect scissor with the scaled bin, this essentially replaces the
|
||||
* window scissor.
|
||||
* window scissor. With custom resolve (TODO: and non-subsampled images)
|
||||
* we have to use the unscaled bin instead.
|
||||
*/
|
||||
uint32_t scaled_width = bin.extent.width / frag_area.width;
|
||||
uint32_t scaled_height = bin.extent.height / frag_area.height;
|
||||
uint32_t bin_x = common_bin_offset.x - hw_viewport_offset.x;
|
||||
uint32_t bin_y = common_bin_offset.y - hw_viewport_offset.y;
|
||||
int32_t bin_x;
|
||||
int32_t bin_y;
|
||||
if (state->custom_resolve && !binning) {
|
||||
bin_x = bin.offset.x;
|
||||
bin_y = bin.offset.y;
|
||||
} else {
|
||||
bin_x = common_bin_offset.x - hw_viewport_offset.x;
|
||||
bin_y = common_bin_offset.y - hw_viewport_offset.y;
|
||||
}
|
||||
vp.scissors[i].offset.x = MAX2(min.x, bin_x);
|
||||
vp.scissors[i].offset.y = MAX2(min.y, bin_y);
|
||||
vp.scissors[i].extent.width =
|
||||
|
|
@ -2818,6 +2856,7 @@ tu6_emit_scissor_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
|
|||
.share_scale = !cmd->state.per_view_viewport &&
|
||||
!cmd->state.per_layer_viewport,
|
||||
.fake_single_viewport = cmd->state.fake_single_viewport,
|
||||
.custom_resolve = cmd->state.subpass->custom_resolve,
|
||||
};
|
||||
if (cmd->state.per_view_viewport)
|
||||
state.vp.scissor_count = num_views;
|
||||
|
|
@ -4426,6 +4465,8 @@ tu_fill_render_pass_state(struct vk_render_pass_state *rp,
|
|||
rp->color_attachment_formats[i] = pass->attachments[a].format;
|
||||
rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
|
||||
}
|
||||
|
||||
rp->custom_resolve = subpass->custom_resolve;
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
|||
|
|
@ -643,20 +643,37 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
|
|||
return true;
|
||||
|
||||
case nir_intrinsic_load_frag_size_ir3:
|
||||
case nir_intrinsic_load_frag_offset_ir3: {
|
||||
case nir_intrinsic_load_frag_offset_ir3:
|
||||
case nir_intrinsic_load_gmem_frag_scale_ir3:
|
||||
case nir_intrinsic_load_gmem_frag_offset_ir3: {
|
||||
if (!dev->compiler->load_shader_consts_via_preamble)
|
||||
return false;
|
||||
|
||||
unsigned param =
|
||||
instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
|
||||
IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
|
||||
unsigned param;
|
||||
switch (instr->intrinsic) {
|
||||
case nir_intrinsic_load_frag_size_ir3:
|
||||
param = IR3_DP_FS(frag_size);
|
||||
break;
|
||||
case nir_intrinsic_load_frag_offset_ir3:
|
||||
param = IR3_DP_FS(frag_offset);
|
||||
break;
|
||||
case nir_intrinsic_load_gmem_frag_scale_ir3:
|
||||
param = IR3_DP_FS(gmem_frag_scale);
|
||||
break;
|
||||
case nir_intrinsic_load_gmem_frag_offset_ir3:
|
||||
param = IR3_DP_FS(gmem_frag_offset);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("bad intrinsic");
|
||||
}
|
||||
|
||||
unsigned offset = param - IR3_DP_FS_DYNAMIC;
|
||||
unsigned base = param - IR3_DP_FS_DYNAMIC;
|
||||
|
||||
nir_def *view = instr->src[0].ssa;
|
||||
nir_def *offset = nir_imul_imm(b, view, 2);
|
||||
nir_def *result =
|
||||
ir3_load_driver_ubo_indirect(b, 2, &shader->const_state.fdm_ubo,
|
||||
offset, view, nir_intrinsic_range(instr));
|
||||
base, offset, nir_intrinsic_range(instr) * 2);
|
||||
|
||||
nir_def_replace(&instr->def, result);
|
||||
return true;
|
||||
|
|
@ -1147,6 +1164,7 @@ struct lower_fdm_options {
|
|||
unsigned num_views;
|
||||
bool adjust_fragcoord;
|
||||
bool use_layer;
|
||||
bool adjust_gmem_fragcoord;
|
||||
};
|
||||
|
||||
static bool
|
||||
|
|
@ -1211,7 +1229,22 @@ lower_fdm_instr(struct nir_builder *b, nir_instr *instr, void *data)
|
|||
}
|
||||
|
||||
if (intrin->intrinsic == nir_intrinsic_load_frag_coord_gmem_ir3) {
|
||||
return nir_load_frag_coord_unscaled_ir3(b);
|
||||
nir_def *unscaled_coord = nir_load_frag_coord_unscaled_ir3(b);
|
||||
|
||||
if (!options->adjust_gmem_fragcoord)
|
||||
return unscaled_coord;
|
||||
|
||||
nir_def *frag_offset =
|
||||
nir_load_gmem_frag_offset_ir3(b, view, .range = options->num_views);
|
||||
nir_def *frag_scale =
|
||||
nir_load_gmem_frag_scale_ir3(b, view, .range = options->num_views);
|
||||
nir_def *xy = nir_trim_vector(b, unscaled_coord, 2);
|
||||
xy = nir_fadd(b, nir_fmul(b, xy, frag_scale), frag_offset);
|
||||
return nir_vec4(b,
|
||||
nir_channel(b, xy, 0),
|
||||
nir_channel(b, xy, 1),
|
||||
nir_channel(b, unscaled_coord, 2),
|
||||
nir_channel(b, unscaled_coord, 3));
|
||||
}
|
||||
|
||||
assert(intrin->intrinsic == nir_intrinsic_load_frag_size);
|
||||
|
|
@ -2802,6 +2835,7 @@ tu_shader_create(struct tu_device *dev,
|
|||
key->max_fdm_layers, 1),
|
||||
.adjust_fragcoord = key->fragment_density_map,
|
||||
.use_layer = !key->multiview_mask,
|
||||
.adjust_gmem_fragcoord = key->fragment_density_map && key->custom_resolve,
|
||||
};
|
||||
NIR_PASS(_, nir, tu_nir_lower_fdm, &fdm_options);
|
||||
|
||||
|
|
|
|||
|
|
@ -128,6 +128,7 @@ struct tu_shader_key {
|
|||
bool robust_storage_access2;
|
||||
bool robust_uniform_access2;
|
||||
bool lower_view_index_to_device_index;
|
||||
bool custom_resolve;
|
||||
enum ir3_wavesize_option api_wavesize, real_wavesize;
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue