diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 7d5dc731d0d..ddec11ff5cd 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -763,6 +763,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_const_ir3: case nir_intrinsic_load_frag_size_ir3: case nir_intrinsic_load_frag_offset_ir3: + case nir_intrinsic_load_gmem_frag_scale_ir3: + case nir_intrinsic_load_gmem_frag_offset_ir3: case nir_intrinsic_bindless_resource_ir3: case nir_intrinsic_ray_intersection_ir3: case nir_intrinsic_load_attribute_payload_intel: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 568d91763da..2b52eb985f4 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1517,6 +1517,11 @@ intrinsic("load_frag_size_ir3", src_comp=[1], dest_comp=2, indices=[RANGE], flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32]) intrinsic("load_frag_offset_ir3", src_comp=[1], dest_comp=2, indices=[RANGE], flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32]) +# Per-view GMEM FragCoord scale and offset. +intrinsic("load_gmem_frag_scale_ir3", src_comp=[1], dest_comp=2, indices=[RANGE], + flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32]) +intrinsic("load_gmem_frag_offset_ir3", src_comp=[1], dest_comp=2, indices=[RANGE], + flags=[CAN_ELIMINATE, CAN_REORDER], bit_sizes=[32]) # IR3-specific load/store intrinsics. These access a buffer used to pass data # between geometry stages - perhaps it's explicit access to the vertex cache. diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 1d559641431..1f0c109f6e1 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -3113,26 +3113,42 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) dst[0] = create_driver_param(ctx, IR3_DP_FS(frag_invocation_count)); break; case nir_intrinsic_load_frag_size_ir3: - case nir_intrinsic_load_frag_offset_ir3: { - unsigned param = - intr->intrinsic == nir_intrinsic_load_frag_size_ir3 ? - IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset); + case nir_intrinsic_load_frag_offset_ir3: + case nir_intrinsic_load_gmem_frag_scale_ir3: + case nir_intrinsic_load_gmem_frag_offset_ir3: { + unsigned param; + switch (intr->intrinsic) { + case nir_intrinsic_load_frag_size_ir3: + param = IR3_DP_FS(frag_size); + break; + case nir_intrinsic_load_frag_offset_ir3: + param = IR3_DP_FS(frag_offset); + break; + case nir_intrinsic_load_gmem_frag_scale_ir3: + param = IR3_DP_FS(gmem_frag_scale); + break; + case nir_intrinsic_load_gmem_frag_offset_ir3: + param = IR3_DP_FS(gmem_frag_offset); + break; + default: + UNREACHABLE("bad intrinsic"); + } if (nir_src_is_const(intr->src[0])) { uint32_t view = nir_src_as_uint(intr->src[0]); for (int i = 0; i < dest_components; i++) { - dst[i] = create_driver_param(ctx, param + 4 * view + i); + dst[i] = create_driver_param(ctx, param + 8 * view + i); } create_rpt = true; } else { struct ir3_instruction *view = ir3_get_src(ctx, &intr->src[0])[0]; for (int i = 0; i < dest_components; i++) { dst[i] = create_driver_param_indirect(ctx, param + i, - ir3_get_addr0(ctx, view, 4)); + ir3_get_addr0(ctx, view, 8)); } ctx->so->constlen = MAX2(ctx->so->constlen, const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4 + - param / 4 + nir_intrinsic_range(intr)); + param / 4 + nir_intrinsic_range(intr) * 2); } break; } diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 15538e458cf..92bcc45dd51 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -1434,11 +1434,19 @@ ir3_get_driver_param_info(const nir_shader *shader, nir_intrinsic_instr *intr, break; case nir_intrinsic_load_frag_size_ir3: param_info->offset = IR3_DP_FS(frag_size); - param_info->extra_size = 4 * (nir_intrinsic_range(intr) - 1); + param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1); break; case nir_intrinsic_load_frag_offset_ir3: param_info->offset = IR3_DP_FS(frag_offset); - param_info->extra_size = 4 * (nir_intrinsic_range(intr) - 1); + param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1); + break; + case nir_intrinsic_load_gmem_frag_scale_ir3: + param_info->offset = IR3_DP_FS(gmem_frag_scale); + param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1); + break; + case nir_intrinsic_load_gmem_frag_offset_ir3: + param_info->offset = IR3_DP_FS(gmem_frag_offset); + param_info->extra_size = 8 * (nir_intrinsic_range(intr) - 1); break; case nir_intrinsic_load_frag_invocation_count: param_info->offset = IR3_DP_FS(frag_invocation_count); diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 81416f465b1..e70701dcba5 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -107,7 +107,11 @@ struct ir3_driver_params_fs { uint32_t frag_size; uint32_t __pad_09; uint32_t frag_offset; - uint32_t __pad_11_12[2]; + uint32_t __pad_11; + uint32_t gmem_frag_scale; + uint32_t __pad_13; + uint32_t gmem_frag_offset; + uint32_t __pad_15; }; #define IR3_DP_FS(name) dword_offsetof(struct ir3_driver_params_fs, name) diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc index bed74ac6806..32955539cd8 100644 --- a/src/freedreno/vulkan/tu_clear_blit.cc +++ b/src/freedreno/vulkan/tu_clear_blit.cc @@ -4043,6 +4043,7 @@ struct apply_sysmem_clear_coords_state { unsigned layer; float z_clear_val; VkRect2D rect; + bool custom_resolve; }; static void @@ -4053,7 +4054,8 @@ fdm_apply_sysmem_clear_coords(struct tu_cmd_buffer *cmd, const VkOffset2D *hw_viewport_offsets, unsigned views, const VkExtent2D *frag_areas, - const VkRect2D *bins) + const VkRect2D *bins, + bool binning) { const struct apply_sysmem_clear_coords_state *state = (const struct apply_sysmem_clear_coords_state *)data; @@ -4064,9 +4066,15 @@ fdm_apply_sysmem_clear_coords(struct tu_cmd_buffer *cmd, hw_viewport_offsets[MIN2(state->view, views - 1)]; VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset); + offset.x -= hw_viewport_offset.x; offset.y -= hw_viewport_offset.y; + if (state->custom_resolve && !binning) { + offset = (VkOffset2D) {}; + frag_area = (VkExtent2D) { 1, 1 }; + } + unsigned x1 = state->rect.offset.x / frag_area.width + offset.x; unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width, frag_area.width) + offset.x; @@ -4251,6 +4259,7 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, .layer = rects[i].baseArrayLayer + layer, .z_clear_val = z_clear_val, .rect = rects[i].rect, + .custom_resolve = subpass->custom_resolve, }; tu_create_fdm_bin_patchpoint(cmd, cs, 4, TU_FDM_NONE, fdm_apply_sysmem_clear_coords, @@ -4323,6 +4332,7 @@ clear_gmem_attachment(struct tu_cmd_buffer *cmd, struct apply_gmem_clear_coords_state { unsigned view; VkRect2D rect; + bool custom_resolve; }; static void @@ -4333,7 +4343,8 @@ fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd, const VkOffset2D *hw_viewport_offsets, unsigned views, const VkExtent2D *frag_areas, - const VkRect2D *bins) + const VkRect2D *bins, + bool binning) { const struct apply_gmem_clear_coords_state *state = (const struct apply_gmem_clear_coords_state *)data; @@ -4343,6 +4354,11 @@ fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd, VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset); + if (state->custom_resolve) { + offset = (VkOffset2D) {}; + frag_area = (VkExtent2D) { 1, 1 }; + } + unsigned x1 = state->rect.offset.x / frag_area.width + offset.x; unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width, frag_area.width) + offset.x - 1; @@ -4491,6 +4507,7 @@ tu_clear_attachments(struct tu_cmd_buffer *cmd, const VkClearRect *pRects) { struct tu_cs *cs = &cmd->draw_cs; + const struct tu_subpass *subpass = cmd->state.subpass; /* sysmem path behaves like a draw, note we don't have a way of using different * flushes for sysmem/gmem, so this needs to be outside of the cond_exec @@ -4504,8 +4521,11 @@ tu_clear_attachments(struct tu_cmd_buffer *cmd, * * Similarly, we also use the 3D path when in a secondary command buffer that * doesn't know the GMEM layout that will be chosen by the primary. + * + * Don't use the GMEM path if we are in a custom resolve. */ - if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) { + if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT || + subpass->custom_resolve) { tu_clear_sysmem_attachments(cmd, attachmentCount, pAttachments, rectCount, pRects); return; } @@ -4514,7 +4534,6 @@ tu_clear_attachments(struct tu_cmd_buffer *cmd, * binning time, then emit the clear as a 3D draw so that it contributes to * that visibility. */ - const struct tu_subpass *subpass = cmd->state.subpass; for (uint32_t i = 0; i < attachmentCount; i++) { uint32_t a; if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { @@ -4561,6 +4580,7 @@ tu7_clear_attachment_generic_single_rect( struct apply_gmem_clear_coords_state state = { .view = 0, .rect = rect->rect, + .custom_resolve = subpass->custom_resolve, }; tu_create_fdm_bin_patchpoint(cmd, cs, 3, TU_FDM_SKIP_BINNING, fdm_apply_gmem_clear_coords, state); @@ -4589,6 +4609,7 @@ tu7_clear_attachment_generic_single_rect( struct apply_gmem_clear_coords_state state = { .view = layer, .rect = rect->rect, + .custom_resolve = subpass->custom_resolve, }; tu_create_fdm_bin_patchpoint(cmd, cs, 3, TU_FDM_SKIP_BINNING, fdm_apply_gmem_clear_coords, state); @@ -4970,7 +4991,8 @@ fdm_apply_load_coords(struct tu_cmd_buffer *cmd, const VkOffset2D *hw_viewport_offsets, unsigned views, const VkExtent2D *frag_areas, - const VkRect2D *bins) + const VkRect2D *bins, + bool binning) { const struct apply_load_coords_state *state = (const struct apply_load_coords_state *)data; @@ -5435,6 +5457,8 @@ tu_choose_gmem_layout(struct tu_cmd_buffer *cmd) subpass->color_attachments[j].attachment; if (tu_attachment_store_mismatched_mutability(cmd, a, gmem_a)) cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU; + if (subpass->custom_resolve) + cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU; } } @@ -5454,7 +5478,8 @@ fdm_apply_store_coords(struct tu_cmd_buffer *cmd, const VkOffset2D *hw_viewport_offsets, unsigned views, const VkExtent2D *frag_areas, - const VkRect2D *bins) + const VkRect2D *bins, + bool binning) { const struct apply_store_coords_state *state = (const struct apply_store_coords_state *)data; diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index e331c5966e2..b61e105ff86 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -1412,6 +1412,64 @@ tu_fdm_num_layers(const struct tu_cmd_buffer *cmd) (cmd->state.fdm_per_layer ? cmd->state.framebuffer->layers : 1); } +template +static void +tu6_emit_bin_size_gmem(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + enum a6xx_buffers_location buffers_location, + bool disable_lrz) +{ + struct tu_physical_device *phys_dev = cmd->device->physical_device; + const struct tu_tiling_config *tiling = cmd->state.tiling; + bool hw_binning = use_hw_binning(cmd); + + tu6_emit_bin_size( + cs, buffers_location == BUFFERS_IN_GMEM ? tiling->tile0.width : 0, + buffers_location == BUFFERS_IN_GMEM ? tiling->tile0.height : 0, + { + .render_mode = RENDERING_PASS, + .force_lrz_write_dis = !phys_dev->info->props.has_lrz_feedback, + .buffers_location = buffers_location, + .lrz_feedback_zmode_mask = + phys_dev->info->props.has_lrz_feedback + ? (hw_binning ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_Z_LATE_Z : + LRZ_FEEDBACK_EARLY_Z_LATE_Z) + : LRZ_FEEDBACK_NONE, + .force_lrz_dis = CHIP >= A7XX && disable_lrz, + }); + +} + +/* Set always-identical registers used specifically for GMEM */ +template +static void +tu7_emit_tile_render_begin_regs(struct tu_cs *cs) +{ + tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP, 0x0)); + tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_GMEM)); +} + +/* Set always-identical registers used specifically for sysmem */ +template +static void +tu7_emit_sysmem_render_begin_regs(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP, + .z_sysmem = true, + .s_sysmem = true, + .rt0_sysmem = true, + .rt1_sysmem = true, + .rt2_sysmem = true, + .rt3_sysmem = true, + .rt4_sysmem = true, + .rt5_sysmem = true, + .rt6_sysmem = true, + .rt7_sysmem = true, + )); + + tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_SYSMEM)); +} + template static void tu6_emit_tile_select(struct tu_cmd_buffer *cmd, @@ -1419,7 +1477,6 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, const struct tu_tile_config *tile, bool fdm, const VkOffset2D *fdm_offsets) { - struct tu_physical_device *phys_dev = cmd->device->physical_device; const struct tu_tiling_config *tiling = cmd->state.tiling; const struct tu_framebuffer *fb = cmd->state.framebuffer; const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling); @@ -1470,19 +1527,15 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, if (fdm_offsets && (tile->pos.x == 0 || tile->pos.y == 0)) disable_lrz = true; - tu6_emit_bin_size( - cs, tiling->tile0.width, tiling->tile0.height, - { - .render_mode = RENDERING_PASS, - .force_lrz_write_dis = !phys_dev->info->props.has_lrz_feedback, - .buffers_location = BUFFERS_IN_GMEM, - .lrz_feedback_zmode_mask = - phys_dev->info->props.has_lrz_feedback && !bin_is_scaled - ? (hw_binning ? LRZ_FEEDBACK_EARLY_Z_OR_EARLY_Z_LATE_Z : - LRZ_FEEDBACK_EARLY_Z_LATE_Z) - : LRZ_FEEDBACK_NONE, - .force_lrz_dis = CHIP >= A7XX && disable_lrz, - }); + /* When using custom resolve we need to re-emit these regs as they are + * overwritten when switching to sysmem. + */ + if (CHIP >= A7XX && + cmd->state.pass->subpasses[cmd->state.pass->subpass_count - 1].custom_resolve) { + tu7_emit_tile_render_begin_regs(cs); + } + + tu6_emit_bin_size_gmem(cmd, cs, BUFFERS_IN_GMEM, disable_lrz); tu_cs_emit_regs(cs, A6XX_VFD_RENDER_MODE(RENDERING_PASS)); @@ -1634,7 +1687,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size); tu_cs_emit_qw(cs, patch->iova); patch->apply(cmd, cs, patch->data, (VkOffset2D) { x1, y1 }, - frag_offsets, views, tile->frag_areas, bins); + frag_offsets, views, tile->frag_areas, bins, false); } /* Make the CP wait until the CP_MEM_WRITE's to the command buffers @@ -2111,15 +2164,6 @@ tu6_init_static_regs(struct tu_device *dev, struct tu_cs *cs) tu_cs_emit(cs, A6XX_SP_VS_CONST_CONFIG_CONSTLEN(8) | A6XX_SP_VS_CONST_CONFIG_ENABLED); } -/* Set always-identical registers used specifically for GMEM */ -template -static void -tu7_emit_tile_render_begin_regs(struct tu_cs *cs) -{ - tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP, 0x0)); - tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_GMEM)); -} - /* Emit the bin restore preamble, which runs in between bins when L1 * preemption with skipsaverestore happens and we switch back to this context. * We need to restore static registers normally programmed at cmdbuf start @@ -2435,7 +2479,7 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size); tu_cs_emit_qw(cs, patch->iova); patch->apply(cmd, cs, patch->data, (VkOffset2D) {0, 0}, frag_offsets, - num_views, unscaled_frag_areas, bins); + num_views, unscaled_frag_areas, bins, true); } tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); @@ -2532,6 +2576,8 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, bool gmem) { const struct tu_tiling_config *tiling = cmd->state.tiling; + uint32_t layers = MAX2(cmd->state.framebuffer->layers, + cmd->state.pass->num_views); /* note: we can probably emit input attachments just once for the whole * renderpass, this would avoid emitting both sysmem/gmem versions @@ -2621,7 +2667,11 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, gmem_offset = att->gmem_offset_stencil[cmd->state.gmem_layout]; } - if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem) { + if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem || + /* Skip GMEM patching when tiling is impossible as we may get + * assertion failures from register packing below. + */ + !tiling->possible) { memcpy(&texture.map[i * A6XX_TEX_CONST_DWORDS], dst, sizeof(dst)); continue; } @@ -2647,10 +2697,17 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, dst[2] = A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp); - /* Note: it seems the HW implicitly calculates the array pitch with the - * GMEM tiling, so we don't need to specify the pitch ourselves. + /* Note: it seems the HW implicitly calculates the array pitch, except + * when rendering to sysmem (i.e. in a custom resolve subpass). We only + * guarantee the pitch is valid when there is more than 1 layer, so skip + * emitting it otherwise to avoid asserts. */ - dst[3] = 0; + if (layers > 1) { + dst[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(tiling->tile0.width * + tiling->tile0.height * cpp); + } else { + dst[3] = 0; + } dst[4] = cmd->device->physical_device->gmem_base + gmem_offset; dst[5] &= A6XX_TEX_CONST_5_DEPTH__MASK; for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++) @@ -2985,20 +3042,7 @@ tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, }); if (CHIP == A7XX) { - tu_cs_emit_regs(cs, RB_BUFFER_CNTL(CHIP, - .z_sysmem = true, - .s_sysmem = true, - .rt0_sysmem = true, - .rt1_sysmem = true, - .rt2_sysmem = true, - .rt3_sysmem = true, - .rt4_sysmem = true, - .rt5_sysmem = true, - .rt6_sysmem = true, - .rt7_sysmem = true, - )); - - tu_cs_emit_regs(cs, RB_CLEAR_TARGET(CHIP, .clear_mode = CLEAR_MODE_SYSMEM)); + tu7_emit_sysmem_render_begin_regs(cmd, cs); } tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); @@ -6400,9 +6444,14 @@ static void tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resolve_group) { struct tu_cs *cs = &cmd->draw_cs; - uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses; + const struct tu_subpass *subpass = cmd->state.subpass; + uint32_t subpass_idx = subpass - cmd->state.pass->subpasses; const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling); + /* Shader resolve subpasses don't use GMEM */ + if (subpass->custom_resolve) + return; + /* If we might choose to bin, then put the loads under a check for geometry * having been binned to this tile. If we don't choose to bin in the end, * then we will have manually set those registers to say geometry is present. @@ -6496,9 +6545,11 @@ tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd) return; struct tu_cs *cs = &cmd->draw_cs; - uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses; + const struct tu_subpass *subpass = cmd->state.subpass; + uint32_t subpass_idx = subpass - cmd->state.pass->subpasses; - tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); + if (!subpass->custom_resolve) + tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); tu6_emit_sysmem_unresolves(cmd, cs, cmd->state.subpass); @@ -6508,7 +6559,8 @@ tu_emit_subpass_begin_sysmem(struct tu_cmd_buffer *cmd) tu_clear_sysmem_attachment(cmd, cs, i); } - tu_cond_exec_end(cs); /* sysmem */ + if (!subpass->custom_resolve) + tu_cond_exec_end(cs); /* sysmem */ } static void @@ -6584,6 +6636,84 @@ tu7_emit_subpass_shading_rate(struct tu_cmd_buffer *cmd, cmd->prev_fsr_is_null = false; } +/* If this is a shader resolve subpass, switch to writing to sysmem. + */ +template +static void +tu_emit_subpass_custom_resolve(struct tu_cmd_buffer *cmd) +{ + struct tu_cs *cs = &cmd->draw_cs; + const struct tu_subpass *subpass = cmd->state.subpass; + const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_tiling_config *tiling = cmd->state.tiling; + + if (!subpass->custom_resolve) + return; + + /* Since a7xx, buffer location can be controlled per-buffer. We also have + * to update the steering register so that generic clears use sysmem. + */ + if (CHIP >= A7XX) { + tu7_emit_sysmem_render_begin_regs(cmd, cs); + + /* Disable foveation offset here. It's not necessary for custom resolve. + */ + tu_cs_emit_regs(cs, GRAS_BIN_FOVEAT(CHIP)); + tu_cs_emit_regs(cs, RB_BIN_FOVEAT(CHIP)); + } else { + tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); + + /* On a6xx the location is set in *_BIN_CONTROL */ + tu6_emit_bin_size_gmem(cmd, cs, BUFFERS_IN_SYSMEM, false); + + tu_cond_exec_end(cs); + } + + /* With FDM and non-subsampled images, we switch from rendering space to + * framebuffer space in the custom resolve subpass when not in the binning + * pass because we are writing directly to the user-visible attachment. We + * already aren't relying on the window scissor whenever FDM is enabled, + * but it can get in the way if FDM offset is being used because it is + * specified in rendering space, so the origin is shifted to the right and + * down compared to the framebuffer-space bin coordinates and part of the + * bin gets incorrectly clipped. Just disable it here by setting it to the + * entire framebuffer. Add an extra tile size for when we are in the + * binning pass and still using rendering space. + */ + if (tu_enable_fdm_offset(cmd)) { + tu6_emit_window_scissor(cs, 0, 0, + fb->width + tiling->tile0.width - 1, + fb->height + tiling->tile0.height - 1); + } + + /* If FDM is enabled, we need to re-emit all FDM-related state. */ + if (cmd->state.pass->fragment_density_map.attachment != + VK_ATTACHMENT_UNUSED) { + cmd->state.dirty |= TU_CMD_DIRTY_FDM; + } +} + +/* If the last subpass is a shader resolve pass, emit flushes after switching + * to sysmem, similar to fixed-function 3D resolves. Our flushing code assumes + * that when in GMEM mode CCU isn't in use so we have to flush it ourselves. + */ +template +static void +tu_emit_custom_resolve_end(struct tu_cmd_buffer *cmd) +{ + struct tu_cs *cs = &cmd->draw_cs; + + const struct tu_subpass *subpass = cmd->state.subpass; + + if (!subpass->custom_resolve) + return; + + if (subpass->color_count) + tu_emit_event_write(cmd, cs, FD_CCU_CLEAN_COLOR); + if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) + tu_emit_event_write(cmd, cs, FD_CCU_CLEAN_DEPTH); +} + /* emit loads, clears, and mrt/zs/msaa/ubwc state for the subpass that is * starting (either at vkCmdBeginRenderPass2() or vkCmdNextSubpass2()) * @@ -6599,6 +6729,7 @@ tu_emit_subpass_begin(struct tu_cmd_buffer *cmd) struct tu_resolve_group resolve_group = {}; + tu_emit_subpass_custom_resolve(cmd); tu_emit_subpass_begin_gmem(cmd, &resolve_group); tu_emit_subpass_begin_sysmem(cmd); if (cmd->device->physical_device->info->props.has_generic_clear) { @@ -7582,6 +7713,7 @@ fs_params_size(struct tu_cmd_buffer *cmd) struct apply_fs_params_state { unsigned num_consts; + bool custom_resolve; }; static void @@ -7592,13 +7724,14 @@ fdm_apply_fs_params(struct tu_cmd_buffer *cmd, const VkOffset2D *hw_viewport_offsets, unsigned views, const VkExtent2D *frag_areas, - const VkRect2D *bins) + const VkRect2D *bins, + bool binning) { const struct apply_fs_params_state *state = (const struct apply_fs_params_state *)data; unsigned num_consts = state->num_consts; - for (unsigned i = 0; i < num_consts; i++) { + for (unsigned i = 0; i < DIV_ROUND_UP(num_consts, 2); i++) { /* FDM per layer may be enabled in the shader but not in the renderpass, * in which case views will be 1 and we have to replicate the one view * to all of the layers. @@ -7607,10 +7740,38 @@ fdm_apply_fs_params(struct tu_cmd_buffer *cmd, VkRect2D bin = bins[MIN2(i, views - 1)]; VkOffset2D offset = tu_fdm_per_bin_offset(area, bin, common_bin_offset); - tu_cs_emit(cs, area.width); - tu_cs_emit(cs, area.height); - tu_cs_emit(cs, fui(offset.x)); - tu_cs_emit(cs, fui(offset.y)); + /* For custom resolve, we switch to rendering directly to sysmem and so + * the fragment size becomes 1x1. This means we have to scale down + * FragCoord when accessing GMEM input attachments. + * + * TODO: When we support subsampled images, this should also only happen + * for non-subsampled images. + */ + if (state->custom_resolve) { + tu_cs_emit(cs, 1 /* width */); + tu_cs_emit(cs, 1 /* height */); + tu_cs_emit(cs, fui(0.0)); + tu_cs_emit(cs, fui(0.0)); + } else { + tu_cs_emit(cs, area.width); + tu_cs_emit(cs, area.height); + tu_cs_emit(cs, fui(offset.x)); + tu_cs_emit(cs, fui(offset.y)); + } + + if (i * 2 + 1 < num_consts) { + if (state->custom_resolve) { + tu_cs_emit(cs, fui(1. / area.width)); + tu_cs_emit(cs, fui(1. / area.height)); + tu_cs_emit(cs, fui(offset.x)); + tu_cs_emit(cs, fui(offset.y)); + } else { + tu_cs_emit(cs, fui(1.0)); + tu_cs_emit(cs, fui(1.0)); + tu_cs_emit(cs, fui(0.0)); + tu_cs_emit(cs, fui(0.0)); + } + } } } @@ -7632,16 +7793,23 @@ tu_emit_fdm_params(struct tu_cmd_buffer *cmd, if (fs->fs.has_fdm) { struct apply_fs_params_state state = { .num_consts = num_units - 1, + .custom_resolve = cmd->state.subpass->custom_resolve, }; tu_create_fdm_bin_patchpoint(cmd, cs, 4 * (num_units - 1), TU_FDM_SKIP_BINNING, fdm_apply_fs_params, state); } else { - for (unsigned i = 1; i < num_units; i++) { + for (unsigned i = 0; i < DIV_ROUND_UP((num_units - 1), 2); i++) { tu_cs_emit(cs, 1); tu_cs_emit(cs, 1); tu_cs_emit(cs, fui(0.0f)); tu_cs_emit(cs, fui(0.0f)); + if (i * 2 + 1 < num_units - 1) { + tu_cs_emit(cs, fui(1.0)); + tu_cs_emit(cs, fui(1.0)); + tu_cs_emit(cs, fui(0.0)); + tu_cs_emit(cs, fui(0.0)); + } } } } @@ -9174,6 +9342,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, fdm_offsets = test_offsets; } + TU_CALLX(cmd_buffer->device, tu_emit_custom_resolve_end)(cmd_buffer); + tu_cs_end(&cmd_buffer->draw_cs); tu_cs_end(&cmd_buffer->draw_epilogue_cs); TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer, fdm_offsets); @@ -9202,6 +9372,8 @@ tu_CmdEndRendering2EXT(VkCommandBuffer commandBuffer, */ TU_CALLX(cmd_buffer->device, tu_lrz_flush_valid_during_renderpass) (cmd_buffer, &cmd_buffer->draw_cs); + } else { + TU_CALLX(cmd_buffer->device, tu_emit_custom_resolve_end)(cmd_buffer); } const VkRenderPassFragmentDensityMapOffsetEndInfoEXT *fdm_offset_info = diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index a6814a70676..f83948a74b6 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -858,7 +858,8 @@ typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd, const VkOffset2D *hw_viewport_offsets, unsigned views, const VkExtent2D *frag_areas, - const VkRect2D *bins); + const VkRect2D *bins, + bool binning); enum tu_fdm_flags { TU_FDM_NONE = 0, @@ -926,7 +927,7 @@ _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd, }; hw_viewport_offsets[i] = (VkOffset2D) { 0, 0 }; } - apply(cmd, cs, state, (VkOffset2D) {0, 0}, hw_viewport_offsets, num_views, unscaled_frag_areas, bins); + apply(cmd, cs, state, (VkOffset2D) {0, 0}, hw_viewport_offsets, num_views, unscaled_frag_areas, bins, false); assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t)); util_dynarray_append(&cmd->fdm_bin_patchpoints, patch); diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index a67478d0dfe..036088a8fdb 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -350,6 +350,7 @@ get_device_extensions(const struct tu_physical_device *device, .IMG_filter_cubic = device->info->props.has_tex_filter_cubic, .NV_compute_shader_derivatives = device->info->chip >= 7, .QCOM_fragment_density_map_offset = true, + .QCOM_render_pass_shader_resolve = true, .VALVE_fragment_density_map_layered = true, .VALVE_mutable_descriptor_type = true, } }; diff --git a/src/freedreno/vulkan/tu_pass.cc b/src/freedreno/vulkan/tu_pass.cc index 2e7d455e7d2..05fe8e74f26 100644 --- a/src/freedreno/vulkan/tu_pass.cc +++ b/src/freedreno/vulkan/tu_pass.cc @@ -969,7 +969,8 @@ tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const struct tu_subpass *subpass = &pass->subpasses[i]; struct tu_render_pass_attachment *att = &pass->attachments[a]; - att->gmem = true; + if (!subpass->custom_resolve) + att->gmem = true; update_samples(subpass, att->samples); att->used_views |= subpass->multiview_mask; @@ -1182,6 +1183,8 @@ tu_CreateRenderPass2(VkDevice _device, subpass->srgb_cntl = 0; subpass->legacy_dithering_enabled = desc->flags & VK_SUBPASS_DESCRIPTION_ENABLE_LEGACY_DITHERING_BIT_EXT; + subpass->custom_resolve = desc->flags & + VK_SUBPASS_DESCRIPTION_SHADER_RESOLVE_BIT_QCOM; const BITMASK_ENUM(VkSubpassDescriptionFlagBits) raster_order_access_bits = VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_EXT | diff --git a/src/freedreno/vulkan/tu_pass.h b/src/freedreno/vulkan/tu_pass.h index cca5cda4abe..da92babc657 100644 --- a/src/freedreno/vulkan/tu_pass.h +++ b/src/freedreno/vulkan/tu_pass.h @@ -82,6 +82,8 @@ struct tu_subpass bool depth_used; bool stencil_used; + bool custom_resolve; + VkSampleCountFlagBits samples; uint32_t srgb_cntl; diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index 82f46d1968b..2c43ad9bd92 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -1773,6 +1773,12 @@ tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder, ~attachments_referenced; } + if (builder->state & + VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) { + keys[MESA_SHADER_FRAGMENT].custom_resolve = + builder->graphics_state.rp->custom_resolve; + } + if (builder->create_flags & VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) { for (unsigned i = 0; i < builder->num_libraries; i++) { @@ -2578,6 +2584,7 @@ struct apply_viewport_state { bool share_scale; /* See tu_pipeline::fake_single_viewport */ bool fake_single_viewport; + bool custom_resolve; }; /* It's a hardware restriction that the window offset (i.e. common_bin_offset) @@ -2624,7 +2631,8 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, VkOffset2D common_bin_offset, const VkOffset2D *hw_viewport_offsets, unsigned views, - const VkExtent2D *frag_areas, const VkRect2D *bins) + const VkExtent2D *frag_areas, const VkRect2D *bins, + bool binning) { const struct apply_viewport_state *state = (const struct apply_viewport_state *)data; @@ -2653,9 +2661,16 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, */ VkViewport viewport = state->fake_single_viewport ? state->vp.viewports[0] : state->vp.viewports[i]; - if (frag_area.width == 1 && frag_area.height == 1 && - common_bin_offset.x == bin.offset.x && - common_bin_offset.y == bin.offset.y) { + if ((frag_area.width == 1 && frag_area.height == 1 && + common_bin_offset.x == bin.offset.x && + common_bin_offset.y == bin.offset.y) || + /* When in a custom resolve operation (TODO: and using + * non-subsampled images) we switch to framebuffer coordinates so we + * shouldn't apply the transform. However the binning pass isn't + * aware of this, so we have to keep applying the transform for + * binning. + */ + (state->custom_resolve && !binning)) { vp.viewports[i] = viewport; continue; } @@ -2692,6 +2707,7 @@ tu6_emit_viewport_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd, .share_scale = !cmd->state.per_view_viewport && !cmd->state.per_layer_viewport, .fake_single_viewport = cmd->state.fake_single_viewport, + .custom_resolve = cmd->state.subpass->custom_resolve, }; if (cmd->state.per_view_viewport) state.vp.viewport_count = num_views; @@ -2753,7 +2769,8 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, VkOffset2D common_bin_offset, const VkOffset2D *hw_viewport_offsets, unsigned views, - const VkExtent2D *frag_areas, const VkRect2D *bins) + const VkExtent2D *frag_areas, const VkRect2D *bins, + bool binning) { const struct apply_viewport_state *state = (const struct apply_viewport_state *)data; @@ -2781,6 +2798,19 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, common_bin_offset); offset.x -= hw_viewport_offset.x; offset.y -= hw_viewport_offset.y; + + /* Disable scaling and offset when doing a custom resolve to a + * non-subsampled image and not in the binning pass, because we + * use framebuffer coordinates. + * + * TODO: When we support subsampled images, only do this for + * non-subsampled images. + */ + if (state->custom_resolve && !binning) { + offset = (VkOffset2D) {}; + frag_area = (VkExtent2D) {1, 1}; + } + VkOffset2D min = { scissor.offset.x / frag_area.width + offset.x, scissor.offset.y / frag_area.width + offset.y, @@ -2791,12 +2821,20 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, }; /* Intersect scissor with the scaled bin, this essentially replaces the - * window scissor. + * window scissor. With custom resolve (TODO: and non-subsampled images) + * we have to use the unscaled bin instead. */ uint32_t scaled_width = bin.extent.width / frag_area.width; uint32_t scaled_height = bin.extent.height / frag_area.height; - uint32_t bin_x = common_bin_offset.x - hw_viewport_offset.x; - uint32_t bin_y = common_bin_offset.y - hw_viewport_offset.y; + int32_t bin_x; + int32_t bin_y; + if (state->custom_resolve && !binning) { + bin_x = bin.offset.x; + bin_y = bin.offset.y; + } else { + bin_x = common_bin_offset.x - hw_viewport_offset.x; + bin_y = common_bin_offset.y - hw_viewport_offset.y; + } vp.scissors[i].offset.x = MAX2(min.x, bin_x); vp.scissors[i].offset.y = MAX2(min.y, bin_y); vp.scissors[i].extent.width = @@ -2818,6 +2856,7 @@ tu6_emit_scissor_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd, .share_scale = !cmd->state.per_view_viewport && !cmd->state.per_layer_viewport, .fake_single_viewport = cmd->state.fake_single_viewport, + .custom_resolve = cmd->state.subpass->custom_resolve, }; if (cmd->state.per_view_viewport) state.vp.scissor_count = num_views; @@ -4426,6 +4465,8 @@ tu_fill_render_pass_state(struct vk_render_pass_state *rp, rp->color_attachment_formats[i] = pass->attachments[a].format; rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i); } + + rp->custom_resolve = subpass->custom_resolve; } static void diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index eb8fc560328..e431f6ae12b 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -643,20 +643,37 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, return true; case nir_intrinsic_load_frag_size_ir3: - case nir_intrinsic_load_frag_offset_ir3: { + case nir_intrinsic_load_frag_offset_ir3: + case nir_intrinsic_load_gmem_frag_scale_ir3: + case nir_intrinsic_load_gmem_frag_offset_ir3: { if (!dev->compiler->load_shader_consts_via_preamble) return false; - unsigned param = - instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ? - IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset); + unsigned param; + switch (instr->intrinsic) { + case nir_intrinsic_load_frag_size_ir3: + param = IR3_DP_FS(frag_size); + break; + case nir_intrinsic_load_frag_offset_ir3: + param = IR3_DP_FS(frag_offset); + break; + case nir_intrinsic_load_gmem_frag_scale_ir3: + param = IR3_DP_FS(gmem_frag_scale); + break; + case nir_intrinsic_load_gmem_frag_offset_ir3: + param = IR3_DP_FS(gmem_frag_offset); + break; + default: + UNREACHABLE("bad intrinsic"); + } - unsigned offset = param - IR3_DP_FS_DYNAMIC; + unsigned base = param - IR3_DP_FS_DYNAMIC; nir_def *view = instr->src[0].ssa; + nir_def *offset = nir_imul_imm(b, view, 2); nir_def *result = ir3_load_driver_ubo_indirect(b, 2, &shader->const_state.fdm_ubo, - offset, view, nir_intrinsic_range(instr)); + base, offset, nir_intrinsic_range(instr) * 2); nir_def_replace(&instr->def, result); return true; @@ -1147,6 +1164,7 @@ struct lower_fdm_options { unsigned num_views; bool adjust_fragcoord; bool use_layer; + bool adjust_gmem_fragcoord; }; static bool @@ -1211,7 +1229,22 @@ lower_fdm_instr(struct nir_builder *b, nir_instr *instr, void *data) } if (intrin->intrinsic == nir_intrinsic_load_frag_coord_gmem_ir3) { - return nir_load_frag_coord_unscaled_ir3(b); + nir_def *unscaled_coord = nir_load_frag_coord_unscaled_ir3(b); + + if (!options->adjust_gmem_fragcoord) + return unscaled_coord; + + nir_def *frag_offset = + nir_load_gmem_frag_offset_ir3(b, view, .range = options->num_views); + nir_def *frag_scale = + nir_load_gmem_frag_scale_ir3(b, view, .range = options->num_views); + nir_def *xy = nir_trim_vector(b, unscaled_coord, 2); + xy = nir_fadd(b, nir_fmul(b, xy, frag_scale), frag_offset); + return nir_vec4(b, + nir_channel(b, xy, 0), + nir_channel(b, xy, 1), + nir_channel(b, unscaled_coord, 2), + nir_channel(b, unscaled_coord, 3)); } assert(intrin->intrinsic == nir_intrinsic_load_frag_size); @@ -2802,6 +2835,7 @@ tu_shader_create(struct tu_device *dev, key->max_fdm_layers, 1), .adjust_fragcoord = key->fragment_density_map, .use_layer = !key->multiview_mask, + .adjust_gmem_fragcoord = key->fragment_density_map && key->custom_resolve, }; NIR_PASS(_, nir, tu_nir_lower_fdm, &fdm_options); diff --git a/src/freedreno/vulkan/tu_shader.h b/src/freedreno/vulkan/tu_shader.h index 61887585b02..8bad650d960 100644 --- a/src/freedreno/vulkan/tu_shader.h +++ b/src/freedreno/vulkan/tu_shader.h @@ -128,6 +128,7 @@ struct tu_shader_key { bool robust_storage_access2; bool robust_uniform_access2; bool lower_view_index_to_device_index; + bool custom_resolve; enum ir3_wavesize_option api_wavesize, real_wavesize; };