diff --git a/docs/features.txt b/docs/features.txt index 4500c5b078e..796a09a2ca7 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -694,6 +694,7 @@ Khronos extensions that are not part of any Vulkan version: VK_EXT_map_memory_placed DONE (anv, nvk, radv, tu) VK_MESA_image_alignment_control DONE (anv, nvk, radv) VK_EXT_legacy_dithering DONE (anv, tu, vn) + VK_QCOM_fragment_density_map_offset DONE (tu) Clover OpenCL 1.0 -- all DONE: diff --git a/src/freedreno/vulkan/tu_clear_blit.cc b/src/freedreno/vulkan/tu_clear_blit.cc index a3c94f570aa..1222a4d7233 100644 --- a/src/freedreno/vulkan/tu_clear_blit.cc +++ b/src/freedreno/vulkan/tu_clear_blit.cc @@ -1363,6 +1363,22 @@ r3d_src_gmem(struct tu_cmd_buffer *cmd, if (!iview->view.is_mutable) desc[0] &= ~A6XX_TEX_CONST_0_SWAP__MASK; desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); + + /* If FDM offset is used, the last row and column extend beyond the + * framebuffer but are shifted over when storing. Expand the width and + * height to account for that. + */ + if (tu_enable_fdm_offset(cmd)) { + uint32_t width = desc[1] & A6XX_TEX_CONST_1_WIDTH__MASK; + uint32_t height = (desc[1] & A6XX_TEX_CONST_1_HEIGHT__MASK) >> + A6XX_TEX_CONST_1_HEIGHT__SHIFT; + width += cmd->state.tiling->tile0.width; + height += cmd->state.tiling->tile0.height; + desc[1] = (desc[1] & ~(A6XX_TEX_CONST_1_WIDTH__MASK | + A6XX_TEX_CONST_1_HEIGHT__MASK)) | + A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height); + } + desc[2] = A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp); @@ -3910,17 +3926,19 @@ static void fdm_apply_sysmem_clear_coords(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, - VkRect2D bin, + VkOffset2D common_bin_offset, unsigned views, - const VkExtent2D *frag_areas) + const VkExtent2D *frag_areas, + const VkRect2D *bins) { const struct apply_sysmem_clear_coords_state *state = (const struct apply_sysmem_clear_coords_state *)data; assert(state->view < views); VkExtent2D frag_area = frag_areas[state->view]; + VkRect2D bin = bins[state->view]; - VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin); + VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset); unsigned x1 = state->rect.offset.x / frag_area.width + offset.x; unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width, @@ -4182,17 +4200,19 @@ static void fdm_apply_gmem_clear_coords(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, - VkRect2D bin, + VkOffset2D common_bin_offset, unsigned views, - const VkExtent2D *frag_areas) + const VkExtent2D *frag_areas, + const VkRect2D *bins) { const struct apply_gmem_clear_coords_state *state = (const struct apply_gmem_clear_coords_state *)data; assert(state->view < views); VkExtent2D frag_area = frag_areas[state->view]; + VkRect2D bin = bins[state->view]; - VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin); + VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, common_bin_offset); unsigned x1 = state->rect.offset.x / frag_area.width + offset.x; unsigned x2 = DIV_ROUND_UP(state->rect.offset.x + state->rect.extent.width, @@ -4816,14 +4836,16 @@ static void fdm_apply_load_coords(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, - VkRect2D bin, + VkOffset2D common_bin_offset, unsigned views, - const VkExtent2D *frag_areas) + const VkExtent2D *frag_areas, + const VkRect2D *bins) { const struct apply_load_coords_state *state = (const struct apply_load_coords_state *)data; assert(state->view < views); VkExtent2D frag_area = frag_areas[state->view]; + VkRect2D bin = bins[state->view]; assert(bin.extent.width % frag_area.width == 0); assert(bin.extent.height % frag_area.height == 0); @@ -4831,10 +4853,10 @@ fdm_apply_load_coords(struct tu_cmd_buffer *cmd, uint32_t scaled_height = bin.extent.height / frag_area.height; const float coords[] = { - bin.offset.x, bin.offset.y, - bin.offset.x, bin.offset.y, - bin.offset.x + scaled_width, bin.offset.y + scaled_height, - bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height, + common_bin_offset.x, common_bin_offset.y, + bin.offset.x, bin.offset.y, + common_bin_offset.x + scaled_width, common_bin_offset.y + scaled_height, + bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height, }; r3d_coords_raw(cmd, cs, coords); } @@ -5050,6 +5072,19 @@ store_cp_blit(struct tu_cmd_buffer *cmd, enum a6xx_format format = fmt.fmt; fixup_src_format(&src_format, dst_format, &format); + uint32_t src_width = dst_iview->vk.extent.width; + uint32_t src_height = dst_iview->vk.extent.height; + + /* With FDM offset, we may blit from an extra row/column of tiles whose + * source coordinates are outside of the attachment. Add an extra tile + * width/height to the size to avoid clipping the source. + */ + if (tu_enable_fdm_offset(cmd)) { + const struct tu_tiling_config *tiling = cmd->state.tiling; + src_width += tiling->tile0.width; + src_height += tiling->tile0.height; + } + tu_cs_emit_regs(cs, SP_PS_2D_SRC_INFO(CHIP, .color_format = format, @@ -5063,8 +5098,8 @@ store_cp_blit(struct tu_cmd_buffer *cmd, .unk22 = 1, .mutableen = src_iview->view.is_mutable), SP_PS_2D_SRC_SIZE(CHIP, - .width = dst_iview->vk.extent.width, - .height = dst_iview->vk.extent.height), + .width = src_width, + .height = src_height), SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset), SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp)); @@ -5274,14 +5309,16 @@ static void fdm_apply_store_coords(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, - VkRect2D bin, + VkOffset2D common_bin_offset, unsigned views, - const VkExtent2D *frag_areas) + const VkExtent2D *frag_areas, + const VkRect2D *bins) { const struct apply_store_coords_state *state = (const struct apply_store_coords_state *)data; assert(state->view < views); VkExtent2D frag_area = frag_areas[state->view]; + VkRect2D bin = bins[state->view]; /* The bin width/height must be a multiple of the frag_area to make sure * that the scaling happens correctly. This means there may be some @@ -5299,10 +5336,10 @@ fdm_apply_store_coords(struct tu_cmd_buffer *cmd, A6XX_GRAS_2D_DST_BR(.x = bin.offset.x + bin.extent.width - 1, .y = bin.offset.y + bin.extent.height - 1)); tu_cs_emit_regs(cs, - A6XX_GRAS_2D_SRC_TL_X(bin.offset.x), - A6XX_GRAS_2D_SRC_BR_X(bin.offset.x + scaled_width - 1), - A6XX_GRAS_2D_SRC_TL_Y(bin.offset.y), - A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1)); + A6XX_GRAS_2D_SRC_TL_X(common_bin_offset.x), + A6XX_GRAS_2D_SRC_BR_X(common_bin_offset.x + scaled_width - 1), + A6XX_GRAS_2D_SRC_TL_Y(common_bin_offset.y), + A6XX_GRAS_2D_SRC_BR_Y(common_bin_offset.y + scaled_height - 1)); } template diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 573475d558a..9b16cb2f71b 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -700,7 +700,8 @@ tu6_emit_render_cntl(struct tu_cmd_buffer *cmd, } static void -tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align) +tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align, + bool used_by_sysmem) { struct tu_physical_device *phys_dev = cmd->device->physical_device; const VkRect2D *render_area = &cmd->state.render_area; @@ -727,9 +728,42 @@ tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align) y2 = ALIGN_POT(y2 + 1, phys_dev->info->gmem_align_h) - 1; } - tu_cs_emit_regs(cs, - A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1), - A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2)); + /* With FDM offset, bins are shifted to the right in GMEM space compared to + * framebuffer space. We do not use RB_BLIT_SCISSOR_* for loads and stores + * because those do not use the fast path, but we do use it for + * LOAD_OP_CLEAR. Expand the render area so that GMEM clears work + * correctly. We may over-clear but that's ok because the store is clipped + * to the render area. + */ + if (tu_enable_fdm_offset(cmd)) { + const struct tu_tiling_config *tiling = cmd->state.tiling; + + /* If this is a generic clear that's also used in sysmem mode then we + * need to emit the unmodified render area in sysmem mode because + * over-clearing is not allowed. + */ + if (used_by_sysmem) { + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1), + A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2)); + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | + CP_COND_REG_EXEC_0_GMEM); + } + + x2 += tiling->tile0.width; + y2 += tiling->tile0.height; + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1), + A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2)); + + if (used_by_sysmem) { + tu_cond_exec_end(cs); + } + } else { + tu_cs_emit_regs(cs, + A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1), + A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2)); + } } void @@ -950,12 +984,20 @@ tu6_update_msaa_disable(struct tu_cmd_buffer *cmd) } } +static const struct tu_vsc_config * +tu_vsc_config(struct tu_cmd_buffer *cmd, const struct tu_tiling_config *tiling) +{ + if (tu_enable_fdm_offset(cmd)) + return &tiling->fdm_offset_vsc; + return &tiling->vsc; +} + static bool use_hw_binning(struct tu_cmd_buffer *cmd) { const struct tu_framebuffer *fb = cmd->state.framebuffer; const struct tu_tiling_config *tiling = &fb->tiling[cmd->state.gmem_layout]; - const struct tu_vsc_config *vsc = &tiling->vsc; + const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling); /* XFB commands are emitted for BINNING || SYSMEM, which makes it * incompatible with non-hw binning GMEM rendering. this is required because @@ -1014,7 +1056,7 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd, return true; } - const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc; + const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling); /* XFB is incompatible with non-hw binning GMEM rendering, see use_hw_binning */ if (cmd->state.rp.xfb_used && !vsc->binning_possible) { @@ -1059,7 +1101,7 @@ static void tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t pipe, uint32_t slot, bool skip_wfm) { - const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc; + const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling); if (vsc->binning_possible && cmd->state.pass->has_cond_load_store) { @@ -1080,16 +1122,48 @@ struct tu_tile_config { VkExtent2D frag_areas[MAX_VIEWS]; }; +/* For bin offsetting we want to do "Euclidean division," where the remainder + * (i.e. the offset of the bin) is always positive. Unfortunately C/C++ + * remainder and division don't do this, so we have to implement it ourselves. + * + * For example, we should have: + * + * euclid_rem(-3, 4) = 1 + * euclid_rem(-4, 4) = 0 + * euclid_rem(-4, 4) = 3 + */ + +static int32_t +euclid_rem(int32_t divisor, int32_t divisend) +{ + if (divisor >= 0) + return divisor % divisend; + int32_t tmp = divisend - (-divisor % divisend); + return tmp == divisend ? 0 : tmp; +} + +/* Calculate how much the bins for a given view should be shifted to the left + * and upwards, given the application-provided FDM offset. + */ +static VkOffset2D +tu_bin_offset(VkOffset2D fdm_offset, const struct tu_tiling_config *tiling) +{ + return (VkOffset2D) { + euclid_rem(-fdm_offset.x, tiling->tile0.width), + euclid_rem(-fdm_offset.y, tiling->tile0.height), + }; +} + template static void tu6_emit_tile_select(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_tile_config *tile, - bool fdm) + bool fdm, const VkOffset2D *fdm_offsets) { struct tu_physical_device *phys_dev = cmd->device->physical_device; const struct tu_tiling_config *tiling = cmd->state.tiling; - const struct tu_vsc_config *vsc = &tiling->vsc; + const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling); bool hw_binning = use_hw_binning(cmd); tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); @@ -1118,6 +1192,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, const uint32_t x1 = tiling->tile0.width * tile->pos.x; const uint32_t y1 = tiling->tile0.height * tile->pos.y; + const uint32_t x2 = MIN2(x1 + tiling->tile0.width, MAX_VIEWPORT_SIZE); const uint32_t y2 = MIN2(y1 + tiling->tile0.height, MAX_VIEWPORT_SIZE); tu6_emit_window_scissor(cs, x1, y1, x2 - 1, y2 - 1); @@ -1161,11 +1236,29 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, { x1, y1 }, { (x2 - x1) * tile->extent.width, (y2 - y1) * tile->extent.height } }; + VkRect2D bins[views]; + for (unsigned i = 0; i < views; i++) { + if (!fdm_offsets || cmd->state.rp.shared_viewport) { + bins[i] = bin; + continue; + } + + VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling); + + bins[i].offset.x = MAX2(0, (int32_t)x1 - bin_offset.x); + bins[i].offset.y = MAX2(0, (int32_t)y1 - bin_offset.y); + bins[i].extent.width = + MAX2(MIN2((int32_t)x1 + bin.extent.width - bin_offset.x, MAX_VIEWPORT_SIZE) - bins[i].offset.x, 0); + bins[i].extent.height = + MAX2(MIN2((int32_t)y1 + bin.extent.height - bin_offset.y, MAX_VIEWPORT_SIZE) - bins[i].offset.y, 0); + } + util_dynarray_foreach (&cmd->fdm_bin_patchpoints, struct tu_fdm_bin_patchpoint, patch) { tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size); tu_cs_emit_qw(cs, patch->iova); - patch->apply(cmd, cs, patch->data, bin, views, tile->frag_areas); + patch->apply(cmd, cs, patch->data, (VkOffset2D) { x1, y1 }, views, + tile->frag_areas, bins); } /* Make the CP wait until the CP_MEM_WRITE's to the command buffers @@ -1252,7 +1345,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) const struct tu_render_pass *pass = cmd->state.pass; const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1]; const struct tu_framebuffer *fb = cmd->state.framebuffer; - const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc; + const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling); if (pass->has_fdm) tu_cs_set_writeable(cs, true); @@ -1261,7 +1354,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_RESOLVE) | A6XX_CP_SET_MARKER_0_USES_GMEM); - tu6_emit_blit_scissor(cmd, cs, true); + tu6_emit_blit_scissor(cmd, cs, true, false); struct tu_resolve_group resolve_group = {}; @@ -1646,13 +1739,31 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_sanity_check(cs); } +bool +tu_enable_fdm_offset(struct tu_cmd_buffer *cmd) +{ + if (!cmd->state.pass) + return false; + + if (!cmd->state.pass->has_fdm) + return false; + + unsigned fdm_a = cmd->state.pass->fragment_density_map.attachment; + if (fdm_a == VK_ATTACHMENT_UNUSED) + return TU_DEBUG(FDM_OFFSET); + + const struct tu_image_view *fdm = cmd->state.attachments[fdm_a]; + return fdm->image->vk.create_flags & + VK_IMAGE_CREATE_FRAGMENT_DENSITY_MAP_OFFSET_BIT_QCOM; +} + static void update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t num_vsc_pipes) { const struct tu_tiling_config *tiling = cmd->state.tiling; - const struct tu_vsc_config *vsc = &tiling->vsc; + const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling); tu_cs_emit_regs(cs, A6XX_VSC_BIN_SIZE(.width = tiling->tile0.width, @@ -1680,7 +1791,7 @@ static void emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { const struct tu_tiling_config *tiling = cmd->state.tiling; - const struct tu_vsc_config *vsc = &tiling->vsc; + const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling); const uint32_t used_pipe_count = vsc->pipe_count.width * vsc->pipe_count.height; @@ -1711,36 +1822,70 @@ emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs) template static void -tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + const VkOffset2D *fdm_offsets) { struct tu_physical_device *phys_dev = cmd->device->physical_device; const struct tu_framebuffer *fb = cmd->state.framebuffer; + const struct tu_tiling_config *tiling = cmd->state.tiling; /* If this command buffer may be executed multiple times, then * viewports/scissor states may have been changed by previous executions - * and we need to reset them before executing the binning IB. + * and we need to reset them before executing the binning IB. With FDM + * offset the viewport also needs to be transformed during the binning + * phase. */ - if (!(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) && - cmd->fdm_bin_patchpoints.size != 0) { + if ((!(cmd->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) || + fdm_offsets) && cmd->fdm_bin_patchpoints.size != 0) { unsigned num_views = MAX2(cmd->state.pass->num_views, 1); VkExtent2D unscaled_frag_areas[num_views]; - for (unsigned i = 0; i < num_views; i++) + VkRect2D bins[num_views]; + for (unsigned i = 0; i < num_views; i++) { unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 }; - VkRect2D bin = { { 0, 0 }, { fb->width, fb->height } }; + if (fdm_offsets && !cmd->state.rp.shared_viewport) { + /* We need to shift over the viewport and scissor during the + * binning pass to match the shift applied when rendering. The way + * to do this is to make the per-view bin start negative. In the + * actual rendering pass, the per-view bin start is shifted in a + * negative direction but the first bin is clipped so that the bin + * start is never negative, but we need to do this to avoid + * clipping the user scissor to a non-zero common bin start. We + * skip patching load/store below in order to avoid patching loads + * and stores to a crazy negative-offset bin. The parts of the + * framebuffer left or above the origin correspond to the + * non-visible parts of the left or top bins that will be + * discarded. The framebuffer still needs to extend to the + * original bottom and right, to avoid incorrectly clipping the + * user scissor, so we need to add to the width and height to + * compensate. + */ + VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling); + bins[i] = { + { -bin_offset.x, -bin_offset.y }, + { fb->width + bin_offset.x, fb->height + bin_offset.y }, + }; + } else { + bins[i] = { { 0, 0 }, { fb->width, fb->height } }; + } + } util_dynarray_foreach (&cmd->fdm_bin_patchpoints, struct tu_fdm_bin_patchpoint, patch) { if (patch->flags & TU_FDM_SKIP_BINNING) continue; tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + patch->size); tu_cs_emit_qw(cs, patch->iova); - patch->apply(cmd, cs, patch->data, bin, num_views, unscaled_frag_areas); + patch->apply(cmd, cs, patch->data, (VkOffset2D) {0, 0}, num_views, + unscaled_frag_areas, bins); } tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); } - tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1); + uint32_t width = fb->width + (fdm_offsets ? tiling->tile0.width : 0); + uint32_t height = fb->height + (fdm_offsets ? tiling->tile0.height : 0); + + tu6_emit_window_scissor(cs, 0, 0, width - 1, height - 1); tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BIN_VISIBILITY)); @@ -1929,6 +2074,22 @@ tu_emit_input_attachments(struct tu_cmd_buffer *cmd, if (!iview->view.is_mutable) dst[0] &= ~A6XX_TEX_CONST_0_SWAP__MASK; dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); + + /* If FDM offset is used, the last row and column extend beyond the + * framebuffer but are shifted over when storing. Expand the width and + * height to account for that. + */ + if (tu_enable_fdm_offset(cmd)) { + uint32_t width = dst[1] & A6XX_TEX_CONST_1_WIDTH__MASK; + uint32_t height = (dst[1] & A6XX_TEX_CONST_1_HEIGHT__MASK) >> + A6XX_TEX_CONST_1_HEIGHT__SHIFT; + width += cmd->state.tiling->tile0.width; + height += cmd->state.tiling->tile0.height; + dst[1] = (dst[1] & ~(A6XX_TEX_CONST_1_WIDTH__MASK | + A6XX_TEX_CONST_1_HEIGHT__MASK)) | + A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height); + } + dst[2] = A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | A6XX_TEX_CONST_2_PITCH(tiling->tile0.width * cpp); @@ -2177,11 +2338,12 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, template static void tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - struct tu_renderpass_result *autotune_result) + struct tu_renderpass_result *autotune_result, + const VkOffset2D *fdm_offsets) { struct tu_physical_device *phys_dev = cmd->device->physical_device; const struct tu_tiling_config *tiling = cmd->state.tiling; - const struct tu_vsc_config *vsc = &tiling->vsc; + const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling); tu_lrz_tiling_begin(cmd, cs); tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); @@ -2225,7 +2387,7 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true); - tu6_emit_binning_pass(cmd, cs); + tu6_emit_binning_pass(cmd, cs, fdm_offsets); if (CHIP == A6XX) { tu_cs_emit_regs(cs, @@ -2270,9 +2432,9 @@ template static void tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const struct tu_tile_config *tile, - bool fdm) + bool fdm, const VkOffset2D *fdm_offsets) { - tu6_emit_tile_select(cmd, &cmd->cs, tile, fdm); + tu6_emit_tile_select(cmd, &cmd->cs, tile, fdm, fdm_offsets); tu_lrz_before_tile(cmd, &cmd->cs); trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs); @@ -2338,7 +2500,8 @@ tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs, static void tu_calc_frag_area(struct tu_cmd_buffer *cmd, struct tu_tile_config *tile, - const struct tu_image_view *fdm) + const struct tu_image_view *fdm, + const VkOffset2D *fdm_offsets) { const struct tu_tiling_config *tiling = cmd->state.tiling; const uint32_t x1 = tiling->tile0.width * tile->pos.x; @@ -2351,11 +2514,71 @@ tu_calc_frag_area(struct tu_cmd_buffer *cmd, const struct tu_framebuffer *fb = cmd->state.framebuffer; struct tu_frag_area raw_areas[views]; if (fdm) { - tu_fragment_density_map_sample(fdm, - (x1 + MIN2(x2, fb->width)) / 2, - (y1 + MIN2(y2, fb->height)) / 2, - fb->width, fb->height, views, - raw_areas); + for (unsigned i = 0; i < views; i++) { + VkOffset2D sample_pos = { 0, 0 }; + + /* Offsets less than a tile size are accomplished by sliding the + * tiles. However once we shift a whole tile size then we reset the + * tiles back to where they were at the beginning and we need to + * adjust where each bin is sampling from: + * + * x offset = 0: + * + * ------------------------------------ + * | * | * | * | (unused) | + * ------------------------------------ + * + * x offset = 4: + * + * ------------------------- + * | * | * | * | * | + * ------------------------- + * + * x offset = 8: + * + * ------------------------------------ + * | * | * | * | (unused) | + * ------------------------------------ + * + * As the user's offset increases we slide the tiles to the right, + * until we reach the whole tile size and reset the tile positions. + * tu_bin_offset() returns an amount to shift to the left, negating + * the offset. + * + * If we were forced to use a shared viewport, then we must not shift + * over the tiles and instead must only shift when sampling because + * we cannot shift the tiles differently per view. This disables + * smooth transitions of the fragment density map and effectively + * negates the extension. + * + * Note that we cannot clamp x2/y2 to the framebuffer size, as we + * normally would do, because then tiles along the edge would + * incorrectly nudge the sample_pos towards the center of the + * framebuffer. If we shift one complete tile over towards the + * center and reset the tiles as above, the sample_pos would + * then shift back towards the edge and we could get a "pop" from + * suddenly changing density due to the slight shift. + */ + if (fdm_offsets) { + VkOffset2D offset = fdm_offsets[i]; + if (!cmd->state.rp.shared_viewport) { + VkOffset2D bin_offset = tu_bin_offset(fdm_offsets[i], tiling); + offset.x += bin_offset.x; + offset.y += bin_offset.y; + } + sample_pos.x = (x1 + x2) / 2 - offset.x; + sample_pos.y = (y1 + y2) / 2 - offset.y; + } else { + sample_pos.x = (x1 + MIN2(x2, fb->width)) / 2; + sample_pos.y = (y1 + MIN2(y2, fb->height)) / 2; + } + + tu_fragment_density_map_sample(fdm, + sample_pos.x, + sample_pos.y, + fb->width, fb->height, i, + &raw_areas[i]); + } } else { for (unsigned i = 0; i < views; i++) raw_areas[i].width = raw_areas[i].height = 1.0f; @@ -2388,10 +2611,24 @@ tu_calc_frag_area(struct tu_cmd_buffer *cmd, width = 1u << util_logbase2(width); height = 1u << util_logbase2(height); + /* When FDM offset is enabled, the fragment area has to divide the + * offset to make sure that we don't have tiles with partial fragments. + * It would be bad to have the fragment area change as a function of the + * offset, because we'd get "popping" as the resolution changes with the + * offset, so just make sure it divides the offset granularity. This + * should mean it always divides the offset for any possible offset. + */ + if (fdm_offsets) { + width = MIN2(width, TU_FDM_OFFSET_GRANULARITY); + height = MIN2(height, TU_FDM_OFFSET_GRANULARITY); + } + /* Make sure that the width/height divides the tile width/height so * we don't have to do extra awkward clamping of the edges of each - * bin when resolving. Note that because the tile width is rounded to - * a multiple of 32 any power of two 32 or less will work. + * bin when resolving. It also has to divide the fdm offset, if any. + * Note that because the tile width is rounded to a multiple of 32 any + * power of two 32 or less will work, and if there is an offset then it + * must be a multiple of 4 so 2 or 4 will definitely work. * * TODO: Try to take advantage of the total area allowance here, too. */ @@ -2486,7 +2723,8 @@ template void tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe, uint32_t tx1, uint32_t ty1, uint32_t tx2, uint32_t ty2, - const struct tu_image_view *fdm) + const struct tu_image_view *fdm, + const VkOffset2D *fdm_offsets) { uint32_t width = tx2 - tx1; uint32_t height = ty2 - ty1; @@ -2505,7 +2743,7 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe, tile->extent = { 1, 1 }; tile->pipe = pipe; tile->slot_mask = 1u << (width * y + x); - tu_calc_frag_area(cmd, tile, fdm); + tu_calc_frag_area(cmd, tile, fdm, fdm_offsets); } } @@ -2549,7 +2787,8 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe, if (merged_tiles & (1u << tile_idx)) continue; - tu6_render_tile(cmd, &cmd->cs, &tiles[tile_idx], true); + tu6_render_tile(cmd, &cmd->cs, &tiles[tile_idx], + true, fdm_offsets); } } } @@ -2557,10 +2796,11 @@ tu_render_pipe_fdm(struct tu_cmd_buffer *cmd, uint32_t pipe, template static void tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, - struct tu_renderpass_result *autotune_result) + struct tu_renderpass_result *autotune_result, + const VkOffset2D *fdm_offsets) { const struct tu_tiling_config *tiling = cmd->state.tiling; - const struct tu_vsc_config *vsc = &tiling->vsc; + const struct tu_vsc_config *vsc = tu_vsc_config(cmd, tiling); const struct tu_image_view *fdm = NULL; if (cmd->state.pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) { @@ -2571,6 +2811,10 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, bool merge_tiles = has_fdm && !TU_DEBUG(NO_BIN_MERGING) && cmd->device->physical_device->info->a6xx.has_bin_mask; + /* If not using FDM make sure not to accidentally apply the offsets */ + if (!has_fdm) + fdm_offsets = NULL; + /* Create gmem stores now (at EndRenderPass time)) because they needed to * know whether to allow their conditional execution, which was tied to a * state that was known only at the end of the renderpass. They will be @@ -2582,7 +2826,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, cmd->trace_renderpass_end = u_trace_end_iterator(&cmd->trace); - tu6_tile_render_begin(cmd, &cmd->cs, autotune_result); + tu6_tile_render_begin(cmd, &cmd->cs, autotune_result, fdm_offsets); /* Note: we reverse the order of walking the pipes and tiles on every * other row, to improve texture cache locality compared to raster order. @@ -2602,7 +2846,8 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, uint32_t ty2 = MIN2(ty1 + vsc->pipe0.height, vsc->tile_count.height); if (merge_tiles) { - tu_render_pipe_fdm(cmd, pipe, tx1, ty1, tx2, ty2, fdm); + tu_render_pipe_fdm(cmd, pipe, tx1, ty1, tx2, ty2, fdm, + fdm_offsets); continue; } @@ -2623,9 +2868,10 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, .extent = { 1, 1 }, }; if (has_fdm) - tu_calc_frag_area(cmd, &tile, fdm); + tu_calc_frag_area(cmd, &tile, fdm, fdm_offsets); - tu6_render_tile(cmd, &cmd->cs, &tile, has_fdm); + tu6_render_tile(cmd, &cmd->cs, &tile, has_fdm, + fdm_offsets); } slot_row += tile_row_stride; } @@ -2676,7 +2922,8 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd, template void -tu_cmd_render(struct tu_cmd_buffer *cmd_buffer) +tu_cmd_render(struct tu_cmd_buffer *cmd_buffer, + const VkOffset2D *fdm_offsets) { if (cmd_buffer->state.rp.has_tess) tu6_lazy_emit_tessfactor_addr(cmd_buffer); @@ -2685,7 +2932,7 @@ tu_cmd_render(struct tu_cmd_buffer *cmd_buffer) if (use_sysmem_rendering(cmd_buffer, &autotune_result)) tu_cmd_render_sysmem(cmd_buffer, autotune_result); else - tu_cmd_render_tiles(cmd_buffer, autotune_result); + tu_cmd_render_tiles(cmd_buffer, autotune_result, fdm_offsets); /* Outside of renderpasses we assume all draw states are disabled. We do * this outside the draw CS for the normal case where 3d gmem stores aren't @@ -4771,7 +5018,7 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, */ tu_restore_suspended_pass(cmd, cmd); - TU_CALLX(cmd->device, tu_cmd_render)(cmd); + TU_CALLX(cmd->device, tu_cmd_render)(cmd, NULL); if (cmd->state.suspend_resume == SR_IN_CHAIN) cmd->state.suspend_resume = SR_NONE; else @@ -4877,7 +5124,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r { struct tu_cs *cs = &cmd->draw_cs; uint32_t subpass_idx = cmd->state.subpass - cmd->state.pass->subpasses; - const struct tu_vsc_config *vsc = &cmd->state.tiling->vsc; + const struct tu_vsc_config *vsc = tu_vsc_config(cmd, cmd->state.tiling); /* If we might choose to bin, then put the loads under a check for geometry * having been binned to this tile. If we don't choose to bin in the end, @@ -4902,7 +5149,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[i]; if ((att->load || att->load_stencil) && att->first_subpass_idx == subpass_idx) { if (!emitted_scissor) { - tu6_emit_blit_scissor(cmd, cs, true); + tu6_emit_blit_scissor(cmd, cs, true, false); emitted_scissor = true; } tu_load_gmem_attachment(cmd, cs, resolve_group, i, @@ -4918,7 +5165,7 @@ tu_emit_subpass_begin_gmem(struct tu_cmd_buffer *cmd, struct tu_resolve_group *r &cmd->state.pass->attachments[i]; if (att->clear_mask && att->first_subpass_idx == subpass_idx) { if (!emitted_scissor) { - tu6_emit_blit_scissor(cmd, cs, false); + tu6_emit_blit_scissor(cmd, cs, false, false); emitted_scissor = true; } tu_clear_gmem_attachment(cmd, cs, resolve_group, i); @@ -4969,7 +5216,7 @@ tu7_emit_subpass_clear(struct tu_cmd_buffer *cmd, struct tu_resolve_group *resol &cmd->state.pass->attachments[i]; if (att->clear_mask && att->first_subpass_idx == subpass_idx) { if (!emitted_scissor) { - tu6_emit_blit_scissor(cmd, cs, false); + tu6_emit_blit_scissor(cmd, cs, false, true); emitted_scissor = true; } tu7_generic_clear_attachment(cmd, cs, resolve_group, i); @@ -5432,7 +5679,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); if (subpass->resolve_attachments) { - tu6_emit_blit_scissor(cmd, cs, true); + tu6_emit_blit_scissor(cmd, cs, true, false); struct tu_resolve_group resolve_group = {}; @@ -5908,9 +6155,10 @@ static void fdm_apply_fs_params(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, - VkRect2D bin, + VkOffset2D common_bin_offset, unsigned views, - const VkExtent2D *frag_areas) + const VkExtent2D *frag_areas, + const VkRect2D *bins) { const struct apply_fs_params_state *state = (const struct apply_fs_params_state *)data; @@ -5919,7 +6167,8 @@ fdm_apply_fs_params(struct tu_cmd_buffer *cmd, for (unsigned i = 0; i < num_consts; i++) { assert(i < views); VkExtent2D area = frag_areas[i]; - VkOffset2D offset = tu_fdm_per_bin_offset(area, bin); + VkRect2D bin = bins[i]; + VkOffset2D offset = tu_fdm_per_bin_offset(area, bin, common_bin_offset); tu_cs_emit(cs, area.width); tu_cs_emit(cs, area.height); @@ -7443,9 +7692,25 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, return; } + const VkSubpassFragmentDensityMapOffsetEndInfoQCOM *fdm_offset_info = + vk_find_struct_const(pSubpassEndInfo->pNext, + SUBPASS_FRAGMENT_DENSITY_MAP_OFFSET_END_INFO_QCOM); + const VkOffset2D *fdm_offsets = + (fdm_offset_info && fdm_offset_info->fragmentDensityOffsetCount > 0) ? + fdm_offset_info->pFragmentDensityOffsets : NULL; + + VkOffset2D test_offsets[MAX_VIEWS]; + if (TU_DEBUG(FDM) && TU_DEBUG(FDM_OFFSET)) { + for (unsigned i = 0; + i < MAX2(cmd_buffer->state.pass->num_views, 1); i++) { + test_offsets[i] = { 64, 64 }; + } + fdm_offsets = test_offsets; + } + tu_cs_end(&cmd_buffer->draw_cs); tu_cs_end(&cmd_buffer->draw_epilogue_cs); - TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer); + TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer, fdm_offsets); cmd_buffer->state.cache.pending_flush_bits |= cmd_buffer->state.renderpass_cache.pending_flush_bits; @@ -7483,7 +7748,16 @@ tu_CmdEndRendering(VkCommandBuffer commandBuffer) */ tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs); } else { - TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer); + VkOffset2D test_offsets[MAX_VIEWS]; + const VkOffset2D *fdm_offsets = NULL; + if (TU_DEBUG(FDM) && TU_DEBUG(FDM_OFFSET)) { + for (unsigned i = 0; + i < MAX2(cmd_buffer->state.pass->num_views, 1); i++) { + test_offsets[i] = { 64, 64 }; + } + fdm_offsets = test_offsets; + } + TU_CALLX(cmd_buffer->device, tu_cmd_render)(cmd_buffer, fdm_offsets); } tu_reset_render_pass(cmd_buffer); diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index 5afad09f62a..3cb5476c108 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -695,7 +695,7 @@ tu_restore_suspended_pass(struct tu_cmd_buffer *cmd, struct tu_cmd_buffer *suspended); template -void tu_cmd_render(struct tu_cmd_buffer *cmd); +void tu_cmd_render(struct tu_cmd_buffer *cmd, const VkOffset2D *fdm_offsets); void tu_dispatch_unaligned(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z); @@ -748,12 +748,15 @@ void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs); void tu6_apply_depth_bounds_workaround(struct tu_device *device, uint32_t *rb_depth_cntl); +bool tu_enable_fdm_offset(struct tu_cmd_buffer *cmd); + typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, - VkRect2D bin, + VkOffset2D common_bin_offset, unsigned views, - const VkExtent2D *frag_areas); + const VkExtent2D *frag_areas, + const VkRect2D *bins); enum tu_fdm_flags { TU_FDM_NONE = 0, @@ -807,13 +810,15 @@ _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd, */ unsigned num_views = MAX2(cmd->state.pass->num_views, 1); VkExtent2D unscaled_frag_areas[num_views]; + VkRect2D bins[num_views]; for (unsigned i = 0; i < num_views; i++) { unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 }; - } - apply(cmd, cs, state, (VkRect2D) { + bins[i] = (VkRect2D) { { 0, 0 }, { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE }, - }, num_views, unscaled_frag_areas); + }; + } + apply(cmd, cs, state, (VkOffset2D) {0, 0}, num_views, unscaled_frag_areas, bins); assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t)); util_dynarray_append(&cmd->fdm_bin_patchpoints, diff --git a/src/freedreno/vulkan/tu_common.h b/src/freedreno/vulkan/tu_common.h index af4a1aaf539..a8700e276c3 100644 --- a/src/freedreno/vulkan/tu_common.h +++ b/src/freedreno/vulkan/tu_common.h @@ -138,6 +138,18 @@ #define MAX_FDM_TEXEL_SIZE_LOG2 10 #define MAX_FDM_TEXEL_SIZE (1u << MAX_FDM_TEXEL_SIZE_LOG2) +/* This granularity is arbitrary, but there are two competing concerns here: + * + * - The fragment area has to always divide the offset, and we don't want the + * fragment area changing with the offset, so we have to clamp the fragment + * area to this granularity. Therefore larger granularities lead to lower + * minimum resolution. + * - The larger the offset granularity, the choppier the motion is. + * + * Choose 8 as a compromise between the two. + */ +#define TU_FDM_OFFSET_GRANULARITY 8 + #define TU_GENX(FUNC_NAME) FD_GENX(FUNC_NAME) #define TU_CALLX(device, thing) FD_CALLX((device)->physical_device->info, thing) diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 6573d83bf22..9fdcf803159 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -333,6 +333,7 @@ get_device_extensions(const struct tu_physical_device *device, .GOOGLE_user_type = true, .IMG_filter_cubic = device->info->a6xx.has_tex_filter_cubic, .NV_compute_shader_derivatives = device->info->chip >= 7, + .QCOM_fragment_density_map_offset = true, .VALVE_mutable_descriptor_type = true, } }; @@ -747,6 +748,9 @@ tu_get_features(struct tu_physical_device *pdevice, /* VK_KHR_subgroup_rotate */ features->shaderSubgroupRotate = true; features->shaderSubgroupRotateClustered = true; + + /* VK_QCOM_fragment_density_map_offset */ + features->fragmentDensityMapOffset = true; } static void @@ -1385,6 +1389,11 @@ tu_get_properties(struct tu_physical_device *pdevice, props->degenerateLinesRasterized = false; props->fullyCoveredFragmentShaderInputVariable = false; props->conservativeRasterizationPostDepthCoverage = false; + + /* VK_QCOM_fragment_density_map_offset */ + props->fragmentDensityOffsetGranularity = (VkExtent2D) { + TU_FDM_OFFSET_GRANULARITY, TU_FDM_OFFSET_GRANULARITY + }; } static const struct vk_pipeline_cache_object_ops *const cache_import_ops[] = { diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index e8277967a0b..fc9b898eea0 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -488,7 +488,7 @@ struct tu_tiling_config { /* Whether using GMEM is even possible with this configuration */ bool possible; - struct tu_vsc_config vsc; + struct tu_vsc_config vsc, fdm_offset_vsc; }; struct tu_framebuffer diff --git a/src/freedreno/vulkan/tu_dynamic_rendering.cc b/src/freedreno/vulkan/tu_dynamic_rendering.cc index 1ff7d4b631e..e7618b47281 100644 --- a/src/freedreno/vulkan/tu_dynamic_rendering.cc +++ b/src/freedreno/vulkan/tu_dynamic_rendering.cc @@ -152,7 +152,7 @@ tu_insert_dynamic_cmdbufs(struct tu_device *dev, old_cmds[i]->pre_chain.trace_renderpass_end); } - TU_CALLX(dev, tu_cmd_render)(cmd_buffer); + TU_CALLX(dev, tu_cmd_render)(cmd_buffer, NULL); tu_cs_emit_pkt7(&cmd_buffer->cs, CP_MEM_WRITE, 3); tu_cs_emit_qw(&cmd_buffer->cs, diff --git a/src/freedreno/vulkan/tu_image.cc b/src/freedreno/vulkan/tu_image.cc index da5e1e520a4..16da996a4d0 100644 --- a/src/freedreno/vulkan/tu_image.cc +++ b/src/freedreno/vulkan/tu_image.cc @@ -1163,10 +1163,10 @@ tu_DestroyImageView(VkDevice _device, */ void tu_fragment_density_map_sample(const struct tu_image_view *fdm, - uint32_t x, uint32_t y, + int32_t x, int32_t y, uint32_t width, uint32_t height, - uint32_t layers, - struct tu_frag_area *areas) + uint32_t layer, + struct tu_frag_area *area) { assert(fdm->image->layout[0].tile_mode == TILE6_LINEAR); @@ -1176,20 +1176,19 @@ tu_fragment_density_map_sample(const struct tu_image_view *fdm, fdm_shift_x = CLAMP(fdm_shift_x, MIN_FDM_TEXEL_SIZE_LOG2, MAX_FDM_TEXEL_SIZE_LOG2); fdm_shift_y = CLAMP(fdm_shift_y, MIN_FDM_TEXEL_SIZE_LOG2, MAX_FDM_TEXEL_SIZE_LOG2); - uint32_t i = x >> fdm_shift_x; - uint32_t j = y >> fdm_shift_y; + int32_t i = x >> fdm_shift_x; + int32_t j = y >> fdm_shift_y; + + i = CLAMP(i, 0, fdm->vk.extent.width - 1); + j = CLAMP(j, 0, fdm->vk.extent.height - 1); unsigned cpp = fdm->image->layout[0].cpp; unsigned pitch = fdm->view.pitch; - void *pixel = (char *)fdm->image->map + fdm->view.offset + cpp * i + pitch * j; - for (unsigned i = 0; i < layers; i++) { - float density_src[4], density[4]; - util_format_unpack_rgba(fdm->view.format, density_src, pixel, 1); - pipe_swizzle_4f(density, density_src, fdm->swizzle); - areas[i].width = 1.0f / density[0]; - areas[i].height = 1.0f / density[1]; - - pixel = (char *)pixel + fdm->view.layer_size; - } + void *pixel = (char *)fdm->image->map + fdm->view.offset + fdm->view.layer_size * layer + cpp * i + pitch * j; + float density_src[4], density[4]; + util_format_unpack_rgba(fdm->view.format, density_src, pixel, 1); + pipe_swizzle_4f(density, density_src, fdm->swizzle); + area->width = 1.0f / density[0]; + area->height = 1.0f / density[1]; } diff --git a/src/freedreno/vulkan/tu_image.h b/src/freedreno/vulkan/tu_image.h index 5d47327b7c9..6bfb48c7bc0 100644 --- a/src/freedreno/vulkan/tu_image.h +++ b/src/freedreno/vulkan/tu_image.h @@ -129,9 +129,9 @@ struct tu_frag_area { void tu_fragment_density_map_sample(const struct tu_image_view *fdm, - uint32_t x, uint32_t y, + int32_t x, int32_t y, uint32_t width, uint32_t height, - uint32_t layers, struct tu_frag_area *areas); + uint32_t layer, struct tu_frag_area *area); VkResult tu_image_update_layout(struct tu_device *device, struct tu_image *image, diff --git a/src/freedreno/vulkan/tu_lrz.cc b/src/freedreno/vulkan/tu_lrz.cc index 9c352346fc3..b26cfe3d7c9 100644 --- a/src/freedreno/vulkan/tu_lrz.cc +++ b/src/freedreno/vulkan/tu_lrz.cc @@ -157,7 +157,7 @@ tu6_write_lrz_cntl(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct A6XX_GRAS_LRZ_CNTL cntl) { if (CHIP >= A7XX) { - // A7XX split LRZ_CNTL into two seperate registers. + /* A7XX split LRZ_CNTL into two seperate registers. */ struct tu_reg_value cntl2 = A7XX_GRAS_LRZ_CNTL2( .disable_on_wrong_dir = cntl.disable_on_wrong_dir, .fc_enable = cntl.fc_enable, diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index 47ac853229f..52c9303e12b 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -2548,44 +2548,49 @@ struct apply_viewport_state { bool share_scale; }; -/* It's a hardware restriction that the window offset (i.e. bin.offset) must - * be the same for all views. This means that GMEM coordinates cannot be a - * simple scaling of framebuffer coordinates, because this would require us to - * scale the window offset and the scale may be different per view. Instead we - * have to apply a per-bin offset to the GMEM coordinate transform to make - * sure that the window offset maps to itself. Specifically we need an offset - * o to the transform: +/* It's a hardware restriction that the window offset (i.e. common_bin_offset) + * must be the same for all views. This means that GMEM coordinates cannot be + * a simple scaling of framebuffer coordinates, because this would require us + * to scale the window offset and the scale may be different per view. Instead + * we have to apply a per-bin offset to the GMEM coordinate transform to make + * sure that the window offset maps to the per-view bin coordinate, which will + * be the same if there is no offset. Specifically we need an offset o to the + * transform: * * x' = s * x + o * - * so that when we plug in the bin start b_s: + * so that when we plug in the per-view bin start b_s and the common window + * offset b_cs: * - * b_s = s * b_s + o + * b_cs = s * b_s + o * * and we get: * - * o = b_s - s * b_s + * o = b_cs - s * b_s * - * We use this form exactly, because we know the bin offset is a multiple of + * We use this form exactly, because we know the bin start is a multiple of * the frag area so s * b_s is an integer and we can compute an exact result - * easily. + * easily. We also have to make sure that the bin offset is a multiple of the + * frag area by restricting the frag area. */ VkOffset2D -tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin) +tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin, + VkOffset2D common_bin_offset) { assert(bin.offset.x % frag_area.width == 0); assert(bin.offset.y % frag_area.height == 0); return (VkOffset2D) { - bin.offset.x - bin.offset.x / frag_area.width, - bin.offset.y - bin.offset.y / frag_area.height + common_bin_offset.x - bin.offset.x / frag_area.width, + common_bin_offset.y - bin.offset.y / frag_area.height }; } static void fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, - VkRect2D bin, unsigned views, const VkExtent2D *frag_areas) + VkOffset2D common_bin_offset, unsigned views, + const VkExtent2D *frag_areas, const VkRect2D *bins) { const struct apply_viewport_state *state = (const struct apply_viewport_state *)data; @@ -2603,9 +2608,12 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, * replicate it across all viewports. */ VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i]; + VkRect2D bin = state->share_scale ? bins[0] : bins[i]; VkViewport viewport = state->share_scale ? state->vp.viewports[i] : state->vp.viewports[0]; - if (frag_area.width == 1 && frag_area.height == 1) { + if (frag_area.width == 1 && frag_area.height == 1 && + common_bin_offset.x == bin.offset.x && + common_bin_offset.y == bin.offset.y) { vp.viewports[i] = viewport; continue; } @@ -2618,7 +2626,8 @@ fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, vp.viewports[i].width = viewport.width * scale_x; vp.viewports[i].height = viewport.height * scale_y; - VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin); + VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, + common_bin_offset); vp.viewports[i].x = scale_x * viewport.x + offset.x; vp.viewports[i].y = scale_y * viewport.y + offset.y; @@ -2694,7 +2703,8 @@ tu6_emit_scissor(struct tu_cs *cs, const struct vk_viewport_state *vp) static void fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, - VkRect2D bin, unsigned views, const VkExtent2D *frag_areas) + VkOffset2D common_bin_offset, unsigned views, + const VkExtent2D *frag_areas, const VkRect2D *bins) { const struct apply_viewport_state *state = (const struct apply_viewport_state *)data; @@ -2703,12 +2713,9 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, for (unsigned i = 0; i < vp.scissor_count; i++) { VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i]; + VkRect2D bin = state->share_scale ? bins[0] : bins[i]; VkRect2D scissor = state->share_scale ? state->vp.scissors[i] : state->vp.scissors[0]; - if (frag_area.width == 1 && frag_area.height == 1) { - vp.scissors[i] = scissor; - continue; - } /* Transform the scissor following the viewport. It's unclear how this * is supposed to handle cases where the scissor isn't aligned to the @@ -2716,7 +2723,8 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, * fragments if the scissor size equals the framebuffer size and it * isn't aligned to the fragment area. */ - VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin); + VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin, + common_bin_offset); VkOffset2D min = { scissor.offset.x / frag_area.width + offset.x, scissor.offset.y / frag_area.width + offset.y, @@ -2731,12 +2739,12 @@ fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data, */ uint32_t scaled_width = bin.extent.width / frag_area.width; uint32_t scaled_height = bin.extent.height / frag_area.height; - vp.scissors[i].offset.x = MAX2(min.x, bin.offset.x); - vp.scissors[i].offset.y = MAX2(min.y, bin.offset.y); + vp.scissors[i].offset.x = MAX2(min.x, common_bin_offset.x); + vp.scissors[i].offset.y = MAX2(min.y, common_bin_offset.y); vp.scissors[i].extent.width = - MIN2(max.x, bin.offset.x + scaled_width) - vp.scissors[i].offset.x; + MIN2(max.x, common_bin_offset.x + scaled_width) - vp.scissors[i].offset.x; vp.scissors[i].extent.height = - MIN2(max.y, bin.offset.y + scaled_height) - vp.scissors[i].offset.y; + MIN2(max.y, common_bin_offset.y + scaled_height) - vp.scissors[i].offset.y; } TU_CALLX(cs->device, tu6_emit_scissor)(cs, &vp); diff --git a/src/freedreno/vulkan/tu_pipeline.h b/src/freedreno/vulkan/tu_pipeline.h index f16fed7d8c1..547c14756c6 100644 --- a/src/freedreno/vulkan/tu_pipeline.h +++ b/src/freedreno/vulkan/tu_pipeline.h @@ -243,7 +243,8 @@ TU_DECL_PIPELINE_DOWNCAST(graphics, TU_PIPELINE_GRAPHICS) TU_DECL_PIPELINE_DOWNCAST(graphics_lib, TU_PIPELINE_GRAPHICS_LIB) TU_DECL_PIPELINE_DOWNCAST(compute, TU_PIPELINE_COMPUTE) -VkOffset2D tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin); +VkOffset2D tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin, + VkOffset2D common_bin_offset); template uint32_t tu_emit_draw_state(struct tu_cmd_buffer *cmd); diff --git a/src/freedreno/vulkan/tu_util.cc b/src/freedreno/vulkan/tu_util.cc index cf4b88772f1..36e6c6d561a 100644 --- a/src/freedreno/vulkan/tu_util.cc +++ b/src/freedreno/vulkan/tu_util.cc @@ -49,6 +49,7 @@ static const struct debug_control tu_debug_options[] = { { "dumpas", TU_DEBUG_DUMPAS }, { "nobinmerging", TU_DEBUG_NO_BIN_MERGING }, { "perfcraw", TU_DEBUG_PERFCRAW }, + { "fdmoffset", TU_DEBUG_FDM_OFFSET }, { NULL, 0 } }; @@ -454,6 +455,16 @@ tu_framebuffer_tiling_config(struct tu_framebuffer *fb, tu_tiling_config_update_pipe_layout(vsc, device, pass->has_fdm); tu_tiling_config_update_pipes(vsc, device); tu_tiling_config_update_binning(vsc, device); + + if (pass->has_fdm) { + struct tu_vsc_config *fdm_offset_vsc = &tiling->fdm_offset_vsc; + fdm_offset_vsc->tile_count = (VkExtent2D) { + vsc->tile_count.width + 1, vsc->tile_count.height + 1 + }; + tu_tiling_config_update_pipe_layout(fdm_offset_vsc, device, true); + tu_tiling_config_update_pipes(fdm_offset_vsc, device); + tu_tiling_config_update_binning(fdm_offset_vsc, device); + } } } diff --git a/src/freedreno/vulkan/tu_util.h b/src/freedreno/vulkan/tu_util.h index 173c8ace984..5ebdcd26a9d 100644 --- a/src/freedreno/vulkan/tu_util.h +++ b/src/freedreno/vulkan/tu_util.h @@ -69,6 +69,7 @@ enum tu_debug_flags : uint64_t TU_DEBUG_DUMPAS = BITFIELD64_BIT(28), TU_DEBUG_NO_BIN_MERGING = BITFIELD64_BIT(29), TU_DEBUG_PERFCRAW = BITFIELD64_BIT(30), + TU_DEBUG_FDM_OFFSET = BITFIELD64_BIT(31), }; struct tu_env {